[med-svn] r399 - in trunk/packages/clustalw/trunk: . debian debian/patches
charles-guest at alioth.debian.org
charles-guest at alioth.debian.org
Sun Aug 12 15:08:40 UTC 2007
Author: charles-guest
Date: 2007-08-12 15:08:40 +0000 (Sun, 12 Aug 2007)
New Revision: 399
Added:
trunk/packages/clustalw/trunk/debian/patches/
trunk/packages/clustalw/trunk/debian/patches/amenu.c.patch
trunk/packages/clustalw/trunk/debian/patches/clustal-help.patch
trunk/packages/clustalw/trunk/debian/patches/clustalw.h.patch
trunk/packages/clustalw/trunk/debian/patches/clustalx.html.patch
trunk/packages/clustalw/trunk/debian/patches/clustalx_help.patch
trunk/packages/clustalw/trunk/debian/patches/interface.c.patch
trunk/packages/clustalw/trunk/debian/patches/makefile.patch
trunk/packages/clustalw/trunk/debian/patches/sequence.c.patch
trunk/packages/clustalw/trunk/debian/patches/series
trunk/packages/clustalw/trunk/debian/patches/trees.c.patch
trunk/packages/clustalw/trunk/debian/patches/util.c.patch
trunk/packages/clustalw/trunk/debian/patches/xmenu.c.patch
Removed:
trunk/packages/clustalw/trunk/README_W
trunk/packages/clustalw/trunk/README_X
trunk/packages/clustalw/trunk/alnscore.c
trunk/packages/clustalw/trunk/amenu.c
trunk/packages/clustalw/trunk/calcgapcoeff.c
trunk/packages/clustalw/trunk/calcprf1.c
trunk/packages/clustalw/trunk/calcprf2.c
trunk/packages/clustalw/trunk/calctree.c
trunk/packages/clustalw/trunk/clustalv.doc
trunk/packages/clustalw/trunk/clustalw.c
trunk/packages/clustalw/trunk/clustalw.doc
trunk/packages/clustalw/trunk/clustalw.h
trunk/packages/clustalw/trunk/clustalw.ms
trunk/packages/clustalw/trunk/clustalw.new
trunk/packages/clustalw/trunk/clustalw_help
trunk/packages/clustalw/trunk/clustalx.c
trunk/packages/clustalw/trunk/clustalx.html
trunk/packages/clustalw/trunk/clustalx_help
trunk/packages/clustalw/trunk/coldna.par
trunk/packages/clustalw/trunk/colprint.par
trunk/packages/clustalw/trunk/colprot.par
trunk/packages/clustalw/trunk/dayhoff.h
trunk/packages/clustalw/trunk/gcgcheck.c
trunk/packages/clustalw/trunk/general.h
trunk/packages/clustalw/trunk/globin.pep
trunk/packages/clustalw/trunk/gon90.bla
trunk/packages/clustalw/trunk/interface.c
trunk/packages/clustalw/trunk/makefile
trunk/packages/clustalw/trunk/makefile.alpha
trunk/packages/clustalw/trunk/makefile.linux
trunk/packages/clustalw/trunk/makefile.sgi
trunk/packages/clustalw/trunk/makefile.sun
trunk/packages/clustalw/trunk/malign.c
trunk/packages/clustalw/trunk/matrices.h
trunk/packages/clustalw/trunk/matrixseries.gon
trunk/packages/clustalw/trunk/pairalign.c
trunk/packages/clustalw/trunk/param.h
trunk/packages/clustalw/trunk/prfalign.c
trunk/packages/clustalw/trunk/random.c
trunk/packages/clustalw/trunk/readmat.c
trunk/packages/clustalw/trunk/sequence.c
trunk/packages/clustalw/trunk/showpair.c
trunk/packages/clustalw/trunk/trees.c
trunk/packages/clustalw/trunk/util.c
trunk/packages/clustalw/trunk/xcolor.c
trunk/packages/clustalw/trunk/xdisplay.c
trunk/packages/clustalw/trunk/xmenu.c
trunk/packages/clustalw/trunk/xmenu.h
trunk/packages/clustalw/trunk/xscore.c
trunk/packages/clustalw/trunk/xutils.c
Modified:
trunk/packages/clustalw/trunk/
trunk/packages/clustalw/trunk/debian/
trunk/packages/clustalw/trunk/debian/changelog
trunk/packages/clustalw/trunk/debian/control
trunk/packages/clustalw/trunk/debian/rules
Log:
patchification
Property changes on: trunk/packages/clustalw/trunk
___________________________________________________________________
Name: mergeWithUpstream
+ 1
Deleted: trunk/packages/clustalw/trunk/README_W
===================================================================
--- trunk/packages/clustalw/trunk/README_W 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/README_W 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,280 +0,0 @@
-******************************************************************************
-
- CLUSTAL W Multiple Sequence Alignment Program
- (version 1.83, Feb 2003)
-
-******************************************************************************
-
-
-Please send bug reports, comments etc. to one of:-
- gibson at embl-heidelberg.de
- thompson at igbmc.u-strasbg.fr
- d.higgins at ucc.ie
-
-
-******************************************************************************
-
- POLICY ON COMMERCIAL DISTRIBUTION OF CLUSTAL W
-
-Clustal W is freely available to the user community. However, Clustal W is
-increasingly being distributed as part of commercial sequence analysis
-packages. To help us safeguard future maintenance and development, commercial
-distributors of Clustal W must take out a NON-EXCLUSIVE LICENCE. Anyone
-wishing to commercially distribute version 1.81 of Clustal W should contact the
-authors unless they have previously taken out a licence.
-
-******************************************************************************
-
-Clustal W is written in ANSI-C and can be run on any machine with an ANSI-C
-compiler. Executables are provided for several major platforms.
-
-Changes since CLUSTAL X Version 1.82
-------------------------------------
-
-1. The FASTA format has been added to the list of alignment output options.
-
-2. It is now possible to save the residue ranges (appended after the sequence
-names) when saving a specified range of the alignment.
-
-3. The efficiency of the neighour-joining algorithm has been improved. This
-work was done by Tadashi Koike at the Center for Information Biology and DNA Data
-Bank of Japan and FUJITSU Limited.
-
-Some example speedups are given below : (timings on a SPARC64 CPU)
-
-No. of sequences original NJ new NJ
- 200 0' 12" 0.1"
- 500 9' 19" 1.4"
- 1000 XXXX 0' 31"
-
-Changes since version 1.8
---------------------------
-
-1. ClustalW now returns error codes for some common errors when exiting. This
-may be useful for people who run clustalw automatically from within a script.
-Error codes are:
- 1 bad command line option
- 2 cannot open sequence file
- 3 wrong format in sequence file
- 4 sequence file contains only 1 sequence (for multiple alignments)
-
-2. Alignments can now be saved in Nexus format, for compatibility with PAUP,
-MacClade etc. For a description of the Nexus format, see:
-Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
-NEXUS: an extensible file format for systematic information.
-Systematic Biology 46:590-621.
-
-3. Phylogenetic trees can also be saved in nexus format.
-
-4. A ClustalW icon has been designed for MAC and PC systems.
-
-
-Changes since version 1.74
---------------------------
-
-1. Some work has been done to automatically select the optimal parameters
-depending on the set of sequences to be aligned. The Gonnet series of residue
-comparison matrices are now used by default. The Blosum series remains as an
-option. The default gap extension penalty for proteins has been changed to 0.2
-(was 0.05).The 'delay divergent sequences' option has been changed to 30%
-residue identity (was 40%).
-
-2. The default parameters used when the 'Negative matrix' option is selected
-have been optimised. This option may help when the sequences to be aligned are
-not superposable over their whole lengths (e.g. in the presence of N/C terminal
-extensions).
-
-3. A bug in the calculation of phylogenetic trees for 2 sequences has been
-fixed.
-
-4. A command line option has been added to turn off the sequence weighting
-calculation.
-
-5. The phylogenetic tree calculation now ignores any ambiguity codes in the
-sequences.
-
-6. A bug in the memory access during the calculation of profiles has been
-fixed. (Thanks to Haruna Cofer at SGI).
-
-7. A bug has been fixed in the 'transition weight' option for nucleic acid
-sequences. (Thanks to Chanan Rubin at Compugen).
-
-8. An option has been added to read in a series of comparison matrices from a
-file. This option is only applicable for protein sequences. For details of the
-file format, see the on-line documentation.
-
-9. The MSF output file format has been changed. The sequence weights
-calculated by Clustal W are now included in the header.
-
-10. Two bugs in the FAST/APPROXIMATE pairwise alignments have been fixed. One
-involved the alignment of new sequences to an existing profile using the fast
-pairwise alignment option; the second was caused by changing the default
-options for the fast pairwise alignments.
-
-11. A bug in the alignment of a small number of sequences has been fixed.
-Previously a Guide Tree was not calculated for less than 4 sequences.
-
-
-Changes since version 1.6
--------------------------
-
-1. The static arrays used by clustalw for storing the alignment data have been
-replaced by dynamically allocated memory. There is now no limit on the number
-or length of sequences which can be input.
-
-2. The alignment of DNA sequences now offers a new hard-coded matrix, as well
-as the identity matrix used previously. The new matrix is the default scoring
-matrix used by the BESTFIT program of the GCG package for the comparison of
-nucleic acid sequences. X's and N's are treated as matches to any IUB ambiguity
-symbol. All matches score 1.9; all mismatches for IUB symbols score 0.0.
-
-3. The transition weight option for aligning nucleotide sequences has been
-changed from an on/off toggle to a weight between 0 and 1. A weight of zero
-means that the transitions are scored as mismatches; a weight of 1 gives
-transitions the full match score. For distantly related DNA sequences, the
-weight should be near to zero; for closely related sequences it can be useful
-to assign a higher score.
-
-4. The RSF sequence alignment file format used by GCG Version 9 can now be
-read.
-
-5. The clustal sequence alignment file format has been changed to allow
-sequence names longer than 10 characters. The maximum length allowed is set in
-clustalw.h by the statement:
-#define MAXNAMES 10
-
-For the fasta format, the name is taken as the first string after the '>'
-character, stopping at the first white space. (Previously, the first 10
-characters were taken, replacing blanks by underscores).
-
-6. The bootstrap values written in the phylip tree file format can be assigned
-either to branches or nodes. The default is to write the values on the nodes,
-as this can be read by several commonly-used tree display programs. But note
-that this can lead to confusion if the tree is rooted and the bootstraps may
-be better attached to the internal branches: Software developers should ensure
-they can read the branch label format.
-
-7. The sequence weighting used during sequence to profile alignments has been
-changed. The tree weight is now multiplied by the percent identity of the
-new sequence compared with the most closely related sequence in the profile.
-
-8. The sequence weighting used during profile to profile alignments has been
-changed. A guide tree is now built for each profile separately and the
-sequence weights calculated from the two trees. The weights for each
-sequence are then multiplied by the percent identity of the sequence compared
-with the most closely related sequence in the opposite profile.
-
-9. The adjustment of the Gap Opening and Gap Extension Penalties for sequences
-of unequal length has been improved.
-
-10. The default order of the sequences in the output alignment file has been
-changed. Previously the default was to output the sequences in the same order
-as the input file. Now the default is to use the order in which the sequences
-were aligned (from the guide tree/dendrogram), thus automatically grouping
-closely related sequences.
-
-11. The option to 'Reset Gaps between alignments' has been switched off by
-default.
-
-12. The conservation line output in the clustal format alignment file has been
-changed. Three characters are now used:
-'*' indicates positions which have a single, fully conserved residue
-':' indicates that one of the following 'strong' groups is fully conserved:-
- STA
- NEQK
- NHQK
- NDEQ
- QHRK
- MILV
- MILF
- HY
- FYW
-
-'.' indicates that one of the following 'weaker' groups is fully conserved:-
- CSA
- ATV
- SAG
- STNK
- STPA
- SGND
- SNDEQK
- NDEQHK
- NEQHRK
- FVLIM
- HFY
-
-These are all the positively scoring groups that occur in the Gonnet Pam250
-matrix. The strong and weak groups are defined as strong score >0.5 and weak
-score =<0.5 respectively.
-
-13. A bug in the modification of the Myers and Miller alignment algorithm
-for residue-specific gap penalites has been fixed. This occasionally caused
-new gaps to be opened a few residues away from the optimal position.
-
-14. The GCG/MSF input format no longer needs the word PILEUP on the first
-line. Several versions can now be recognised:-
- 1. The word PILEUP as the first word in the file
- 2. The word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
- as the first word in the file
- 3. The characters MSF on the first line in the line, and the
- characters .. at the end of the line.
-
-15. The standard command line separator for UNIX systems has been changed from
-'/' to '-'. ie. to give options on the command line, you now type
-
- clustalw input.aln -gapopen=8.0
-
-instead of clustalw input.aln /gapopen=8.0
-
-
- ATTENTION SOFTWARE DEVELOPERS!!
- -------------------------------
-
-The CLUSTAL sequence alignment output format was modified from version 1.7:
-
-1. Names longer than 10 chars are now allowed. (The maximum is specified in
-clustalw.h by '#define MAXNAMES'.)
-
-2. The consensus line now consists of three characters: '*',':' and '.'. (Only
-the '*' and '.' were previously used.)
-
-3. An option (not the default) has been added, allowing the user to print out
-sequence numbers at the end of each line of the alignment output.
-
-4. Both RNA bases (U) and base ambiguities are now supported in nucleic acid
-sequences. In the past, all characters (upper or lower case) other than
-a,c,g,t or u were converted to N. Now the following characters are recognised
-and retained in the alignment output: ABCDGHKMNRSTUVWXY (upper or lower case).
-
-5. A Blank line inadvertently added in the version 1.6 header has been taken
-out again.
-
- CLUSTAL REFERENCES
- ------------------
-
-Details of algorithms, implementation and useful tips on usage of Clustal
-programs can be found in the following publications:
-
-Jeanmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
-Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
-
-Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
-The ClustalX windows interface: flexible strategies for multiple sequence
-alignment aided by quality analysis tools. Nucleic Acids Research, 24:4876-4882.
-
-Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
-multiple sequence alignments. Methods Enzymol., 266, 383-402.
-
-Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
-sensitivity of progressive multiple sequence alignment through sequence
-weighting, positions-specific gap penalties and weight matrix choice. Nucleic
-Acids Research, 22:4673-4680.
-
-Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
-multiple sequence alignment. CABIOS 8,189-191.
-
-Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
-alignments on a microcomputer. CABIOS 5,151-153.
-
-Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
-sequence alignment on a microcomputer. Gene 73,237-244.
Deleted: trunk/packages/clustalw/trunk/README_X
===================================================================
--- trunk/packages/clustalw/trunk/README_X 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/README_X 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,392 +0,0 @@
-******************************************************************************
-
- CLUSTAL X Multiple Sequence Alignment Program
- (version 1.83, Feb 2003)
-
-******************************************************************************
-
-This README contains notes on version CHANGES and help with INSTALLATION
-
-Clustal X provides a new window-based user interface to the Clustal W multiple
-alignment program. It uses the Vibrant multi-platform user interface
-development library, developed by the National Center for Biotechnology
-Information (Bldg 38A, NIH 8600 Rockville Pike,Bethesda, MD 20894) as part of
-their NCBI SOFTWARE DEVELOPEMENT TOOLKIT. The toolkit is available by
-anonymous ftp from ncbi.nlm.nih.gov
-
-Please e-mail bug reports/complaints/suggestions (polite if possible) to
- Julie Thompson at julie at igbmc.u-strasbg.fr
- or Toby Gibson at gibson at embl-heidelberg.de
-
-
-******************************************************************************
-
- POLICY ON COMMERCIAL DISTRIBUTION OF CLUSTAL W and X
-
-Clustal W and X are freely available to the user community. However, Clustal W
-is increasingly being distributed as part of commercial sequence analysis
-packages. To help us safeguard future maintenance and development, commercial
-distributors of Clustal X must take out a non-exclusive licence. Anyone
-wishing to commercially distribute version 1.81 of Clustal X should contact the
-authors unless they have previously taken out a licence.
-
-******************************************************************************
-
-Changes since CLUSTAL X Version 1.82
-------------------------------------
-
-1. The FASTA format has been added to the list of alignment output options.
-
-2. It is now possible to save the residue ranges (appended after the sequence
-names) when saving a specified range of the alignment.
-
-3. The efficiency of the neighour-joining algorithm has been improved. This
-work was done by Tadashi Koike at the Center for Information Biology and DNA Data
-Bank of Japan and FUJITSU Limited.
-
-Some example speedups are given below : (timings on a SPARC64 CPU)
-
-No. of sequences original NJ new NJ
- 200 0' 12" 0.1"
- 500 9' 19" 1.4"
- 1000 XXXX 0' 31"
-
-
-Changes since CLUSTAL X Version 1.8
------------------------------------
-
-1. ClustalX now returns error codes for some common errors when exiting. This
-may be useful for people who run clustalx automatically from within a script.
-Error codes are:
- 1 bad command line option
- 2 cannot open sequence file
- 3 wrong format in sequence file
- 4 sequence file contains only 1 sequence (for multiple alignments)
-
-2. Alignments can now be saved in Nexus format, for compatibility with PAUP,
-MacClade etc. For a description of the Nexus format, see:
-Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
-NEXUS: an extensible file format for systematic information.
-Systematic Biology 46:590-621.
-
-3. Phylogenetic trees can also be saved in nexus format.
-
-4. A bug causing ClustalX to crash during cut-and-paste operations has been fixed.
-
-5. A bug on PC systems, causing an error message when writing to files with
-space characters in the filename has been fixed.
-
-6. The Quality Curve is now displayed as a bar chart, instead of a line plot.
-(Thanks to Michele Clamp, michele at ebi.ac.uk, who used this format in the JalView
-editor.)
-
-7. A bug in the 'Save Profile' option, causing the default profile filename to
-be lost has been fixed.
-
-8. A ClustalX icon has been designed for MAC and PC systems.
-
-
-Changes since CLUSTAL X Version 1.65b
--------------------------------------
-
-1. Some work has been done to automatically select the optimal parameters
-depending on the set of sequences to be aligned. The Gonnet series of residue
-comparison matrices are now used by default. The Blosum series remains as an
-option. The default gap extension penalty for proteins has been changed to 0.2
-(was 0.05).The 'delay divergent sequences' option has been changed to 30%
-residue identity (was 40%).
-
-2. The default parameters used when the 'Negative matrix' option is selected
-have been optimised. This option may help when the sequences to be aligned are
-not superposable over their whole lengths (e.g. in the presence of N/C terminal
-extensions).
-
-3. An option has been added to save the quality scores displayed underneath the
-sequence window to a text file.
-
-4. The 'Hide Low-scoring segments' option has been moved from the Low-scoring
-parameter window to the Quality menu, and has been changed to 'Show Low-scoring
-segments'.
-
-5. An option has been added to allow the user to search for a string in the
-sequences.
-
-6. An option has been added to the postscript output to print on US Letter size
-paper.
-
-7. A bug in the display of the message at the bottom of the window causing the
-text to disappear when the window was resized has been fixed.
-
-8. The font for the Help window as been changed to Courier.
-
-9. A bug in the calculation of phylogenetic trees for 2 sequences has been
-fixed.
-
-10. A command line option has been added to turn off the sequence weighting
-calculation.
-
-11. The phylogenetic tree calculation now ignores any ambiguity codes in the
-sequences.
-
-12. A bug in the memory access during the calculation of profiles has been
-fixed. (Thanks to Haruna Cofer at SGI).
-
-13. A bug has been fixed in the 'transition weight' option for nucleic acid
-sequences. (Thanks to Chanan Rubin at Compugen).
-
-14. An option has been added to allow the user to read in a series of residue
-comparison matrices from a file.
-
-15. The MSF output file format has been changed. The sequence weights
-calculated by ClustalX are now included in the header.
-
-16. Two bugs in the FAST/APPROXIMATE pairwise alignments have been fixed. One
-involved the alignment of new sequences to an existing profile using the fast
-pairwise alignment option; the second was caused by changing the default
-options for the fast pairwise alignments.
-
-17. A bug in the alignment of a small number of sequences has been fixed.
-Previously a Guide Tree was not calculated for less than 4 sequences.
-
-18. Several bugs affecting use of secondary structure masks in Clustal X (but
-not in Clustal W) have been fixed.
-
-
-Changes since Version 1.5b
---------------------------
-
-1. The window displayed under MS Windows has previously been a fixed size. The
-window can now be resized by dragging the window frame.
-
-2. An option has been added to read in a series of comparison matrices from a
-file. This option is only applicable for protein sequences. For details of
-the file format, see the on-line documentation.
-
-3. A new DNA comparison matrix has been added. This is the default scoring
-matrix used by BESTFIT for the comparison of nucleic acid sequences. X's and N's
-are treated as matches to any IUB ambiguity symbol. All matches score 1.9; all
-mismatches for IUB symbols score 0.
-The previous system used by ClustalW, in which matches score 1.0 and mismatches
-score 0 remains as an option. All matches for IUB symbols will also score 0.
-
-4. You can now read a comparison matrix for DNA sequences from a file. The
-matrix file should be in the same format as for the Blast program.
-
-5. The 'Reset gaps before alignment' has been changed to 'Reset new gaps
-before alignments'. A new option 'Reset ALL gaps before alignment' has been
-added.
-RESET NEW GAPS BEFORE ALIGNMENT will remove any new gaps introduced into the
-sequences during multiple alignment if you wish to change the parameters and
-try again.
-RESET ALL GAPS BEFORE ALIGNMENT will remove all gaps in the sequences including
-gaps which were read in from the sequence input file.
-
-6. The 'Realign Residue Range' option has been changed. By default, gap
-opening and extension penalties are now applied to the ends of the alignment
-range in order to penalise terminal gaps. If the REALIGN SEGMENT END GAP
-PENALTIES option is switched off, gaps can be introduced at the ends of the
-residue range at no cost.
-
-7. The MSF output file format has been changed. The sequence weights calculated
-by ClustalX are now included in the header.
-
-8. Two bugs in the FAST/APPROXIMATE pairwise alignments have been fixed. One
-involved the alignment of new sequences to an existing profile using the
-fast pairwise alignment option; the second was caused by changing the default
-options for the fast pairwise alignments.
-
-9. A bug in the postscript output file has been fixed. The residue numbers
-printed at the right hand side of the alignment were not always correct.
-
-10. A bug in the alignment of a small number of sequences has been fixed.
-Previously a Guide Tree was not calculated for less than 4 sequences.
-
-11. A bug which occurred after frequent cut-and-paste operations has been
-fixed.
-
-12. A new file called clustalx.html contains an html'ised version of the
-on-line help. The file can be viewed using a World Wide Web viewer, such as
-Netscape.
-
-
-New Features since ClustalW
----------------------------
-
-1. A subset of sequences in an alignment may be selected and realigned to a
-profile made from the unselected sequences. This may be useful when trying to
-align very divergent sequences which have been badly aligned in the initial
-full multiple alignment.
-
-
-2. A range of the sequence alignment can be selected for realignment. A new
-phylogenetic guide tree is built based only on the residue range selected.
-The selected residues are then aligned, and pasted back into the full sequence
-alignment. This may be useful for aligning small sections of the alignment
-which have been badly aligned in the full sequence alignment, or which have a
-very different guide tree structure from the tree built using the full
-sequences.
-
-
-3. Clustal X provides a versatile coloring scheme for the sequence alignment
-display. The sequences (or profiles) are colored automatically, when they are
-loaded. Sequences can be colored either by assigning a color to specific
-residues, or on the basis of an alignment consensus. In the latter case,
-the alignment consensus is calculated automatically, and the residues in each
-column are colored according to the consensus character assigned to the column.
-In this way, for example, conserved hydrophylic or hydrophobic positions can
-be highlighted.
-
-
-4. An 'Alignment Quality Score' is plotted below the alignment. This is an
-estimate of the conservation of each column in the alignment. Highly conserved
-columns will have a high quality score, less conserved positions will be
-marked by a low score.
-
-
-5. 'Exceptional' residues in the alignment that cause the low quality scores
-described above, can be highlighted. These can be expected to occur at a
-moderate frequency in all the sequences because of their steady divergence
-due to the natural processes of evolution. However, clustering of highlighted
-residues is a strong indication of misalignment.
-Occasionally, highlighted residues may also point to regions of some biological
-significance.
-
-6. Low-scoring segments in the alignment can be highlighted. The segments are
-defined as those regions which score negatively in a forward and backward
-summation of the alignment profile scores. See the online help for more
-details.
-
-7. The new GCG9 MSF,RSF formats are now recognised as input formats for
-clustalx. The alignments cannot be written out in these formats however.
-
-The code has been tested on UNIX (SGI, SUN, DIGITAL) and Macintosh. Compiled
-executables are provided for these systems. If you wish to recompile the
-source files, you will first need to install the NCBI toolkit on your machine.
-Then, to compile the program on UNIX, edit the makefile to point to your NCBI
-include and library files, and type:
-
- make -f makefile.sun
-or make -f makefile.sgi
-or make -f makefile.osf
-
-
-To run the program, type clustalx. A window is displayed with a pull-down menu
-bar which allow all functions to be selected and all alignment parameters
-may be modified, if desired.
-
-
-Documentation for ClustalW (clustalw.doc) is included in the directory. Online
-help is also available for most options of Clustal X by selecting HELP from
-the menu bar.
-
-Help is also available on the WWW at
-
-www-igbmc.u-strasbg.fr/BioInfo/ClustalX/
-www-igbmc.u-strasbg.fr/BioInfo/ClustalW/
-www.U.arizona.edu/~schluter/ClustalW/index.html
-
-
-INSTALLATION (for Unix, PC and MAC)
-------------
-
-UNIX
-----
-
-Executables are provided in the appropriate archives for Digital UNIX 4.0 on
-Alphas, Sun OS 5.6, Silicon Graphics IRIX 6.2 and LINUX (libc6 must be
-installed). If you wish to run on another platform, you will need to recompile
-Clustal X for yourself.
-
-The executable file clustalx should be copied to one of the directories
-specified in your PATH environment variable. The files called *.par and
-clustalx_help should also be copied to the same directory.
-
-Recompiling ClustalX:
-
-First of all, you need the NCBI Vibrant toolkit installed on your machine. If
-this is not already done, you can get the toolkit by anonymous ftp to
-ncbi.nlm.nih.gov.
-You should then copy one of the makefiles supplied in the unix archives to
-'makefile' and edit it, changing the NCBI_INC and NCBI_LIB paths for your
-system.
-
-You make the program with:
-make -f makefile
-
-This produces the executable file clustalx. You can then proceed with the
-installation as described above.
-
-
-MS WINDOWS
-----------
-
-We supply an executable file (clustalx.exe) which will run under MS Windows
-(32 bit). The directory containing the executable (plus the files named *.par,
-and clustalx.hlp) should be added to your path defined in the autoexec.bat
-file.
-
-
-Recompiling ClustalX:
-
-First of all, you need the NCBI Vibrant toolkit installed on your machine. If
-this is not already done, you can get the toolkit by anonymous ftp to
-ncbi.nlm.nih.gov.
-
-A makefile is supplied which can be used as a guide for recompiling the
-ClustalX source code. You will need to edit it for your system. In
-particular the NCBI_INC and NCBI_LIB paths should point to your installation.
-
-
-MAC
----
-
-An executable program called clustalx is supplied for Power Macintoshes.
-For 68K machines, you will need to recompile the code yourself. The
-program may need up to 10m of memory to run depending on the number and
-length of your sequences. The memory allocation can be adjusted with the
-Get Info (%I) command from the Finder if you have problems. Just double click
-the executable file name or icon and off you go (we hope). The files *.par and
-clustalx_help should be stored in the same directory as the clustalx program.
-
-Recompiling ClustalX:
-
-First of all, you need the NCBI Vibrant toolkit installed on your machine. If
-this is not already done, you can get the toolkit by anonymous ftp to
-ncbi.nlm.nih.gov.
-
-We used the Metroworks Codewarrior C compiler to compile the ClustalX files,
-but another ANSI C compiler should work. You need to compile all the *.c
-files supplied in the archive, then link them together with the NCBI Toolkit
-libraries 'ncbi' and 'vibrant'.
-
-
- CLUSTAL REFERENCES
- ------------------
-
-Details of algorithms, implementation and useful tips on usage of Clustal
-programs can be found in the following publications:
-
-Jeanmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
-Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
-
-Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
-The ClustalX windows interface: flexible strategies for multiple sequence
-alignment aided by quality analysis tools. Nucleic Acids Research, 25:4876-4882.
-
-Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
-multiple sequence alignments. Methods Enzymol., 266, 383-402.
-
-Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
-sensitivity of progressive multiple sequence alignment through sequence
-weighting, positions-specific gap penalties and weight matrix choice. Nucleic
-Acids Research, 22:4673-4680.
-
-Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
-multiple sequence alignment. CABIOS 8,189-191.
-
-Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
-alignments on a microcomputer. CABIOS 5,151-153.
-
-Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
-sequence alignment on a microcomputer. Gene 73,237-244.
-
Deleted: trunk/packages/clustalw/trunk/alnscore.c
===================================================================
--- trunk/packages/clustalw/trunk/alnscore.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/alnscore.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,114 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include "clustalw.h"
-
-#define MAX(a,b) ((a)>(b)?(a):(b))
-#define MIN(a,b) ((a)<(b)?(a):(b))
-
-/*
- * Prototypes
- */
-
-static sint count_gaps(sint s1, sint s2, sint l);
-
-/*
- * Global Variables
- */
-
-extern float gap_open;
-extern sint nseqs;
-extern sint *seqlen_array;
-extern short blosum45mt[];
-extern short def_aa_xref[];
-extern sint debug;
-extern sint max_aa;
-extern char **seq_array;
-
-
-void aln_score(void)
-{
- static short *mat_xref, *matptr;
- static sint maxres;
- static sint s1,s2,c1,c2;
- static sint ngaps;
- static sint i,l1,l2;
- static lint score;
- static sint matrix[NUMRES][NUMRES];
-
-/* calculate an overall score for the alignment by summing the
-scores for each pairwise alignment */
-
- matptr = blosum45mt;
- mat_xref = def_aa_xref;
- maxres = get_matrix(matptr, mat_xref, matrix, TRUE, 100);
- if (maxres == 0)
- {
- fprintf(stdout,"Error: matrix blosum30 not found\n");
- return;
- }
-
- score=0;
- for (s1=1;s1<=nseqs;s1++)
- {
- for (s2=1;s2<s1;s2++)
- {
-
- l1 = seqlen_array[s1];
- l2 = seqlen_array[s2];
- for (i=1;i<l1 && i<l2;i++)
- {
- c1 = seq_array[s1][i];
- c2 = seq_array[s2][i];
- if ((c1>=0) && (c1<=max_aa) && (c2>=0) && (c2<=max_aa))
- score += matrix[c1][c2];
- }
-
- ngaps = count_gaps(s1, s2, l1);
-
- score -= 100 * gap_open * ngaps;
-
- }
- }
-
- score /= 100;
-
- info("Alignment Score %d", (pint)score);
-
-}
-
-static sint count_gaps(sint s1, sint s2, sint l)
-{
- sint i, g;
- sint q, r, *Q, *R;
-
-
- Q = (sint *)ckalloc((l+2) * sizeof(sint));
- R = (sint *)ckalloc((l+2) * sizeof(sint));
-
- Q[0] = R[0] = g = 0;
-
- for (i=1;i<l;i++)
- {
- if (seq_array[s1][i] > max_aa) q = 1;
- else q = 0;
- if (seq_array[s2][i] > max_aa) r = 1;
- else r = 0;
-
- if (((Q[i-1] <= R[i-1]) && (q != 0) && (1-r != 0)) ||
- ((Q[i-1] >= R[i-1]) && (1-q != 0) && (r != 0)))
- g += 1;
- if (q != 0) Q[i] = Q[i-1]+1;
- else Q[i] = 0;
-
- if (r != 0) R[i] = R[i-1]+1;
- else R[i] = 0;
- }
-
- Q=ckfree((void *)Q);
- R=ckfree((void *)R);
-
- return(g);
-}
-
-
Deleted: trunk/packages/clustalw/trunk/amenu.c
===================================================================
--- trunk/packages/clustalw/trunk/amenu.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/amenu.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,1317 +0,0 @@
-/* Menus and command line interface for Clustal W */
-/* DES was here MARCH. 1994 */
-/* DES was here SEPT. 1994 */
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <signal.h>
-#include <setjmp.h>
-#include "clustalw.h"
-
-static jmp_buf jmpbuf;
-#ifndef VMS
-#ifndef AIX
-#define BADSIG (void (*)())-1
-#endif
-#endif
-
-static void jumper(int);
-
-static void jumper(int i)
-{
- longjmp(jmpbuf,1);
-}
-
-
-/*
-* Prototypes
-*/
-
-
-static void pair_menu(void);
-static void multi_menu(void);
-static void gap_penalties_menu(void);
-static void multiple_align_menu(void); /* multiple alignments menu */
-static void profile_align_menu(void); /* profile " " */
-static void phylogenetic_tree_menu(void); /* NJ trees/distances menu */
-static void format_options_menu(void); /* format of alignment output */
-static void tree_format_options_menu(void); /* format of tree output */
-static void ss_options_menu(void);
-static sint secstroutput_options(void);
-static sint read_matrix(char *title,MatMenu menu, char *matnam, sint matn, short *mat, short *xref);
-
-/*
-* Global variables
-*/
-
-extern float gap_open, gap_extend;
-extern float dna_gap_open, dna_gap_extend;
-extern float prot_gap_open, prot_gap_extend;
-extern float pw_go_penalty, pw_ge_penalty;
-extern float dna_pw_go_penalty, dna_pw_ge_penalty;
-extern float prot_pw_go_penalty, prot_pw_ge_penalty;
-extern float transition_weight;
-extern char revision_level[];
-extern sint wind_gap,ktup,window,signif;
-extern sint dna_wind_gap, dna_ktup, dna_window, dna_signif;
-extern sint prot_wind_gap,prot_ktup,prot_window,prot_signif;
-extern sint nseqs;
-extern sint divergence_cutoff;
-extern sint debug;
-extern Boolean neg_matrix;
-extern Boolean quick_pairalign;
-extern Boolean reset_alignments_new; /* DES */
-extern Boolean reset_alignments_all; /* DES */
-extern sint gap_dist;
-extern Boolean no_var_penalties, no_hyd_penalties, no_pref_penalties;
-extern sint output_order;
-extern sint profile_no;
-extern short usermat[], pw_usermat[];
-extern short aa_xref[], pw_aa_xref[];
-extern short userdnamat[], pw_userdnamat[];
-extern short dna_xref[], pw_dna_xref[];
-
-extern Boolean lowercase; /* Flag for GDE output - set on comm. line*/
-extern Boolean cl_seq_numbers;
-extern Boolean seqRange; /* to append sequence range with seq names, Ranu */
-
-extern Boolean output_clustal, output_nbrf, output_phylip, output_gcg, output_gde, output_nexus;
-extern Boolean output_fasta; /* Ramu */
-
-extern Boolean output_tree_clustal, output_tree_phylip, output_tree_distances,output_tree_nexus;
-extern sint bootstrap_format;
-extern Boolean tossgaps, kimura;
-extern Boolean percent;
-extern Boolean usemenu;
-extern Boolean showaln, save_parameters;
-extern Boolean dnaflag;
-extern Boolean use_ambiguities;
-
-
-extern char hyd_residues[];
-extern char mtrxname[], pw_mtrxname[];
-extern char dnamtrxname[], pw_dnamtrxname[];
-extern char seqname[];
-
-extern sint output_struct_penalties;
-extern Boolean use_ss1, use_ss2;
-
-extern Boolean empty;
-extern Boolean profile1_empty, profile2_empty; /* whether or not profiles */
-
-extern char profile1_name[FILENAMELEN+1];
-extern char profile2_name[FILENAMELEN+1];
-
-extern Boolean use_endgaps;
-extern sint matnum,pw_matnum;
-extern sint dnamatnum,pw_dnamatnum;
-
-extern sint helix_penalty;
-extern sint strand_penalty;
-extern sint loop_penalty;
-extern sint helix_end_minus;
-extern sint helix_end_plus;
-extern sint strand_end_minus;
-extern sint strand_end_plus;
-extern sint helix_end_penalty;
-extern sint strand_end_penalty;
-
-extern MatMenu matrix_menu;
-extern MatMenu pw_matrix_menu;
-extern MatMenu dnamatrix_menu;
-
-static char phylip_name[FILENAMELEN]="";
-static char clustal_name[FILENAMELEN]="";
-static char dist_name[FILENAMELEN]="";
-static char nexus_name[FILENAMELEN]="";
-static char fasta_name[FILENAMELEN]="";
-
-static char p1_tree_name[FILENAMELEN]="";
-static char p2_tree_name[FILENAMELEN]="";
-
-static char *secstroutput_txt[] = {
- "Secondary Structure",
- "Gap Penalty Mask",
- "Structure and Penalty Mask",
- "None" };
-
-
-static char *lin1, *lin2, *lin3;
-
-static int firstres =0; /* range of alignment for saving as ... */
-static int lastres = 0;
-
-void init_amenu(void)
-{
-
- lin1 = (char *)ckalloc( (MAXLINE+1) * sizeof (char) );
- lin2 = (char *)ckalloc( (MAXLINE+1) * sizeof (char) );
- lin3 = (char *)ckalloc( (MAXLINE+1) * sizeof (char) );
-}
-
-void main_menu(void)
-{
- int catchint;
-
- catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
- if (catchint) {
- if (setjmp(jmpbuf) != 0)
- fprintf(stdout,"\n.. Interrupt\n");
-#ifdef UNIX
- if (signal(SIGINT,jumper) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#else
- if (signal(SIGINT,SIG_DFL) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#endif
- }
-
- while(TRUE) {
- fprintf(stdout,"\n\n\n");
- fprintf(stdout," **************************************************************\n");
- fprintf(stdout," ******** CLUSTAL %s Multiple Sequence Alignments ********\n",revision_level);
- fprintf(stdout," **************************************************************\n");
- fprintf(stdout,"\n\n");
-
- fprintf(stdout," 1. Sequence Input From Disc\n");
- fprintf(stdout," 2. Multiple Alignments\n");
- fprintf(stdout," 3. Profile / Structure Alignments\n");
- fprintf(stdout," 4. Phylogenetic trees\n");
- fprintf(stdout,"\n");
- fprintf(stdout," S. Execute a system command\n");
- fprintf(stdout," H. HELP\n");
- fprintf(stdout," X. EXIT (leave program)\n\n\n");
-
- getstr("Your choice",MAXLINE+1,lin1);
-
- switch(toupper(*lin1)) {
- case '1': seq_input(FALSE);
- phylip_name[0]=EOS;
- clustal_name[0]=EOS;
- dist_name[0]=EOS;
- nexus_name[0]=EOS;
- break;
- case '2': multiple_align_menu();
- break;
- case '3': profile_align_menu();
- break;
- case '4': phylogenetic_tree_menu();
- break;
- case 'S': do_system();
- break;
- case '?':
- case 'H': get_help('1');
- break;
- case 'Q':
- case 'X': exit(0);
- break;
- default: fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-
-
-
-
-
-
-
-static void multiple_align_menu(void)
-{
- int catchint;
-
- catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
- if (catchint) {
- if (setjmp(jmpbuf) != 0)
- fprintf(stdout,"\n.. Interrupt\n");
-#ifdef UNIX
- if (signal(SIGINT,jumper) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#else
- if (signal(SIGINT,SIG_DFL) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#endif
- }
-
-
- while(TRUE)
- {
- fprintf(stdout,"\n\n\n");
- fprintf(stdout,"****** MULTIPLE ALIGNMENT MENU ******\n");
- fprintf(stdout,"\n\n");
-
-
- fprintf(stdout," 1. Do complete multiple alignment now (%s)\n",
- (!quick_pairalign) ? "Slow/Accurate" : "Fast/Approximate");
- fprintf(stdout," 2. Produce guide tree file only\n");
- fprintf(stdout," 3. Do alignment using old guide tree file\n\n");
- fprintf(stdout," 4. Toggle Slow/Fast pairwise alignments = %s\n\n",
- (!quick_pairalign) ? "SLOW" : "FAST");
- fprintf(stdout," 5. Pairwise alignment parameters\n");
- fprintf(stdout," 6. Multiple alignment parameters\n\n");
- fprintf(stdout," 7. Reset gaps before alignment?");
- if(reset_alignments_new)
- fprintf(stdout," = ON\n");
- else
- fprintf(stdout," = OFF\n");
- fprintf(stdout," 8. Toggle screen display = %s\n",
- (!showaln) ? "OFF" : "ON");
- fprintf(stdout," 9. Output format options\n");
- fprintf(stdout,"\n");
-
- fprintf(stdout," S. Execute a system command\n");
- fprintf(stdout," H. HELP\n");
- fprintf(stdout," or press [RETURN] to go back to main menu\n\n\n");
-
- getstr("Your choice",MAXLINE+1,lin1);
- if(*lin1 == EOS) return;
-
- switch(toupper(*lin1))
- {
- case '1': align(phylip_name);
- break;
- case '2': make_tree(phylip_name);
- break;
- case '3': get_tree(phylip_name);
- break;
- case '4': quick_pairalign ^= TRUE;
- break;
- case '5': pair_menu();
- break;
- case '6': multi_menu();
- break;
- case '7': reset_alignments_new ^= TRUE;
- if(reset_alignments_new==TRUE)
- reset_alignments_all=FALSE;
- break;
- case '8': showaln ^= TRUE;
- break;
- case '9': format_options_menu();
- break;
- case 'S': do_system();
- break;
- case '?':
- case 'H': get_help('2');
- break;
- case 'Q':
- case 'X': return;
-
- default: fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-
-
-
-
-
-
-
-static void profile_align_menu(void)
-{
- int catchint;
-
- catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
- if (catchint) {
- if (setjmp(jmpbuf) != 0)
- fprintf(stdout,"\n.. Interrupt\n");
-#ifdef UNIX
- if (signal(SIGINT,jumper) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#else
- if (signal(SIGINT,SIG_DFL) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#endif
- }
-
-
- while(TRUE)
- {
- fprintf(stdout,"\n\n\n");
- fprintf(stdout,"****** PROFILE AND STRUCTURE ALIGNMENT MENU ******\n");
- fprintf(stdout,"\n\n");
-
- fprintf(stdout," 1. Input 1st. profile ");
- if (!profile1_empty) fprintf(stdout,"(loaded)");
- fprintf(stdout,"\n");
- fprintf(stdout," 2. Input 2nd. profile/sequences ");
- if (!profile2_empty) fprintf(stdout,"(loaded)");
- fprintf(stdout,"\n\n");
- fprintf(stdout," 3. Align 2nd. profile to 1st. profile\n");
- fprintf(stdout," 4. Align sequences to 1st. profile (%s)\n\n",
- (!quick_pairalign) ? "Slow/Accurate" : "Fast/Approximate");
- fprintf(stdout," 5. Toggle Slow/Fast pairwise alignments = %s\n\n",
- (!quick_pairalign) ? "SLOW" : "FAST");
- fprintf(stdout," 6. Pairwise alignment parameters\n");
- fprintf(stdout," 7. Multiple alignment parameters\n\n");
- fprintf(stdout," 8. Toggle screen display = %s\n",
- (!showaln) ? "OFF" : "ON");
- fprintf(stdout," 9. Output format options\n");
- fprintf(stdout," 0. Secondary structure options\n");
- fprintf(stdout,"\n");
- fprintf(stdout," S. Execute a system command\n");
- fprintf(stdout," H. HELP\n");
- fprintf(stdout," or press [RETURN] to go back to main menu\n\n\n");
-
- getstr("Your choice",MAXLINE+1,lin1);
- if(*lin1 == EOS) return;
-
- switch(toupper(*lin1))
- {
- case '1': profile_no = 1; /* 1 => 1st profile */
- profile_input();
- strcpy(profile1_name, seqname);
- break;
- case '2': profile_no = 2; /* 2 => 2nd profile */
- profile_input();
- strcpy(profile2_name, seqname);
- break;
- case '3': profile_align(p1_tree_name,p2_tree_name); /* align the 2 alignments now */
- break;
- case '4': new_sequence_align(phylip_name); /* align new sequences to profile 1 */
- break;
- case '5': quick_pairalign ^= TRUE;
- break;
- case '6': pair_menu();
- break;
- case '7': multi_menu();
- break;
- case '8': showaln ^= TRUE;
- break;
- case '9': format_options_menu();
- break;
- case '0': ss_options_menu();
- break;
- case 'S': do_system();
- break;
- case '?':
- case 'H': get_help('6');
- break;
- case 'Q':
- case 'X': return;
-
- default: fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-static void ss_options_menu(void)
-{
- int catchint;
-
- catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
- if (catchint) {
- if (setjmp(jmpbuf) != 0)
- fprintf(stdout,"\n.. Interrupt\n");
-#ifdef UNIX
- if (signal(SIGINT,jumper) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#else
- if (signal(SIGINT,SIG_DFL) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#endif
- }
-
-
- while(TRUE) {
-
- fprintf(stdout,"\n\n\n");
- fprintf(stdout," ********* SECONDARY STRUCTURE OPTIONS *********\n");
- fprintf(stdout,"\n\n");
-
- fprintf(stdout," 1. Use profile 1 secondary structure / penalty mask ");
- if(use_ss1)
- fprintf(stdout,"= YES\n");
- else
- fprintf(stdout,"= NO\n");
- fprintf(stdout," 2. Use profile 2 secondary structure / penalty mask ");
- if(use_ss2)
- fprintf(stdout,"= YES\n");
- else
- fprintf(stdout,"= NO\n");
- fprintf(stdout,"\n");
- fprintf(stdout," 3. Output in alignment ");
- fprintf(stdout,"= %s\n",secstroutput_txt[output_struct_penalties]);
- fprintf(stdout,"\n");
-
- fprintf(stdout," 4. Helix gap penalty :%d\n",(pint)helix_penalty);
- fprintf(stdout," 5. Strand gap penalty :%d\n",(pint)strand_penalty);
- fprintf(stdout," 6. Loop gap penalty :%d\n",(pint)loop_penalty);
-
- fprintf(stdout," 7. Secondary structure terminal penalty :%d\n",(pint)helix_end_penalty);
- fprintf(stdout," 8. Helix terminal positions within :%d outside :%d\n",
- (pint)helix_end_minus,(pint)helix_end_plus);
- fprintf(stdout," 9. Strand terminal positions within :%d outside :%d\n",
- (pint)strand_end_minus,(pint)strand_end_plus);
-
- fprintf(stdout,"\n\n");
- fprintf(stdout," H. HELP\n\n\n");
-
- getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
- if( *lin2 == EOS) {
- return;
- }
-
- switch(toupper(*lin2)) {
- case '1': use_ss1 ^= TRUE;
- break;
- case '2': use_ss2 ^= TRUE;
- break;
- case '3': output_struct_penalties = secstroutput_options();
- break;
- case '4':
- fprintf(stdout,"Helix Penalty Currently: %d\n",(pint)helix_penalty);
- helix_penalty=getint("Enter number",1,9,helix_penalty);
- break;
- case '5':
- fprintf(stdout,"Strand Gap Penalty Currently: %d\n",(pint)strand_penalty);
- strand_penalty=getint("Enter number",1,9,strand_penalty);
- break;
- case '6':
- fprintf(stdout,"Loop Gap Penalty Currently: %d\n",(pint)loop_penalty);
- loop_penalty=getint("Enter number",1,9,loop_penalty);
- break;
- case '7':
- fprintf(stdout,"Secondary Structure Terminal Penalty Currently: %d\n",
- (pint)helix_end_penalty);
- helix_end_penalty=getint("Enter number",1,9,helix_end_penalty);
- strand_end_penalty = helix_end_penalty;
- break;
- case '8':
- fprintf(stdout,"Helix Terminal Positions Currently: \n");
- fprintf(stdout," within helix: %d outside helix: %d\n",
- (pint)helix_end_minus,(pint)helix_end_plus);
- helix_end_minus=getint("Enter number of residues within helix",0,3,helix_end_minus);
- helix_end_plus=getint("Enter number of residues outside helix",0,3,helix_end_plus);
- break;
- case '9':
- fprintf(stdout,"Strand Terminal Positions Currently: \n");
- fprintf(stdout," within strand: %d outside strand: %d\n",
- (pint)strand_end_minus,(pint)strand_end_plus);
- strand_end_minus=getint("Enter number of residues within strand",0,3,strand_end_minus);
- strand_end_plus=getint("Enter number of residues outside strand",0,3,strand_end_plus);
- break;
- case '?':
- case 'H':
- get_help('B');
- break;
- default:
- fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-static sint secstroutput_options(void)
-{
-
- while(TRUE)
- {
- fprintf(stdout,"\n\n\n");
- fprintf(stdout," ********* Secondary Structure Output Menu *********\n");
- fprintf(stdout,"\n\n");
-
-
- fprintf(stdout," 1. %s\n",secstroutput_txt[0]);
- fprintf(stdout," 2. %s\n",secstroutput_txt[1]);
- fprintf(stdout," 3. %s\n",secstroutput_txt[2]);
- fprintf(stdout," 4. %s\n",secstroutput_txt[3]);
- fprintf(stdout," H. HELP\n\n");
- fprintf(stdout,
-" -- Current output is %s ",secstroutput_txt[output_struct_penalties]);
- fprintf(stdout,"--\n");
-
-
- getstr("\n\nEnter number (or [RETURN] to exit)",MAXLINE+1,lin2);
- if(*lin2 == EOS) return(output_struct_penalties);
-
- switch(toupper(*lin2))
- {
- case '1': return(0);
- case '2': return(1);
- case '3': return(2);
- case '4': return(3);
- case '?':
- case 'H': get_help('C');
- case 'Q':
- case 'X': return(0);
-
- default: fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-
-
-
-
-
-static void phylogenetic_tree_menu(void)
-{
- int catchint;
-
- catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
- if (catchint) {
- if (setjmp(jmpbuf) != 0)
- fprintf(stdout,"\n.. Interrupt\n");
-#ifdef UNIX
- if (signal(SIGINT,jumper) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#else
- if (signal(SIGINT,SIG_DFL) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#endif
- }
-
-
- while(TRUE)
- {
- fprintf(stdout,"\n\n\n");
- fprintf(stdout,"****** PHYLOGENETIC TREE MENU ******\n");
- fprintf(stdout,"\n\n");
-
- fprintf(stdout," 1. Input an alignment\n");
- fprintf(stdout," 2. Exclude positions with gaps? ");
- if(tossgaps)
- fprintf(stdout,"= ON\n");
- else
- fprintf(stdout,"= OFF\n");
- fprintf(stdout," 3. Correct for multiple substitutions? ");
- if(kimura)
- fprintf(stdout,"= ON\n");
- else
- fprintf(stdout,"= OFF\n");
- fprintf(stdout," 4. Draw tree now\n");
- fprintf(stdout," 5. Bootstrap tree\n");
- fprintf(stdout," 6. Output format options\n");
- fprintf(stdout,"\n");
- fprintf(stdout," S. Execute a system command\n");
- fprintf(stdout," H. HELP\n");
- fprintf(stdout," or press [RETURN] to go back to main menu\n\n\n");
-
- getstr("Your choice",MAXLINE+1,lin1);
- if(*lin1 == EOS) return;
-
- switch(toupper(*lin1))
- {
- case '1': seq_input(FALSE);
- phylip_name[0]=EOS;
- clustal_name[0]=EOS;
- dist_name[0]=EOS;
- nexus_name[0]=EOS;
- break;
- case '2': tossgaps ^= TRUE;
- break;
- case '3': kimura ^= TRUE;;
- break;
- case '4': phylogenetic_tree(phylip_name,clustal_name,dist_name,nexus_name,"amenu.pim");
- break;
- case '5': bootstrap_tree(phylip_name,clustal_name,nexus_name);
- break;
- case '6': tree_format_options_menu();
- break;
- case 'S': do_system();
- break;
- case '?':
- case 'H': get_help('7');
- break;
- case 'Q':
- case 'X': return;
-
- default: fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-
-
-
-
-static void tree_format_options_menu(void) /* format of tree output */
-{
- int catchint;
-
- catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
- if (catchint) {
- if (setjmp(jmpbuf) != 0)
- fprintf(stdout,"\n.. Interrupt\n");
-#ifdef UNIX
- if (signal(SIGINT,jumper) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#else
- if (signal(SIGINT,SIG_DFL) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#endif
- }
-
-
- while(TRUE) {
- fprintf(stdout,"\n\n\n");
- fprintf(stdout," ****** Format of Phylogenetic Tree Output ******\n");
- fprintf(stdout,"\n\n");
- fprintf(stdout," 1. Toggle CLUSTAL format tree output = %s\n",
- (!output_tree_clustal) ? "OFF" : "ON");
- fprintf(stdout," 2. Toggle Phylip format tree output = %s\n",
- (!output_tree_phylip) ? "OFF" : "ON");
- fprintf(stdout," 3. Toggle Phylip distance matrix output = %s\n",
- (!output_tree_distances)? "OFF" : "ON");
- fprintf(stdout," 4. Toggle Nexus format tree output = %s\n\n",
- (!output_tree_nexus)? "OFF" : "ON");
- fprintf(stdout," 5. Toggle Phylip bootstrap positions = %s\n\n",
-(bootstrap_format==BS_NODE_LABELS) ? "NODE LABELS" : "BRANCH LABELS");
- fprintf(stdout,"\n");
- fprintf(stdout," H. HELP\n\n\n");
-
- getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
- if(*lin2 == EOS) return;
-
- switch(toupper(*lin2)) {
- case '1':
- output_tree_clustal ^= TRUE;
- break;
- case '2':
- output_tree_phylip ^= TRUE;
- break;
- case '3':
- output_tree_distances ^= TRUE;
- break;
- case '4':
- output_tree_nexus ^= TRUE;
- break;
- case '5':
- if (bootstrap_format == BS_NODE_LABELS)
- bootstrap_format = BS_BRANCH_LABELS;
- else
- bootstrap_format = BS_NODE_LABELS;
- break;
- case '?':
- case 'H':
- get_help('0');
- break;
- default:
- fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-static void format_options_menu(void) /* format of alignment output */
-{
- sint i;
- sint length = 0;
- char path[FILENAMELEN+1];
- int catchint;
-
- catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
- if (catchint) {
- if (setjmp(jmpbuf) != 0)
- fprintf(stdout,"\n.. Interrupt\n");
-#ifdef UNIX
- if (signal(SIGINT,jumper) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#else
- if (signal(SIGINT,SIG_DFL) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#endif
- }
-
-
- while(TRUE) {
- fprintf(stdout,"\n\n\n");
- fprintf(stdout," ********* Format of Alignment Output *********\n");
- fprintf(stdout,"\n\n");
- fprintf(stdout," F. Toggle FASTA format output = %s\n\n",
- (!output_fasta) ? "OFF" : "ON");
- fprintf(stdout," 1. Toggle CLUSTAL format output = %s\n",
- (!output_clustal) ? "OFF" : "ON");
- fprintf(stdout," 2. Toggle NBRF/PIR format output = %s\n",
- (!output_nbrf) ? "OFF" : "ON");
- fprintf(stdout," 3. Toggle GCG/MSF format output = %s\n",
- (!output_gcg) ? "OFF" : "ON");
- fprintf(stdout," 4. Toggle PHYLIP format output = %s\n",
- (!output_phylip) ? "OFF" : "ON");
- fprintf(stdout," 5. Toggle NEXUS format output = %s\n",
- (!output_nexus) ? "OFF" : "ON");
- fprintf(stdout," 6. Toggle GDE format output = %s\n\n",
- (!output_gde) ? "OFF" : "ON");
- fprintf(stdout," 7. Toggle GDE output case = %s\n",
- (!lowercase) ? "UPPER" : "LOWER");
-
- fprintf(stdout," 8. Toggle CLUSTALW sequence numbers = %s\n",
- (!cl_seq_numbers) ? "OFF" : "ON");
- fprintf(stdout," 9. Toggle output order = %s\n\n",
- (output_order==0) ? "INPUT FILE" : "ALIGNED");
-
- fprintf(stdout," 0. Create alignment output file(s) now?\n\n");
- fprintf(stdout," T. Toggle parameter output = %s\n",
- (!save_parameters) ? "OFF" : "ON");
- fprintf(stdout," R. Toggle sequence range numbers = %s\n",
- (!seqRange) ? "OFF" : "ON");
- fprintf(stdout,"\n");
- fprintf(stdout," H. HELP\n\n\n");
-
- getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
- if(*lin2 == EOS) return;
-
- switch(toupper(*lin2)) {
- case '1':
- output_clustal ^= TRUE;
- break;
- case '2':
- output_nbrf ^= TRUE;
- break;
- case '3':
- output_gcg ^= TRUE;
- break;
- case '4':
- output_phylip ^= TRUE;
- break;
- case '5':
- output_nexus ^= TRUE;
- break;
- case '6':
- output_gde ^= TRUE;
- break;
- case '7':
- lowercase ^= TRUE;
- break;
- case '8':
- cl_seq_numbers ^= TRUE;
- break;
- case '9':
- if (output_order == INPUT) output_order = ALIGNED;
- else output_order = INPUT;
- break;
- case 'F':
- output_fasta ^= TRUE;
- break;
- case 'R':
- seqRange ^= TRUE;
- break;
-
- case '0': /* DES */
- if(empty) {
- error("No sequences loaded");
- break;
- }
- get_path(seqname,path);
- if(!open_alignment_output(path)) break;
- create_alignment_output(1,nseqs);
- break;
- case 'T': save_parameters ^= TRUE;
- break;
- case '?':
- case 'H':
- get_help('5');
- break;
- default:
- fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-
-
-
-
-
-
-
-
-
-
-static void pair_menu(void)
-{
- int catchint;
-
- catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
- if (catchint) {
- if (setjmp(jmpbuf) != 0)
- fprintf(stdout,"\n.. Interrupt\n");
-#ifdef UNIX
- if (signal(SIGINT,jumper) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#else
- if (signal(SIGINT,SIG_DFL) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#endif
- }
-
-
- if(dnaflag) {
- pw_go_penalty = dna_pw_go_penalty;
- pw_ge_penalty = dna_pw_ge_penalty;
- ktup = dna_ktup;
- window = dna_window;
- signif = dna_signif;
- wind_gap = dna_wind_gap;
-
- }
- else {
- pw_go_penalty = prot_pw_go_penalty;
- pw_ge_penalty = prot_pw_ge_penalty;
- ktup = prot_ktup;
- window = prot_window;
- signif = prot_signif;
- wind_gap = prot_wind_gap;
-
- }
-
- while(TRUE) {
-
- fprintf(stdout,"\n\n\n");
- fprintf(stdout," ********* PAIRWISE ALIGNMENT PARAMETERS *********\n");
- fprintf(stdout,"\n\n");
-
- fprintf(stdout," Slow/Accurate alignments:\n\n");
-
- fprintf(stdout," 1. Gap Open Penalty :%4.2f\n",pw_go_penalty);
- fprintf(stdout," 2. Gap Extension Penalty :%4.2f\n",pw_ge_penalty);
- fprintf(stdout," 3. Protein weight matrix :%s\n" ,
- matrix_menu.opt[pw_matnum-1].title);
- fprintf(stdout," 4. DNA weight matrix :%s\n" ,
- dnamatrix_menu.opt[pw_dnamatnum-1].title);
- fprintf(stdout,"\n");
-
- fprintf(stdout," Fast/Approximate alignments:\n\n");
-
- fprintf(stdout," 5. Gap penalty :%d\n",(pint)wind_gap);
- fprintf(stdout," 6. K-tuple (word) size :%d\n",(pint)ktup);
- fprintf(stdout," 7. No. of top diagonals :%d\n",(pint)signif);
- fprintf(stdout," 8. Window size :%d\n\n",(pint)window);
-
- fprintf(stdout," 9. Toggle Slow/Fast pairwise alignments ");
- if(quick_pairalign)
- fprintf(stdout,"= FAST\n\n");
- else
- fprintf(stdout,"= SLOW\n\n");
-
-
- fprintf(stdout," H. HELP\n\n\n");
-
- getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
- if( *lin2 == EOS) {
- if(dnaflag) {
- dna_pw_go_penalty = pw_go_penalty;
- dna_pw_ge_penalty = pw_ge_penalty;
- dna_ktup = ktup;
- dna_window = window;
- dna_signif = signif;
- dna_wind_gap = wind_gap;
-
- }
- else {
- prot_pw_go_penalty = pw_go_penalty;
- prot_pw_ge_penalty = pw_ge_penalty;
- prot_ktup = ktup;
- prot_window = window;
- prot_signif = signif;
- prot_wind_gap = wind_gap;
-
- }
-
- return;
- }
-
- switch(toupper(*lin2)) {
- case '1':
- fprintf(stdout,"Gap Open Penalty Currently: %4.2f\n",pw_go_penalty);
- pw_go_penalty=(float)getreal("Enter number",(double)0.0,(double)100.0,(double)pw_go_penalty);
- break;
- case '2':
- fprintf(stdout,"Gap Extension Penalty Currently: %4.2f\n",pw_ge_penalty);
- pw_ge_penalty=(float)getreal("Enter number",(double)0.0,(double)10.0,(double)pw_ge_penalty);
- break;
- case '3':
- pw_matnum = read_matrix("PROTEIN",pw_matrix_menu,pw_mtrxname,pw_matnum,pw_usermat,pw_aa_xref);
- break;
- case '4':
- pw_dnamatnum = read_matrix("DNA",dnamatrix_menu,pw_dnamtrxname,pw_dnamatnum,pw_userdnamat,pw_dna_xref);
- break;
- case '5':
- fprintf(stdout,"Gap Penalty Currently: %d\n",(pint)wind_gap);
- wind_gap=getint("Enter number",1,500,wind_gap);
- break;
- case '6':
- fprintf(stdout,"K-tuple Currently: %d\n",(pint)ktup);
- if(dnaflag)
- ktup=getint("Enter number",1,4,ktup);
- else
- ktup=getint("Enter number",1,2,ktup);
- break;
- case '7':
- fprintf(stdout,"Top diagonals Currently: %d\n",(pint)signif);
- signif=getint("Enter number",1,50,signif);
- break;
- case '8':
- fprintf(stdout,"Window size Currently: %d\n",(pint)window);
- window=getint("Enter number",1,50,window);
- break;
- case '9': quick_pairalign ^= TRUE;
- break;
- case '?':
- case 'H':
- get_help('3');
- break;
- default:
- fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-
-
-
-static void multi_menu(void)
-{
- int catchint;
-
- catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
- if (catchint) {
- if (setjmp(jmpbuf) != 0)
- fprintf(stdout,"\n.. Interrupt\n");
-#ifdef UNIX
- if (signal(SIGINT,jumper) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#else
- if (signal(SIGINT,SIG_DFL) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#endif
- }
-
-
- if(dnaflag) {
- gap_open = dna_gap_open;
- gap_extend = dna_gap_extend;
- }
- else {
- gap_open = prot_gap_open;
- gap_extend = prot_gap_extend;
- }
-
- while(TRUE) {
-
- fprintf(stdout,"\n\n\n");
- fprintf(stdout," ********* MULTIPLE ALIGNMENT PARAMETERS *********\n");
- fprintf(stdout,"\n\n");
-
- fprintf(stdout," 1. Gap Opening Penalty :%4.2f\n",gap_open);
- fprintf(stdout," 2. Gap Extension Penalty :%4.2f\n",gap_extend);
-
- fprintf(stdout," 3. Delay divergent sequences :%d %%\n\n",(pint)divergence_cutoff);
-
- fprintf(stdout," 4. DNA Transitions Weight :%1.2f\n\n",transition_weight);
- fprintf(stdout," 5. Protein weight matrix :%s\n"
- ,matrix_menu.opt[matnum-1].title);
- fprintf(stdout," 6. DNA weight matrix :%s\n"
- ,dnamatrix_menu.opt[dnamatnum-1].title);
- fprintf(stdout," 7. Use negative matrix :%s\n\n",(!neg_matrix) ? "OFF" : "ON");
- fprintf(stdout," 8. Protein Gap Parameters\n\n");
- fprintf(stdout," H. HELP\n\n\n");
-
- getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
-
- if(*lin2 == EOS) {
- if(dnaflag) {
- dna_gap_open = gap_open;
- dna_gap_extend = gap_extend;
- }
- else {
- prot_gap_open = gap_open;
- prot_gap_extend = gap_extend;
- }
- return;
- }
-
- switch(toupper(*lin2)) {
- case '1':
- fprintf(stdout,"Gap Opening Penalty Currently: %4.2f\n",gap_open);
- gap_open=(float)getreal("Enter number",(double)0.0,(double)100.0,(double)gap_open);
- break;
- case '2':
- fprintf(stdout,"Gap Extension Penalty Currently: %4.2f\n",gap_extend);
- gap_extend=(float)getreal("Enter number",(double)0.0,(double)10.0,(double)gap_extend);
- break;
- case '3':
- fprintf(stdout,"Min Identity Currently: %d\n",(pint)divergence_cutoff);
- divergence_cutoff=getint("Enter number",0,100,divergence_cutoff);
- break;
- case '4':
- fprintf(stdout,"Transition Weight Currently: %1.2f\n",(pint)transition_weight);
- transition_weight=(float)getreal("Enter number",(double)0.0,(double)1.0,(double)transition_weight);
- break;
- case '5':
- matnum = read_matrix("PROTEIN",matrix_menu,mtrxname,matnum,usermat,aa_xref);
- break;
- case '6':
- dnamatnum = read_matrix("DNA",dnamatrix_menu,dnamtrxname,dnamatnum,userdnamat,dna_xref);
- break;
- case '7':
- neg_matrix ^= TRUE;
- break;
- case '8':
- gap_penalties_menu();
- break;
- case '?':
- case 'H':
- get_help('4');
- break;
- default:
- fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-
-
-
-
-static void gap_penalties_menu(void)
-{
- char c;
- sint i;
- int catchint;
-
- catchint = signal(SIGINT, SIG_IGN) != SIG_IGN;
- if (catchint) {
- if (setjmp(jmpbuf) != 0)
- fprintf(stdout,"\n.. Interrupt\n");
-#ifdef UNIX
- if (signal(SIGINT,jumper) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#else
- if (signal(SIGINT,SIG_DFL) == BADSIG)
- fprintf(stdout,"Error: signal\n");
-#endif
- }
-
-
- while(TRUE) {
-
- fprintf(stdout,"\n\n\n");
- fprintf(stdout," ********* PROTEIN GAP PARAMETERS *********\n");
- fprintf(stdout,"\n\n\n");
-
- fprintf(stdout," 1. Toggle Residue-Specific Penalties :%s\n\n",(no_pref_penalties) ? "OFF" : "ON");
- fprintf(stdout," 2. Toggle Hydrophilic Penalties :%s\n",(no_hyd_penalties) ? "OFF" : "ON");
- fprintf(stdout," 3. Hydrophilic Residues :%s\n\n"
- ,hyd_residues);
- fprintf(stdout," 4. Gap Separation Distance :%d\n",(pint)gap_dist);
- fprintf(stdout," 5. Toggle End Gap Separation :%s\n\n",(!use_endgaps) ? "OFF" : "ON");
- fprintf(stdout," H. HELP\n\n\n");
-
- getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
-
- if(*lin2 == EOS) return;
-
- switch(toupper(*lin2)) {
- case '1':
- no_pref_penalties ^= TRUE;
- break;
- case '2':
- no_hyd_penalties ^= TRUE;
- break;
- case '3':
- fprintf(stdout,"Hydrophilic Residues Currently: %s\n",hyd_residues);
-
- getstr("Enter residues (or [RETURN] to quit)",MAXLINE+1,lin1);
- if (*lin1 != EOS) {
- for (i=0;i<strlen(hyd_residues) && i<26;i++) {
- c = lin1[i];
- if (isalpha(c))
- hyd_residues[i] = (char)toupper(c);
- else
- break;
- }
- hyd_residues[i] = EOS;
- }
- break;
- case '4':
- fprintf(stdout,"Gap Separation Distance Currently: %d\n",(pint)gap_dist);
- gap_dist=getint("Enter number",0,100,gap_dist);
- break;
- case '5':
- use_endgaps ^= TRUE;
- break;
- case '?':
- case 'H':
- get_help('A');
- break;
- default:
- fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-
-
-static sint read_matrix(char *title,MatMenu menu, char *matnam, sint matn, short *mat, short *xref)
-{ static char userfile[FILENAMELEN+1];
- int i;
-
- while(TRUE)
- {
- fprintf(stdout,"\n\n\n");
- fprintf(stdout," ********* %s WEIGHT MATRIX MENU *********\n",title);
- fprintf(stdout,"\n\n");
-
- for(i=0;i<menu.noptions;i++)
- fprintf(stdout," %d. %s\n",i+1,menu.opt[i].title);
- fprintf(stdout," H. HELP\n\n");
- fprintf(stdout,
-" -- Current matrix is the %s ",menu.opt[matn-1].title);
- if(matn == menu.noptions) fprintf(stdout,"(file = %s)",userfile);
- fprintf(stdout,"--\n");
-
-
- getstr("\n\nEnter number (or [RETURN] to exit)",MAXLINE+1,lin2);
- if(*lin2 == EOS) return(matn);
-
- i=toupper(*lin2)-'0';
- if(i>0 && i<menu.noptions) {
- strcpy(matnam,menu.opt[i-1].string);
- matn=i;
- } else if (i==menu.noptions) {
- if(user_mat(userfile, mat, xref)) {
- strcpy(matnam,userfile);
- matn=i;
- }
- }
- else
- switch(toupper(*lin2)) {
- case '?':
- case 'H':
- get_help('8');
- break;
- default:
- fprintf(stdout,"\n\nUnrecognised Command\n\n");
- break;
- }
- }
-}
-
-
-char prompt_for_yes_no(char *title,char *prompt)
-{
- char line[80];
- char lin2[80];
-
- fprintf(stdout,"\n%s\n",title);
- strcpy(line,prompt);
- strcat(line, "(y/n) ? [y]");
- getstr(line,MAXLINE+1,lin2);
- if ((*lin2 != 'n') && (*lin2 != 'N'))
- return('y');
- else
- return('n');
-
-}
-
-
-/*
-* fatal()
-*
-* Prints error msg to stdout and exits.
-* Variadic parameter list can be passed.
-*
-* Return values:
-* none
-*/
-
-void fatal( char *msg,...)
-{
- va_list ap;
-
- va_start(ap,msg);
- fprintf(stdout,"\n\nFATAL ERROR: ");
- vfprintf(stdout,msg,ap);
- fprintf(stdout,"\n\n");
- va_end(ap);
- exit(1);
-}
-
-/*
-* error()
-*
-* Prints error msg to stdout.
-* Variadic parameter list can be passed.
-*
-* Return values:
-* none
-*/
-
-void error( char *msg,...)
-{
- va_list ap;
-
- va_start(ap,msg);
- fprintf(stdout,"\n\nERROR: ");
- vfprintf(stdout,msg,ap);
- fprintf(stdout,"\n\n");
- va_end(ap);
-}
-
-/*
-* warning()
-*
-* Prints warning msg to stdout.
-* Variadic parameter list can be passed.
-*
-* Return values:
-* none
-*/
-
-void warning( char *msg,...)
-{
- va_list ap;
-
- va_start(ap,msg);
- fprintf(stdout,"\n\nWARNING: ");
- vfprintf(stdout,msg,ap);
- fprintf(stdout,"\n\n");
- va_end(ap);
-}
-
-/*
-* info()
-*
-* Prints info msg to stdout.
-* Variadic parameter list can be passed.
-*
-* Return values:
-* none
-*/
-
-void info( char *msg,...)
-{
- va_list ap;
-
- va_start(ap,msg);
- fprintf(stdout,"\n");
- vfprintf(stdout,msg,ap);
- va_end(ap);
-}
Deleted: trunk/packages/clustalw/trunk/calcgapcoeff.c
===================================================================
--- trunk/packages/clustalw/trunk/calcgapcoeff.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/calcgapcoeff.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,497 +0,0 @@
-#include <stdio.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include <string.h>
-#include "clustalw.h"
-
-
-/*
- * Prototypes
- */
-void calc_p_penalties(char **aln, sint n, sint fs, sint ls, sint *weight);
-void calc_h_penalties(char **aln, sint n, sint fs, sint ls, sint *weight);
-void calc_v_penalties(char **aln, sint n, sint fs, sint ls, sint *weight);
-sint local_penalty(sint penalty, sint n, sint *pweight, sint *hweight, sint *vweight);
-float percentid(char *s1, char *s2,sint length);
-/*
- * Global variables
- */
-
-extern sint gap_dist;
-extern sint max_aa;
-extern sint debug;
-extern Boolean dnaflag;
-extern Boolean use_endgaps;
-extern Boolean endgappenalties;
-extern Boolean no_var_penalties, no_hyd_penalties, no_pref_penalties;
-extern char hyd_residues[];
-extern char *amino_acid_codes;
-
-/* vwindow is the number of residues used for a window for the variable zone penalties */
-/* vll is the lower limit for the variable zone penalties (vll < pen < 1.0) */
-int vll=50;
-int vwindow=5;
-
-sint vlut[26][26] = {
-/* A B C D E F G H I J K L M N O P Q R S T U V W X Y Z */
-/*A*/ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*B*/ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*C*/ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*D*/ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*E*/ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*F*/ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*G*/ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*H*/ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*I*/ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*J*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*K*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*L*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*M*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*N*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*O*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*P*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*Q*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-/*R*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
-/*S*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-/*T*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
-/*U*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-/*V*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
-/*W*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-/*X*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
-/*Y*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
-/*Z*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
- };
-
-/* pascarella probabilities for opening a gap at specific residues */
-char pr[] = {'A' , 'C', 'D', 'E', 'F', 'G', 'H', 'K', 'I', 'L',
- 'M' , 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'Y', 'W'};
-sint pas_op[] = { 87, 87,104, 69, 80,139,100,104, 68, 79,
- 71,137,126, 93,128,124,111, 75,100, 77};
-sint pas_op2[] ={ 88, 57,111, 98, 75,126, 95, 97, 70, 90,
- 60,122,110,107, 91,125,124, 81,106, 88};
-sint pal_op[] = { 84, 69,128, 78, 88,176, 53, 95, 55, 49,
- 52,148,147,100, 91,129,105, 51,128, 88};
-
-float reduced_gap = 1.0;
-Boolean nvar_pen,nhyd_pen,npref_pen; /* local copies of ho_hyd_penalties, no_pref_penalties */
-sint gdist; /* local copy of gap_dist */
-
-void calc_gap_coeff(char **alignment, sint *gaps, sint **profile, Boolean struct_penalties,
- char *gap_penalty_mask, sint first_seq, sint last_seq,
- sint prf_length, sint gapcoef, sint lencoef)
-{
-
- char c;
- sint i, j;
- sint is, ie;
- static sint numseq,val,pcid;
- static sint *gap_pos;
- static sint *v_weight, *p_weight, *h_weight;
- static float scale;
-
- numseq = last_seq - first_seq;
- if(numseq == 2)
- {
- pcid=percentid(alignment[first_seq],alignment[first_seq+1],prf_length);
- }
- else pcid=0;
-
- for (j=0; j<prf_length; j++)
- gaps[j] = 0;
-/*
- Check for a gap penalty mask
-*/
- if (struct_penalties != NONE)
- {
- nvar_pen = nhyd_pen = npref_pen = TRUE;
- gdist = 0;
- }
- else if (no_var_penalties == FALSE && pcid > 60)
- {
-if(debug>0) fprintf(stderr,"Using variable zones to set gap penalties (pcid = %d)\n",pcid);
- nhyd_pen = npref_pen = TRUE;
- nvar_pen = FALSE;
- }
- else
- {
- nvar_pen = TRUE;
- nhyd_pen = no_hyd_penalties;
- npref_pen = no_pref_penalties;
- gdist = gap_dist;
- }
-
- for (i=first_seq; i<last_seq; i++)
- {
-/*
- Include end gaps as gaps ?
-*/
- is = 0;
- ie = prf_length;
- if (use_endgaps == FALSE && endgappenalties==FALSE)
- {
- for (j=0; j<prf_length; j++)
- {
- c = alignment[i][j];
- if ((c < 0) || (c > max_aa))
- is++;
- else
- break;
- }
- for (j=prf_length-1; j>=0; j--)
- {
- c = alignment[i][j];
- if ((c < 0) || (c > max_aa))
- ie--;
- else
- break;
- }
- }
-
- for (j=is; j<ie; j++)
- {
- if ((alignment[i][j] < 0) || (alignment[i][j] > max_aa))
- gaps[j]++;
- }
- }
-
- if ((!dnaflag) && (nvar_pen == FALSE))
- {
- v_weight = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
- calc_v_penalties(alignment, prf_length, first_seq, last_seq, v_weight);
- }
-
-
- if ((!dnaflag) && (npref_pen == FALSE))
- {
- p_weight = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
- calc_p_penalties(alignment, prf_length, first_seq, last_seq, p_weight);
- }
-
- if ((!dnaflag) && (nhyd_pen == FALSE))
- {
- h_weight = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
- calc_h_penalties(alignment, prf_length, first_seq, last_seq, h_weight);
- }
-
- gap_pos = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
-/*
- mark the residues close to an existing gap (set gaps[i] = -ve)
-*/
- if (dnaflag || (gdist <= 0))
- {
- for (i=0;i<prf_length;i++) gap_pos[i] = gaps[i];
- }
- else
- {
- i=0;
- while (i<prf_length)
- {
- if (gaps[i] <= 0)
- {
- gap_pos[i] = gaps[i];
- i++;
- }
- else
- {
- for (j = -gdist+1; j<0; j++)
- {
- if ((i+j>=0) && (i+j<prf_length) &&
- ((gaps[i+j] == 0) || (gaps[i+j] < j))) gap_pos[i+j] = j;
- }
- while (gaps[i] > 0)
- {
- if (i>=prf_length) break;
- gap_pos[i] = gaps[i];
- i++;
- }
- for (j = 0; j<gdist; j++)
- {
- if (gaps[i+j] > 0) break;
- if ((i+j>=0) && (i+j<prf_length) &&
- ((gaps[i+j] == 0) || (gaps[i+j] < -j))) gap_pos[i+j] = -j-1;
- }
- i += j;
- }
- }
- }
-if (debug>1)
-{
-fprintf(stdout,"gap open %d gap ext %d\n",(pint)gapcoef,(pint)lencoef);
-fprintf(stdout,"gaps:\n");
- for(i=0;i<prf_length;i++) fprintf(stdout,"%d ", (pint)gaps[i]);
- fprintf(stdout,"\n");
-fprintf(stdout,"gap_pos:\n");
- for(i=0;i<prf_length;i++) fprintf(stdout,"%d ", (pint)gap_pos[i]);
- fprintf(stdout,"\n");
-}
-
-
- for (j=0;j<prf_length; j++)
- {
-
- if (gap_pos[j] <= 0)
- {
-/*
- apply residue-specific and hydrophilic gap penalties.
-*/
- if (!dnaflag) {
- profile[j+1][GAPCOL] = local_penalty(gapcoef, j,
- p_weight, h_weight, v_weight);
- profile[j+1][LENCOL] = lencoef;
- }
- else {
- profile[j+1][GAPCOL] = gapcoef;
- profile[j+1][LENCOL] = lencoef;
- }
-
-/*
- increase gap penalty near to existing gaps.
-*/
- if (gap_pos[j] < 0)
- {
- profile[j+1][GAPCOL] *= 2.0+2.0*(gdist+gap_pos[j])/gdist;
- }
-
-
- }
- else
- {
- scale = ((float)(numseq-gaps[j])/(float)numseq) * reduced_gap;
- profile[j+1][GAPCOL] = scale*gapcoef;
- profile[j+1][LENCOL] = 0.5 * lencoef;
- }
-/*
- apply the gap penalty mask
-*/
- if (struct_penalties != NONE)
- {
- val = gap_penalty_mask[j]-'0';
- if (val > 0 && val < 10)
- {
- profile[j+1][GAPCOL] *= val;
- profile[j+1][LENCOL] *= val;
- }
- }
-/*
- make sure no penalty is zero - even for all-gap positions
-*/
- if (profile[j+1][GAPCOL] <= 0) profile[j+1][GAPCOL] = 1;
- if (profile[j+1][LENCOL] <= 0) profile[j+1][LENCOL] = 1;
- }
-
-/* set the penalties at the beginning and end of the profile */
- if(endgappenalties==TRUE)
- {
- profile[0][GAPCOL] = gapcoef;
- profile[0][LENCOL] = lencoef;
- }
- else
- {
- profile[0][GAPCOL] = 0;
- profile[0][LENCOL] = 0;
- profile[prf_length][GAPCOL] = 0;
- profile[prf_length][LENCOL] = 0;
- }
-if (debug>0)
-{
- fprintf(stdout,"Opening penalties:\n");
- for(i=0;i<=prf_length;i++) fprintf(stdout," %d:%d ",i, (pint)profile[i][GAPCOL]);
- fprintf(stdout,"\n");
-}
-if (debug>0)
-{
- fprintf(stdout,"Extension penalties:\n");
- for(i=0;i<=prf_length;i++) fprintf(stdout,"%d:%d ",i, (pint)profile[i][LENCOL]);
- fprintf(stdout,"\n");
-}
- if ((!dnaflag) && (nvar_pen == FALSE))
- v_weight=ckfree((void *)v_weight);
-
- if ((!dnaflag) && (npref_pen == FALSE))
- p_weight=ckfree((void *)p_weight);
-
- if ((!dnaflag) && (nhyd_pen == FALSE))
- h_weight=ckfree((void *)h_weight);
-
-
- gap_pos=ckfree((void *)gap_pos);
-}
-
-void calc_v_penalties(char **aln, sint n, sint fs, sint ls, sint *weight)
-{
- char ix1,ix2;
- sint i,j,k,t;
-
- for (i=0;i<n;i++)
- {
- weight[i] = 0;
- t=0;
- for(j=i-vwindow;j<i+vwindow;j++)
- {
- if(j>=0 && j<n)
- {
- ix1 = aln[fs][j];
- ix2 = aln[fs+1][j];
- if ((ix1 < 0) || (ix1 > max_aa) || (ix2< 0) || (ix2> max_aa)) continue;
- weight[i] += vlut[amino_acid_codes[ix1]-'A'][amino_acid_codes[ix2]-'A'];
- t++;
- }
- }
-/* now we have a weight -t < w < t */
- weight[i] +=t;
- if(t>0)
- weight[i] = (weight[i]*100)/(2*t);
- else
- weight[i] = 100;
-/* now we have a weight vll < w < 100 */
- if (weight[i]<vll) weight[i]=vll;
- }
-
-
-}
-
-void calc_p_penalties(char **aln, sint n, sint fs, sint ls, sint *weight)
-{
- char ix;
- sint j,k,numseq;
- sint i;
-
- numseq = ls - fs;
- for (i=0;i<n;i++)
- {
- weight[i] = 0;
- for (k=fs;k<ls;k++)
- {
- for (j=0;j<22;j++)
- {
- ix = aln[k][i];
- if ((ix < 0) || (ix > max_aa)) continue;
- if (amino_acid_codes[ix] == pr[j])
- {
- weight[i] += (180-pas_op[j]);
- break;
- }
- }
- }
- weight[i] /= numseq;
- }
-
-}
-
-void calc_h_penalties(char **aln, sint n, sint fs, sint ls, sint *weight)
-{
-
-/*
- weight[] is the length of the hydrophilic run of residues.
-*/
- char ix;
- sint nh,j,k;
- sint i,e,s;
- sint *hyd;
- float scale;
-
- hyd = (sint *)ckalloc((n+2) * sizeof(sint));
- nh = (sint)strlen(hyd_residues);
- for (i=0;i<n;i++)
- weight[i] = 0;
-
- for (k=fs;k<ls;k++)
- {
- for (i=0;i<n;i++)
- {
- hyd[i] = 0;
- for (j=0;j<nh;j++)
- {
- ix = aln[k][i];
- if ((ix < 0) || (ix > max_aa)) continue;
- if (amino_acid_codes[ix] == hyd_residues[j])
- {
- hyd[i] = 1;
- break;
- }
- }
- }
- i = 0;
- while (i < n)
- {
- if (hyd[i] == 0) i++;
- else
- {
- s = i;
- while ((hyd[i] != 0) && (i<n)) i++;
- e = i;
- if (e-s > 3)
- for (j=s; j<e; j++) weight[j] += 100;
- }
- }
- }
-
- scale = ls - fs;
- for (i=0;i<n;i++)
- weight[i] /= scale;
-
- hyd=ckfree((void *)hyd);
-
-if (debug>1)
-{
- for(i=0;i<n;i++) fprintf(stdout,"%d ", (pint)weight[i]);
- fprintf(stdout,"\n");
-}
-
-}
-
-sint local_penalty(sint penalty, sint n, sint *pweight, sint *hweight, sint *vweight)
-{
-
- Boolean h = FALSE;
- float gw;
-
- if (dnaflag) return(1);
-
- gw = 1.0;
- if (nvar_pen == FALSE)
- {
- gw *= (float)vweight[n]/100.0;
- }
-
- if (nhyd_pen == FALSE)
- {
- if (hweight[n] > 0)
- {
- gw *= 0.5;
- h = TRUE;
- }
- }
- if ((npref_pen == FALSE) && (h==FALSE))
- {
- gw *= ((float)pweight[n]/100.0);
- }
-
- gw *= penalty;
- return((sint)gw);
-
-}
-
-float percentid(char *s1, char *s2,sint length)
-{
- sint i;
- sint count,total;
- float score;
-
- count = total = 0;
- for (i=0;i<length;i++) {
- if ((s1[i]>=0) && (s1[i]<max_aa)) {
- total++;
- if (s1[i] == s2[i]) count++;
- }
- if (s1[i]==(-3) || s2[i]==(-3)) break;
-
- }
-
- if(total==0) score=0;
- else
- score = 100.0 * (float)count / (float)total;
- return(score);
-
-}
-
Deleted: trunk/packages/clustalw/trunk/calcprf1.c
===================================================================
--- trunk/packages/clustalw/trunk/calcprf1.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/calcprf1.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,99 +0,0 @@
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include "clustalw.h"
-
-
-/*
- * Prototypes
- */
-
-/*
- * Global variables
- */
-
-extern sint max_aa,gap_pos1,gap_pos2;
-
-void calc_prf1(sint **profile, char **alignment, sint *gaps,
- sint matrix[NUMRES][NUMRES],
- sint *seq_weight, sint prf_length, sint first_seq, sint last_seq)
-{
-
- sint **weighting, sum2, d, i, res;
- sint numseq;
- sint r, pos;
- int f;
- float scale;
-
- weighting = (sint **) ckalloc( (NUMRES+2) * sizeof (sint *) );
- for (i=0;i<NUMRES+2;i++)
- weighting[i] = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
-
- numseq = last_seq-first_seq;
-
- sum2 = 0;
- for (i=first_seq; i<last_seq; i++)
- sum2 += seq_weight[i];
-
- for (r=0; r<prf_length; r++)
- {
- for (d=0; d<=max_aa; d++)
- {
- weighting[d][r] = 0;
- for (i=first_seq; i<last_seq; i++)
- if (d == alignment[i][r]) weighting[d][r] += seq_weight[i];
- }
- weighting[gap_pos1][r] = 0;
- for (i=first_seq; i<last_seq; i++)
- if (gap_pos1 == alignment[i][r]) weighting[gap_pos1][r] += seq_weight[i];
- weighting[gap_pos2][r] = 0;
- for (i=first_seq; i<last_seq; i++)
- if (gap_pos2 == alignment[i][r]) weighting[gap_pos2][r] += seq_weight[i];
- }
-
- for (pos=0; pos< prf_length; pos++)
- {
- if (gaps[pos] == numseq)
- {
- for (res=0; res<=max_aa; res++)
- {
- profile[pos+1][res] = matrix[res][gap_pos1];
- }
- profile[pos+1][gap_pos1] = matrix[gap_pos1][gap_pos1];
- profile[pos+1][gap_pos2] = matrix[gap_pos2][gap_pos1];
- }
- else
- {
- scale = (float)(numseq-gaps[pos]) / (float)numseq;
- for (res=0; res<=max_aa; res++)
- {
- f = 0;
- for (d=0; d<=max_aa; d++)
- f += (weighting[d][pos] * matrix[d][res]);
- f += (weighting[gap_pos1][pos] * matrix[gap_pos1][res]);
- f += (weighting[gap_pos2][pos] * matrix[gap_pos2][res]);
- profile[pos+1][res] = (sint )(((float)f / (float)sum2)*scale);
- }
- f = 0;
- for (d=0; d<=max_aa; d++)
- f += (weighting[d][pos] * matrix[d][gap_pos1]);
- f += (weighting[gap_pos1][pos] * matrix[gap_pos1][gap_pos1]);
- f += (weighting[gap_pos2][pos] * matrix[gap_pos2][gap_pos1]);
- profile[pos+1][gap_pos1] = (sint )(((float)f / (float)sum2)*scale);
- f = 0;
- for (d=0; d<=max_aa; d++)
- f += (weighting[d][pos] * matrix[d][gap_pos2]);
- f += (weighting[gap_pos1][pos] * matrix[gap_pos1][gap_pos2]);
- f += (weighting[gap_pos2][pos] * matrix[gap_pos2][gap_pos2]);
- profile[pos+1][gap_pos2] = (sint )(((float)f / (float)sum2)*scale);
- }
- }
-
- for (i=0;i<NUMRES+2;i++)
- weighting[i]=ckfree((void *)weighting[i]);
- weighting=ckfree((void *)weighting);
-
-}
-
-
Deleted: trunk/packages/clustalw/trunk/calcprf2.c
===================================================================
--- trunk/packages/clustalw/trunk/calcprf2.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/calcprf2.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,73 +0,0 @@
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include "clustalw.h"
-
-/*
- * Prototypes
- */
-/*
- * Global variables
- */
-
-extern sint max_aa,gap_pos1,gap_pos2;
-
-void calc_prf2(sint **profile, char **alignment,
- sint *seq_weight,sint prf_length, sint first_seq, sint last_seq)
-{
-
- sint sum1, sum2;
- sint i, d;
- sint r;
-
-
- for (r=0; r<prf_length; r++)
- {
-/*
- calculate sum2 = number of residues found in this column
-*/
- sum2 = 0;
- for (i=first_seq; i<last_seq; i++)
- {
- sum2 += seq_weight[i];
- }
-/*
- only include matrix comparison scores for those residue types found in this
- column
-*/
- if (sum2 == 0)
- {
- for (d=0; d<=max_aa; d++)
- profile[r+1][d] = 0;
- profile[r+1][gap_pos1] = 0;
- profile[r+1][gap_pos2] = 0;
- }
- else
- {
- for (d=0; d<=max_aa; d++)
- {
- sum1 = 0;
- for (i=first_seq; i<last_seq; i++)
- {
- if (d == alignment[i][r]) sum1 += seq_weight[i];
- }
- profile[r+1][d] = (sint)(10 * (float)sum1 / (float)sum2);
- }
- sum1 = 0;
- for (i=first_seq; i<last_seq; i++)
- {
- if (gap_pos1 == alignment[i][r]) sum1 += seq_weight[i];
- }
- profile[r+1][gap_pos1] = (sint)(10 * (float)sum1 / (float)sum2);
- sum1 = 0;
- for (i=first_seq; i<last_seq; i++)
- {
- if (gap_pos2 == alignment[i][r]) sum1 += seq_weight[i];
- }
- profile[r+1][gap_pos2] = (sint)(10 * (float)sum1 / (float)sum2);
- }
- }
-}
-
-
Deleted: trunk/packages/clustalw/trunk/calctree.c
===================================================================
--- trunk/packages/clustalw/trunk/calctree.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/calctree.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,984 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <stdarg.h>
-#include <ctype.h>
-#include "clustalw.h"
-
-#define MAXERRS 10
-
-/*
- * Prototypes
- */
-static void create_tree(treeptr ptree, treeptr parent);
-static void create_node(treeptr pptr, treeptr parent);
-static treeptr insert_node(treeptr pptr);
-static void skip_space(FILE *fd);
-static treeptr avail(void);
-static void set_info(treeptr p, treeptr parent, sint pleaf, char *pname, float pdist);
-static treeptr reroot(treeptr ptree, sint nseqs);
-static treeptr insert_root(treeptr p, float diff);
-static float calc_root_mean(treeptr root, float *maxdist);
-static float calc_mean(treeptr nptr, float *maxdist, sint nseqs);
-static void order_nodes(void);
-static sint calc_weight(sint leaf);
-static void group_seqs(treeptr p, sint *next_groups, sint nseqs);
-static void mark_group1(treeptr p, sint *groups, sint n);
-static void mark_group2(treeptr p, sint *groups, sint n);
-static void save_set(sint n, sint *groups);
-static void clear_tree_nodes(treeptr p);
-
-
-/*
- * Global variables
- */
-extern Boolean interactive;
-extern Boolean distance_tree;
-extern Boolean usemenu;
-extern sint debug;
-extern double **tmat;
-extern sint **sets;
-extern sint nsets;
-extern char **names;
-extern sint *seq_weight;
-extern Boolean no_weights;
-
-char ch;
-FILE *fd;
-treeptr *lptr;
-treeptr *olptr;
-treeptr *nptr;
-treeptr *ptrs;
-sint nnodes = 0;
-sint ntotal = 0;
-Boolean rooted_tree = TRUE;
-static treeptr seq_tree,root;
-static sint *groups, numseq;
-
-void calc_seq_weights(sint first_seq, sint last_seq, sint *sweight)
-{
- sint i, nseqs;
- sint temp, sum, *weight;
-
-
-/*
- If there are more than three sequences....
-*/
- nseqs = last_seq-first_seq;
- if ((nseqs >= 2) && (distance_tree == TRUE) && (no_weights == FALSE))
- {
-/*
- Calculate sequence weights based on Phylip tree.
-*/
- weight = (sint *)ckalloc((last_seq+1) * sizeof(sint));
-
- for (i=first_seq; i<last_seq; i++)
- weight[i] = calc_weight(i);
-
-/*
- Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
-*/
-
- sum = 0;
- for (i=first_seq; i<last_seq; i++)
- sum += weight[i];
-
- if (sum == 0)
- {
- for (i=first_seq; i<last_seq; i++)
- weight[i] = 1;
- sum = i;
- }
-
- for (i=first_seq; i<last_seq; i++)
- {
- sweight[i] = (weight[i] * INT_SCALE_FACTOR) / sum;
- if (sweight[i] < 1) sweight[i] = 1;
- }
-
- weight=ckfree((void *)weight);
-
- }
-
- else
- {
-/*
- Otherwise, use identity weights.
-*/
- temp = INT_SCALE_FACTOR / nseqs;
- for (i=first_seq; i<last_seq; i++)
- sweight[i] = temp;
- }
-
-}
-
-void create_sets(sint first_seq, sint last_seq)
-{
- sint i, j, nseqs;
-
- nsets = 0;
- nseqs = last_seq-first_seq;
- if (nseqs >= 2)
- {
-/*
- If there are more than three sequences....
-*/
- groups = (sint *)ckalloc((nseqs+1) * sizeof(sint));
- group_seqs(root, groups, nseqs);
- groups=ckfree((void *)groups);
-
- }
-
- else
- {
- groups = (sint *)ckalloc((nseqs+1) * sizeof(sint));
- for (i=0;i<nseqs-1;i++)
- {
- for (j=0;j<nseqs;j++)
- if (j<=i) groups[j] = 1;
- else if (j==i+1) groups[j] = 2;
- else groups[j] = 0;
- save_set(nseqs, groups);
- }
- groups=ckfree((void *)groups);
- }
-
-}
-
-sint read_tree(char *treefile, sint first_seq, sint last_seq)
-{
-
- char c;
- char name1[MAXNAMES+1], name2[MAXNAMES+1];
- sint i, j, k;
- Boolean found;
-
- numseq = 0;
- nnodes = 0;
- ntotal = 0;
- rooted_tree = TRUE;
-
-#ifdef VMS
- if ((fd = fopen(treefile,"r","rat=cr","rfm=var")) == NULL)
-#else
- if ((fd = fopen(treefile, "r")) == NULL)
-#endif
- {
- error("cannot open %s", treefile);
- return((sint)0);
- }
-
- skip_space(fd);
- ch = (char)getc(fd);
- if (ch != '(')
- {
- error("Wrong format in tree file %s", treefile);
- return((sint)0);
- }
- rewind(fd);
-
- distance_tree = TRUE;
-
-/*
- Allocate memory for tree
-*/
- nptr = (treeptr *)ckalloc(3*(last_seq-first_seq+1) * sizeof(treeptr));
- ptrs = (treeptr *)ckalloc(3*(last_seq-first_seq+1) * sizeof(treeptr));
- lptr = (treeptr *)ckalloc((last_seq-first_seq+1) * sizeof(treeptr));
- olptr = (treeptr *)ckalloc((last_seq+1) * sizeof(treeptr));
-
- seq_tree = avail();
- set_info(seq_tree, NULL, 0, "", 0.0);
-
- create_tree(seq_tree,NULL);
- fclose(fd);
-
-
- if (numseq != last_seq-first_seq)
- {
- error("tree not compatible with alignment\n(%d sequences in alignment and %d in tree", (pint)last_seq-first_seq,(pint)numseq);
- return((sint)0);
- }
-
-/*
- If the tree is unrooted, reroot the tree - ie. minimise the difference
- between the mean root->leaf distances for the left and right branches of
- the tree.
-*/
-
- if (distance_tree == FALSE)
- {
- if (rooted_tree == FALSE)
- {
- error("input tree is unrooted and has no distances.\nCannot align sequences");
- return((sint)0);
- }
- }
-
- if (rooted_tree == FALSE)
- {
- root = reroot(seq_tree, last_seq-first_seq+1);
- }
- else
- {
- root = seq_tree;
- }
-
-/*
- calculate the 'order' of each node.
-*/
- order_nodes();
-
- if (numseq >= 2)
- {
-/*
- If there are more than three sequences....
-*/
-/*
- assign the sequence nodes (in the same order as in the alignment file)
-*/
- for (i=first_seq; i<last_seq; i++)
- {
- if (strlen(names[i+1]) > MAXNAMES)
- warning("name %s is too long for PHYLIP tree format (max %d chars)", names[i+1],MAXNAMES);
-
- for (k=0; k< strlen(names[i+1]) && k<MAXNAMES ; k++)
- {
- c = names[i+1][k];
- if ((c>0x40) && (c<0x5b)) c=c | 0x20;
- if (c == ' ') c = '_';
- name2[k] = c;
- }
- name2[k]='\0';
- found = FALSE;
- for (j=0; j<numseq; j++)
- {
- for (k=0; k< strlen(lptr[j]->name) && k<MAXNAMES ; k++)
- {
- c = lptr[j]->name[k];
- if ((c>0x40) && (c<0x5b)) c=c | 0x20;
- name1[k] = c;
- }
- name1[k]='\0';
- if (strcmp(name1, name2) == 0)
- {
- olptr[i] = lptr[j];
- found = TRUE;
- }
- }
- if (found == FALSE)
- {
- error("tree not compatible with alignment:\n%s not found", name2);
- return((sint)0);
- }
- }
-
- }
- return((sint)1);
-}
-
-static void create_tree(treeptr ptree, treeptr parent)
-{
- treeptr p;
-
- sint i, type;
- float dist;
- char name[MAXNAMES+1];
-
-/*
- is this a node or a leaf ?
-*/
- skip_space(fd);
- ch = (char)getc(fd);
- if (ch == '(')
- {
-/*
- this must be a node....
-*/
- type = NODE;
- name[0] = '\0';
- ptrs[ntotal] = nptr[nnodes] = ptree;
- nnodes++;
- ntotal++;
-
- create_node(ptree, parent);
-
- p = ptree->left;
- create_tree(p, ptree);
-
- if ( ch == ',')
- {
- p = ptree->right;
- create_tree(p, ptree);
- if ( ch == ',')
- {
- ptree = insert_node(ptree);
- ptrs[ntotal] = nptr[nnodes] = ptree;
- nnodes++;
- ntotal++;
- p = ptree->right;
- create_tree(p, ptree);
- rooted_tree = FALSE;
- }
- }
-
- skip_space(fd);
- ch = (char)getc(fd);
- }
-/*
- ...otherwise, this is a leaf
-*/
- else
- {
- type = LEAF;
- ptrs[ntotal++] = lptr[numseq++] = ptree;
-/*
- get the sequence name
-*/
- name[0] = ch;
- ch = (char)getc(fd);
- i = 1;
- while ((ch != ':') && (ch != ',') && (ch != ')'))
- {
- if (i < MAXNAMES) name[i++] = ch;
- ch = (char)getc(fd);
- }
- name[i] = '\0';
- if (ch != ':')
- {
- distance_tree = FALSE;
- dist = 0.0;
- }
- }
-
-/*
- get the distance information
-*/
- dist = 0.0;
- if (ch == ':')
- {
- skip_space(fd);
- fscanf(fd,"%f",&dist);
- skip_space(fd);
- ch = (char)getc(fd);
- }
- set_info(ptree, parent, type, name, dist);
-
-
-}
-
-static void create_node(treeptr pptr, treeptr parent)
-{
- treeptr t;
-
- pptr->parent = parent;
- t = avail();
- pptr->left = t;
- t = avail();
- pptr->right = t;
-
-}
-
-static treeptr insert_node(treeptr pptr)
-{
-
- treeptr newnode;
-
- newnode = avail();
- create_node(newnode, pptr->parent);
-
- newnode->left = pptr;
- pptr->parent = newnode;
-
- set_info(newnode, pptr->parent, NODE, "", 0.0);
-
- return(newnode);
-}
-
-static void skip_space(FILE *fd)
-{
- int c;
-
- do
- c = getc(fd);
- while(isspace(c));
-
- ungetc(c, fd);
-}
-
-static treeptr avail(void)
-{
- treeptr p;
- p = ckalloc(sizeof(stree));
- p->left = NULL;
- p->right = NULL;
- p->parent = NULL;
- p->dist = 0.0;
- p->leaf = 0;
- p->order = 0;
- p->name[0] = '\0';
- return(p);
-}
-
-void clear_tree(treeptr p)
-{
- clear_tree_nodes(p);
-
- nptr=ckfree((void *)nptr);
- ptrs=ckfree((void *)ptrs);
- lptr=ckfree((void *)lptr);
- olptr=ckfree((void *)olptr);
-}
-
-static void clear_tree_nodes(treeptr p)
-{
- if (p==NULL) p = root;
- if (p->left != NULL)
- {
- clear_tree_nodes(p->left);
- }
- if (p->right != NULL)
- {
- clear_tree_nodes(p->right);
- }
- p->left = NULL;
- p->right = NULL;
- p=ckfree((void *)p);
-}
-
-static void set_info(treeptr p, treeptr parent, sint pleaf, char *pname, float pdist)
-{
- p->parent = parent;
- p->leaf = pleaf;
- p->dist = pdist;
- p->order = 0;
- strcpy(p->name, pname);
- if (p->leaf == TRUE)
- {
- p->left = NULL;
- p->right = NULL;
- }
-}
-
-static treeptr reroot(treeptr ptree, sint nseqs)
-{
-
- treeptr p, rootnode, rootptr;
- float diff, mindiff = 0.0, mindepth = 1.0, maxdist;
- sint i;
- Boolean first = TRUE;
-
-/*
- find the difference between the means of leaf->node
- distances on the left and on the right of each node
-*/
- rootptr = ptree;
- for (i=0; i<ntotal; i++)
- {
- p = ptrs[i];
- if (p->parent == NULL)
- diff = calc_root_mean(p, &maxdist);
- else
- diff = calc_mean(p, &maxdist, nseqs);
-
- if ((diff == 0) || ((diff > 0) && (diff < 2 * p->dist)))
- {
- if ((maxdist < mindepth) || (first == TRUE))
- {
- first = FALSE;
- rootptr = p;
- mindepth = maxdist;
- mindiff = diff;
- }
- }
-
- }
-
-/*
- insert a new node as the ancestor of the node which produces the shallowest
- tree.
-*/
- if (rootptr == ptree)
- {
- mindiff = rootptr->left->dist + rootptr->right->dist;
- rootptr = rootptr->right;
- }
- rootnode = insert_root(rootptr, mindiff);
-
- diff = calc_root_mean(rootnode, &maxdist);
-
- return(rootnode);
-}
-
-static treeptr insert_root(treeptr p, float diff)
-{
- treeptr newp, prev, q, t;
- float dist, prevdist,td;
-
- newp = avail();
-
- t = p->parent;
- prevdist = t->dist;
-
- p->parent = newp;
-
- dist = p->dist;
-
- p->dist = diff / 2;
- if (p->dist < 0.0) p->dist = 0.0;
- if (p->dist > dist) p->dist = dist;
-
- t->dist = dist - p->dist;
-
- newp->left = t;
- newp->right = p;
- newp->parent = NULL;
- newp->dist = 0.0;
- newp->leaf = NODE;
-
- if (t->left == p) t->left = t->parent;
- else t->right = t->parent;
-
- prev = t;
- q = t->parent;
-
- t->parent = newp;
-
- while (q != NULL)
- {
- if (q->left == prev)
- {
- q->left = q->parent;
- q->parent = prev;
- td = q->dist;
- q->dist = prevdist;
- prevdist = td;
- prev = q;
- q = q->left;
- }
- else
- {
- q->right = q->parent;
- q->parent = prev;
- td = q->dist;
- q->dist = prevdist;
- prevdist = td;
- prev = q;
- q = q->right;
- }
- }
-
-/*
- remove the old root node
-*/
- q = prev;
- if (q->left == NULL)
- {
- dist = q->dist;
- q = q->right;
- q->dist += dist;
- q->parent = prev->parent;
- if (prev->parent->left == prev)
- prev->parent->left = q;
- else
- prev->parent->right = q;
- prev->right = NULL;
- }
- else
- {
- dist = q->dist;
- q = q->left;
- q->dist += dist;
- q->parent = prev->parent;
- if (prev->parent->left == prev)
- prev->parent->left = q;
- else
- prev->parent->right = q;
- prev->left = NULL;
- }
-
- return(newp);
-}
-
-static float calc_root_mean(treeptr root, float *maxdist)
-{
- float dist , lsum = 0.0, rsum = 0.0, lmean,rmean,diff;
- treeptr p;
- sint i;
- sint nl, nr;
- sint direction;
-/*
- for each leaf, determine whether the leaf is left or right of the root.
-*/
- dist = (*maxdist) = 0;
- nl = nr = 0;
- for (i=0; i< numseq; i++)
- {
- p = lptr[i];
- dist = 0.0;
- while (p->parent != root)
- {
- dist += p->dist;
- p = p->parent;
- }
- if (p == root->left) direction = LEFT;
- else direction = RIGHT;
- dist += p->dist;
-
- if (direction == LEFT)
- {
- lsum += dist;
- nl++;
- }
- else
- {
- rsum += dist;
- nr++;
- }
- if (dist > (*maxdist)) *maxdist = dist;
- }
-
- lmean = lsum / nl;
- rmean = rsum / nr;
-
- diff = lmean - rmean;
- return(diff);
-}
-
-
-static float calc_mean(treeptr nptr, float *maxdist, sint nseqs)
-{
- float dist , lsum = 0.0, rsum = 0.0, lmean,rmean,diff;
- treeptr p, *path2root;
- float *dist2node;
- sint depth = 0, i,j , n = 0;
- sint nl , nr;
- sint direction, found;
-
- path2root = (treeptr *)ckalloc(nseqs * sizeof(treeptr));
- dist2node = (float *)ckalloc(nseqs * sizeof(float));
-/*
- determine all nodes between the selected node and the root;
-*/
- depth = (*maxdist) = dist = 0;
- nl = nr = 0;
- p = nptr;
- while (p != NULL)
- {
- path2root[depth] = p;
- dist += p->dist;
- dist2node[depth] = dist;
- p = p->parent;
- depth++;
- }
-
-/*
- *nl = *nr = 0;
- for each leaf, determine whether the leaf is left or right of the node.
- (RIGHT = descendant, LEFT = not descendant)
-*/
- for (i=0; i< numseq; i++)
- {
- p = lptr[i];
- if (p == nptr)
- {
- direction = RIGHT;
- dist = 0.0;
- }
- else
- {
- direction = LEFT;
- dist = 0.0;
-/*
- find the common ancestor.
-*/
- found = FALSE;
- n = 0;
- while ((found == FALSE) && (p->parent != NULL))
- {
- for (j=0; j< depth; j++)
- if (p->parent == path2root[j])
- {
- found = TRUE;
- n = j;
- }
- dist += p->dist;
- p = p->parent;
- }
- if (p == nptr) direction = RIGHT;
- }
-
- if (direction == LEFT)
- {
- lsum += dist;
- lsum += dist2node[n-1];
- nl++;
- }
- else
- {
- rsum += dist;
- nr++;
- }
-
- if (dist > (*maxdist)) *maxdist = dist;
- }
-
- dist2node=ckfree((void *)dist2node);
- path2root=ckfree((void *)path2root);
-
- lmean = lsum / nl;
- rmean = rsum / nr;
-
- diff = lmean - rmean;
- return(diff);
-}
-
-static void order_nodes(void)
-{
- sint i;
- treeptr p;
-
- for (i=0; i<numseq; i++)
- {
- p = lptr[i];
- while (p != NULL)
- {
- p->order++;
- p = p->parent;
- }
- }
-}
-
-
-static sint calc_weight(sint leaf)
-{
-
- treeptr p;
- float weight = 0.0;
-
- p = olptr[leaf];
- while (p->parent != NULL)
- {
- weight += p->dist / p->order;
- p = p->parent;
- }
-
- weight *= 100.0;
-
- return((sint)weight);
-
-}
-
-static void group_seqs(treeptr p, sint *next_groups, sint nseqs)
-{
- sint i;
- sint *tmp_groups;
-
- tmp_groups = (sint *)ckalloc((nseqs+1) * sizeof(sint));
- for (i=0;i<nseqs;i++)
- tmp_groups[i] = 0;
-
- if (p->left != NULL)
- {
- if (p->left->leaf == NODE)
- {
- group_seqs(p->left, next_groups, nseqs);
- for (i=0;i<nseqs;i++)
- if (next_groups[i] != 0) tmp_groups[i] = 1;
- }
- else
- {
- mark_group1(p->left, tmp_groups, nseqs);
- }
-
- }
-
- if (p->right != NULL)
- {
- if (p->right->leaf == NODE)
- {
- group_seqs(p->right, next_groups, nseqs);
- for (i=0;i<nseqs;i++)
- if (next_groups[i] != 0) tmp_groups[i] = 2;
- }
- else
- {
- mark_group2(p->right, tmp_groups, nseqs);
- }
- save_set(nseqs, tmp_groups);
- }
- for (i=0;i<nseqs;i++)
- next_groups[i] = tmp_groups[i];
-
- tmp_groups=ckfree((void *)tmp_groups);
-
-}
-
-static void mark_group1(treeptr p, sint *groups, sint n)
-{
- sint i;
-
- for (i=0;i<n;i++)
- {
- if (olptr[i] == p)
- groups[i] = 1;
- else
- groups[i] = 0;
- }
-}
-
-static void mark_group2(treeptr p, sint *groups, sint n)
-{
- sint i;
-
- for (i=0;i<n;i++)
- {
- if (olptr[i] == p)
- groups[i] = 2;
- else if (groups[i] != 0)
- groups[i] = 1;
- }
-}
-
-static void save_set(sint n, sint *groups)
-{
- sint i;
-
- for (i=0;i<n;i++)
- sets[nsets+1][i+1] = groups[i];
- nsets++;
-}
-
-
-
-sint calc_similarities(sint nseqs)
-{
- sint depth = 0, i,j, k, n;
- sint found;
- sint nerrs, seq1[MAXERRS],seq2[MAXERRS];
- treeptr p, *path2root;
- float dist;
- float *dist2node, bad_dist[MAXERRS];
- double **dmat;
- char err_mess[1024],err1[MAXLINE],reply[MAXLINE];
-
- path2root = (treeptr *)ckalloc((nseqs) * sizeof(treeptr));
- dist2node = (float *)ckalloc((nseqs) * sizeof(float));
- dmat = (double **)ckalloc((nseqs) * sizeof(double *));
- for (i=0;i<nseqs;i++)
- dmat[i] = (double *)ckalloc((nseqs) * sizeof(double));
-
- if (nseqs >= 2)
- {
-/*
- for each leaf, determine all nodes between the leaf and the root;
-*/
- for (i = 0;i<nseqs; i++)
- {
- depth = dist = 0;
- p = olptr[i];
- while (p != NULL)
- {
- path2root[depth] = p;
- dist += p->dist;
- dist2node[depth] = dist;
- p = p->parent;
- depth++;
- }
-
-/*
- for each pair....
-*/
- for (j=0; j < i; j++)
- {
- p = olptr[j];
- dist = 0.0;
-/*
- find the common ancestor.
-*/
- found = FALSE;
- n = 0;
- while ((found == FALSE) && (p->parent != NULL))
- {
- for (k=0; k< depth; k++)
- if (p->parent == path2root[k])
- {
- found = TRUE;
- n = k;
- }
- dist += p->dist;
- p = p->parent;
- }
-
- dmat[i][j] = dist + dist2node[n-1];
- }
- }
-
- nerrs = 0;
- for (i=0;i<nseqs;i++)
- {
- dmat[i][i] = 0.0;
- for (j=0;j<i;j++)
- {
- if (dmat[i][j] < 0.01) dmat[i][j] = 0.01;
- if (dmat[i][j] > 1.0) {
- if (dmat[i][j] > 1.1 && nerrs<MAXERRS) {
- seq1[nerrs] = i;
- seq2[nerrs] = j;
- bad_dist[nerrs] = dmat[i][j];
- nerrs++;
- }
- dmat[i][j] = 1.0;
- }
- }
- }
- if (nerrs>0)
- {
- strcpy(err_mess,"The following sequences are too divergent to be aligned:\n");
- for (i=0;i<nerrs && i<5;i++)
- {
- sprintf(err1," %s and %s (distance %1.3f)\n",
- names[seq1[i]+1],
- names[seq2[i]+1],bad_dist[i]);
- strcat(err_mess,err1);
- }
- strcat(err_mess,"(All distances should be between 0.0 and 1.0)\n");
- strcat(err_mess,"This may not be fatal but you have been warned!\n");
- strcat(err_mess,"SUGGESTION: Remove one or more problem sequences and try again");
- if(interactive)
- (*reply)=prompt_for_yes_no(err_mess,"Continue ");
- else (*reply) = 'y';
- if ((*reply != 'y') && (*reply != 'Y'))
- return((sint)0);
- }
- }
- else
- {
- for (i=0;i<nseqs;i++)
- {
- for (j=0;j<i;j++)
- {
- dmat[i][j] = tmat[i+1][j+1];
- }
- }
- }
-
- path2root=ckfree((void *)path2root);
- dist2node=ckfree((void *)dist2node);
- for (i=0;i<nseqs;i++)
- {
- tmat[i+1][i+1] = 0.0;
- for (j=0;j<i;j++)
- {
- tmat[i+1][j+1] = 100.0 - (dmat[i][j]) * 100.0;
- tmat[j+1][i+1] = tmat[i+1][j+1];
- }
- }
-
- for (i=0;i<nseqs;i++) dmat[i]=ckfree((void *)dmat[i]);
- dmat=ckfree((void *)dmat);
-
- return((sint)1);
-}
-
Deleted: trunk/packages/clustalw/trunk/clustalv.doc
===================================================================
--- trunk/packages/clustalw/trunk/clustalv.doc 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/clustalv.doc 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,1978 +0,0 @@
-
-
-
- Clustal V Multiple Sequence Alignments.
-
- Documentation (Installation and Usage).
-
- Des Higgins
- European Molecular Biology Laboratory
- Postfach 10.2209
- D-6900 Heidelberg
- Germany.
-
- higgins at EMBL-Heidelberg.DE
-
-
-*******************************************************************
-
-
- Contents.
-
-
- 1 Overview
-
- 2 Installation
-
- 3 Interactive usage
-
- 4 Command-line interface
-
- 5 Algorithms and references
-
-
-*******************************************************************
-
- 1. Overview
-
-This document describes how to install and use ClustalV on various
-machines. ClustalV is a complete upgrade and rewrite of the Clustal
-package of multiple alignment programs (Higgins and Sharp, 1988 and
-1989). The original programs were written in Fortran for
-microcomputers running MSDOS. You carried out a complete alignment
-by running 3 programs in succession. Later, these were merged into
-a single menu driven program with on-line help, for VAX/VMS.
-ClustalV was written in C and has all of the features of the old
-programs plus many new ones. It has been compiled and tested using
-VAX/VMS C, Decstation ULTRIX C, Gnu C for Sun workstations, Turbo C
-for IBM PC's and Think C for Apple Mac's. The original Clustal was
-written by Des Higgins while he was a Post-Doc in the lab of Paul
-Sharp in the Genetics Department, Trinity College, Dublin 2,
-Ireland.
-
-The main feature of the old package was the ability to carry out
-reliable multiple alignments of many sequences. The sensitivity of
-the program is as good as from any other program we have tried, with
-the exception of the programs of Vingron and Argos (1991), while it
-works in reasonable time on a microcomputer. The programs of
-Vingron and Argos are specialised for finding distant similarities
-between proteins but require mainframes or workstations and are more
-difficult to use.
-
-The main new features are: profile alignments (alignments of old
-alignments); phylogenetic trees (Neighbor Joining trees calculated
-after multiple alignment with a bootstrapping option); better
-sequence input (automatically recognise and read NBRF/PIR, Pearson
-(Fasta) or EMBL/SwissProt formats); flexible alignment output
-(choose one of: old Clustal format, NBRF/PIR, GCG msf format or
-Phylip format); full command line interface (everything that you can
-do interactively can be specified on the command line).
-
-In version 7 of the GCG package, there is a program called PILEUP
-which uses a very similar algorithm to the one in ClustalV. There
-are 2 main differences between the programs: 1) the metric used to
-compare the sequences for the initial "guide tree" uses a full
-global, optimal alignment in PILEUP instead of the fast, approximate
-ones in ClustalV. This makes PILEUP much slower for the comparison
-of long sequences. In principle, the distances calculated from
-PILEUP will be more sensitive than ours, but in practice it will not
-make much difference, except in difficult cases. 2) During the
-multiple alignment, terminal gaps are penalised in ClustalV but not
-in PILEUP. This will make the PILEUP alignments better when the
-sequences are of very different lengths (has no effect if there are
-no large terminal gaps).
-
-
-This software may be distributed and used freely, provided that you
-do not modify it or this documentation in any way without the
-permission of the authors.
-
-If you wish to refer to ClustalV, please cite:
-Higgins,D.G. Bleasby,A.J. and Fuchs,R. (1991) CLUSTAL V: improved software
-for multiple sequence alignment. CABIOS, vol .8, 189-191.
-
-The overall multiple alignment algorithm was described in:
-Higgins,D.G. and Sharp,P.M. (1989). Fast and sensitive multiple
-sequence alignments on a microcomputer. CABIOS, vol. 5, 151-153.
-
-
-ACKNOWLEDGEMENTS.
-
-D.H. would particularly like to thank Paul Sharp, in whose lab. this
-work originated. We also thank Manolo Gouy, Gene Myers, Peter Rice
-and Martin Vingron for suggestions, bug-fixes and help.
-
-Des Higgins and Rainer Fuchs,
-EMBL Data Library, Heidelberg, Germany.
-
-Alan Bleasby,
-Daresbury, UK.
-
-JUNE 1991
-*******************************************************************
-
- 2. Installation.
-
-
-
-As far as possible, we have tried to make ClustalV portable to any
-machine with a standard C compiler (proposed ANSI C standard). The
-source code, as supplied by us, has been compiled and tested using
-the following compilers:
-
-VAX/VMS C
-Ultrix C (on a Decstation 2100)
-Gnu C on a Sun 4 workstation
-Think C on an Apple Macintosh SE
-Turbo C on an IBM AT.
-
-In each case, one must make 1 change to 1 line of code in 1 header
-file. This is described below. The exact capacity of the program
-(how many sequences of what length can be aligned) will depend of
-course on available memory but can also be set in this header file.
-
-The package comes as 9 C source files; 3 header files; 1 file of on-
-line help; this documentation file; 3 make files:
-
-Source code: clustalv.c, amenu.c, gcgcheck.c, myers.c, sequence.c,
- showpair.c, trees.c, upgma.c, util.c
-
-Header files: clustalv.h, general.h, matrices.h
-
-On-Line help: clustalv.hlp (must be renamed or defined as
- clustalv_help except on PC's)
-
-Documentation: clustalv.doc (this file).
-
-Makefiles: makefile.sun (gnu c on Sun), vmslink.com (vax/vms),
- makefile.ult (ultrix).
-
-
-
-
-
-
-
-Before compiling ClustalV you must look at and possibly change
-clustalV.h, shown below..
-
-/*******************CLUSTALV.H********************************/
-
-/*
-Main header file for ClustalV. Uncomment ONE of the following lines
-depending on which compiler you wish to use.
-*/
-
-#define VMS 1 /* VAX VMS */
-
-/*#define MAC 1 Think_C for MacIntosh */
-
-/*#define MSDOS 1 Turbo C for PC's */
-
-/*#define UNIX 1 Ultrix for Decstations or Gnu C for Sun */
-
-/*************************************************************/
-
-#include "general.h"
-
-#define MAXNAMES 10
-#define MAXTITLES 60
-#define FILENAMELEN 256
-
-#define UNKNOWN 0
-#define EMBLSWISS 1
-#define PIR 2
-#define PEARSON 3
-
-#define PAGE_LEN 22
-
-#if VMS
-#define DIRDELIM ']'
-#define MAXLEN 3000
-#define MAXN 150
-#define FSIZE 15000
-#define LINELENGTH 60
-#define GCG_LINELENGTH 50
-
-#elif MAC
-#define DIRDELIM ':'
-#define MAXLEN 2600
-#define MAXN 30
-#define FSIZE 10000
-#define LINELENGTH 50
-#define GCG_LINELENGTH 50
-
-#elif MSDOS
-#define DIRDELIM '\\'
-#define MAXLEN 1300
-#define MAXN 30
-#define FSIZE 5000
-#define LINELENGTH 50
-#define GCG_LINELENGTH 50
-
-#elif UNIX
-#define DIRDELIM '/'
-#define MAXLEN 3000
-#define MAXN 50
-#define FSIZE 15000
-#define LINELENGTH 60
-#define GCG_LINELENGTH 50
-#endif
-/*****************end*of*CLUSTALV.H***************************/
-
-
-
-First, you must remove the comments from one of the first 10 lines.
-There are 4 'define' compiler directives here (e.g. #define VMS 1),
-and you should use one of these, depending on which system you wish
-to work. So choose one of these, remove its comments (if it is
-already commented out) and put comments around any of the others
-that are still active. If you wish to use a different system, you
-will need to insert a new line with a new keyword (which you must
-invent) to identify your system. Most of the rest of this header
-file is taken up with a block of 'define' statements for each system
-type; e.g. the VAX/VMS block is:
-
-#if VMS
-#define DIRDELIM ']'
-#define MAXLEN 3000
-#define MAXN 150
-#define FSIZE 15000
-#define LINELENGTH 60
-#define GCG_LINELENGTH 50
-
-In this block, you can specify the maximum number of sequences to be
-allowed (MAXN); the maximum sequence length, including gaps
-(MAXLEN); FSIZE declares the size of some workspace, used by the
-fast 2 sequence comparison routines and should be APPROXIMATELY 4
-times MAXLEN; LINELENGTH is the length of the blocks of alignment
-output in the output files; GCG_LINELENGTH is the same but for the
-GCG compatible output only. Finally, DIRDELIM is the character used
-to specify directories and subdirectories in file names. It should
-be the character used to seperate the file name itself from the
-directory name (e.g. in VMS, file names are like:
-$drive:[dir1.dir2.dir3]filename.ext;2 so ']' is used as DIRDELIM).
-
-So, if you want to use a system, not covered in Clustalv.h, you will
-have to insert a new block, like the above one. To compile and link
-the program, we supply 3 makefiles: one each for VAX/VMS, Ultrix
-and GNU C for Sun workstations.
-
-
-
-VAX/VMS
-
-Compile and link the program with the
-supplied makefile for vms: vmslink.com .
-
-$ @vmslink
-
-This will produce clustalv.exe (and a lot of .obj files which you can delete).
-
-The on-line help file (clustalv.hlp) should be 'defined' as
-clustalv_help as follows:
-
-$ def clustalv_help $drive:[dir1.dir2]clustalv.hlp
-
-where $drive is the drive designation and [dir1.dir2] is the
-directory where clustalv.hlp is kept.
-
-To make use of the command-line interface, you must make clustalv a
-'foreign' command with:
-
-$ clustalv :== $$drive:[dir1.dir2]clustalv
-
-where $drive is the drive designation and [dir1.dir2] is the
-directory where clustalv.exe is kept.
-
-
-
-IBM PC/MSDOS/TURBO C
-
-Create a makefile (something.prj) with the names of the source files
-(clustalv.c, amenu.c etc.) and 'make' this using the HUGE memory
-model. You will get half a dozen warnings from the compiler about
-pieces of code than look suspicious to it but ignore these. The
-help file should remain as clustalv.hlp . To run the program using
-the default settings in Clustalv.h, you need approximately 500k of
-memory. To reduce this, the main influence on memory usage is the
-parameter MAXLEN; reduce MAXLEN to reduce memory usage.
-
-
-
-Apple Mac/THINK_C version 4.0.2
-
-This version of the program is not at all Mac like. It runs in a
-window, the inside of which looks just like a normal character based
-terminal. In the future we might put a proper Mac interface on it
-but do not have the time right now. With the default settings in
-the header file ClustalV.h, you need just over 800k of memory to run
-the program. To reduce this, reduce MAXLEN; this is easily the
-biggest influence on memory usage. To compile the program and save
-it as an application you need to 'set the application type'; here
-you specify how much memory (in kilobytes (k)) the application will
-need. You should set this to 900k to run the application as it is
-OR reduce MAXLEN in the header. To compile the program you have to
-create a 'project'; you 'add' the names of the 9 source files to the
-project AND the name of the ANSI library. The source code is too
-large to compile in one compilation unit. You will get a 'link
-error: code segment too big' if you try to compile and link as is.
-You should compile amenu.c (the biggest source file) as a seperate
-unit ..... you will have to read the manual/ask someone/mail me to
-find out what this is.
-
-
-*******************************************************************
-
- 3. Interactive usage.
-
-
-
-Interactive usage of Clustal V is completely menu driven. On-line
-help is provided, defaults are offered for all parameters and file
-names. With a little effort it should be completely self
-explanatory. The main menu, which appears when you run the
-programs is shown below. Each item brings you to a sub menu.
-
-
-
-Main menu for Clustal V:
-
-
- 1. Sequence Input From Disc
- 2. Multiple Alignments
- 3. Profile Alignments
- 4. Phylogenetic trees
-
- S. Execute a system command
- H. HELP
- X. EXIT (leave program)
-
-
-Your choice:
-
-
-
-The options S and H appear on all the main menus. H will provide
-help and if you type S you will be asked to enter a command, such as
-DIR or LS, which will be sent to the system (does not work on
-Mac's). Before carrying out an alignment, you must use option 1
-(sequence input); the format for sequences is explained below.
-Under menu item 2 you will be able to automatically align your
-sequences to each other. Menu item 3 allows you to do profile
-alignments. These are alignments of old alignments. This allows
-you to build up a multiple alignment in stages or add a new sequence
-to an old alignment. You can calculate phylogenetic trees from
-alignments using menu item 4.
-
-
-
-
- ******************************
- * SEQUENCE INPUT. *
- ******************************
-
-
-All sequences should be in 1 file. Three formats are automatically
-recognised and used: NBRF/PIR, EMBL/SwissProt and FASTA (Pearson and
-Lipman (1988) format).
-
-***
-Users of the Wisconsin GCG package should use the command TONBRF
-(recently changed to TOPIR) to reformat their sequences before use.
-***
-
-Sequences can be in upper or lower case. For proteins, the only
-symbols recognised are: A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y and
-for DNA/RNA use: A,C,G and T (or U). Any other letters of the
-alphabet will be treated as X (proteins) or N (DNA/RNA) for unknown.
-All other symbols (blanks, digits etc.) will be ignored EXCEPT for
-the hyphen "-" which can be used to specify a gap. This last point
-is especially useful for 2 reasons: 1) you can fix the positions of
-some gaps in advance; 2) the alignment output from this program can
-be written out in NBRF format using "-"'s to specify gaps; these
-alignments can be used again as input, either for profile alignments
-or for phylogenetic trees.
-
-If you are using an editor to create sequence files, use the FASTA
-format as it is by far the simplest (see below). If you have access
-to utility programs for generating/converting the NBRF/PIR format
-then use it in preference.
-
-
-
-FASTA (PEARSON AND LIPMAN, 1988) FORMAT: The sequences are
-delimited by an angle bracket ">" in column 1. The text immediately
-after the ">" is used as a title. Everything on the following line
-until the next ">" or the end of the file is one sequence.
-
-e.g.
-
-> RABSTOUT rabbit Guinness receptor
- LKMHLMGHLKMGLKMGLKGMHLMHLKHMHLMTYTYTTYRRWPLWMWLPDFGHAS
- ADSCVCAHGFAVCACFAHFDVCFGAVCFHAVCFAHVCFAAAVCFAVCAC
-> MUSNOSE mouse nose drying factor
- mhkmmhkgmkhmhgmhmhglhmkmhlkmgkhmgkmkytytytryrwtqtqwtwyt
- fdgfdsgafdagfdgfsagdfavdfdvgavfsvfgvdfsvdgvagvfdv
-> HSHEAVEN human Guinness receptor repeat
- mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
- fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv
- mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
- fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv
-
-
-
-NBRF/PIR FORMAT is similar to FASTA format but immediately
-after the ">", you find the characters "P1;" if the sequences are
-protein or "DL;" if they are nucleic acid. Clustalv looks for the
-";" character as the third character after the ">". If it finds one
-it assumes that the format is NBRF if not, FASTA format is assumed.
-The text after the ";" is treated as a sequence name while the
-entire next line is treated as a title. The sequence is terminated
-by a star "*" and the next sequence can then begin (with a >P1; etc
-). This is just the basic format description (there are other
-variations and rules).
-
-ANY files/sequences in GCG format can be converted to this format
-using the TONBRF command (now TOPIR) of the Wisconsin GCG package.
-
-
-e.g.
-
->P1;RABSTOUT
-rabbit Guinness receptor
-LKMHLMGHLKMGLKMGLKGMHLMHLKHMHLMTYTYTTYRRWPLWMWLPDFGHAS
-ADSCVCAHGFAVCACFAHFDVCFGAVCFHAVCFAHVCFAAAVCFAVCAC*
->P1;MUSNOSE
-mouse nose drying factor
-mhkmmhkgmkhmhgmhmhglhmkmhlkmgkhmgkmkytytytryrwtqtqwtwyt
-fdgfdsgafdagfdgfsagdfavdfdvgavfsvfgvdfsvdgvagvfd
-*
->P1;HSHEAVEN
-human Guinness receptor repeat protein.
-mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
-fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv
-mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
-fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv*
-
-
-
-
-EMBL/SWISSPROT FORMAT: Do not try to create files with this
-format unless you have utilities to help. If you are just using an
-editor, use one of the above formats. If you do use this format,
-the program will ignore everything between the ID line (line
-beginning with the characters "ID") and the SQ line. The sequence
-is then read from between the SQ line and the "//" characters.
-
-
-
-It is critically important for the program to know whether or not it
-is aligning DNA or protein sequences. The input routines attempt to
-guess which type of sequence is being used by counting the number of
-A,C,G,T or U's in the sequences. If the total is more than 85% of
-the sequence length then DNA is assumed. If you use very bizarre
-sequences (proteins with really strange aa compositions or DNA
-sequences with loads of strange ambiguity codes) you might confuse
-the program. It is difficult to do but be careful.
-
-
-
-
-
- ******************************
- * MULTIPLE ALIGNMENT MENU. *
- ******************************
-
-The multiple alignment menu is shown below. Before explaining how
-to use it, you must be introduced briefly to the alignment strategy.
-If you do not follow this, try using option 1 anyway; the entire
-process will be carried out automatically.
-
-To do a complete multiple alignment, we need to know the approximate
-relationships of the sequences to each other (which ones are most
-similar to each other). We do this by calculating a crude
-phylogenetic tree which we call a dendrogram (to distinguish it from
-the more sensitive trees available under the phylogenetic tree
-menu). This dendrogram is used as a guide to align bigger and
-bigger groups of sequences during the multiple alignment. The
-dendrogram is calculated in 2 stages: 1) all pairs of sequence are
-compared using the fast/approximate method of Wilbur and Lipman
-(1983); the result of each comparison is a similarity score. 2) the
-similarity scores are used to construct the dendrogram using the
-UPGMA cluster analysis method of Sneath and Sokal (1973).
-
-The construction of the dendrogram can be very time consuming if you
-wish to align many sequences (e.g. for 100 sequences you need to
-carry out 100x99/2 sequence comparisons = 4950). During every
-multiple alignment, a dendrogram is constructed and saved to a file
-(something.dnd). These can be reused later.
-
-
-
-
-
-
-
-
-******Multiple*Alignment*Menu******
-
-
- 1. Do complete multiple alignment now
- 2. Produce dendrogram file only
- 3. Use old dendrogram file
- 4. Pairwise alignment parameters
- 5. Multiple alignment parameters
- 6. Output format options
-
- S. Execute a system command
- H. HELP
- or press [RETURN] to go back to main menu
-
-
-Your choice:
-
-
-So, if in doubt, and you have already loaded some sequences from the
-main menu, just try option 1 and press the <Return> key in response
-to any questions. You will be prompted for 2 file names e.g. if the
-sequence input file was called DRINK.PEP, you will be offered
-DRINK.ALN as the file to contain the alignment and DRINK.DND for the
-dendrogram.
-
-If you wish to repeat a multiple alignment (e.g. to experiment with
-different gap penalties) but do not wish to make a dendrogram all
-over again use menu item 3 (providing you are using the same
-sequences). Similarly, menu item 2 allows you to produce the
-dendrogram file only.
-
-
-
-
-PAIRWISE ALIGNMENT PARAMETERS:
-
-The parameters that control the initial fast/approximate comparisons
-can be set from menu item 4 which looks like:
-
-
- ********* WILBUR/LIPMAN PAIRWISE ALIGNMENT PARAMETERS *********
-
-
- 1. Toggle Scoring Method :Percentage
- 2. Gap Penalty :3
- 3. K-tuple :1
- 4. No. of top diagonals :5
- 5. Window size :5
-
- H. HELP
-
-
-Enter number (or [RETURN] to exit):
-
-
-
-The similarity scores are calculated from fast alignments generated
-by the method of Wilbur and Lipman (1983). These are 'hash' or
-'word' or 'k-tuple' alignments carried out in 3 stages.
-
-First you mark the positions of every fragment of sequence, K-tuple
-long (for proteins, the default length is 1 residue, for DNA it is 2
-bases) in both sequences. Then you locate all k-tuple matches
-between the 2 sequences. At this stage you have to imagine a dot-
-matrix plot between the 2 sequences with each k-tuple match as a
-dot. You find those diagonals in the plot with most matches (you
-take the "No. of top diagonals" best ones) and mark all diagonals
-within "Window size" of each top diagonal. This process will define
-diagonal bands in the plot where you hope the most likely regions of
-similarity will lie.
-
-The final alignment stage is to find that head to tail arrangement
-of k-tuple matches from these diagonal regions that will give the
-highest score. The score is calculated as the number of exactly
-matching residues in this alignment minus a "gap penalty" for every
-gap that was introduced. When you toggle "Scoring method" you
-choose between expressing these similarity scores as raw scores or
-expressed as a percentage of the shorter sequence length.
-
-K-TUPLE SIZE: Can be 1 or 2 for proteins; 1 to 4 for DNA.
-Increase this to increase speed; decrease to improve sensitivity.
-
-GAP PENALTY: The number of matching residues that must be found
-in order to introduce a gap. This should be larger than K-Tuple
-Size. This has little effect on speed or sensitivity.
-
-NO. OF TOP DIAGONALS: The number of best diagonals in the
-imaginary dot-matrix plot that are considered. Decrease (must be
-greater than zero) to increase speed; increase to improve
-sensitivity.
-
-WINDOW SIZE: The number of diagonals around each "top" diagonal
-that are considered. Decrease for speed; increase for greater
-sensitivity.
-
-SCORING METHOD: The similarity scores may be expressed as raw scores
-(number of identical residues minus a "gap penalty" for each gap) or
-as percentage scores. If the sequences are of very different
-lengths, percentage scores make more sense.
-
-
-
-CHANGING THE PAIRWISE ALIGNMENT PARAMETERS
-
-The main reason for wanting to change the above parameters is SPEED
-(especially on microcomputers), NOT SENSITIVITY. The dendrograms
-that are produced can only show the relationships between the
-sequences APPROXIMATELY because the similarity scores are calculated
-from seperate pairwise alignments; not from a multiple alignment
-(that is what we eventually hope to produce). If the groupings of
-the sequences are "obvious", the above method should work well; if
-the relationships are obscure or weakly represented by the data, it
-will not make much difference playing with the parameters. The main
-factor influencing speed is the K-TUPLE SIZE followed by the WINDOW
-SIZE.
-
-The alignments are carried out in a small amount of memory.
-Occasionally (it is hard to predict), you will run out of memory
-while doing these alignments; when this happens, it will say on the
-screen: "Sequences (a,b) partially aligned" (instead of "Sequences
-(a,b) aligned"). This means that the alignment score for these
-sequences will be approximate; it is not a problem unless many of
-the alignments do this. It can be fixed by using less sensitive
-parameters or increasing parameter FSIZE in clustalv.h .
-
-
-THE DENDROGRAM ITSELF
-
-The similarity scores generated by the fast comparison of all the
-sequences are used to construct a dendrogram by the UPGMA method of
-Sneath and Sokal (1973). This is a form of cluster analysis and the
-end result produces something that looks like a tree. It represents
-the similarity of the sequences as a hierarchy. The dendrogram is
-written to a file in a machine readable format and is ahown below
-for an example with 6 sequences.
-
-
- 91.0 0 0 2 012000 ! seq 2 joins seq 3 at 91% ID.
- 72.0 1 0 3 011200 ! seq 4 joins seqs 2,3 at 72%
- 71.1 0 0 2 000012 ! seq 5 joins seq 6 at 71%
- 35.5 0 2 4 122200 ! seq 1 joins seqs 2,3,4
- 21.7 4 3 6 111122 ! seqs 1,2,3,4 join seqs 5,6
-
-This LOOKS complicated but you do not normally need to care what is
-in here. Anyway, each row represents the joining together of 2 or
-more sequences. You progress from the top down, joining more and
-more sequences until all are joined together; for N sequences you
-have N-1 groupings hence there are 5 rows in the above file (there
-were 6 sequences). In each row, the first number is the similarity
-score of this grouping; ignore the next three columns for the
-moment; the last 6 digits in the line show which sequences are
-grouped; there is one digit for each sequence (the first digit is
-for the first sequence). The rule is: in each row, all of the "1"s
-join all of the "2"s; the zero's do nothing.
-
-Hence, in the first row, sequence 2 joins sequence 3 at a similarity
-level of 91% identity; next, sequence 4 joins the previous grouping
-of 2 plus 3 at a level of 72% etc. This is shown diagrammatically
-below. Before leaving the dendrogram format, the other 3 columns of
-numbers are: a pointer to the row from which the "1" sequences were
-last joined (or zero if only one of them); a pointer to the row in
-which the "2"s were last joined; the total number of sequences
-joined in this line.
-
-
-
-
- I------ 2
- I------I
- I I------ 3 Diagram of the sequence similarity
- I----I
- I I------------- 4 relationships shown in the above
- I--I
- I I------------------ 1 dendrogram file (branch lengths are
- ----I
- I I------------- 5 not to scale).
- I-------I
- I------------- 6
-
-
-
-
-
-
-
-
-
-MULTIPLE ALIGNMENT PARAMETERS:
-
-
-Having calculated a dendrogram between a set of sequences, the final
-multiple alignment is carried out by a series of alignments of
-larger and larger groups of sequences. The order is determined by
-the dendrogram so that the most similar sequences get aligned first.
-Any gaps that are introduced in the early alignments are fixed.
-When two groups of sequences are aligned against each other, a full
-protein weight matrix (such as a Dayhoff PAM 250) is used. Two gap
-penalties are offered: a "FIXED" penalty for opening up a gap and a
-"FLOATING" penalty for extending a gap.
-
-
- ********* MULTIPLE ALIGNMENT PARAMETERS *********
-
-
- 1. Fixed Gap Penalty :10
- 2. Floating Gap Penalty :10
- 3. Toggle Transitions (DNA):Weighted
- 4. Protein weight matrix :PAM 250
-
- H. HELP
-
-
-Enter number (or [RETURN] to exit):
-
-
-FIXED GAP PENALTY: Reduce this to encourage gaps of all sizes;
-increase it to discourage them. Terminal gaps are penalised same
-as all others. BEWARE of making this too small (approx 5 or so); if
-the penalty is too small, the program may prefer to align each
-sequence opposite one long gap.
-
-FLOATING GAP PENALTY: Reduce this to encourage longer gaps;
-increase it to shorten them. Terminal gaps are penalised same as
-all others. BEWARE of making this too small (approx 5 or so); if
-the penalty is too small, the program may prefer to align each
-sequence opposite one long gap.
-
-
-DNA TRANSITIONS = WEIGHTED or UNWEIGHTED: By default, transitions
-(A versus G; C versus T) are weighted more strongly than
-transversions (an A aligned with a G will be preferred to an A
-aligned with a C or a T). You can make all pairs of nucleotide
-equally weighted with this option.
-
-PROTEIN WEIGHT MATRIX: For protein comparisons, a weight matrix is
-used to differentially weight different pairs of aligned amino
-acids. The default is the well known Dayhoff PAM 250 matrix. We
-also offer a PAM 100 matrix, an identity matrix (all weights are the
-same for exact matches) or allow you to give the name of a file with
-your own matrix. The weight matrices used by Clustal V are shown in
-full in the Algorithms and References section of this documentation.
-
-If you input a matrix from a file, it must be in the following
-format. Use a 20x20 matrix only (entries for the 20 "normal" amino
-acids only; no ambiguity codes etc.). Input the lower left triangle
-of the matrix, INCLUDING the diagonal. The order of the amino acids
-(rows and columns) must be: CSTPAGNDEQHRKMILVFYW. The values can be
-in free format seperated by spaces (not commas). The PAM 250 matrix
-is shown below in this format.
-
- 12
- 0 2
- -2 1 3
- -3 1 0 6
- -2 1 1 1 2
- -3 1 0 -1 1 5
- -4 1 0 -1 0 0 2
- -5 0 0 -1 0 1 2 4
- -5 0 0 -1 0 0 1 3 4
- -5 -1 -1 0 0 -1 1 2 2 4
- -3 -1 -1 0 -1 -2 2 1 1 3 6
- -4 0 -1 0 -2 -3 0 -1 -1 1 2 6
- -5 0 0 -1 -1 -2 1 0 0 1 0 3 5
- -5 -2 -1 -2 -1 -3 -2 -3 -2 -1 -2 0 0 6
- -2 -1 0 -2 -1 -3 -2 -2 -2 -2 -2 -2 -2 2 5
- -6 -3 -2 -3 -2 -4 -3 -4 -3 -2 -2 -3 -3 4 2 6
- -2 -1 0 -1 0 -1 -2 -2 -2 -2 -2 -2 -2 2 4 2 4
- -4 -3 -3 -5 -4 -5 -4 -6 -5 -5 -2 -4 -5 0 1 2 -1 9
- 0 -3 -3 -5 -3 -5 -2 -4 -4 -4 0 -4 -4 -2 -1 -1 -2 7 10
- -8 -2 -5 -6 -6 -7 -4 -7 -7 -5 -3 2 -3 -4 -5 -2 -6 0 0 17
-
-Values must be integers and can be all positive or positive and
-negative as above. These are SIMILARITY values.
-
-
-
-
-ALIGNMENT OUTPUT OPTIONS:
-
-By default, the alignment goes to a file in a self explanatory
-"blocked" alignment format. This format is fine for displaying the
-results but requires heavy editing if you wish to use the alignment
-with other software. To help, we provide 3 other formats which can
-be turned on or off. If you have a sequence data set or alignment
-in memory, you can also ask for output files in whatever formats are
-turned on, NOW. The menu you use to choose format is shown below.
-
-***
-We draw your attention to NBRF/PIR format in particular. This
-format is EXACTLY the same as one of the input formats. Therefore,
-alignments written in this format can be used again as input (to the
-profile alignments or phylogenetic trees).
-***
-
-
- ********* Format of Alignment Output *********
-
-
- 1. Toggle CLUSTAL format output = ON
- 2. Toggle NBRF/PIR format output = OFF
- 3. Toggle GCG format output = OFF
- 4. Toggle PHYLIP format output = OFF
-
- 5. Create alignment output file(s) now?
- H. HELP
-
-
-Enter number (or [RETURN] to exit):
-
-
-
-CLUSTAL FORMAT: This is a self explanatory alignment. The
-alignment is written out in blocks. Identities are highlighted and
-(if you use a PAM 250 matrix) positions in the alignment where all
-of the residues are "similar" to each other (PAM 250 score of 8 or
-more) are indicated.
-
-NBRF/PIR FORMAT: This is the usual NBRF/PIR format with gaps
-indicated by hyphens ("-"). AS we have stressed before, this format
-is EXACTLY compatible with the sequence input format. Therefore you
-can read in these alignments again for profile alignments or for
-calculating phylogenetic trees.
-
-GCG FORMAT: In version 7 of the Wisconsin GCG package, a new
-multiple sequence format was introduced. This is the MSF (Multiple
-Sequence Format) format. It can be used as input to the GCG
-sequence editor or any of the GCG programs that make use of multiple
-alignments. THIS FORMAT IS ONLY SUPPORTED IN VERSION 7 OF THE GCG
-PACKAGE OR LATER.
-
-PHYLIP FORMAT: This format can be used by the Phylip package of
-Joe Felsenstein (see the references/algorithms section for details
-of how to get it). Phylip allows you to do a huge range of
-phylogenetic analyses (we just offer one method in this program) and
-is probably the most widely used set of programs for drawing trees.
-It also works on just about every computer you can think of,
-providing you have a decent Pascal compiler.
-
-
-
-
-
- ******************************
- * PROFILE ALIGNMENT MENU. *
- ******************************
-
-
-
-This menu is for taking two old alignments (or single sequences) and
-aligning them with each other. The result is one bigger alignment.
-The menu is very similar to the multiple alignment menu except that
-there is no mention of dendrograms here (they are not needed) and
-you need to input two sets of sequences. The menu looks like this:
-
-
-
-******Profile*Alignment*Menu******
-
-
- 1. Input 1st. profile/sequence
- 2. Input 2nd. profile/sequence
- 3. Do alignment now
- 4. Alignment parameters
- 5. Output format options
-
- S. Execute a system command
- H. HELP
- or press [RETURN] to go back to main menu
-
-
-Your choice:
-
-
-You must input profile number 1 first. When both profiles are
-loaded, use item 3 (Do alignment now) and the 2 profiles will be
-aligned. Items 4 and 5 (parameters and output options) are
-identical to the equivalent options on the multiple alignment menu.
-
-The same input routines that are used for general input are used
-here i.e. sequences must be in NBRF/PIR, EMBL/SwissProt or FASTA
-format, with gaps indicated by hyphens ("-"). This is why we have
-continualy drawn your attention to the NBRF/PIR format as a useful
-output format.
-
-Either profile can consist of just one sequence. Therefore, if you
-have a favourite alignment of sequences that you are working on and
-wish to add a new sequence, you can use this menu, provided the
-alignment is in the correct format.
-
-The total number of sequences in the two profiles must be less less
-than or equal to the MAXN parameter set in the clustalv.h header
-file.
-
-
-
-
-
-
-
-
-
-
-
- ******************************
- * PHYLOGENETIC TREE MENU. *
- ******************************
-
-
-This menu allows you to input an alignment and calculate a
-phylogenetic tree. You can also calculate a tree if you have just
-carried out a multiple alignment and the alignment is still in
-memory. THE SEQUENCES MUST BE ALIGNED ALREADY!!!!!! The tree will
-look strange if the sequences are not already aligned. You can also
-"BOOTSTRAP" the tree to show confidence levels for groupings. This
-is SLOW on microcomputers but works fine on workstations or
-mainframes.
-
-
-
-******Phylogenetic*tree*Menu******
-
-
- 1. Input an alignment
- 2. Exclude positions with gaps? = OFF
- 3. Correct for multiple substitutions? = OFF
- 4. Draw tree now
- 5. Bootstrap tree
-
- S. Execute a system command
- H. HELP
- or press [RETURN] to go back to main menu
-
-
-Your choice:
-
-
-
-
-The same input routine that is used for general input is used here
-i.e. sequences must be in NBRF/PIR, EMBL/SwissProt or FASTA format,
-with gaps indicated by hyphens ("-"). This is why we have
-continualy drawn your attention to the NBRF/PIR format as a useful
-output format.
-
-If you have input an alignment, then just use item 4 to draw a tree.
-The method used is the Neighbor Joining method of Saitou and Nei
-(1987). This is a "distance method". First, percent divergence
-figures are calculated between all pairs of sequence. These
-divergence figures are then used by the NJ method to give the tree.
-Example trees will be shown below.
-
-There are two options which can be used to control the way the
-distances are calculated. These are set by options 2 and 3 in the
-menu.
-
-EXCLUDE POSITIONS WITH GAPS? This option allows you to ignore all
-alignment positions (columns) where there is a gap in ANY sequence.
-This guarantees that "like" is compared with "like" in all distances
-i.e. the same positions are used to calculate all distances. It
-also means that the distances will be "metric". The disadvantage of
-using this option is that you throw away much of the data if there
-are many gaps. If the total number of gaps is small, it has little
-effect.
-
-CORRECT FOR MULTIPLE SUBSTITUTIONS? As sequences diverge,
-substitutions accumulate. It becomes increasingly likely that more
-than one substitution (as a result of a mutation) will have happened
-at a site where you observe just one difference now. This option
-allows you to use formulae developed by Motoo Kimura to correct for
-this effect. It has the effect of stretching long branches in tres
-while leaving short ones relatively untouched. The desired effect
-is to try and make distances proportional to time since divergence.
-
-The tree is sent to a file called BLAH.NJ, where BLAH.SEQ is the
-name of the input, alignment file. An example is shown below for 6
-globin sequences.
-
-
-
- DIST = percentage divergence (/100)
- Length = number of sites used in comparison
-
- 1 vs. 2 DIST = 0.5683; length = 139
- 1 vs. 3 DIST = 0.5540; length = 139
- 1 vs. 4 DIST = 0.5315; length = 111
- 1 vs. 5 DIST = 0.7447; length = 141
- 1 vs. 6 DIST = 0.7571; length = 140
- 2 vs. 3 DIST = 0.0897; length = 145
- 2 vs. 4 DIST = 0.1391; length = 115
- 2 vs. 5 DIST = 0.7517; length = 145
- 2 vs. 6 DIST = 0.7431; length = 144
- 3 vs. 4 DIST = 0.0957; length = 115
- 3 vs. 5 DIST = 0.7379; length = 145
- 3 vs. 6 DIST = 0.7361; length = 144
- 4 vs. 5 DIST = 0.7304; length = 115
- 4 vs. 6 DIST = 0.7368; length = 114
- 5 vs. 6 DIST = 0.2697; length = 152
-
-
- Neighbor-joining Method
-
- Saitou, N. and Nei, M. (1987) The Neighbor-joining Method:
- A New Method for Reconstructing Phylogenetic Trees.
- Mol. Biol. Evol., 4(4), 406-425
-
-
- This is an UNROOTED tree
-
- Numbers in parentheses are branch lengths
-
-
- Cycle 1 = SEQ: 5 ( 0.13382) joins SEQ: 6 ( 0.13592)
-
- Cycle 2 = SEQ: 1 ( 0.28142) joins Node: 5 ( 0.33462)
-
- Cycle 3 = SEQ: 2 ( 0.05879) joins SEQ: 3 ( 0.03086)
-
- Cycle 4 (Last cycle, trichotomy):
-
- Node: 1 ( 0.20798) joins
- Node: 2 ( 0.02341) joins
- SEQ: 4 ( 0.04915)
-
-
-
-The output file first shows the percent divergence (distance)
-figures between each pair of sequence. Then a description of a NJ
-tree is given. This description shows which sequences (SEQ:) or
-which groups of sequences (NODE: , a node is numbered using the
-lowest sequence that belongs to it) join at each level of the tree.
-
-This is an unrooted tree!! This means that the direction of
-evolution through the tree is not shown. This can only be inferred
-in one of two ways:
-1) assume a degree of constancy in the molecular clock and place the
-root (bottom of the tree; the point where all the sequences radiate
-from) half way along the longest branch. **OR**
-2) use an "outgroup", a sequence from an organism that you "know"
-must be outside of the rest of the sequences i.e. root the tree
-manually, on biological grounds.
-
-The above tree can be represented diagramatically as follows:
-
-
- SEQ 1 SEQ 4
- I I
- 13.6 I 28.1 I 4.9 5.9
- SEQ 6 ----------I I I I--------- SEQ 2
- I I I I
- I--------I-----------I----------I
- 13.4 I 33.5 20.8 2.3 I 3.1
- SEQ 5 ----------I I--------- SEQ 3
-
-
-The figures along each branch are percent divergences along that
-branch. If you root the tree by placing the root along the longest
-branch (33.5%) then you can draw it again as follows, this time
-rooted:
-
-
-
- 13.6
- I-------------------- SEQ 6
- I---------I 13.4
- I I-------------------- SEQ 5
- I 33.5
- -----I 28.1
- I I-------------------- SEQ 1
- I I
- I---------I 4.9
- I 20.8 I----------- SEQ 4
- I--------I
- I 5.9
- I 2.3 I----- SEQ 2
- I-----I 3.1
- I----- SEQ 3
-
-
-
-The longest branch (33.5% between 5,6 and 1,2,3,4) is split between
-the 2 bottom branches of the tree. As it happens in this particular
-case, sequences 5 and 6 are myoglobins while sequences 1,2,3 and 4
-are alpha and beta globins, so you could also justify the above
-rooting on biological grounds. If you do not have any particular
-need or evidence for the position of the root, then LEAVE THE TREE
-UNROOTED. Unrooted trees do not look as pretty as rooted ones but
-it is uaual to leave them unrooted if you do not have any evidence
-for the position of the root.
-
-
-BOTSTRAPPING: Different sets of sequences and different tree
-drawing methods may give different topologies (branching orders) for
-parts of a tree that are weakly supported by the data. It is useful
-to have an indication of the degree of error in the tree. There are
-several ways of doing this, some of them rather technical. We
-provide one general purpose method in this program, which makes use
-of a technique called bootstrapping (see Felsenstein, 1985).
-
-In the case of sequence alignments, bootstrapping involves taking
-random samples of positions from the alignment. If the alignment
-has N positions, each bootstrap sample consists of a random sample
-of N positions, taken WITH REPLACEMENT i.e. in any given sample,
-some sites may be sampled several times, others not at all. Then,
-with each sample of sites, you calculate a distance matrix as usual
-and draw a tree. If the data very strongly support just one tree
-then the sample trees will be very similar to each other and to the
-original tree, drawn without bootstrapping. However, if parts of
-the tree are not well supported, then the sample trees will vary
-considerably in how they represent these parts.
-
-In practice, you should use a very large number of bootstrap
-replicates (1000 is recommended, even if it means running the
-program for an hour on a slow microcomputer; on a workstation it
-will be MUCH faster). For each grouping on the tree, you record the
-number of times this grouping occurs in the sample trees. For a
-group to be considered "significant" at the 95% level (or P <= 0.05
-in statistical terms) you expect the grouping to show up in >= 95%
-of the sample trees. If this happens, then you can say that the
-grouping is significant, given the data set and the method used to
-draw the tree.
-
-So, when you use the bootstrap option, a NJ tree is drawn as before
-and then you are asked to say how many bootstrap samples you want
-(1000 is the default) and you are asked to give a seed number for
-the random number generator. If you give the same seed number in
-future, you will get the same results (we hope). Remember to give
-different seed numbers if you wish to carry out genuinely different
-bootstrap sampling experiments. Below is the output file from using
-the same data for the 6 globin sequences as used before. The output
-file has the same name as the input fike with the extension ".njb".
-
-//
-STUFF DELETED .... same as for the ordinary NJ output
-//
- Bootstrap Confidence Limits
-
-
- Random number generator seed = 99
-
- Number of bootstrap trials = 1000
-
-
- Diagrammatic representation of the above tree:
-
- Each row represents 1 tree cycle; defining 2 groups.
-
- Each column is 1 sequence; the stars in each line show 1 group;
- the dots show the other
-
- Numbers show occurences in bootstrap samples.
-
-****.. 1000
-.***.. 1000 <- This is the answer!!
-*..*** 812
-122311
-
-
-For an unrooted tree with N sequences, there are actually only N-3
-genuinely different groupings that we can test (this is the number
-of "internal branches"; each internal branch splits the sequences
-into 2 groups). In this example, we have 6 sequences with 3
-internal branches in the reference tree. In the bootstrap
-resampling, we count how often each of these internal branches
-occur. Here, we find that the branch which splits 1,2,3 and 4
-versus 5 and 6 occurs in all 1000 samples; the branch which splits
-2,3 and 4 versus 1,5 and 6 occurs in 1000; the branch which splits 2
-and 3 versus 1,4,5 and 6 occurs in 812/1000 samples. We can put
-these figures on to the diagrammatic representation we made earlier
-of our unrooted NJ tree as follows:
-
-
-
- SEQ 1 SEQ 4
- I I
- I I
- SEQ 6 ----------I I I I--------- SEQ 2
- I 1000 I 1000 I 812 I
- I--------I-----------I----------I
- I I
- SEQ 5 ----------I I--------- SEQ 3
-
-
-
-You can equally put these confidence figures on the rooted tree (in
-fact the interpretation is simpler with rooted trees). With the
-unrooted tree, the grouping of sequence 5 with 6 is significant (as
-is the grouping of sequences 1,2,3 and 4). Equally the grouping of
-sequences 1,5 and 6 is significant (the same as saying that 2,3 and
-4 group significantly). However, the grouping of 2 and 3 is not
-significant, although it is relatively strongly supported.
-
-Unfortunately, there is a small complication in the interpretation
-of these results. In statistical hypothesis testing, it is not
-valid to make multiple simultaneous tests and to treat the result of
-each test completely independantly. In the above case, if you have
-one particular test (grouping) that you wish to make in advance, it
-is valid to test IT ALONE and to simply show the other bootstrap
-figures for reference. If you do not have any particular test in
-mind before you do the bootstrapping, you can just show all of the
-figures and use the 95% level as an ARBITRARY cut off to show those
-groups that are very strongly supported; but not mention anything
-about SIGNIFICANCE testing. In the literature, it is common
-practice to simply show the figures with a tree; they frequently
-speak for themselves.
-
-
-
-*******************************************************************
-
- 4. Command Line Interface.
-
-
-
-You can do almost everything that can be done from the menus, using
-a command line interface. In this mode, the program will take all of
-its instructions as "switches" when you activate it; no questions
-will be asked; if there are no errors, the program just does an
-analysis and stops. It does not work so well on the MAC but is
-still possible. To get you started we will show you the 2 simplest
-uses of the command line as it looks on VAX/VMS. On all other
-machines (except the MAC) it works in the same way.
-
-$ clustalv /help **OR** $ clustalv /check
-
-Both of the above switches give you a one page summary of the
-command line on the screen and then the program stops.
-
-
-$ clustalv proteins.seq **OR** $ clustalv /infile=proteins.seq
-
-This will read the sequences from the file 'proteins.seq' and do a
-complete multiple alignment. Default parameters will be used, the
-program will try to tell whether or not the sequences are DNA or
-protein and the output will go to a file called 'proteins.aln' . A
-dendrogram file called 'proteins.dnd' will also be created. Thus
-the default action for the program, when it successfully reads in an
-input file is to do a full multiple alignment. Some further
-examples of command line usage will be given leter.
-
-Command line switches can be abbreviated but MAKE SURE YOU DO NOT
-MAKE THEM AMBIGUOUS. No attempt will be made to detect ambiguity.
-Use enough characters to distinguish each switch uniquely.
-
-
-
-
-
-
-
-The full list of allowed switches is given below:
-
-
- DATA (sequences)
-
-/INFILE=file.ext :input sequences. If you give an input file and
- nothing else as a switch, the default action is
- to do a complete multiple alignment. The input
- file can also be specified by giving it as the
- first command line parameter with no "/" in
- front of it e.g $ clustalv file.ext .
-
-/PROFILE1=file.ext :You use these two switches to give the names of
-/PROFILE2=file.ext two profiles. The default action is to align
- the two. You must give the names of both profile
- files.
-
-
-
- VERBS (do things)
-
-/HELP :list the command line parameters on the screen.
-/CHECK
-
-/ALIGN :do full multiple alignment. This is the default
- action if no other switches except for input files
- are given.
-
-/TREE :calculate NJ tree. If this is the only action
- specified (e.g. $ clustalv proteins.seq/tree ) it IS
- ASSUMED THAT THE SEQUENCES ARE ALREADY ALIGNED. If
- the sequences are not already aligned, you should
- also give the /ALIGN switch. This will align the
- sequences first, output an alignment file and
- calculate the tree in memory.
-
-/BOOTSTRAP(=n) :bootstrap a NJ tree (n= number of bootstraps;
- default = 1000). If this is the only action
- specified (e.g. $ clustalv proteins.seq/bootstrap )
- it IS ASSUMED THAT THE SEQUENCES ARE ALREADY ALIGNED.
- If the sequences are not already aligned, you should
- also give the /ALIGN switch. This will align the
- sequences first, output an alignment file and
- calculate the bootstraps in memory. You can set the
- number of bootstrap trials here (e.g./bootstrap=500).
- You can set the seed number for the random number
- generator with /seed=n.
-
-
-
- PARAMETERS (set things)
-
-***Pairwise alignments:***
-
-/KTUP=n :word size
-
-/TOPDIAGS=n :number of best diagonals
-
-/WINDOW=n :window around best diagonals
-
-/PAIRGAP=n :gap penalty
-
-
-
-***Multiple alignments:***
-
-/FIXEDGAP=n :fixed length gap pen.
-
-/FLOATGAP=n :variable length gap pen.
-
-/MATRIX= :PAM100 or ID or file name. The default weight matrix
- for proteins is PAM 250.
-
-/TYPE=p or d :type is protein or DNA. This allows you to
- explicitely overide the programs attempt at guessing
- the type of the sequence. It is only useful if you
- are using sequences with a VERY strange composition.
-
-/OUTPUT= :GCG or PHYLIP or PIR. The default output is
- Clustal format.
-
-/TRANSIT :transitions not weighted. The default is to weight
- transitions as more favourable than other mismatches
- in DNA alignments. This switch makes all nucleotide
- mismatches equally weighted.
-
-
-***Trees:***
-
-/KIMURA :use Kimura's correction on distances.
-
-/TOSSGAPS :ignore positions with a gap in ANY sequence.
-
-/SEED=n :seed number for bootstraps.
-
-
-
-
-EXAMPLES:
-
-These examples use the VAX/VMS $ prompt; otherwise, command-line
-usage is the same on all machines except the Macintosh.
-
-
-$ clustalv proteins.seq OR $ clustalv /infile=proteins.seq
-
-Read whatever sequences are in the file "proteins.seq" and do a full
-multiple alignment; output will go to the files: "proteins.dnd"
-(dendrogram) and "proteins.aln" (alignment).
-
-
-$ clustalv proteins.seq/ktup=2/matrix=pam100/output=pir
-
-Same as last example but use K-Tuple size of 2; use a PAM 100
-protein weight matrix; write the alignment out in NBRF/PIR format
-(goes to a file called "proteins.pir").
-
-
-$ clustalv /profile1=proteins.seq/profile2=more.seq/type=p/fixed=11
-
-Take the alignment in "proteins.seq" and align it with "more.seq"
-using default values for everything except the fixed gap penalty
-which is set to 11. The sequence type is explicitely set to
-PROTEIN.
-
-
-$ clustalv proteins.pir/tree/kimura
-
-Take the sequences in proteins.pir (they MUST BE ALIGNED ALREADY)
-and calculate a phylogenetic tree using Kimura's correction for
-distances.
-
-
-$ clustalv proteins.pir/align/tree/kimura
-
-Same as the previous example, EXCEPT THAT AN ALIGNMENT IS DONE
-FIRST.
-
-
-$ clustalv proteins.seq/align/boot=500/seed=99/tossgaps/type=p
-
-Take the sequences in proteins.seq; they are explicitely set to be
-protein; align them; bootstrap a tree using 500 samples and a seed
-number of 99.
-
-
-*******************************************************************
-
- 5. Algorithms and references.
-
-
-
-In this section, we will try to BRIEFLY describe the algorithms used
-in ClustalV and give references. The topics covered are:
-
-
- -Multiple alignments
-
- -Profile alignments
-
- -Protein weight matrices
-
- -Phylogenetic trees
-
- -distances
-
- -NJ method
-
- -Bootstrapping
-
- -Phylip
-
- -References
-
-
-
-
-
-
-MULTIPLE ALIGNMENTS.
-
-The approach used in ClustalV is a modified version of the method of
-Feng and Doolittle (1987) who aligned the sequences in larger and
-larger groups according to the branching order in an initial
-phylogenetic tree. This approach allows a very useful combination
-of computational tractability and sensitivity.
-
-The positions of gaps that are generated in early alignments remain
-through later stages. This can be justified because gaps that arise
-from the comparison of closely related sequences should not be moved
-because of later alignment with more distantly related sequences.
-At each alignment stage, you align two groups of already aligned
-sequences. This is done using a dynamic programming algorithm where
-one allows the residues that occur in every sequence at each
-alignment position to contribute to the alignment score. A Dayhoff
-(1978) PAM matrix is used in protein comparisons.
-
-The details of the algorithm used in ClustalV have been published in
-Higgins and Sharp (1989). This was an improved version of an
-earlier algorithm published in Higgins and Sharp (1988). First, you
-calculate a crude similarity measure between every pair of sequence.
-This is done using the fast, approximate alignment algorithm of
-Wilbur and Lipman (1983). Then, these scores are used to calculate
-a "guide tree" or dendrogram, which will tell the multiple alignment
-stage in which order to align the sequences for the final multiple
-alignment. This "guide tree" is calculated using the UPGMA method
-of Sneath and Sokal (1973). UPGMA is a fancy name for one type of
-average linkage cluster analysis, invented by Sokal and Michener
-(1958).
-
-Having calculated the dendrogram, the sequences are aligned in
-larger and larger groups. At each alignment stage, we use the
-algorithm of Myers and Miller (1988) for the optimal alignments.
-This algorithm is a very memory efficient variation of Gotoh's
-algorithm (Gotoh, 1982). It is because of this algorithm that
-ClustalV can work on microcomputers. Each of these alignments
-consists of aligning 2 alignments, using what we call "profile
-alignments".
-
-
-PROFILE ALIGNMENTS.
-
-We use the term "profile alignment" to describe the alignment of 2
-alignments. We use this term because the method is a simple
-extension of the profile method of Gribskov, et al. (1987) for
-aligning 1 sequence with an alignment. Normally, with a 2 sequence
-alignment, you use a weight matrix (e.g. a PAM 250 matrix) to give a
-score between the pairs of aligned residues. The alignment is
-considered "optimal" if it gives the best total score for aligned
-residues minus penalties for any gaps (insertions or deletions) that
-must be introduced.
-
-Profile alignments are a simple extension of 2 sequence alignments
-in that you can treat each of the two input alignments as single
-sequences but you calculate the score at aligned positions as the
-average weight matrix score of all the residues in one alignment
-versus all those in the other e.g. if you have 2 alignments with I
-and J sequences respectively; the score at any position is the
-average of all the I times J scores of the residues compared
-seperately. Any gaps that are introduced are placed in all of the
-sequences of an alignment at the same position. The profile
-alignments offered in the "profile alignment menu" are also
-calculated in this way.
-
-
-PROTEIN WEIGHT MATRICES.
-
-There are 3 built-in weight matrices used by clustalV. These are
-the PAM 100 and PAM 250 matrices of Dayhoff (1978) and an identity
-matrix. Each matrix is given as the bottom left half, including the
-diagonal of a 20 by 20 matrix. The order of the rows and columns is
-CSTPAGNDEQHRKMILVFYW.
-
-
-PAM 250
-
-C 12
-S 0 2
-T -2 1 3
-P -3 1 0 6
-A -2 1 1 1 2
-G -3 1 0 -1 1 5
-N -4 1 0 -1 0 0 2
-D -5 0 0 -1 0 1 2 4
-E -5 0 0 -1 0 0 1 3 4
-Q -5 -1 -1 0 0 -1 1 2 2 4
-H -3 -1 -1 0 -1 -2 2 1 1 3 6
-R -4 0 -1 0 -2 -3 0 -1 -1 1 2 6
-K -5 0 0 -1 -1 -2 1 0 0 1 0 3 5
-M -5 -2 -1 -2 -1 -3 -2 -3 -2 -1 -2 0 0 6
-I -2 -1 0 -2 -1 -3 -2 -2 -2 -2 -2 -2 -2 2 5
-L -6 -3 -2 -3 -2 -4 -3 -4 -3 -2 -2 -3 -3 4 2 6
-V -2 -1 0 -1 0 -1 -2 -2 -2 -2 -2 -2 -2 2 4 2 4
-F -4 -3 -3 -5 -4 -5 -4 -6 -5 -5 -2 -4 -5 0 1 2 -1 9
-Y 0 -3 -3 -5 -3 -5 -2 -4 -4 -4 0 -4 -4 -2 -1 -1 -2 7 10
-W -8 -2 -5 -6 -6 -7 -4 -7 -7 -5 -3 2 -3 -4 -5 -2 -6 0 0 17
-----------------------------------------------------------------
- C S T P A G N D E Q H R K M I L V F Y W
-
-
-IDENTITY MATRIX
-
-10
- 0 10
- 0 0 10
- 0 0 0 10
- 0 0 0 0 10
- 0 0 0 0 1 10
- 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
- 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
-
-
-
-
-
-PAM 100
-
- 14
- -1 6
- -5 2 7
- -6 1 -1 10
- -5 2 2 1 6
- -8 1 -3 -3 1 8
- -8 2 0 -3 -1 -1 7
--11 -1 -2 -4 -1 -1 4 8
--11 -2 -3 -3 0 -2 1 5 8
--11 -3 -3 -1 -2 -5 -1 1 4 9
- -6 -4 -5 -2 -5 -7 2 -1 -2 4 11
- -6 -1 -4, -2 -5 -8 -3 -6 -5 1 1 10
--11 -2 -1 -4 -4 -5 1 -2 -2 -1 -3 3 8
--11 -4 -2 -6 -3 -8 -5 -8 -6 -2 -7 -2 1 13
- -5 -4 -1 -6 -3 -7 -4 -6 -5 -5 -7 -4 4 2 9
--12 -7 -5 -5 -5 -8 -6 -9 -7 -3 -5 -7 -6 4 2 9
- -4 -4 -1 -4 0 -4 -5 -6 -5 -5 -6 -6 -6 1 5 1 8
--10 -5 -6 -9 -7 -8 -6 -11 -11 -10 -4 -7-11 -2 0 0 -5 12
- -2 -6 -6 -11 -6 -11 -3 -9 -7 -9 -1-10-10 -8 -4 -5 -6 6 13
--13 -4 -10 -11 -11 -13 -8 -13 -14 -11 -7 1 -9-11-12 -7-14 -2 -2 19
-
-
-
-
-PHYLOGENETIC TREES.
-
-There are two COMMONLY used approaches for inferring phylogentic
-trees from sequence data: parsimony and distance methods. There are
-other approaches which are probably superior in theory but which are
-yet to be used widely. This does not mean that they are no use; we
-(the authors of this program at any rate) simply do not know enough
-about them yet. You should see the documentation accompanying the
-Phylip package and some of the references there for an explanation
-of the different methods and what assumptions are implied when you
-use them.
-
-There is a constant debate in the literature as to the merits of
-different methods but unfortunately, a lot of what is said is
-incomprehensible or inaccurate. It is also a field that is prone to
-having highly opinionated schools of thought. This is a pity
-because it prevents rational discussion of the pro's and con's of
-the different methods. The approach adopted in ClustalV is to
-supply just one method and to produce alignments in a format that
-can be used by Phylip. In simple cases, the trees produced will be
-as "good" (reliable, robust) as those from ANY other method. In
-more complicated cases, there is no single magic recipe that we can
-supply that will work well in even most situations.
-
-The method we provide is the Neighbor Joining method (NJ) of Saitou
-and Nei (1987) which is a distance method. We use this for three
-reasons: it is conceptually and computationally simple; it is fast;
-it gives "good" trees in simple cases. It is difficult to prove that
-one tree is "better" than another if you do not know the true
-phylogeny; the few systematic surveys of methods show it to work
-more or less as well as any other method ON AVERAGE. Another reason
-for using the NJ method is that it is very commonly used; THIS IS A
-BAD REASON SCIENTIFICALLY but at least you will not feel lonely if
-you use it.
-
-The NJ method works on a matrix of distances (the distance matrix)
-between all pairs of sequence to be analysed. These distances are
-related to the degree of divergence between the sequences. It is
-normal to calculate the distances from the sequences after they are
-multiply aligned. If you calculate them from seperate alignments
-(as done for the dendrograms in another part of this program), you
-may increase the error considerably.
-
-
-DISTANCES
-
-The simplest measure of distance between sequences is percent
-divergence (100% minus percent identity). For two sequences, you
-count how many positions differ between them (ignoring all positions
-with a gap or an unknown residue) and divide by the number of
-positions considered. It is common practice to also ignore all
-positions in the alignment where there is a GAP in ANY of the
-sequences (Tossgaps ? option in the menu). Usually, you express the
-percent distance divided by 100 (gives distances between 0.0 and
-1.0).
-
-This measure of distance is perfectly adequate (with some further
-modification described below) for rRNA sequences. However it treats
-all residues identically e.g. all amino acid substitutions are
-equally weighted. It also treats all positions identically e.g. it
-does not take account of different rates of substitution in
-different positions of different codons in protein coding DNA
-sequences; see Li et al (1985) for a distance measure that does.
-Despite these shortcomings, these percent identity distances do work
-well in practice in a wide variety of situations.
-
-In a simple world, you would like a distance to be proportional to
-the time since the sequences diverged. If this were EXACTLY true,
-then the calculation of the tree would be a simple matter of algebra
-(UPGMA does this for you) and the branch lengths will be nice and
-meaningful (times). In practice this OBVIOUSLY depends on the
-existence and quality of the "molecular clock", a subject of on-
-going debate. However, even if there is a good clock, there is a
-further problem with estimating divergences. As sequences diverge,
-they become "saturated" with mutations. Sites can have
-substitutions more than once. Calculated distances will
-underestimate actual divergence times; the greater the divergence,
-the greater the discrepancy. There are various methods for dealing
-with this and we provide two commonly used ones, both due to Motoo
-Kimura; one for proteins and one for DNA.
-
-
-For distance K (percent divergence /100 ) ...
-
-Correction for Protein distances: (Kimura, 1983).
-
- Corrected K = -ln(1.0 - K - (K * k/5.0))
-
-
-
-Correction for nucleotide distances: Kimura's 2-parameter method
-(Kimura, 1980).
-
- Corrected K = 0.5*ln(a) + 0.25*ln(b)
-
- where a = 1/(1 - 2*P - Q)
- and b = 1/(1 - 2*Q)
-
- P and Q are the proportions of transitions (A<-->G, C<-->T)
- and transversions occuring between the sequences.
-
-
-One paradoxical effect of these corrections, is that distances can
-be corrected to have more than 100% divergence. That is because,
-for very highly diverged sequences of length N, you can estimate
-that more than N substitutions have occured by correcting the
-observed distance in the above ways. Don't panic!
-
-
-
-NEIGHBOR JOINING TREES.
-
-VERY briefly, the NJ method works as follows. You start by placing
-the sequences in a star topology (no internal branches). You then
-find that internal branch (take 2 sequences; join them; connect them
-to the rest by the internal branch) which when added to the tree
-will minimise the total branch length. The two joined sequences
-(neighbours) are merged into a single sequence and the process is
-repeated. For an unrooted tree with N sequences, there are N-3
-internal branches. The above process is repeated N-3 times to give
-the final tree. The full details are given in Saitou and Nei
-(1987).
-
-As explained elsewhere in the documentation, you can only root the
-tree by one of two methods:
-
-1) assume a degree of constancy in the molecular clock and place the
-root along the longest branch (internal or external). Methods that
-appear to produce rooted trees automatically are often just doing
-this without letting you know; this is true of UPGMA.
-
-2) root the tree on biological grounds. The usual method is to
-include an "outgroup", a sequence that you are certain will branch
-to the outside of the tree.
-
-
-
-BOOTSTRAPPING.
-
-Bootstrapping is a general purpose technique that can be used for
-placing confidence limits on statistics that you estimate without
-any knowledge of the underlying distribution (e.g. a normal or
-poisson distribution). In the case of phylogenetic trees, there are
-several analytical methods for placing confidence limits on
-groupings (actually on the internal branches) but these are either
-restricted to particular tree drawing methods or only work on small
-trees of 4 or 5 sequences. Felsenstein (1985) showed how to use
-bootstrapping to calculate confidence limits on trees. His approach
-is completely general and can be applied to any tree drawing method.
-The main assumption of the method in this context is that the sites
-in the alignment are independant; this will be true of some sequence
-alignments (e.g. pseudogenes) but not others (e.g. rRNA's). What
-effect, lack of independance will have on the results is not known.
-
-The method works by taking random samples of data from the complete
-data set. You compute the test statistic (tree in this case) on
-each sample. Variation in the statistic computed from the samples
-gives a measure of variation in the statistic which can be used to
-calculate confidence intervals. Each random sample is the same size
-as the complete data set and is taken WITH REPLACEMENT i.e. a data
-point can be selected more than once (or not at all) in any given
-sample.
-
-In the case of an alignment N residues long, each random sample is a
-random selection of N sites form the alignment. For each sample, we
-calculate a distance matrix and tree in the usual way. Variation in
-the sample trees compared to a tree calculated from the full data
-set gives an indication of how well supported the tree is by the
-data. If the sample trees are very similar to each other and to the
-full tree, then the tree is "strongly" supported; if the sample
-trees show great variation, then the tree will be weakly supported.
-In practice, you usually find some parts of a tree well supported,
-others weakly. This can be seen by counting how often each
-monophyletic group in the full tree occurs in the sample trees.
-
-For a particular grouping, one considers it to be significant at the
-95% level (P <= 0.05) if it occurs in 95% of the bootstrap samples.
-If a grouping is significant, it is significant with respect to the
-particular data set and method used for drawing the tree.
-Biological "significance" is another matter.
-
-
-PHYLIP.
-
-The Phylip package was written by Joe Felsenstein, University of
-Washington, USA. It provides Pascal source code for a large number
-of programs for doing most types of phylogenetic analyses. The
-Phylip format alignments produced by this program can be used by all
-of the Phylip programs, version 3.4 or later (March 1991). It is
-freely available from him as follows.
-
-
-
-================= PHYLIP information sheet =====================
-
- PHYLIP - Phylogeny Inference Package (version 3.3)
-
-This is a FREE package of programs for inferring phylogenies and
-carrying out certain related tasks. At present it contains 28
-programs, which carry out different algorithms on different kinds of
-data. The programs in the package are:
-
- ---------- Programs for molecular sequence data ----------
-PROTPARS Protein parsimony
-DNAPARS Parsimony method for DNA
-DNAMOVE Interactive DNA parsimony
-DNAPENNY Branch and bound for DNA
-DNABOOT Bootstraps DNA parsimony
-DNACOMP Compatibility for DNA
-DNAINVAR Phylogenetic invariants
-DNAML Maximum likelihood method
-DNAMLK DNAML with molecular clock
-DNADIST Distances from sequences
-RESTML ML for restriction sites
-
- ----------- Programs for distance matrix data ------------
-FITCH Fitch-Margoliash and least-squares methods
-KITSCH Fitch-Margoliash and least squares methods with
- evolutionary clock
-
- --- Programs for gene frequencies and continuous characters --
-CONTML Maximum likelihood method
-GENDIST Computes genetic distances
-
- ------------- Programs for discrete state data -----------
-MIX Wagner, Camin-Sokal, and mixed parsimony criteria
-MOVE Interactive Wagner, C-S, mixed parsimony program
-PENNY Finds all most parsimonious trees by branch-and-bound
-BOOT Bootstrap confidence interval on mixed parsimony methods
-DOLLOP, DOLMOVE, DOLPENNY, DOLBOOT same as preceding four
- programs, but for the Dollo and polymorphism parsimony
- criteria
-CLIQUE Compatibility method
-FACTOR recode multistate characters
-
- ---- Programs for plotting trees and consensus trees ----
-DRAWGRAM Draws cladograms and phenograms on screens, plotters and
- printers
-DRAWTREE Draws unrooted phylogenies on screens, plotters and
- printers
-CONSENSE Majority-rule and strict consensus trees
-
-The package includes extensive documentation files that provide the
-information necessary to use and modify the programs.
-
-COMPATIBILITY: The programs are written in a very standard subset of
-Pascal, a language that is available on most computers (including
-microcomputers). The programs require only trivial modifications to
-run on most machines: for example they work with only minor
-modifications with Turbo Pascal, and without modifications on VAX
-VMS Pascal. Pascal source code is distributed in the regular version
-of PHYLIP: compiled object code is not. To use that version, you
-must have a Pascal compiler.
-
-DISKETTE DISTRIBUTION: The package is distributed in a variety of
-microcomputer diskette formats. You should send FORMATTED
-diskettes, which I will return with the package written on them.
-Unfortunately, I cannot write any Apple formats. See below for how
-many diskettes to send. The programs on the magnetic tape or
-electronic network versions may of course also be moved to
-microcomputers using a terminal program.
-
-PRECOMPILED VERSIONS: Precompiled executable programs for PCDOS
-systems are available from me. Specify the "PCDOS executable
-version" and send the number of extra diskettes indicated below.
-An Apple Macintosh version with precompiled code is available from
-Willem Ellis, Instituut voor Taxonomische Zoologie, Zoologisch
-Museum, Universiteit van Amsterdam, Plantage Middenlaan 64, 1018DH
-Amsterdam, Netherlands, who asks that you send 5 800K diskettes.
-
-HOW MANY DISKETTES TO SEND: The following table shows for different
-PCDOS formats how many diskettes to send, and how many extra
-diskettes to send for the PCDOS executable version:
-
-Diskette size Density For source code For executables, send
- in addition
- 3.5 inch 1.44 Mb 2 1
- 5.25 inch 1.2 Mb 2 2
- 3.5 inch 720 Kb 4 2
- 5.25 inch 360 Kb 7 4
-
-Some other formats are also available. You MUST tell me EXACTLY
-which of these formats you need. The diskettes MUST be formatted by
-you before being sent to me. Sending an extra diskette may be
-helpful.
-
-NETWORK DISTRIBUTION: The package is also available by distribution
-of the files directly over electronic networks, and by anonymous ftp
-from evolution.genetics.washington.edu. Contact me by electronic
-mail for details.
-
-TAPE DISTRIBUTION: The programs are also distributed on a magnetic
-tape provided by you (which should be a small tape and need only be
-able to hold two megabytes) in the following format: 9-track, ASCII,
-odd parity, unlabelled, 6250 bpi (unless otherwise indicated).
-Logical record: 80 bytes, physical record: 3200 bytes (i.e. blocking
-factor 40). There are a total of 71 files. The first one describes
-the contents of the package.
-
-POLICIES: The package is distributed free. I do not make it
-available or support it in South Africa. The package will be
-written on the diskettes or tape, which will be mailed back. They
-can be sent to:
-
- Joe Felsenstein
-Electronic mail addresses: Department of Genetics SK-50
- Internet: joe at genetics.washington.edu University of Washington
- Bitnet/EARN: felsenst at uwavm Seattle, Washington 98195
- UUCP: uw-beaver!evolution.genetics!joe U.S.A.
-
-
-===================== End of Phylip Info. Sheet ====================
-
-
-
-
-REFERENCES.
-
-Dayhoff, M.O., Schwartz, R.M. and Orcutt, B.C. (1978) in Atlas of
-Protein Sequence and Structure, Vol. 5 supplement 3, Dayhoff, M.O.
-(ed.), NBRF, Washington, p. 345.
-
-Felsenstein, J. (1985) Confidence limits on phylogenies: an
-approach using the bootstrap. Evolution 39, 783-791.
-
-Feng, D.-F. and Doolittle, R.F. (1987) Progressive sequence
-alignment as a prerequisite to correct phylogenetic trees.
-J.Mol.Evol. 25, 351-360.
-
-Gotoh, O. (1982) An improved algorithm for matching biological
-sequences. J.Mol.Biol. 162, 705-708.
-
-Gribskov, M., McLachlan, A.D. and Eisenberg, D. (1987) Profile
-analysis: detection of distantly related proteins. PNAS USA 84,
-4355-4358.
-
-Higgins, D.G. and Sharp, P.M. (1988) CLUSTAL: a package for
-performing multiple sequence alignments on a microcomputer. Gene
-73, 237-244.
-
-Higgins, D.G. and Sharp, P.M. (1989) Fast and sensitive multiple
-sequence alignments on a microcomputer. CABIOS 5, 151-153.
-
-Kimura, M. (1980) A simple method for estimating evolutionary
-rates of base substitutions through comparative studies of
-nucleotide sequences. J. Mol. Evol. 16, 111-120.
-
-Kimura, M. (1983) The Neutral Theory of Molecular Evolution.
-Cambridge University Press, Cambridge, England.
-
-Li, W.-H., Wu, C.-I. and Luo, C.-C. (1985) A new method for
-estimating synonymous and nonsynonymous rates of nucleotide
-substitution considering the relative likelihood of nucleotide and
-codon changes. Mol.Biol.Evol. 2, 150-174.
-
-Myers, E.W. and Miller, W. (1988) Optimal alignments in linear
-space. CABIOS 4, 11-17.
-
-Pearson, W.R. and Lipman, D.J. (1988) Improved tools for biological
-sequence comparison. PNAS USA 85, 2444-2448.
-
-Saitou, N. and Nei, M. (1987) The neighbor-joining method: a new
-method for reconstructing phylogenetic trees. Mol.Biol.Evol. 4,
-406-425.
-
-Sneath, P.H.A. and Sokal, R.R. (1973) Numerical Taxonomy. Freeman,
-San Francisco.
-
-Sokal, R.R. and Michener, C.D. (1958) A statistical method for
-evaluating systematic relationships. Univ.Kansas Sci.Bull. 38,
-1409-1438.
-
-Vingron, M. and Argos, P. (1991) Motif recognition and alignment
-for many sequences by comparison of dot matrices. J.Mol.Biol. 218,
-33-43.
-
-Wilbur, W.J. and Lipman, D.J. (1983) Rapid similarity searches of
-nucleic acid and protein data banks. PNAS USA 80, 726-730.
-
Deleted: trunk/packages/clustalw/trunk/clustalw.c
===================================================================
--- trunk/packages/clustalw/trunk/clustalw.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/clustalw.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,122 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#ifdef MAC
-#include <console.h>
-#endif
-#include "clustalw.h"
-
-/*
-* Prototypes
-*/
-
-#ifdef MAC
-extern int ccommand(char ***);
-#endif
-
-extern void *ckalloc(size_t);
-extern void init_amenu(void);
-extern void init_interface(void);
-extern void init_matrix(void);
-extern void fill_chartab(void);
-extern void parse_params(Boolean);
-extern void main_menu(void);
-
-/*
-* Global variables
-*/
-double **tmat;
-
-char revision_level[] = "W (1.83)"; /* JULIE feb 2001*/
-
-Boolean interactive=FALSE;
-
-#ifdef MSDOS
- char *help_file_name = "clustalw.hlp";
-#else
- char *help_file_name = "/usr/share/clustalw/clustalw_help";
-#endif
-
-sint max_names; /* maximum length of names in current alignment file */
-
-float gap_open, gap_extend;
-float pw_go_penalty, pw_ge_penalty;
-
-FILE *tree;
-FILE *clustal_outfile, *gcg_outfile, *nbrf_outfile, *phylip_outfile,
- *gde_outfile, *nexus_outfile;
-FILE *fasta_outfile; /* Ramu */
-
-sint *seqlen_array;
-sint max_aln_length;
-short usermat[NUMRES][NUMRES], pw_usermat[NUMRES][NUMRES];
-short def_aa_xref[NUMRES+1], aa_xref[NUMRES+1], pw_aa_xref[NUMRES+1];
-short userdnamat[NUMRES][NUMRES], pw_userdnamat[NUMRES][NUMRES];
-short def_dna_xref[NUMRES+1], dna_xref[NUMRES+1], pw_dna_xref[NUMRES+1];
-sint nseqs;
-sint nsets;
-sint *output_index;
-sint **sets;
-sint *seq_weight;
-sint max_aa;
-sint gap_pos1;
-sint gap_pos2;
-sint mat_avscore;
-sint profile_no;
-
-Boolean usemenu;
-Boolean dnaflag;
-Boolean distance_tree;
-
-char **seq_array;
-char **names,**titles;
-char **args;
-char seqname[FILENAMELEN+1];
-
-char *gap_penalty_mask1 = NULL, *gap_penalty_mask2 = NULL;
-char *sec_struct_mask1 = NULL, *sec_struct_mask2 = NULL;
-sint struct_penalties;
-char *ss_name1 = NULL, *ss_name2 = NULL;
-
-Boolean user_series = FALSE;
-UserMatSeries matseries;
-short usermatseries[MAXMAT][NUMRES][NUMRES];
-short aa_xrefseries[MAXMAT][NUMRES+1];
-
-int main(int argc,char **argv)
-{
- sint i;
-
-#ifdef MAC
- argc=ccommand(&argv);
-#endif
-
- init_amenu();
- init_interface();
- init_matrix();
-
- fill_chartab();
-
- if(argc>1) {
- args = (char **)ckalloc(argc * sizeof(char *));
-
- for(i=1;i<argc;++i)
- {
- args[i-1]=(char *)ckalloc((strlen(argv[i])+1) * sizeof(char));
- strcpy(args[i-1],argv[i]);
- }
- usemenu=FALSE;
- parse_params(FALSE);
-
- for(i=0;i<argc-1;i++)
- ckfree(args[i]);
- ckfree(args);
- }
- usemenu=TRUE;
- interactive=TRUE;
-
- main_menu();
-
- exit(0);
-}
-
Deleted: trunk/packages/clustalw/trunk/clustalw.doc
===================================================================
--- trunk/packages/clustalw/trunk/clustalw.doc 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/clustalw.doc 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,757 +0,0 @@
-README for Clustal W version 1.7 June 1997
-
- Clustal W version 1.7 Documentation
-
-This file provides some notes on the latest changes, installation and usage
-of the Clustal W multiple sequence alignment program.
-
-
-
-Julie Thompson (Thompson at EMBL-Heidelberg.DE)
-Toby Gibson (Gibson at EMBL-Heidelberg.DE)
-
-European Molecular Biology Laboratory
-Meyerhofstrasse 1
-D 69117 Heidelberg
-Germany
-
-
-Des Higgins (Higgins at ucc.ie)
-
-University of County Cork
-Cork
-Ireland
-
-
-Please e-mail bug reports/complaints/suggestions (polite if possible)
-to Toby Gibson or Des Higgins.
-
-
-
-Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994)
-CLUSTAL W: improving the sensitivity of progressive multiple sequence alignment
-through sequence weighting, positions-specific gap penalties and weight matrix
-choice. Nucleic Acids Research, 22:4673-4680.
-
---------------------------------------------------------------
-
-What's New (June 1997) in Version 1.7 (since version 1.6).
-
-
-1. The static arrays used by clustalw for storing the alignment data have been
-replaced by dynamically allocated memory. There is now no limit on the number
-or length of sequences which can be input.
-
-2. The alignment of DNA sequences now offers a new hard-coded matrix, as well
-as the identity matrix used previously. The new matrix is the default scoring
-matrix used by the BESTFIT program of the GCG package for the comparison of
-nucleic acid sequences. X's and N's are treated as matches to any IUB ambiguity
-symbol. All matches score 1.9; all mismatches for IUB symbols score 0.0.
-
-3. The transition weight option for aligning nucleotide sequences has been
-changed from an on/off toggle to a weight between 0 and 1. A weight of zero
-means that the transitions are scored as mismatches; a weight of 1 gives
-transitions the full match score. For distantly related DNA sequences, the
-weight should be near to zero; for closely related sequences it can be useful
-to assign a higher score.
-
-4. The RSF sequence alignment file format used by GCG Version 9 can now be
-read.
-
-5. The clustal sequence alignment file format has been changed to allow
-sequence names longer than 10 characters. The maximum length allowed is set in
-clustalw.h by the statement:
-#define MAXNAMES 10
-
-For the fasta format, the name is taken as the first string after the '>'
-character, stopping at the first white space. (Previously, the first 10
-characters were taken, replacing blanks by underscores).
-
-6. The bootstrap values written in the phylip tree file format can be assigned
-either to branches or nodes. The default is to write the values on the nodes,
-as this can be read by several commonly-used tree display programs. But note
-that this can lead to confusion if the tree is rooted and the bootstraps may
-be better attached to the internal branches: Software developers should ensure
-they can read the branch label format.
-
-7. The sequence weighting used during sequence to profile alignments has been
-changed. The tree weight is now multiplied by the percent identity of the
-new sequence compared with the most closely related sequence in the profile.
-
-8. The sequence weighting used during profile to profile alignments has been
-changed. A guide tree is now built for each profile separately and the
-sequence weights calculated from the two trees. The weights for each
-sequence are then multiplied by the percent identity of the sequence compared
-with the most closely related sequence in the opposite profile.
-
-9. The adjustment of the Gap Opening and Gap Extension Penalties for sequences
-of unequal length has been improved.
-
-10. The default order of the sequences in the output alignment file has been
-changed. Previously the default was to output the sequences in the same order
-as the input file. Now the default is to use the order in which the sequences
-were aligned (from the guide tree/dendrogram), thus automatically grouping
-closely related sequences.
-
-11. The option to 'Reset Gaps between alignments' has been switched off by
-default.
-
-12. The conservation line output in the clustal format alignment file has been
-changed. Three characters are now used:
-'*' indicates positions which have a single, fully conserved residue
-':' indicates that one of the following 'strong' groups is fully conserved:-
- STA
- NEQK
- NHQK
- NDEQ
- QHRK
- MILV
- MILF
- HY
- FYW
-
-'.' indicates that one of the following 'weaker' groups is fully conserved:-
- CSA
- ATV
- SAG
- STNK
- STPA
- SGND
- SNDEQK
- NDEQHK
- NEQHRK
- FVLIM
- HFY
-
-These are all the positively scoring groups that occur in the Gonnet Pam250
-matrix. The strong and weak groups are defined as strong score >0.5 and weak
-score =<0.5 respectively.
-
-13. A bug in the modification of the Myers and Miller alignment algorithm
-for residue-specific gap penalites has been fixed. This occasionally caused
-new gaps to be opened a few residues away from the optimal position.
-
-14. The GCG/MSF input format no longer needs the word PILEUP on the first
-line. Several versions can now be recognised:-
- 1. The word PILEUP as the first word in the file
- 2. The word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
- as the first word in the file
- 3. The characters MSF on the first line in the line, and the
- characters .. at the end of the line.
-
-15. The standard command line separator for UNIX systems has been changed from
-'/' to '-'. ie. to give options on the command line, you now type
-
- clustalw input.aln -gapopen=8.0
-
-instead of clustalw input.aln /gapopen=8.0
-
-
- ATTENTION SOFTWARE DEVELOPERS!!
- -------------------------------
-
-The CLUSTAL sequence alignment output format has been modified:
-
-1. Names longer than 10 chars are now allowed. (The maximum is specified in
-clustalw.h by '#define MAXNAMES'.)
-
-2. The consensus line now consists of three characters: '*',':' and '.'. (Only
-the '*' and '.' were previously used.)
-
-3. An option (not the default) has been added, allowing the user to print out
-sequence numbers at the end of each line of the alignment output.
-
-4. Both RNA bases (U) and base ambiguities are now supported in nucleic acid
-sequences. In the past, all characters (upper or lower case) other than
-a,c,g,t or u were converted to N. Now the following characters are recognised
-and retained in the alignment output: ABCDGHKMNRSTUVWXY (upper or lower case).
-
-5. A Blank line inadvertently added in the version 1.6 header has been taken
-out again.
-
-
---------------------------------------------------------------
-
-What's New (March 1996) in Version 1.6 (since version 1.5).
-
-
-1) Improved handling of sequences of unequal length. Previously, we
-increased the gap extension penalties for both sequences if the two sequences
-(or groups of previously aligned sequences) were of different lengths.
-Now, we increase the gap opening and extension penalties for the shorter
-sequence only. This helps prevent short sequences being stretched out
-along longer ones.
-
-2) Added the "Gonnet" series of weight matrices (from Gaston Gonnet and
-co-workers at the ETH in Zurich). Fixed a bug in the matrix
-choice menu; now PAM matrices can be selected ok.
-
-3) Added secondary structure/gap penalty masks. These allow you to
-include, in an alignment, a position specific set of gap penalties.
-You can either set a gap opening penalty at each position or specify
-the secondary strcuture (if protein; alpha helix, beta strand or loop)
-and have gap penalties set automatically. This, basically, is used to make
-gaps harder to open inside helices or strands.
-
-These masks are only used in the "profile alignment" menu. They may be read in
-as part of an alignment in a special format (see the on-line help for
-details) or associated with each sequence, if the sequences are in Swiss Prot
-format and secondary structure information is given. All of the mask
-parameters can be set from the profile alignment menu. Basically, the
-mask is made up of a series of numbers between 1 and 9, one per position.
-The gap opening penalty at a position is calculated as the starting penalty
-multipleied by the mask value at that site.
-
-4) Added command line options /profile and /sequences.
-These allow uses to choose between normal profile alignment where the
-two profiles (pre-existing alignments specified in the files
-/profile1= and /profile2=) are merged/aligned with each other (/profile)
-and the case where the individual sequences in /profile2 are aligned
-sequentially with the alignment in /profile1 (/sequences).
-
-5) Fixed bug in modified Myers and Miller algorithm - gap penalty score
-was not always calculated properly for type 2 midpoints. This is the core
-alignment algorithm.
-
-6) Only allows one output file format to be selected from command line
-- ie. multiple output alignment files are not allowed.
-
-7) Fixed 'bad calls to ckfree' error during calculation of phylip distance
-matrix.
-
-8) Fixed command line options /gapopen /gapext /type=protein /negative.
-
-9) Allowed user to change command line separator on UNIX from '/' to '-'.
-This allows unix users to use the more conventinal '-' symbol
-for seperating command line options. "/" can then be used in unix
-file names on the command line. The symbol that is used,
-is specified in the file clustalw.h which must be edited if you
-wish to change it (and the program must then be recompiled). Find the
-block of code in clustalw.h that corrsponds to the operating system you
-are using. These blocks are started by one of the following:
-
-#ifdef VMS
-#elif MAC
-#elif MSDOS
-#elif UNIX
-
-On the next line after each is the line:
-
-#define COMMANDSEP '/'
-
-Change this in the appropriate block of code (e.g. the UNIX block) to
-
-#define COMMANDSEP '-'
-
-if you wish to use the "-" character as command seperator.
-
-
-
---------------------------------------------------------------
-
-What's New (April 1995) in Version 1.5 (since version 1.3).
-
-1) ported to MAC and PC. These versions are quite slow unless you
-have a nice beefy machine. On a Power Mac or a Pentium box
-it is nice and fast. Two precompiled versions are supplied for Macs
-(Power mac and old mac versions).
-Mac: 1500 residues by 100 sequences
-Power Mac 3000 " " " "
-PC 1500 " " " "
-
-2) alignment of new sequences to an alignment. Fixed a serious bug
-which assigned weights to the wrong sequences. Now also, weights
-sequences according to distance from the incoming sequence. The
-new weights are: tree weights * similarity to incoming sequence.
-The tree weights are the old weights that we derive from the tree
-connecting all the sequences in the existing alignment.
-
-3) for all platforms, output linelength = 60.
-
-4) Bootstrap files (*.phb): the "final" node (arbitrary trichotomy
-at the end of the neighbor-joining process) is labelled as
-TRICHOTOMY in the bootstrap output files. This is to help
-link bootstrap figures with nodes when you reroot the tree.
-
-5) Command line /bootstrap option now more robust.
-
---------------------------------------------------------------
-INTRODUCTION
-
-
-
-This document gives some BRIEF notes about usage of the Clustal W
-multiple alignment program for UNIX and VMS machines. Clustal W
-is a major update and rewrite of the Clustal V program which
-was described in:
-
-Higgins, D.G., Bleasby, A.J. and Fuchs, R. (1992)
-CLUSTAL V: improved software for multiple sequence alignment.
-Computer Applications in the Biosciences (CABIOS), 8(2):189-191.
-
-The main new features are a greatly improved (more sensitive)
-multiple alignment procedure for proteins and improved support
-for different file formats. This software was described in:
-
-Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994)
-CLUSTAL W: improving the sensitivity of progressive multiple
-sequence alignment through sequence weighting, position specific
-gap penalties and weight matrix choice.
-Nucleic Acids Research, 22(22):4673-4680.
-
-
-The usage of Clustal W is largely the same as for
-Clustal V details of which are described in clustalv.doc. Details of the
-new alignment algorithms are described in the manuscript by
-Thompson et. al. above, an ascii/text version of which is included
-(clustalw.ms). This file lists some of the details not covered by either
-of the above documents.
-
-
-There are brief notes on the following topics:
-
-1) Installation for VMS and UNIX and MAC and PC
-2) File input
-3) file output
-4) changes to the alignment algorithms
-5) minor modifications to the phylogenetic tree and bootstrapping methods
-6) summary of the command line usage.
-
--------------------------------------------------------------------
-
-1) INSTALLATION (for Unix, VAX/VMS, PC and MAC)
-
-
-
-*****IMPORTANT*****
-If you wish to recompile the program (or compile it for the first
-time; you will have to do this with UNIX):
-first check the file CLUSTALW.H which needs to be changed if you
-move the code from between unix and vms machines. At the top
-of the file are four lines which define one of VMS, MSDOS, MAC or
-UNIX to be 1. All of these EXCEPT one must be commented out
-using enclosed /* ... */.
-*******************
-
-
-Unix
------
-
-Make files are supplied for unix machines. The code was compiled and
-tested using Decstation (Ultrix), SUN (Gnu C compiler/gcc), Silicon
-Graphics (IRIX) and DEC/Alpha (OSF1). We have not tested the code on any other
-systems. Just use makefile to make on most systems. For Sun, you need to
-have the Gnuc C (gcc) compiler installed ... use the file makefile.sun in this
-case. You make the program with:
-make (or make -f makefile.sun)
-
-This produces the file clustalw which can be run by typing clustalw and
-pressing return. The help file is called clustalw_help
-
-
-VMS
-----
-
-There is a small DCL command file (VMSLINK.COM) to compile and link the
-code for VMS machines (vax or alpha). This procedure just compiles the
-source files and links using default settings. Run it using:
-$ @vmslink
-This produces Clustalw.exe which can be run using the run command:
-$ run clustalw
-
-The intermediate object files can be deleted with:
-$ del *.obj;
-
-There is an extensive command line facility. To use this, you must
-create a symbol to run the program (and put this in your login.com file).
-e.g.
-$ clustalw :== $$drive:[dir.dir]clustalw
-where $drive is the drive on which the executable file is stored (clustalw.exe)
-and [dir.dir] is the full directory specification. NOTE THE EXTRA DOLLAR SIGN.
-Then the program can be run using the command:
-$ clustalw
-
-
-PC
-__
-
-We supply an executable file (Clustalw.exe) which will run using MSDOS.
-It will also run under windows (as a DOS application)
-*** IF you have a maths coprocessor***. If you do not have a maths chip
-(e.g. 80387), the program can only be run under MSDOS. In the latter case,
-you must have the file EMU387.exe in the same directory as CLUSTALW.EXE.
-This file emulates a maths chip if you do not have one.
-
-
-We generated the executable file using gnu c for MSDOS.
-It will also compile (with about 10,000 warning messages)
-using Microsoft C but we have not tested it and there appear to be problems
-with the executable.
-
-You will need to use a "memory extender" to allow the program to get at more
-than 640kb of memory.
-
-
-
-MAC
----
-
-The code compiles for Power Mac and older macs using Metroworks Codewarrior
-C compiler. We supply 2 executable programs (one each for PowerMac and
-older mac): ClustalwPPC and Clustalw68k). These need up to
-10mb of memory to run which needs to be adjusted with the Get Info (%I)
-command from the Finder if you have problems. Just double click the
-executable file name or icon and off you go (we hope).
-
-As a special treat for Mac users, we supply an executable and brief readme
-file for NJPLOT. This is a really nice program by Manolo Gouy
-(University of Lyon, France) that allows you to import the trees
-made by Clustal W and display them/manipulate them. It will properly
-display the bootstrap figures from the *.phb files. It can export the
-trees in PICT format which can then be used by MacDraw for example.
-
-
--------------------------------------------------------------------------
-
-2) FILE INPUT (sequences to be aligned)
-
-
-
-The sequences must all be in one file (or two files for a "profile alignment")
-in ONE of the following formats:
-
-FASTA (Pearson), NBRF/PIR, EMBL/Swiss Prot, GDE, CLUSTAL, GCG/MSF, GCG9/RSF.
-
-The program tries to "guess" which format is being used and whether
-the sequences are nucleic acid (DNA/RNA) or amino acid (proteins). The
-format is recognised by the first characters in the file. This is kind
-of stupid/crude but works most of the time and it is difficult
-to do reliably, any other way.
-
-
-Format First non blank word or character in the file.
-...............................................................
-FASTA >
-NBRF >P1; or >D1;
-EMBL/SWISS ID
-GDE protein %
-GDE nucleotide #
-CLUSTAL CLUSTAL (blocked multiple alignments)
-GCG/MSF PILEUP or !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
- or MSF on the first line, and '..' at the end of line
-GCG9/RSF !!RICH_SEQUENCE
-
-Note, that the only way of spotting that a file is MSF format is if
-the word PILEUP appears at the very beginning of the file. If you
-produce this format from software other than the GCG pileup program,
-then you will have to insert the word PILEUP at the start of the file.
-Similarly, if you use clustal format, the word CLUSTAL must appear first.
-
-All of these formats can be used to read in AN EXISTING FULL ALIGNMENT.
-With CLUSTAL format, this is just the same as the output format of this
-program and Clustal V. If you use PILEUP or CLUSTAL format, all sequences
-must be the same length, INCLUDING GAPS ("-" in clustal format; "." in MSF).
-With the other formats, sequences can be gapped with "-" characters. If you
-read in any gaps these are kept during any later alignments. You can use
-this facility to read in an alignment in order to calculate a phylogenetic
-tree OR to output the same alignment in a different format (from the
-output format options menu of the multiple alignment menu) e.g. read
-in a GCG/MSF format alignment and output a PHYLIP format alignment. This is
-also useful to read in one reference alignment and to add one or more new
-sequences to it using the "profile alignment" facilities.
-
-DNA vs. PROTEIN: the program will count the number of A,C,G,T,U and N
-charcters. If 85% or more of the characters in a sequence are as above,
-then DNA/RNA is assumed, protein otherwise.
-
--------------------------------------------------------------------------
-
-
-3) FILE OUTPUT
-
-
-1) the alignments.
-
-In the multiple alignment and profile alignment menus, there is a menu
-item to control the output format(s).
-
-The alignment output format can be set to any (or all) of:
-CLUSTAL (a self explanatory blocked alignment)
-NBRF/PIR (same as input format but with "-" characters for gaps)
-MSF (the main GCG package multiple alignment format)
-PHYLIP (Joe Felsenstein's phylogeny inference package. Gaps are set to
- "-" characters. For some programs (e.g. PROTPARS/DNAPARS) these
- should be changed to "?" characters for unknown residues.
-GDE (Used by Steven Smith's GDE package)
-
-You can also choose between having the sequences in the same order as in
-the input file or writing them out in an order that more closely matches the
-order used to carry out the multiple alignment.
-
-
-2) The trees.
-
-Believe it or not, we now use the New Hampshire (nested parentheses)
-format as default for our trees. This format is compatible with e.g. the
-PHYLIP package. If you want to view a tree, you can use the RETREE or
-DRAWGRAM/DRAWTREE programs of PHYLIP. This format is used for all our
-trees, even the initial guide trees for deciding the order of multiple
-alignment. The output trees from the phylogenetic tree menu can also be
-requested in our old verbose/cryptic format. This may be more useful
-if, for example, you wish to see the bootstrap figures. The bootstrap
-trees in the default New Hampshire format give the bootstrap figures
-as extra labels which can be viewed very easily using TREETOOL which is
-available as part of the GDE package. TREETOOL is available from the
-RDP project by ftp from rdp.life.uiuc.edu.
-
-The New Hampshire format is only useful if you have software to display or
-manipulate the trees. The PHYLIP package is highly recommended if you intend
-to do much work with trees and includes programs for doing this. If you do
-not have such software, request the trees in the older clustal format
-and see the documentation for Clustal V (clustalv.doc). WE DO NOT PROVIDE
-ANY DIRECT MEANS FOR VIEWING TREES GRAPHICALLY.
-
--------------------------------------------------------------------------
-
-4) THE ALIGNMENT ALGORITHMS
-
-
-The basic algorithm is the same as for Clustal V and is described in some
-detail in clustalv.doc. The new modifications are described in detail in
-clustalw.ms. Here we just list some notes to help answer some of the most
-obvious questions.
-
-
-Terminal Gaps
-
-In the original Clustal V program, terminal gaps were penalised the same
-as all other gaps. This caused some ugly side effects e.g.
-
-acgtacgtacgtacgt acgtacgtacgtacgt
-a----cgtacgtacgt gets the same score as ----acgtacgtacgt
-
-NOW, terminal gaps are free. This is better on average and stops silly
-effects like single residues jumping to the edge of the alignment. However,
-it is not perfect. It does mean that if there should be a gap near the end
-of the alignment, the program may be reluctant to insert it i.e.
-
-cccccgggccccc cccccgggccccc
-ccccc---ccccc may be considered worse (lower score) than cccccccccc---
-
-In the right hand case above, the terminal gap is free and may score higher
-than the laft hand alignment. This can be prevented by lowering the gap
-opening and extension penalties. It is difficult to get this right all the
-time. Please watch the ends of your alignments.
-
-
-
-Speed of the initial (pairwise) alignments (fast approximate/slow accurate)
-
-By default, the initial pairwise alignments are now carried out using a full
-dynamic programming algorithm. This is more accurate than the older hash/
-k-tuple based alignments (Wilbur and Lipman) but is MUCH slower. On a fast
-workstation you may not notice but on a slow box, the difference is extreme.
-You can set the alignment method from the menus easily to the older, faster
-method.
-
-
-
-Delaying alignment of distant sequences
-
-The user can set a cut off to delay the alignment of the most divergent
-sequences in a data set until all other sequences have been aligned. By
-default, this is set to 40% which means that if a sequence is less than 40%
-identical to any other sequence, its alignment will be delayed.
-
-
-
-Iterative realignment/Reset gaps between alignments
-
-By default, if you align a set of sequences a second time (e.g. with changed
-gap penalties), the gaps from the first alignment are discarded. You can
-set this from the menus so that older gaps will be kept between alignments,
-This can sometimes give better alignments by keeping the gaps (do not reset
-them) and doing the full multiple alignment a second time. Sometimes, the
-alignment will converge on a better solution; sometimes the new alignment will
-be the same as the first. There can be a strange side effect: you can get
-columns of nothing but gaps introduced.
-
-Any gaps that are read in from the input file are always kept, regardless
-of the setting of this switch. If you read in a full multiple alignment, the "reset
-gaps" switch has no effect. The old gaps will remain and if you carry out
-a multiple alignment, any new gaps will be added in. If you wish to carry out
-a full new alignment of a set of sequences that are already aligned in a file
-you must input the sequences without gaps.
-
-
-
-Profile alignment
-
-By profile alignment, we simply mean the alignment of old alignments/sequences.
-In this context, a profile is just an existing alignment (or even a set of
-unaligned sequences; see below). This allows you to
-read in an old alignment (in any of the allowed input formats) and align
-one or more new sequences to it. From the profile alignment menu, you
-are allowed to read in 2 profiles. Either profile can be a full alignment
-OR a single sequence. In the simplest mode, you simply align the two profiles
-to each other. This is useful if you want to gradually build up a full
-multiple alignment.
-
-A second option is to align the sequences from the second profile, one at
-a time to the first profile. This is done, taking the underlying tree between
-the sequences into account. This is useful if you have a set of new sequences
-(not aligned) and you wish to add them all to an older alignment.
-
-----------------------------------------------------------------------------
-
-5) CHANGES TO THE PHYLOGENTIC TREE CALCULATIONS AND SOME HINTS.
-
-
-
-IMPROVED DISTANCE CALCULATIONS FOR PROTEIN TREES
-
-
-The phylogenetic trees in Clustal W (the real trees that you calculate
-AFTER alignment; not the guide trees used to decide the branching order
-for multiple alignment) use the Neighbor-Joining method of Saitou and
-Nei based on a matrix of "distances" between all sequences. These distances
-can be corrected for "multiple hits". This is normal practice when accurate
-trees are needed. This correction stretches distances (especially large ones)
-to try to correct for the fact that OBSERVED distances (mean number of
-differences per site) greatly underestimate the actual number that happened
-during evolution.
-
-In Clustal V we used a simple formula to convert an observed distance to one
-that is corrected for multiple hits. The observed distance is the mean number
-of differences per site in an alignment (ignoring sites with a gap) and is
-therefore always between 0.0 (for ientical sequences) an 1.0 (no residues the
-same at any site). These distances can be multiplied by 100 to give percent
-difference values. 100 minus percent difference gives percent identity.
-The formula we use to correct for multiple hits is from Motoo Kimura
-(Kimura, M. The neutral Theory of Molecular Evolution, Camb.Univ.Press, 1983,
-page 75) and is:
-
-K = -Ln(1 - D - (D.D)/5) where D is the observed distance and K is
- corrected distance.
-
-This formula gives mean number of estimated substitutions per site and, in
-contrast to D (the observed number), can be greater than 1 i.e. more than
-one substitution per site, on average. For example, if you observe 0.8
-differences per site (80% difference; 20% identity), then the above formula
-predicts that there have been 2.5 substitutions per site over the course
-of evolution since the 2 sequences diverged. This can also be expressed in
-PAM units by multiplying by 100 (mean number of substitutions per 100 residues).
-The PAM scale of evolution and its derivation/calculation comes from the
-work of Margaret Dayhoff and co workers (the famous Dayhoff PAM series
-of weight matrices also came from this work). Dayhoff et al constructed
-an elaborate model of protein evolution based on observed frequencies
-of substitution between very closely related proteins. Using this model,
-they derived a table relating observed distances to predicted PAM distances.
-Kimura's formula, above, is just a "curve fitting" approximation to this table.
-It is very accurate in the range 0.75 > D > 0.0 but becomes increasingly
-unaccurate at high D (>0.75) and fails completely at around D = 0.85.
-
-To circumvent this problem, we calculated all the values for K corresponding
-to D above 0.75 directly using the Dayhoff model and store these in an
-internal table, used by Clustal W. This table is declared in the file dayhoff.h and
-gives values of K for all D between 0.75 and 0.93 in intervals of 0.001 i.e.
-for D = 0.750, 0.751, 0.752 ...... 0.929, 0.930. For any observed D
-higher than 0.930, we arbitrarily set K to 10.0. This sounds drastic but
-with real sequences, distances of 0.93 (less than 7% identity) are rare.
-If your data set includes sequences with this degree of divergence, you
-will have great difficulty getting accurate trees by ANY method; the alignment
-itself will be very difficult (to construct and to evaluate).
-
-There are some important
-things to note. Firstly, this formula works well if your sequences are
-of average amino acid composition and if the amino acids substitute according
-to the original Dayhoff model. In other cases, it may be misleading. Secondly,
-it is based only on observed percent distance i.e. it does not DIRECTLY
-take conservative substitutions into account. Thirdly, the error on the
-estimated PAM distances may be VERY great for high distances; at very high
-distance (e.g. over 85%) it may give largely arbitrary corrected distances.
-In most cases, however, the correction is still worth using; the trees will
-be more accurate and the branch lengths will be more realistic.
-
-A far more sophisticated distance correction based on a full Dayhoff
-model which DOES take conservative substitutions and actual amino acid
-composition into account, may be found in the PROTDIST program of the
-PHYLIP package. For serious tree makers, this program is highly recommended.
-
-
-
-TWO NOTES ON BOOTSTRAPPING...
-
-When you use the BOOTSTRAP in Clustal W to estimate the reliability of parts
-of a tree, many of the uncorrected distances may randomly exceed the arbitrary cut
-off of 0.93 (sequences only 7% identical) if the sequences are distantly
-related. This will happen randomly i.e. even if none of the pairs of
-sequences are less than 7% identical, the bootstrap samples may contain pairs
-of sequences that do exceed this cut off.
-If this happens, you will be warned. In practice, this can
-happen with many data sets. It is not a serious problem if it happens rarely.
-If it does happen (you are warned when it happens and told how often the
-problem occurs), you should consider removing the most distantly
-related sequences and/or using the PHYLIP package instead.
-
-
-A further problem arises in almost exactly the opposite situation: when
-you bootstrap a data set which contains 3 or more sequences that are identical
-or almost identical. Here, the sets of identical sequences should be shown
-as a multifurcation (several sequences joing at the same part of the tree).
-Because the Neighbor-Joining method only gives strictly dichotomous trees
-(never more than 2 sequences join at one time), this cannot be exactly
-represented. In practice, this is NOT a problem as there will be some
-internal branches of zero length seperating the sequences. If you
-display the tree with all branch lengths, you will still see a multifurcation.
-However, when you bootstrap
-the tree, only the branching orders are stored and counted. In the case
-of multifurcations, the exact branching order is arbitrary but the program
-will always get the same branching order, depending only on the input order
-of the sequences. In practice, this is only a problem in situations where
-you have a set of sequences where all of them are VERY similar. In this case,
-you can find very high support for some groupings which will disappear if you
-run the analysis with a different input order. Again, the PHYLIP package
-deals with this by offering a JUMBLE option to shuffle the input order
-of your sequences between each bootstrap sample.
-
-----------------------------------------------------------------------------
-
-6) SUMMARY OF THE COMMAND LINE USAGE
-
-Clustal W is designed to be run interactively. However, there are many
-situations where it is convenient to run it from the command line, especially
-if you wish to run it from another piece of software (e.g. SeqApp or GDE).
-All parameters can be set from the command line by giving options after the
-clustalw command. On UNIX options should be preceded by '-', all other systems
-use the '/' character.
-
-If anything is put on the command line, the program will (attempt to) carry
-out whatever is requested and will exit. If you wish to use the command
-line to set some parameters and then go into interactive mode, use the
-command line switch: interactive .... e.g.
-
-clustalw -quicktree -interactive on UNIX
-or
-clustalw /quicktree /interactive on VMS,MAC and PC
-
-will set the default initial alignment mode to fast/approximate and will then
-go to the main menu.
-
-
-To see a list of all the command line parameters, type:
-
-clustalw -options on UNIX
-or
-clustalw /options on VMS,MAC and PC
-
-and you will see a list with no explanation.
-
-
-To get (VERY BRIEF) help on command line usage, use the /HELP or /CHECK
-(-help or -check on UNIX systems) options. Otherwise, the command line
-usage is self explanatory or is explained in clustalv.doc. The defaults
-for all parameters are set in the file param.h which can be changed easily
-(remember to recompile the program afterwards :-).
-
-------------------------------------------------------------------------------
Deleted: trunk/packages/clustalw/trunk/clustalw.h
===================================================================
--- trunk/packages/clustalw/trunk/clustalw.h 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/clustalw.h 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,250 +0,0 @@
-/*#include "/us1/user/julie/dmalloc/malloc.h"*/
-/*********************CLUSTALW.H*********************************************/
-/****************************************************************************/
-
- /*
- Main header file for ClustalW. Uncomment ONE of the following 4 lines
- depending on which compiler you wish to use.
- */
-
-/*#define VMS 1 VAX or ALPHA VMS */
-
-/*#define MAC 1 Think_C for Macintosh */
-
-/*#define MSDOS 1 Turbo C for PC's */
-
-#define UNIX 1 /*Ultrix/Decstation, Gnu C for
- Sun, IRIX/SGI, OSF1/ALPHA */
-
-/***************************************************************************/
-/***************************************************************************/
-
-
-#include "general.h"
-
-#define MAXNAMES 30 /* Max chars read for seq. names */
-#define MAXTITLES 60 /* Title length */
-#define FILENAMELEN 256 /* Max. file name length */
-
-#define UNKNOWN 0
-#define EMBLSWISS 1
-#define PIR 2
-#define PEARSON 3
-#define GDE 4
-#define CLUSTAL 5 /* DES */
-#define MSF 6 /* DES */
-#define RSF 7 /* JULIE */
-#define USER 8 /* DES */
-#define PHYLIP 9 /* DES */
-#define NEXUS 10/* DES */
-#define FASTA 11/* Ramu */
-
-#define NONE 0
-#define SECST 1
-#define GMASK 2
-
-#define PROFILE 0
-#define SEQUENCE 1
-
-#define BS_NODE_LABELS 2
-#define BS_BRANCH_LABELS 1
-
-#define PAGE_LEN 22 /* Number of lines of help sent to screen */
-
-#define PAGEWIDTH 80 /* maximum characters on output file page */
-#define LINELENGTH 60 /* Output file line length */
-#define GCG_LINELENGTH 50
-
-#ifdef VMS /* Defaults for VAX VMS */
-#define COMMANDSEP '/'
-#define DIRDELIM ']' /* Last character before file name in full file
- specs */
-#define INT_SCALE_FACTOR 1000 /* Scaling factor to convert float to integer for profile scores */
-
-#elif MAC
-#define COMMANDSEP '/'
-#define DIRDELIM ':'
-#define INT_SCALE_FACTOR 100 /* Scaling factor to convert float to integer for profile scores */
-
-#elif MSDOS
-#define COMMANDSEP '/'
-#define DIRDELIM '\\'
-#define INT_SCALE_FACTOR 100 /* Scaling factor to convert float to integer for profile scores */
-
-#elif UNIX
-#define COMMANDSEP '-'
-#define DIRDELIM '/'
-#define INT_SCALE_FACTOR 1000 /* Scaling factor to convert float to integer for profile scores */
-#endif
-
-#define NUMRES 32 /* max size of comparison matrix */
-
-#define INPUT 0
-#define ALIGNED 1
-
-#define LEFT 1
-#define RIGHT 2
-
-#define NODE 0
-#define LEAF 1
-
-#define GAPCOL 32 /* position of gap open penalty in profile */
-#define LENCOL 33 /* position of gap extension penalty in profile */
-
-typedef struct node { /* phylogenetic tree structure */
- struct node *left;
- struct node *right;
- struct node *parent;
- float dist;
- sint leaf;
- int order;
- char name[64];
-} stree, *treeptr;
-
-typedef struct {
- char title[30];
- char string[30];
-} MatMenuEntry;
-
-typedef struct {
- int noptions;
- MatMenuEntry opt[10];
-} MatMenu;
-
-#define MAXMAT 10
-
-typedef struct {
- int llimit;
- int ulimit;
- short *matptr;
- short *aa_xref;
-} SeriesMat;
-
-typedef struct {
- int nmat;
- SeriesMat mat[MAXMAT];
-} UserMatSeries;
-
-
-/*
- Prototypes
-*/
-
-/* alnscore.c */
-void aln_score(void);
-/* interface.c */
-void parse_params(Boolean);
-void init_amenu(void);
-void init_interface(void);
-void main_menu(void);
-FILE *open_output_file(char *, char *, char *, char *);
-FILE *open_explicit_file(char *);
-sint seq_input(Boolean);
-Boolean open_alignment_output(char *);
-void create_alignment_output(sint fseq,sint lseq);
-void align(char *phylip_name);
-void profile_align(char *p1_tree_name,char *p2_tree_name);/* Align 2 alignments */
-void make_tree(char *phylip_name);
-void get_tree(char *phylip_name);
-sint profile_input(void); /* read a profile */
-void new_sequence_align(char *phylip_name);
-Boolean user_mat(char *, short *, short *);
-Boolean user_mat_series(char *, short *, short *);
-void get_help(char);
-void clustal_out(FILE *, sint, sint, sint, sint);
-void nbrf_out(FILE *, sint, sint, sint, sint);
-void gcg_out(FILE *, sint, sint, sint, sint);
-void phylip_out(FILE *, sint, sint, sint, sint);
-void gde_out(FILE *, sint, sint, sint, sint);
-void nexus_out(FILE *, sint, sint, sint, sint);
-void fasta_out(FILE *, sint, sint, sint, sint);
-void print_sec_struct_mask(int prf_length,char *mask,char *struct_mask);
-void fix_gaps(void);
-
-
-/* calcgapcoeff.c */
-void calc_gap_coeff(char **alignment, sint *gaps, sint **profile, Boolean struct_penalties,
- char *gap_penalty_mask, sint first_seq, sint last_seq,
- sint prf_length, sint gapcoef, sint lencoef);
-/* calcprf1.c */
-void calc_prf1(sint **profile, char **alignment, sint *gaps, sint matrix[NUMRES ][NUMRES ],
- sint *seq_weight, sint prf_length, sint first_seq, sint last_seq);
-/* calcprf2.c */
-void calc_prf2(sint **profile, char **alignment, sint *seq_weight, sint prf_length,
- sint first_seq, sint last_seq);
-/* calctree.c */
-void calc_seq_weights(sint first_seq, sint last_seq,sint *seq_weight);
-void create_sets(sint first_seq, sint last_seq);
-sint read_tree(char *treefile, sint first_seq, sint last_seq);
-void clear_tree(treeptr p);
-sint calc_similarities(sint nseqs);
-/* clustalw.c */
-int main(int argc, char **argv);
-/* gcgcheck.c */
-int SeqGCGCheckSum(char *seq, sint len);
-/* malign.c */
-sint malign(sint istart,char *phylip_name);
-sint seqalign(sint istart,char *phylip_name);
-sint palign1(void);
-float countid(sint s1, sint s2);
-sint palign2(char *p1_tree_name,char *p2_tree_name);
-/* pairalign.c */
-sint pairalign(sint istart, sint iend, sint jstart, sint jend);
-/* prfalign.c */
-lint prfalign(sint *group, sint *aligned);
-/* random.c */
-unsigned long linrand(unsigned long r);
-unsigned long addrand(unsigned long r);
-void addrandinit(unsigned long s);
-/* readmat.c */
-void init_matrix(void);
-sint get_matrix(short *matptr, short *xref, sint matrix[NUMRES ][NUMRES ], Boolean neg_flag,
- sint scale);
-sint read_user_matrix(char *filename, short *usermat, short *xref);
-sint read_matrix_series(char *filename, short *usermat, short *xref);
-int getargs(char *inline1, char *args[], int max);
-/* sequence.c */
-void fill_chartab(void);
-sint readseqs(sint first_seq);
-/* showpair.c */
-void show_pair(sint istart, sint iend, sint jstart, sint jend);
-/* trees.c */
-void phylogenetic_tree(char *phylip_name,char *clustal_name,char *dist_name, char *nexus_name, char *pim_name);
-void bootstrap_tree(char *phylip_name,char *clustal_name, char *nexus_name);
-sint dna_distance_matrix(FILE *tree);
-sint prot_distance_matrix(FILE *tree);
-void guide_tree(FILE *tree,int first_seq,sint nseqs);
-
-void calc_percidentity(FILE *pfile);
-
-/* util.c */
-
-void alloc_aln(sint nseqs);
-void realloc_aln(sint first_seq,sint nseqs);
-void free_aln(sint nseqs);
-void alloc_seq(sint seq_no,sint length);
-void realloc_seq(sint seq_no,sint length);
-void free_seq(sint seq_no);
-
-void *ckalloc(size_t bytes);
-void *ckrealloc(void *ptr, size_t bytes);
-void *ckfree(void *ptr);
-char prompt_for_yes_no(char *title,char *prompt);
-void fatal(char *msg, ...);
-void error(char *msg, ...);
-void warning(char *msg, ...);
-void info(char *msg, ...);
-char *rtrim(char *str);
-char *blank_to_(char *str);
-char *upstr(char *str);
-char *lowstr(char *str);
-void getstr(char *instr, int n, char *outstr);
-double getreal(char *instr, double minx, double maxx, double def);
-int getint(char *instr, int minx, int maxx, int def);
-void do_system(void);
-Boolean linetype(char *line, char *code);
-Boolean keyword(char *line, char *code);
-Boolean blankline(char *line);
-void get_path(char *str, char *path);
-
-
Deleted: trunk/packages/clustalw/trunk/clustalw.ms
===================================================================
--- trunk/packages/clustalw/trunk/clustalw.ms 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/clustalw.ms 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,794 +0,0 @@
-This is just an ASCII text version of the manuscript describing
-Clustal W, without the figures. It was published:
-
-Nucleic Acids Research, 22(22):4673-4680.
-
-
-
-CLUSTAL W: improving the sensitivity of progressive multiple
-sequence alignment through sequence weighting, position specific
-gap penalties and weight matrix choice.
-
-
-
-Julie D. Thompson, Desmond G. Higgins1 and Toby J. Gibson*
-
-European Molecular Biology Laboratory
-Postfach 102209
-Meyerhofstrasse 1
-D-69012 Heidelberg
-Germany
-
-
-Phone: +49-6221-387398
-Fax: +49-6221-387306
-E-mail: Gibson at EMBL-Heidelberg.DE
- Des.Higgins at EBI.AC.UK
- Thompson at EMBL-Heidelberg.DE
-
-
-Keywords: Multiple alignment, phylogenetic tree, weight matrix, gap
- penalty, dynamic programming, sequence weighting.
-
-
-1 Current address:
-European Bioinformatics Institute
-Hinxton Hall
-Hinxton
-Cambridge CB10 1RQ
-UK.
-
-* To whom correspondence should be addressed
-
-
-ABSTRACT
-
-The sensitivity of the commonly used progressive multiple sequence
-alignment method has been greatly improved for the alignment of divergent
-protein sequences. Firstly, individual weights are assigned to each sequence
-in a partial alignment in order to downweight near-duplicate sequences and
-upweight the most divergent ones. Secondly, amino acid substitution
-matrices are varied at different alignment stages according to the divergence
-of the sequences to be aligned. Thirdly, residue specific gap penalties and
-locally reduced gap penalties in hydrophilic regions encourage new gaps in
-potential loop regions rather than regular secondary structure. Fourthly,
-positions in early alignments where gaps have been opened receive locally
-reduced gap penalties to encourage the opening up of new gaps at these
-positions. These modifications are incorporated into a new program,
-CLUSTAL W which is freely available.
-
-
-INTRODUCTION
-
-The simultaneous alignment of many nucleotide or amino acid sequences is
-now an essential tool in molecular biology. Multiple alignments are used to
-find diagnostic patterns to characterise protein families; to detect or
-demonstrate homology between new sequences and existing families of
-sequences; to help predict the secondary and tertiary structures of new
-sequences; to suggest oligonucleotide primers for PCR; as an essential prelude
-to molecular evolutionary analysis. The rate of appearance of new sequence
-data is steadily increasing and the development of efficient and accurate
-automatic methods for multiple alignment is, therefore, of major
-importance. The majority of automatic multiple alignments are now carried
-out using the "progressive" approach of Feng and Doolittle (1). In this paper,
-we describe a number of improvements to the progressive multiple
-alignment method which greatly improve the sensitivity without sacrificing
-any of the speed and efficiency which makes this approach so practical. The
-new methods are made available in a program called CLUSTAL W which is
-freely available and portable to a wide variety of computers and operating
-systems.
-
-In order to align just two sequences, it is standard practice to use dynamic
-programming (2). This guarantees a mathematically optimal alignment,
-given a table of scores for matches and mismatches between all amino acids
-or nucleotides (e.g. the PAM250 matrix (3) or BLOSUM62 matrix (4)) and
-penalties for insertions or deletions of different lengths. Attempts at
-generalising dynamic programming to multiple alignments are limited to
-small numbers of short sequences (5). For much more than eight or so
-proteins of average length, the problem is uncomputable given current
-computer power. Therefore, all of the methods capable of handling larger
-problems in practical timescales, make use of heuristics. Currently, the most
-widely used approach is to exploit the fact that homologous sequences are
-evolutionarily related. One can build up a multiple alignment progressively
-by a series of pairwise alignments, following the branching order in a
-phylogenetic tree (1). One first aligns the most closely related sequences,
-gradually adding in the more distant ones. This approach is sufficiently fast
-to allow alignments of virtually any size. Further, in simple cases, the
-quality of the alignments is excellent, as judged by the ability to correctly align
-corresponding domains from sequences of known secondary or tertiary
-structure (6). In more difficult cases, the alignments give good starting points
-for further automatic or manual refinement.
-
-This approach works well when the data set consists of sequences of different
-degrees of divergence. Pairwise alignment of very closely related sequences
-can be carried out very accurately. The correct answer may often be obtained
-using a wide range of parameter values (gap penalties and weight matrix). By
-the time the most distantly related sequences are aligned, one already has a
-sample of aligned sequences which gives important information about the
-variability at each position. The positions of the gaps that were introduced
-during the early alignments of the closely related sequences are not changed
-as new sequences are added. This is justified because the placement of gaps
-in alignments between closely related sequences is much more accurate than
-between distantly related ones. When all of the sequences are highly
-divergent (e.g. less than approximately 25-30% identity between any pair of
-sequences), this progressive approach becomes much less reliable.
-
-There are two major problems with the progressive approach: the local
-minimum problem and the choice of alignment parameters. The local
-minimum problem stems from the "greedy" nature of the alignment strategy.
-The algorithm greedily adds sequences together, following the initial tree.
-There is no guarantee that the global optimal solution, as defined by some
-overall measure of multiple alignment quality (7,8), or anything close to it,
-will be found. More specifically, any mistakes (misaligned regions) made
-early in the alignment process cannot be corrected later as new information
-from other sequences is added. This problem is frequently thought of as
-mainly resulting from an incorrect branching order in the initial tree. The
-initial trees are derived from a matrix of distances between separately aligned
-pairs of sequences and are much less reliable than trees from complete
-multiple alignments. In our experience, however, the real problem is caused
-simply by errors in the initial alignments. Even if the topology of the guide
-tree is correct, each alignment step in the multiple alignment process may
-have some percentage of the residues misaligned. This percentage will be
-very low on average for very closely related sequences but will increase as
-sequences diverge. It is these misalignments which carry through from the
-early alignment steps that cause the local minimum problem. The only way
-to correct this is to use an iterative or stochastic sampling procedure (e.g.
-7,9,10). We do not directly address this problem in this paper.
-
-The alignment parameter choice problem is, in our view, at least as serious as
-the local minimum problem. Stochastic or iterative algorithms will be just
-as badly affected as progressive ones if the parameters are inappropriate: they
-will arrive at a false global minimum. Traditionally, one chooses one weight
-matrix and two gap penalties (one for opening a new gap and one for
-extending an existing gap) and hope that these will work well over all parts of
-all the sequences in the data set. When the sequences are all closely related,
-this works. The first reason is that virtually all residue weight matrices give
-most weight to identities. When identities dominate an alignment, almost
-any weight matrix will find approximately the correct solution. With very
-divergent sequences, however, the scores given to non-identical residues will
-become critically important; there will be more mismatches than identities.
-Different weight matrices will be optimal at different evolutionary distances
-or for different classes of proteins.
-
-The second reason is that the range of gap penalty values that will find the
-correct or best possible solution can be very broad for highly similar sequences
-(11). As more and more divergent sequences are used, however, the exact
-values of the gap penalties become important for success. In each case, there
-may be a very narrow range of values which will deliver the best alignment.
-Further, in protein alignments, gaps do not occur randomly (i.e. with equal
-probability at all positions). They occur far more often between the major
-secondary structural elements of alpha helices and beta strands than within
-(12).
-
-The major improvements described in this paper attempt to address the
-alignment parameter choice problem. We dynamically vary the gap
-penalties in a position and residue specific manner. The observed relative
-frequencies of gaps adjacent to each of the 20 amino acids (12) are used to
-locally adjust the gap opening penalty after each residue. Short stretches of
-hydrophilic residues (e.g. 5 or more) usually indicate loop or random coil
-regions and the gap opening penalties are locally reduced in these stretches.
-In addition, the locations of the gaps found in the early alignments are also
-given reduced gap opening penalties. It has been observed in alignments
-between sequences of known structure that gaps tend not to be closer than
-roughly eight residues on average (12). We increase the gap opening penalty
-within eight residues of exising gaps. The two main series of amino acid
-weight matrices that are used today are the PAM series (3) and the BLOSUM
-series (4). In each case, there is a range of matrices to choose from. Some
-matrices are appropriate for aligning very closely related sequences where
-most weight by far is given to identities, with only the most frequent
-conservative substitutions receiving high scores. Other matrices work better
-at greater evolutionary distances where less importance is attached to
-identities (13). We choose different weight matrices, as the alignment
-proceeds, depending on the estimated divergence of the sequences to be
-aligned at each stage.
-
-Sequences are weighted to correct for unequal sampling across all
-evolutionary distances in the data set (14). This downweights sequences that
-are very similar to other sequences in the data set and upweights the most
-divergent ones. The weights are calculated directly from the branch lengths
-in the initial guide tree (15). Sequence weighting has already been shown to
-be effective in improving the sensitivity of profile searches (15,16). In the
-original CLUSTAL programs (17-19), the initial guide trees, used to guide the
-multiple alignment, were calculated using the UPGMA method (20). We
-now use the Neighbour-Joining method (21) which is more robust against the
-effects of unequal evolutionary rates in different lineages and which gives
-better estimates of individual branch lengths. This is useful because it is these
-branch lengths which are used to derive the sequence weights. We also allow
-users to choose between fast approximate alignments (22) or full dynamic
-programming for the distance calculations used to make the guide tree.
-
-The new improvements dramatically improve the sensitivity of the
-progressive alignment method for difficult alignments involving highly
-diverged sequences. We show one very demanding test case of over 60 SH3
-domains (23) which includes sequence pairs with as little as 12% identity and
-where there is only one exactly conserved residue across all of the sequences.
-Using default parameters, we can achieve an alignment that is almost exactly
-correct, according to available structural information (24). Using the program
-in a wide variety of situations, we find that it will normally find the correct
-alignment, in all but the most difficult and pathological of cases.
-
-
-MATERIAL AND METHODS
-
-
-The basic alignment method
-
-The basic multiple alignment algorithm consists of three main stages: 1) all
-pairs of sequences are aligned separately in order to calculate a distance matrix
-giving the divergence of each pair of sequences; 2) a guide tree is calculated
-from the distance matrix; 3) the sequences are progressively aligned according
-to the branching order in the guide tree. An example using 7 globin
-sequences of known tertiary structure (25) is given in figure 1.
-
-
-1) The distance matrix/pairwise alignments
-
-In the original CLUSTAL programs, the pairwise distances were calculated
-using a fast approximate method (22). This allows very large numbers of
-sequences to be aligned, even on a microcomputer. The scores are calculated
-as the number of k-tuple matches (runs of identical residues, typically 1 or 2
-long for proteins or 2 to 4 long for nucleotide sequences) in the best alignment
-between two sequences minus a fixed penalty for every gap. We now offer a
-choice between this method and the slower but more accurate scores from full
-dynamic programming alignments using two gap penalties (for opening or
-extending gaps) and a full amino acid weight matrix. These scores are
-calculated as the number of identities in the best alignment divided by the
-number of residues compared (gap positions are excluded). Both of these
-scores are initially calculated as percent identity scores and are converted to
-distances by dividing by 100 and subtracting from 1.0 to give number of
-differences per site. We do not correct for multiple substitutions in these
-initial distances. In figure 1 we give the 7x7 distance matrix between the 7
-globin sequences calculated using the full dynamic programming method.
-
-
-2) The guide tree
-
-The trees used to guide the final multiple alignment process are calculated
-from the distance matrix of step 1 using the Neighbour-Joining method (21).
-This produces unrooted trees with branch lengths proportional to estimated
-divergence along each branch. The root is placed by a "mid-point" method
-(15) at a position where the means of the branch lengths on either side of the
-root are equal. These trees are also used to derive a weight for each sequence
-(15). The weights are dependent upon the distance from the root of the tree
-but sequences which have a common branch with other sequences share the
-weight derived from the shared branch. In the example in figure 1, the
-leghaemoglobin (Lgb2_Luplu) gets a weight of 0.442 which is equal to the
-length of the branch from the root to it. The Human beta globin
-(Hbb_Human) gets a weight consisting of the length of the branch leading to
-it that is not shared with any other sequences (0.081) plus half the length of
-the branch shared with the horse beta globin (0.226/2) plus one quarter the
-length of the branch shared by all four haemoglobins (0.061/4) plus one fifth
-the branch shared between the haemoglobins and the myoglobin (0.015/5)
-plus one sixth the branch leading to all the vertebrate globins (0.062). This
-sums to a total of 0.221. By contrast, in the normal progressive alignment
-algorithm, all sequences would be equally weighted. The rooted tree with
-branch lengths and sequence weights for the 7 globins is given in figure 1.
-
-
-3) Progressive alignment
-
-The basic procedure at this stage is to use a series of pairwise alignments to
-align larger and larger groups of sequences, following the branching order in
-the guide tree. You proceed from the tips of the rooted tree towards the root.
-In the globin example in figure 1 you align the sequences in the following
-order: human vs. horse beta globin; human vs. horse alpha globin; the 2
-alpha globins vs. the 2 beta globins; the myoglobin vs. the haemoglobins; the
-cyanohaemoglobin vs the haemoglobins plus myoglobin; the leghaemoglobin
-vs. all the rest. At each stage a full dynamic programming (26,27) algorithm is
-used with a residue weight matrix and penalties for opening and extending
-gaps. Each step consists of aligning two existing alignments or sequences.
-Gaps that are present in older alignments remain fixed. In the basic
-algorithm, new gaps that are introduced at each stage get full gap opening and
-extension penalties, even if they are introduced inside old gap positions (see
-the section on gap penalties below for modifications to this rule). In order to
-calculate the score between a position from one sequence or alignment and
-one from another, the average of all the pairwise weight matrix scores from
-the amino acids in the two sets of sequences is used i.e. if you align 2
-alignments with 2 and 4 sequences respectively, the score at each position is
-the average of 8 (2x4) comparisons. This is illustrated in figure 2. If either set
-of sequences contains one or more gaps in one of the positions being
-considered, each gap versus a residue is scored as zero. The default amino
-acid weight matrices we use are rescored to have only positive values.
-Therefore, this treatment of gaps treats the score of a residue versus a gap as
-having the worst possible score. When sequences are weighted (see
-improvements to progressive alignment, below), each weight matrix value is
-multiplied by the weights from the 2 sequences, as illustrated in figure 2.
-
-
-Improvements to progressive alignment
-
-All of the remaining modifications apply only to the final progressive
-alignment stage. Sequence weighting is relatively straightforward and is
-already widely used in profile searches (15,16). The treatment of gap penalties
-is more complicated. Initial gap penalties are calculated depending on the
-weight matrix, the similarity of the sequences, and the length of the
-sequences. Then, an attempt is made to derive sensible local gap opening
-penalties at every position in each pre-aligned group of sequences that will
-vary as new sequences are added. The use of different weight matrices as the
-alignment progresses is novel and largely by-passes the problem of initial
-choice of weight matrix. The final modification allows us to delay the
-addition of very divergent sequences until the end of the alignment process
-when all of the more closely related sequences have already been aligned.
-
-
-Sequence weighting
-
-Sequence weights are calculated directly from the guide tree. The weights
-are normalised such that the biggest one is set to 1.0 and the rest are all less
-than one. Groups of closely related sequences receive lowered weights
-because they contain much duplicated information. Highly divergent
-sequences without any close relatives receive high weights. These weights
-are used as simple multiplication factors for scoring positions from different
-sequences or prealigned groups of sequences. The method is illustrated in
-figure 2. In the globin example in figure 1, the two alpha globins get
-downweighted because they are almost duplicate sequences (as do the two
-beta globins); they receive a combined weight of only slightly more than if a
-single alpha globin was used.
-
-
-Initial gap penalties
-
-Initially, two gap penalties are used: a gap opening penalty (GOP) which gives
-the cost of opening a new gap of any length and a gap extension penalty (GEP)
-which gives the cost of every item in a gap. Initial values can be set by the
-user from a menu. The software then automatically attempts to choose
-appropriate gap penalties for each sequence alignment, depending on the
-following factors.
-
-1) Dependence on the weight matrix
-
-It has been shown (16,28) that varying the gap penalties used with different
-weight matrices can improve the accuracy of sequence alignments. Here, we
-use the average score for two mismatched residues (ie. off-diagonal values in
-the matrix) as a scaling factor for the GOP.
-
-2) Dependence on the similarity of the sequences
-
-The percent identity of the two (groups of) sequences to be aligned is used to
-increase the GOP for closely related sequences and decrease it for more
-divergent sequences on a linear scale.
-
-3) Dependence on the lengths of the sequences
-
-The scores for both true and false sequence alignments grow with the length
-of the sequences. We use the logarithm of the length of the shorter sequence
-to increase the GOP with sequence length.
-
-Using these three modifications, the initial GOP calculated by the program is:
-
-GOP->(GOP+log(MIN(N,M))) * (average residue mismatch score) *
- (percent identity scaling factor)
-where N, M are the lengths of the two sequences.
-
-4) Dependence on the difference in the lengths of the sequences
-
-The GEP is modified depending on the difference between the lengths of the
-two sequences to be aligned. If one sequence is much shorter than the other,
-the GEP is increased to inhibit too many long gaps in the shorter sequence.
-The initial GEP calculated by the program is:
-
-GEP -> GEP*(1.0+|log(N/M)|)
-where N, M are the lengths of the two sequences.
-
-
-Position-specific gap penalties
-
- In most dynamic programming applications, the initial gap opening and
-extension penalties are applied equally at every position in the sequence,
-regardless of the location of a gap, except for terminal gaps which are usually
-allowed at no cost. In CLUSTAL W, before any pair of sequences or
-prealigned groups of sequences are aligned, we generate a table of gap opening
-penalties for every position in the two (sets of) sequences. An example is
-shown in figure 3. We manipulate the initial gap opening penalty in a
-position specific manner, in order to make gaps more or less likely at different
-positions.
-
-The local gap penalty modification rules are applied in a hierarchical manner.
-The exact details of each rule are given below. Firstly, if there is a gap at a
-position, the gap opening and gap extension penalties are lowered; the other
-rules do not apply. This makes gaps more likely at positions where there are
-already gaps. If there is no gap at a position, then the gap opening penalty is
-increased if the position is within 8 residues of an existing gap. This
-discourages gaps that are too close together. Finally, at any position within a
-run of hydrophilic residues, the penalty is decreased. These runs usually
-indicate loop regions in protein structures. If there is no run of hydrophilic
-residues, the penalty is modified using a table of residue specific gap
-propensities (12). These propensities were derived by counting the frequency
-of each residue at either end of gaps in alignments of proteins of known
-structure. An illustration of the application of these rules from one part of
-the globin example, in figure 1, is given in figure 3.
-
-1) Lowered gap penalties at existing gaps
-
-If there are already gaps at a position, then the GOP is reduced in proportion
-to the number of sequences with a gap at this position and the GEP is lowered
-by a half. The new gap opening penalty is calculated as:
-
-GOP -> GOP*0.3*(no. of sequences without a gap/no. of sequences).
-
-2) Increased gap penalties near existing gaps
-
-If a position does not have any gaps but is within 8 residues of an existing gap,
-the GOP is increased by:
-
-GOP -> GOP*(2+((8-distance from gap)*2)/8)
-
-3) Reduced gap penalties in hydrophilic stretches
-
-Any run of 5 hydrophilic residues is considered to be a hydrophilic stretch.
-The residues that are to be considered hydrophilic may be set by the user but
-are conservatively set to D, E, G, K, N, Q, P, R or S by default. If, at any
-position, there are no gaps and any of the sequences has such a stretch, the
-GOP is reduced by one third.
-
-
-4) Residue specific penalties
-
-If there is no hydrophilic stretch and the position does not contain any gaps,
-then the GOP is multiplied by one of the 20 numbers in table 1, depending on
-the residue. If there is a mixture of residues at a position, the multiplication
-factor is the average of all the contributions from each sequence.
-
-
-Weight matrices
-
-Two main series of weight matrices are offered to the user: the Dayhoff PAM
-series (3) and the BLOSUM series (4). The default is the BLOSUM series. In
-each case, there is a choice of matrix ranging from strict ones, useful for
-comparing very closely related sequences to very "soft" ones that are useful
-for comparing very distantly related sequences. Depending on the distance
-between the two sequences or groups of sequences to be compared, we switch
-between 4 different matrices. The distances are measured directly from the
-guide tree. The ranges of distances and tables used with the PAM series of
-matrices is: 80-100%:PAM20, 60-80%:PAM60, 40-60%:PAM120, 0-40%:PAM350.
-The range used with the BLOSUM series is:80-100%:BLOSUM80,
-60-80%:BLOSUM62, 30-60%:BLOSUM45, 0-30%:BLOSUM30.
-
-
-Divergent sequences
-
-The most divergent sequences (most different, on average from all of the
-other sequences) are usually the most difficult to align correctly. It is
-sometimes better to delay the incorporation of these sequences until all of the
-more easily aligned sequences are merged first. This may give a better chance
-of correctly placing the gaps and matching weakly conserved positions against
-the rest of the sequences. A choice is offered to set a cut off (default is 40%
-identity or less with any other sequence) that will delay the alignment of the
-divergent sequences until all of the rest have been aligned.
-
-
-Software and Algorithms
-
-
-Dynamic Programming
-
-The most demanding part of the multiple alignment strategy, in terms of
-computer processing and memory usage, is the alignment of two (groups of)
-sequences at each step in the final progressive alignment. To make it
-possible to align very long sequences (e.g. dynein heavy chains at ~ 5,000
-residues) in a reasonable amount of memory, we use the memory efficient
-dynamic programming algorithm of Myers and Miller (26). This sacrifices
-some processing time but makes very large alignments practical in very little
-memory. One disadvantage of this algorithm is that it does not allow
-different gap opening and extension penalties at each position. We have
-modified the algorithm so as to allow this and the details are described in a
-separate paper (27).
-
-
-
-Menus/file formats
-
-Six different sequence input formats are detected automatically and read by
-the program: EMBL/Swiss Prot, NBRF/PIR, Pearson/FASTA (29), GCG/MSF
-(30), GDE (Steven Smith, Harvard University Genome Center) and CLUSTAL
-format alignments. The last three formats allow users to read in complete
-alignments (e.g. for calculating phylogenetic trees or for addition of new
-sequences to an existing alignment). Alignment output may be requested in
-standard CLUSTAL format (self-explanatory blocked alignments) or in
-formats compatible with the GDE, PHYLIP (31) or GCG (30) packages. The
-program offers the user the ability to calculate Neighbour-Joining
-phylogenetic trees from existing alignments with options to correct for
-multiple hits (32,33) and to estimate confidence levels using a bootstrap
-resampling procedure (34). The trees may be output in the "New
-Hampshire" format that is compatible with the PHYLIP package (31).
-
-Alignment to an alignment
-
-Profile alignment is used to align two existing alignments (either of which
-may consist of just one sequence) or to add a series of new sequences to an
-existing alignment. This is useful because one may wish to build up a
-multiple alignment gradually, choosing different parameters manually, or
-correcting intermediate errors as the alignment proceeds. Often, just a few
-sequences cause misalignments in the progressive algorithm and these can be
-removed from the process and then added at the end by profile alignment. A
-second use is where one has a high quality reference alignment and wishes to
-keep it fixed while adding new sequences automatically.
-
-
-Portability/Availability
-
-The full source code of the package is provided free to academic users. The
-program will run on any machine with a full ANSI conforming C compiler.
-It has been tested on the following hardware/software combinations:
-Decstation/Ultrix, Vax or ALPHA/VMS, Silicon Graphics/IRIX. The source
-code and documentation are available by E-mail from the EMBL file server
-(send the words HELP and HELP SOFTWARE on two lines to the internet
-address:
-Netserv at EMBL-Heidelberg.DE) or by anonymous FTP from
-FTP.EMBL-Heidelberg.DE. Queries may be addressed by E-mail to
-Des.Higgins at EBI.AC.UK or Gibson at EMBL-Heidelberg.DE.
-
-
-RESULTS AND DISCUSSION
-
-
-Alignment of SH3 Domains
-
-The ~60 residue SH3 domain was chosen to illustrate the performance of
-CLUSTAL W, as there is a reference manual alignment (23) and the fold is
-known (24). SH3 domains, with a minimum similarity below 12% identity,
-are poorly aligned by progressive alignment programs such as CLUSTAL V
-and PILEUP: neither program can generate the correct blocks corresponding to
-the secondary structure elements.
-
-Figure 4 shows an alignment generated by CLUSTAL W of the example set of
-SH3 domains. The alignment was generated in two steps. After progressive
-alignment, five blocks were produced, corresponding to structural elements,
-with gaps inserted exclusively in the known loop regions. The beta strands in
-blocks 1, 4 and 5 were all correctly superposed. However, four sequences in
-block 2 and one sequence in block 3 were misaligned by 1-2 residues
-(underlined in figure 4). A second progressive alignment of the aligned
-sequences, including the gaps, improved this alignment: A single misaligned
-sequence, H_P55, remains in block 2 (boxed in figure 4), while block 3 is now
-completely aligned. This alignment corrects several errors (eg. P85A, P85B
-and FUS1) in the manual alignment (23).
-
-The SH3 alignment illustrates several features of CLUSTAL W usage. Firstly,
-in a practical application involving divergent sequences, the initial
-progressive alignment is likely to be a good but not perfect approximation to
-the correct alignment. The alignment quality can be improved in a number of
-ways. If the block structure of the alignment appears to be correct, realignment
-of the alignment will usually improve most of the misaligned blocks: the
-existing gaps allow the blocks to "float" cheaply to a locally optimal position
-without disturbing the rest of the alignment. Remaining sequences which are
-doubtfully aligned can then be individually tested by profile alignment to the
-remainder: the misaligned H_P55 SH3 domain can be correctly aligned by
-profile (with GOP <= 8). The indel regions in the final alignment can then be
-manually cleaned up: Usually the exact alignment in the loop regions is not
-determinable, and may have no meaning in structural terms. It is then
-desirable to have a single gap per structural loop. CLUSTAL W achieved this
-for two of the four SH3 loop regions (figure 4).
-
-If the block structure of the alignment appears suspect, greater intervention by
-the user may be required. The most divergent sequences, especially if they
-have large insertions (which can be discerned with the aid of dot matrix
-plots), should be left out of the progressive alignment. If there are sets of
-closely related sequences that are deeply diverged from other sets, these can be
-separately aligned and then merged by profile alignment. Incorrectly
-determined sequences, containing frameshifts, can also confound regions of
-an alignment: these can be hard to detect but sometimes they have been
-grouped within the excluded divergent sequences: then they may be revealed
-when they are individually compared to the alignment as having apparently
-nonsense segments with respect to the other sequences.
-
-
-
-Finding the best alignment
-
-In cases where all of the sequences in a data set are very similar (e.g. no pair
-less than 35% identical), CLUSTAL W will find an alignment which is
-difficult to improve by eye. In this sense, the alignment is optimal with
-regard to the alternative of manual alignment. Mathematically, this is vague
-and can only be put on a more systematic footing by finding an objective
-function (a measure of multiple alignment quality) that exactly mirrors the
-information used by an "expert" to evaluate an alignment. Nonetheless, if an
-alignment is impossible to improve by eye, then the program has achieved a
-very useful result.
-
-In more difficult cases, as more divergent sequences are included, it becomes
-increasingly difficult to find good alignments and to evaluate them. What
-we find with CLUSTAL W is that the basic block-like structure of the
-alignment (corresponding to the major secondary structure elements) is
-usually recovered, with some of the most divergent sequences misaligned in
-small regions. This is a very useful starting point for manual refinement as it
-helps define the major blocks of similarity. The problem sequences can be
-removed from the analysis and realigned to the rest of the sequences
-automatically or with different parameter settings. An examination of the
-tree used to guide the alignment will usually show which sequences will be
-most unreliably placed (those that branch off closest to the root and/or those
-that align to other single sequences at a very low level of sequence identity
-rather than align to a group of pre-aligned sequences). Finally, one can
-simply iterate the multiple alignment process by feeding an output alignment
-back into CLUSTAL W and repeating the multiple alignment process (using
-the same or different parameters). The SH3 domain alignment in figure 4
-was derived in this way by 2 passes using default parameters. In the second
-pass, the local gap penalties are dominated by the placement of the initial
-major gap positions. The alignment will either remain unchanged or will
-converge rapidly (after 1 or 2 extra passes) on a better solution. If the
-placement of the initial gaps is approximately correct but some of the
-sequences are locally misaligned, this works well.
-
-
-Comparison with other methods
-
-Recently, several papers have addressed the problem of position specific
-parameters for multiple alignment. In one case (35), local gap penalties are
-increased in alpha helical and beta strand regions, when the 3-D structures of
-one or more of the sequences are known. In a second case (36), a hidden
-Markov model was used to estimate position specific gap penalties and
-residue substitution weight matrices when large numbers of examples of a
-protein domain were known. With CLUSTAL W, we attempt to derive the
-same information purely from the set of sequences to be aligned. Therefore,
-we can apply the method to any set of sequences. The success of this approach
-will depend on the number of available sequences and their evolutionary
-relationships. It will also depend on the decision making process during
-multiple alignment (e.g. when to change weight matrix) and the accuracy and
-appropriateness of our parameterisation. In the long term, this can only be
-evaluated by exhaustive testing of sets of sequences where the correct
-alignment (or parts of it) are known from structural information. What is
-clear, however, is that the modifications described here significantly improve
-the sensitivity of the progressive multiple alignment approach. This is
-achieved with almost no sacrifice in speed and efficiency.
-
-There are several areas where further improvements in sensitivity and
-accuracy can be made. Firstly, the residue weight matrices and gap settings
-can be made more accurate as more and more data accumulate, while
-matrices for specific sequence types can be derived (e.g. for transmembrane
-regions (37)). Secondly, stochastic or iterative optimisation methods can be
-used to refine initial alignments (7,9,10). CLUSTAL W could be run with
-several sets of starting parameters and in each case, the alignments refined
-according to an objective function. The search for a good objective function,
-that takes into account the sequence and position specific information used in
-CLUSTAL W is a key area of research. Finally, the average number of
-examples of each protein domain or family is growing steadily. It is not only
-important that programs can cope with the large volumes of data that are
-being generated, they should be able to exploit the new information to make
-the alignments more and more accurate. Globally optimal alignments
-(according to an objective function) may not always be possible but the
-problem may be avoided if sufficiently large volumes of data become
-available. CLUSTAL W is a step in this direction.
-
-ACKNOWLEDGEMENTS
-
-Numerous people have offered advice and suggestions for improvements to
-earlier versions of the CLUSTAL programs. D.H. wishes to apologise to all of
-the irate CLUSTAL V users who had to live with the bugs and lack of facilities
-for getting trees in the New Hampshire format. We wish to specifically thank
-Jeroen Coppieters who suggested using a series of weight matrices and Steven
-Henikoff for advice on using the BLOSUM matrices. We are grateful to Rein
-Aasland, Peer Bork, Ariel Blocker and Brtrand Seraphin for providing
-challenging alignment problems. T.G. and J.T. thank Kevin Leonard for
-support and encouragement. Finally, we thank all of the people who were
-involved with various CLUSTAL programs over the years, namely: Paul
-Sharp, Rainer Fuchs and Alan Bleasby.
-
-
-REFERENCES
-
- 1.Feng, D.-F. and Doolittle, R.F. (1987). J. Mol. Evol. 25, 351-360.
- 2.Needleman, S.B. and Wunsch, C.D. (1970). J. Mol. Biol. 48, 443-453.
- 3.Dayhoff, M.O., Schwartz, R.M. and Orcutt, B.C. (1978) in Atlas of Protein
-Sequence and Structure, vol. 5, suppl. 3 (Dayhoff, M.O., ed.), pp 345-352,
-NBRF, Washington.
- 4.Henikoff, S. and Henikoff, J.G. (1992). Proc. Natl. Acad. Sci. USA 89, 10915-
-10919.
- 5.Lipman, D.J., Altschul, S.F. and Kececioglu, J.D. (1989). Proc. Natl. Acad. Sci.
-USA 86, 4412-4415.
- 6.Barton, G.J. and Sternberg, M.J.E. (1987). J. Mol. Biol. 198, 327-337.
- 7.Gotoh, O. (1993). CABIOS 9, 361-370.
- 8.Altschul, S.F. (1989). J. Theor. Biol. 138, 297-309.
- 9.Lukashin, A.V., Engelbrecht, J. and Brunak, S. (1992). Nucl. Acids Res. 20,
-2511-2516.
-10.Lawrence, C.E., Altschul, S.F., Boguski, M.S., Liu, J.S., Neuwald, A.F. and
-Wooton, J.C. (1993). Science, 262, 208-214.
-11.Vingron, M. and Waterman, M.S. (1993). J. Mol. Biol. 234, 1-12.
-12.Pascarella, S. and Argos, P. (1992). J. Mol. Biol. 224, 461-471.
-13.Collins, J.F. and Coulson, A.F.W. (1987). In Nucleic acid and protein
-sequence analysis a practical approach, Bishop, M.J. and Rawlings, C.J. ed.,
-chapter 13, pp. 323-358.
-14.Vingron, M. and Sibbald, P.R. (1993). Proc. Natl. Acad. Sci. USA, 90, 8777-
-8781.
-15.Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994). CABIOS, 10, 19-29.
-16.Lthy, R., Xenarios, I. and Bucher, P. (1994). Protein Science, 3, 139-146.
-17.Higgins, D.G. and Sharp, P.M. (1988). Gene, 73, 237-244.
-18.Higgins, D.G. and Sharp, P.M. (1989). CABIOS, 5, 151-153.
-19.Higgins, D.G., Bleasby, A.J. and Fuchs, R. (1992). CABIOS, 8, 189-191.
-20.Sneath, P.H.A. and Sokal, R.R. (1973). Numerical Taxonomy, W.H.
-Freeman, San Francisco.
-21.Saitou, N. and Nei, M. (1987). Mol. Biol. Evol. 4, 406-425.
-22.Wilbur, W.J. and Lipman, D.J. (1983). Proc. Natl. Acad. Sci. USA, 80, 726-
-730.
-23.Musacchio, A., Gibson, T., Lehto, V.-P. and Saraste, M. (1992). FEBS Lett.
-307, 55-61.
-24.Musacchio, A., Noble, M., Pauptit, R., Wierenga, R. and Saraste, M. (1992).
-Nature, 359, 851-855.
-25.Bashford, D., Chothia, C. and Lesk, A.M. (1987). J. Mol. Biol. 196, 199-216.
-26.Myers, E.W. and Miller, W. (1988). CABIOS, 4, 11-17.
-27.Thompson, J.D. (1994). CABIOS, (Submitted).
-28.Smith, T.F., Waterman, M.S. and Fitch, W.M. (1981). J. Mol. Evol. 18, 38-46.
-29.Pearson, W.R. and Lipman, D.J. (1988). Proc. Natl. Acad. Sci. USA. 85, 2444-
-2448.
-30.Devereux, J., Haeberli, P. and Smithies, O. (1984). Nucleic Acids Res. 12,
-387-395.
-31.Felsenstein, J. (1989). Cladistics 5, 164-166.
-32.Kimura, M. (1980). J. Mol. Evol. 16, 111-120.
-33.Kimura, M. (1983). The Neutral Theory of Molecular Evolution.
-Cambridge University Press, Cambridge.
-34.Felsenstein, J. (1985). Evolution 39, 783-791.
-35.Smith, R.F. and Smith, T.F. (1992) Protein Engineering 5, 35-41.
-36.Krogh, A., Brown, M., Mian, S., Sjlander, K. and Haussler, D. (1994) J. Mol.
-Biol. 235-1501-1531.
-37.Jones, D.T., Taylor, W.R. and Thornton, J.M. (1994). FEBS Lett. 339, 269-275.
-38.Bairoch, A. and Bckmann, B. (1992) Nucleic Acids Res., 20, 2019-2022.
-39.Noble, M.E.M., Musacchio, A., Saraste, M., Courtneidge, S.A. and
-Wierenga, R.K. (1993) EMBO J. 12, 2617-2624.
-40.Kabsch, W. and Sander, C. (1983) Biopolymers, 22, 2577-2637.
-
-FIGURE LEGENDS
-
-Figure 1. The basic progressive alignment procedure, illustrated using a set of
-7 globins of known tertiary structure. The sequence names are from Swiss
-Prot (38): Hba_Horse: horse alpha globin; Hba_Human: human alpha globin;
-Hbb_Horse: horse beta globin; Hbb_Human: human beta globin; Myg_Phyca:
-sperm whale myoglobin; Glb5_Petma: lamprey cyanohaemoglobin;
-Lgb2_Luplu: lupin leghaemoglobin. In the distance matrix, the mean
-number of differences per residue is given. The unrooted tree shows all
-branch lengths drawn to scale. In the rooted tree, all branch lengths (mean
-number of differences per residue along each branch) are given as well as
-weights for each sequence. In the multiple alignment, the approximate
-positions of the 7 alpha helices, common to all 7 proteins are shown. This
-alignment was derived using CLUSTAL W with default parameters and the
-PAM (3) series of weight matrices.
-
-Figure 2. The scoring scheme for comparing two positions from two
-alignments. Two sections of alignment with 4 and 2 sequences respectively
-are shown. The score of the position with amino acids T,L,K,K versus the
-position with amino acids V and I is given with and without sequence
-weights. M(X,Y) is the weight matrix entry for amino acid X versus amino
-acid Y. Wn is the weight for sequence n.
-
-Figure 3. The variation in local gap opening penalty is plotted for a section of
-alignment. The inital gap opening penalty is indicated by a dotted line. Two
-hydrophilic stretches are underlined. The lowest penalties correspond to the
-ends of the alignment, the hydrophilic stretches and the two positions with
-gaps. The highest values are within 8 residues of the two gap positions. The
-rest of the variation is caused by the residue specific gap penalties (12).
-
-Figure 4. CLUSTAL W Alignment of a set of SH3 domains taken from (23).
-Secondary structure assignments for the solved Spectrin (24) and Fyn (39)
-domains are according to DSSP (40). The alignment was generated in two
-steps using default parameters. After full multiple alignment, the aligned
-sequences were realigned. Segments which were correctly aligned in the
-second pass are underlined. The single misaligned segment in H_P55 and the
-misaligned residue in H_NCK/2 are boxed.
-
-The sequences are coloured to illustrate significant features. All G (orange)
-and P (yellow) are coloured. Other residues matching a frequent occurrence of
-a property in a column are coloured: hydrophobic = blue; hydrophobic
-tendency = light blue; basic = red; acidic = purple; hydrophilic = green; White
-= unconserved. The alignment figure was prepared with the GDE sequence
-editor (S. Smith, Harvard University) and COLORMASK (J. Thompson,
-EMBL).
-
-
-
-
-Table 1. Pascarella and Argos residue specific gap modification factors.
------------------------------------------------------------------------------------
-A 1.13 M 1.29
-C 1.13 N 0.63
-D 0.96 P 0.74
-E 1.31 Q 1.07
-F 1.20 R 0.72
-G 0.61 S 0.76
-H 1.00 T 0.89
-I 1.32 V 1.25
-K 0.96 Y 1.00
-L 1.21 W 1.23
------------------------------------------------------------------------------------
-The values are normalised around a mean value of 1.0 for H. The lower the
-value, the greater the chance of having an adjacent gap. These are derived
-from the original table of relative frequencies of gaps adjacent to each residue
-(12) by subtraction from 2.0.
-
-
Deleted: trunk/packages/clustalw/trunk/clustalw.new
===================================================================
(Binary files differ)
Deleted: trunk/packages/clustalw/trunk/clustalw_help
===================================================================
--- trunk/packages/clustalw/trunk/clustalw_help 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/clustalw_help 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,697 +0,0 @@
-
-This is the on-line help file for CLUSTAL W ( version 1.83).
-
-It should be named or defined as: clustalw_help
-except with MSDOS in which case it should be named CLUSTALW.HLP
-
-For full details of usage and algorithms, please read the CLUSTALW.DOC file.
-
-
-Toby Gibson EMBL, Heidelberg, Germany.
-Des Higgins UCC, Cork, Ireland.
-Julie Thompson IGBMC, Strasbourg, France.
-
-
-
->>NEW <<
-
- Fasta output
- ===========
-
- Write/Read sequence with range specified. The command line syntax
- for range specification is flexible. You can use one of the following
- syntax.
-
- -range=n:m
- -range=n-m
- -range="n m"
-
- where m is the starting and m is the length of the sequence.
-
- Range and range numbers.
- =======================
-
- Include range numbers in the ouput.
-
- -seqno_range=on/off
-
- The sequence range will be appended as to the names of the sequence.
-
-
- PIM: Percentage Identity Matrix
- ===============================
-
-
-
->>HELP 1 << General help for CLUSTAL W (1.81)
-
-Clustal W is a general purpose multiple alignment program for DNA or proteins.
-
-SEQUENCE INPUT: all sequences must be in 1 file, one after another.
-7 formats are automatically recognised: NBRF-PIR, EMBL-SWISSPROT,
-Pearson (Fasta), Clustal (*.aln), GCG-MSF (Pileup), GCG9-RSF and GDE flat file.
-All non-alphabetic characters (spaces, digits, punctuation marks) are ignored
-except "-" which is used to indicate a GAP ("." in MSF-RSF).
-
-To do a MULTIPLE ALIGNMENT on a set of sequences, use item 1 from this menu to
-INPUT them; go to menu item 2 to do the multiple alignment.
-
-PROFILE ALIGNMENTS (menu item 3) are used to align 2 alignments. Use this to
-add a new sequence to an old alignment, or to use secondary structure to guide
-the alignment process. GAPS in the old alignments are indicated using the "-"
-character. PROFILES can be input in ANY of the allowed formats; just
-use "-" (or "." for MSF-RSF) for each gap position.
-
-PHYLOGENETIC TREES (menu item 4) can be calculated from old alignments (read in
-with "-" characters to indicate gaps) OR after a multiple alignment while the
-alignment is still in memory.
-
-
-The program tries to automatically recognise the different file formats used
-and to guess whether the sequences are amino acid or nucleotide. This is not
-always foolproof.
-
-FASTA and NBRF-PIR formats are recognised by having a ">" as the first
-character in the file.
-
-EMBL-Swiss Prot formats are recognised by the letters
-ID at the start of the file (the token for the entry name field).
-
-CLUSTAL format is recognised by the word CLUSTAL at the beginning of the file.
-
-GCG-MSF format is recognised by one of the following:
- - the word PileUp at the start of the file.
- - the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
- at the start of the file.
- - the word MSF on the first line of the line, and the characters ..
- at the end of this line.
-
-GCG-RSF format is recognised by the word !!RICH_SEQUENCE at the beginning of
-the file.
-
-
-If 85% or more of the characters in the sequence are from A,C,G,T,U or N, the
-sequence will be assumed to be nucleotide. This works in 97.3% of cases
-but watch out!
-
->>HELP 2 << Help for multiple alignments
-
-If you have already loaded sequences, use menu item 1 to do the complete
-multiple alignment. You will be prompted for 2 output files: 1 for the
-alignment itself; another to store a dendrogram that describes the similarity
-of the sequences to each other.
-
-Multiple alignments are carried out in 3 stages (automatically done from menu
-item 1 ...Do complete multiple alignments now):
-
-1) all sequences are compared to each other (pairwise alignments);
-
-2) a dendrogram (like a phylogenetic tree) is constructed, describing the
-approximate groupings of the sequences by similarity (stored in a file).
-
-3) the final multiple alignment is carried out, using the dendrogram as a guide.
-
-
-PAIRWISE ALIGNMENT parameters control the speed-sensitivity of the initial
-alignments.
-
-MULTIPLE ALIGNMENT parameters control the gaps in the final multiple alignments.
-
-
-RESET GAPS (menu item 7) will remove any new gaps introduced into the sequences
-during multiple alignment if you wish to change the parameters and try again.
-This only takes effect just before you do a second multiple alignment. You
-can make phylogenetic trees after alignment whether or not this is ON.
-If you turn this OFF, the new gaps are kept even if you do a second multiple
-alignment. This allows you to iterate the alignment gradually. Sometimes, the
-alignment is improved by a second or third pass.
-
-SCREEN DISPLAY (menu item 8) can be used to send the output alignments to the
-screen as well as to the output file.
-
-You can skip the first stages (pairwise alignments; dendrogram) by using an
-old dendrogram file (menu item 3); or you can just produce the dendrogram
-with no final multiple alignment (menu item 2).
-
-
-OUTPUT FORMAT: Menu item 9 (format options) allows you to choose from 6
-different alignment formats (CLUSTAL, GCG, NBRF-PIR, PHYLIP, GDE, NEXUS, and FASTA).
-
-
->>HELP 3 << Help for pairwise alignment parameters
-A distance is calculated between every pair of sequences and these are used to
-construct the dendrogram which guides the final multiple alignment. The scores
-are calculated from separate pairwise alignments. These can be calculated using
-2 methods: dynamic programming (slow but accurate) or by the method of Wilbur
-and Lipman (extremely fast but approximate).
-
-You can choose between the 2 alignment methods using menu option 8. The
-slow-accurate method is fine for short sequences but will be VERY SLOW for
-many (e.g. >100) long (e.g. >1000 residue) sequences.
-
-SLOW-ACCURATE alignment parameters:
- These parameters do not have any affect on the speed of the alignments.
-They are used to give initial alignments which are then rescored to give percent
-identity scores. These % scores are the ones which are displayed on the
-screen. The scores are converted to distances for the trees.
-
-1) Gap Open Penalty: the penalty for opening a gap in the alignment.
-2) Gap extension penalty: the penalty for extending a gap by 1 residue.
-3) Protein weight matrix: the scoring table which describes the similarity
- of each amino acid to each other.
-4) DNA weight matrix: the scores assigned to matches and mismatches
- (including IUB ambiguity codes).
-
-
-FAST-APPROXIMATE alignment parameters:
-
-These similarity scores are calculated from fast, approximate, global align-
-ments, which are controlled by 4 parameters. 2 techniques are used to make
-these alignments very fast: 1) only exactly matching fragments (k-tuples) are
-considered; 2) only the 'best' diagonals (the ones with most k-tuple matches)
-are used.
-
-K-TUPLE SIZE: This is the size of exactly matching fragment that is used.
-INCREASE for speed (max= 2 for proteins; 4 for DNA), DECREASE for sensitivity.
-For longer sequences (e.g. >1000 residues) you may need to increase the default.
-
-GAP PENALTY: This is a penalty for each gap in the fast alignments. It has
-little affect on the speed or sensitivity except for extreme values.
-
-TOP DIAGONALS: The number of k-tuple matches on each diagonal (in an imaginary
-dot-matrix plot) is calculated. Only the best ones (with most matches) are
-used in the alignment. This parameter specifies how many. Decrease for speed;
-increase for sensitivity.
-
-WINDOW SIZE: This is the number of diagonals around each of the 'best'
-diagonals that will be used. Decrease for speed; increase for sensitivity.
-
-
->>HELP 4 << Help for multiple alignment parameters
-
-These parameters control the final multiple alignment. This is the core of the
-program and the details are complicated. To fully understand the use of the
-parameters and the scoring system, you will have to refer to the documentation.
-
-Each step in the final multiple alignment consists of aligning two alignments
-or sequences. This is done progressively, following the branching order in
-the GUIDE TREE. The basic parameters to control this are two gap penalties and
-the scores for various identical-non-indentical residues.
-
-1) and 2) The GAP PENALTIES are set by menu items 1 and 2. These control the
-cost of opening up every new gap and the cost of every item in a gap.
-Increasing the gap opening penalty will make gaps less frequent. Increasing
-the gap extension penalty will make gaps shorter. Terminal gaps are not
-penalised.
-
-3) The DELAY DIVERGENT SEQUENCES switch delays the alignment of the most
-distantly related sequences until after the most closely related sequences have
-been aligned. The setting shows the percent identity level required to delay
-the addition of a sequence; sequences that are less identical than this level
-to any other sequences will be aligned later.
-
-
-
-4) The TRANSITION WEIGHT gives transitions (A <--> G or C <--> T
-i.e. purine-purine or pyrimidine-pyrimidine substitutions) a weight between 0
-and 1; a weight of zero means that the transitions are scored as mismatches,
-while a weight of 1 gives the transitions the match score. For distantly related
-DNA sequences, the weight should be near to zero; for closely related sequences
-it can be useful to assign a higher score.
-
-
-5) PROTEIN WEIGHT MATRIX leads to a new menu where you are offered a choice of
-weight matrices. The default for proteins in version 1.8 is the PAM series
-derived by Gonnet and colleagues. Note, a series is used! The actual matrix
-that is used depends on how similar the sequences to be aligned at this
-alignment step are. Different matrices work differently at each evolutionary
-distance.
-
-6) DNA WEIGHT MATRIX leads to a new menu where a single matrix (not a series)
-can be selected. The default is the matrix used by BESTFIT for comparison of
-nucleic acid sequences.
-
-Further help is offered in the weight matrix menu.
-
-
-7) In the weight matrices, you can use negative as well as positive values if
-you wish, although the matrix will be automatically adjusted to all positive
-scores, unless the NEGATIVE MATRIX option is selected.
-
-8) PROTEIN GAP PARAMETERS displays a menu allowing you to set some Gap Penalty
-options which are only used in protein alignments.
-
-
->>HELP A << Help for protein gap parameters.
-1) RESIDUE SPECIFIC PENALTIES are amino acid specific gap penalties that reduce
-or increase the gap opening penalties at each position in the alignment or
-sequence. See the documentation for details. As an example, positions that
-are rich in glycine are more likely to have an adjacent gap than positions that
-are rich in valine.
-
-2) 3) HYDROPHILIC GAP PENALTIES are used to increase the chances of a gap within
-a run (5 or more residues) of hydrophilic amino acids; these are likely to
-be loop or random coil regions where gaps are more common. The residues that
-are "considered" to be hydrophilic are set by menu item 3.
-
-4) GAP SEPARATION DISTANCE tries to decrease the chances of gaps being too
-close to each other. Gaps that are less than this distance apart are penalised
-more than other gaps. This does not prevent close gaps; it makes them less
-frequent, promoting a block-like appearance of the alignment.
-
-5) END GAP SEPARATION treats end gaps just like internal gaps for the purposes
-of avoiding gaps that are too close (set by GAP SEPARATION DISTANCE above).
-If you turn this off, end gaps will be ignored for this purpose. This is
-useful when you wish to align fragments where the end gaps are not biologically
-meaningful.
->>HELP 5 << Help for output format options.
-
-Six output formats are offered. You can choose any (or all 6 if you wish).
-
-CLUSTAL format output is a self explanatory alignment format. It shows the
-sequences aligned in blocks. It can be read in again at a later date to
-(for example) calculate a phylogenetic tree or add a new sequence with a
-profile alignment.
-
-GCG output can be used by any of the GCG programs that can work on multiple
-alignments (e.g. PRETTY, PROFILEMAKE, PLOTALIGN). It is the same as the GCG
-.msf format files (multiple sequence file); new in version 7 of GCG.
-
-PHYLIP format output can be used for input to the PHYLIP package of Joe
-Felsenstein. This is an extremely widely used package for doing every
-imaginable form of phylogenetic analysis (MUCH more than the the modest intro-
-duction offered by this program).
-
-NBRF-PIR: this is the same as the standard PIR format with ONE ADDITION. Gap
-characters "-" are used to indicate the positions of gaps in the multiple
-alignment. These files can be re-used as input in any part of clustal that
-allows sequences (or alignments or profiles) to be read in.
-
-GDE: this is the flat file format used by the GDE package of Steven Smith.
-
-NEXUS: the format used by several phylogeny programs, including PAUP and
-MacClade.
-
-GDE OUTPUT CASE: sequences in GDE format may be written in either upper or
-lower case.
-
-CLUSTALW SEQUENCE NUMBERS: residue numbers may be added to the end of the
-alignment lines in clustalw format.
-
-OUTPUT ORDER is used to control the order of the sequences in the output
-alignments. By default, the order corresponds to the order in which the
-sequences were aligned (from the guide tree-dendrogram), thus automatically
-grouping closely related sequences. This switch can be used to set the order
-to the same as the input file.
-
-PARAMETER OUTPUT: This option allows you to save all your parameter settings
-in a parameter file. This file can be used subsequently to rerun Clustal W
-using the same parameters.
-
->>HELP 6 << Help for profile and structure alignments
-
-By PROFILE ALIGNMENT, we mean alignment using existing alignments. Profile
-alignments allow you to store alignments of your favourite sequences and add
-new sequences to them in small bunches at a time. A profile is simply an
-alignment of one or more sequences (e.g. an alignment output file from CLUSTAL
-W). Each input can be a single sequence. One or both sets of input sequences
-may include secondary structure assignments or gap penalty masks to guide the
-alignment.
-
-The profiles can be in any of the allowed input formats with "-" characters
-used to specify gaps (except for MSF-RSF where "." is used).
-
-You have to specify the 2 profiles by choosing menu items 1 and 2 and giving
-2 file names. Then Menu item 3 will align the 2 profiles to each other.
-Secondary structure masks in either profile can be used to guide the alignment.
-
-Menu item 4 will take the sequences in the second profile and align them to
-the first profile, 1 at a time. This is useful to add some new sequences to
-an existing alignment, or to align a set of sequences to a known structure.
-In this case, the second profile would not be pre-aligned.
-
-
-The alignment parameters can be set using menu items 5, 6 and 7. These are
-EXACTLY the same parameters as used by the general, automatic multiple
-alignment procedure. The general multiple alignment procedure is simply a
-series of profile alignments. Carrying out a series of profile alignments on
-larger and larger groups of sequences, allows you to manually build up a
-complete alignment, if necessary editing intermediate alignments.
-
-SECONDARY STRUCTURE OPTIONS. Menu Option 0 allows you to set 2D structure
-parameters. If a solved structure is available, it can be used to guide the
-alignment by raising gap penalties within secondary structure elements, so
-that gaps will preferentially be inserted into unstructured surface loops.
-Alternatively, a user-specified gap penalty mask can be supplied directly.
-
-A gap penalty mask is a series of numbers between 1 and 9, one per position in
-the alignment. Each number specifies how much the gap opening penalty is to be
-raised at that position (raised by multiplying the basic gap opening penalty
-by the number) i.e. a mask figure of 1 at a position means no change
-in gap opening penalty; a figure of 4 means that the gap opening penalty is
-four times greater at that position, making gaps 4 times harder to open.
-
-The format for gap penalty masks and secondary structure masks is explained
-in the help under option 0 (secondary structure options).
->>HELP B << Help for secondary structure - gap penalty masks
-
-The use of secondary structure-based penalties has been shown to improve the
-accuracy of multiple alignment. Therefore CLUSTAL W now allows gap penalty
-masks to be supplied with the input sequences. The masks work by raising gap
-penalties in specified regions (typically secondary structure elements) so that
-gaps are preferentially opened in the less well conserved regions (typically
-surface loops).
-
-Options 1 and 2 control whether the input secondary structure information or
-gap penalty masks will be used.
-
-Option 3 controls whether the secondary structure and gap penalty masks should
-be included in the output alignment.
-
-Options 4 and 5 provide the value for raising the gap penalty at core Alpha
-Helical (A) and Beta Strand (B) residues. In CLUSTAL format, capital residues
-denote the A and B core structure notation. The basic gap penalties are
-multiplied by the amount specified.
-
-Option 6 provides the value for the gap penalty in Loops. By default this
-penalty is not raised. In CLUSTAL format, loops are specified by "." in the
-secondary structure notation.
-
-Option 7 provides the value for setting the gap penalty at the ends of
-secondary structures. Ends of secondary structures are observed to grow
-and-or shrink in related structures. Therefore by default these are given
-intermediate values, lower than the core penalties. All secondary structure
-read in as lower case in CLUSTAL format gets the reduced terminal penalty.
-
-Options 8 and 9 specify the range of structure termini for the intermediate
-penalties. In the alignment output, these are indicated as lower case.
-For Alpha Helices, by default, the range spans the end helical turn. For
-Beta Strands, the default range spans the end residue and the adjacent loop
-residue, since sequence conservation often extends beyond the actual H-bonded
-Beta Strand.
-
-CLUSTAL W can read the masks from SWISS-PROT, CLUSTAL or GDE format input
-files. For many 3-D protein structures, secondary structure information is
-recorded in the feature tables of SWISS-PROT database entries. You should
-always check that the assignments are correct - some are quite inaccurate.
-CLUSTAL W looks for SWISS-PROT HELIX and STRAND assignments e.g.
-
-FT HELIX 100 115
-FT STRAND 118 119
-
-The structure and penalty masks can also be read from CLUSTAL alignment format
-as comment lines beginning "!SS_" or "!GM_" e.g.
-
-!SS_HBA_HUMA ..aaaAAAAAAAAAAaaa.aaaAAAAAAAAAAaaaaaaAaaa.........aaaAAAAAA
-!GM_HBA_HUMA 112224444444444222122244444444442222224222111111111222444444
-HBA_HUMA VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGK
-
-Note that the mask itself is a set of numbers between 1 and 9 each of which is
-assigned to the residue(s) in the same column below.
-
-In GDE flat file format, the masks are specified as text and the names must
-begin with "SS_ or "GM_.
-
-Either a structure or penalty mask or both may be used. If both are included in
-an alignment, the user will be asked which is to be used.
-
->>HELP C << Help for secondary structure - gap penalty mask output options
-
- The options in this menu let you choose whether or not to include the masks
-in the CLUSTAL W output alignments. Showing both is useful for understanding
-how the masks work. The secondary structure information is itself very useful
-in judging the alignment quality and in seeing how residue conservation
-patterns vary with secondary structure.
-
-
->>HELP 7 << Help for phylogenetic trees
-
-1) Before calculating a tree, you must have an ALIGNMENT in memory. This can be
-input in any format or you should have just carried out a full multiple
-alignment and the alignment is still in memory.
-
-
-*************** Remember YOU MUST ALIGN THE SEQUENCES FIRST!!!! ***************
-
-
-The method used is the NJ (Neighbour Joining) method of Saitou and Nei. First
-you calculate distances (percent divergence) between all pairs of sequence from
-a multiple alignment; second you apply the NJ method to the distance matrix.
-
-2) EXCLUDE POSITIONS WITH GAPS? With this option, any alignment positions where
-ANY of the sequences have a gap will be ignored. This means that 'like' will be
-compared to 'like' in all distances, which is highly desirable. It also
-automatically throws away the most ambiguous parts of the alignment, which are
-concentrated around gaps (usually). The disadvantage is that you may throw away
-much of the data if there are many gaps (which is why it is difficult for us to
-make it the default).
-
-
-
-3) CORRECT FOR MULTIPLE SUBSTITUTIONS? For small divergence (say <10%) this
-option makes no difference. For greater divergence, it corrects for the fact
-that observed distances underestimate actual evolutionary distances. This is
-because, as sequences diverge, more than one substitution will happen at many
-sites. However, you only see one difference when you look at the present day
-sequences. Therefore, this option has the effect of stretching branch lengths
-in trees (especially long branches). The corrections used here (for DNA or
-proteins) are both due to Motoo Kimura. See the documentation for details.
-
-Where possible, this option should be used. However, for VERY divergent
-sequences, the distances cannot be reliably corrected. You will be warned if
-this happens. Even if none of the distances in a data set exceed the reliable
-threshold, if you bootstrap the data, some of the bootstrap distances may
-randomly exceed the safe limit.
-
-4) To calculate a tree, use option 4 (DRAW TREE NOW). This gives an UNROOTED
-tree and all branch lengths. The root of the tree can only be inferred by
-using an outgroup (a sequence that you are certain branches at the outside
-of the tree .... certain on biological grounds) OR if you assume a degree
-of constancy in the 'molecular clock', you can place the root in the 'middle'
-of the tree (roughly equidistant from all tips).
-
-5) TOGGLE PHYLIP BOOTSTRAP POSITIONS
-By default, the bootstrap values are correctly placed on the tree branches of
-the phylip format output tree. The toggle allows them to be placed on the
-nodes, which is incorrect, but some display packages (e.g. TreeTool, TreeView
-and Phylowin) only support node labelling but not branch labelling. Care
-should be taken to note which branches and labels go together.
-
-6) OUTPUT FORMATS: four different formats are allowed. None of these displays
-the tree visually. Useful display programs accepting PHYLIP format include
-NJplot (from Manolo Gouy and supplied with Clustal W), TreeView (Mac-PC), and
-PHYLIP itself - OR get the PHYLIP package and use the tree drawing facilities
-there. (Get the PHYLIP package anyway if you are interested in trees). The
-NEXUS format can be read into PAUP or MacClade.
-
->>HELP 8 << Help for choosing a weight matrix
-
-For protein alignments, you use a weight matrix to determine the similarity of
-non-identical amino acids. For example, Tyr aligned with Phe is usually judged
-to be 'better' than Tyr aligned with Pro.
-
-There are three 'in-built' series of weight matrices offered. Each consists of
-several matrices which work differently at different evolutionary distances. To
-see the exact details, read the documentation. Crudely, we store several
-matrices in memory, spanning the full range of amino acid distance (from almost
-identical sequences to highly divergent ones). For very similar sequences, it
-is best to use a strict weight matrix which only gives a high score to
-identities and the most favoured conservative substitutions. For more divergent
-sequences, it is appropriate to use "softer" matrices which give a high score
-to many other frequent substitutions.
-
-1) BLOSUM (Henikoff). These matrices appear to be the best available for
-carrying out database similarity (homology searches). The matrices used are:
-Blosum 80, 62, 45 and 30. (BLOSUM was the default in earlier Clustal W
-versions)
-
-2) PAM (Dayhoff). These have been extremely widely used since the late '70s.
-We use the PAM 20, 60, 120 and 350 matrices.
-
-3) GONNET. These matrices were derived using almost the same procedure as the
-Dayhoff one (above) but are much more up to date and are based on a far larger
-data set. They appear to be more sensitive than the Dayhoff series. We use the
-GONNET 80, 120, 160, 250 and 350 matrices. This series is the default for
-Clustal W version 1.8.
-
-We also supply an identity matrix which gives a score of 1.0 to two identical
-amino acids and a score of zero otherwise. This matrix is not very useful.
-Alternatively, you can read in your own (just one matrix, not a series).
-
-A new matrix can be read from a file on disk, if the filename consists only
-of lower case characters. The values in the new weight matrix must be integers
-and the scores should be similarities. You can use negative as well as positive
-values if you wish, although the matrix will be automatically adjusted to all
-positive scores.
-
-
-
-For DNA, a single matrix (not a series) is used. Two hard-coded matrices are
-available:
-
-
-1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
-of nucleic acid sequences. X's and N's are treated as matches to any IUB
-ambiguity symbol. All matches score 1.9; all mismatches for IUB symbols score 0.
-
-
-2) CLUSTALW(1.6). The previous system used by Clustal W, in which matches score
-1.0 and mismatches score 0. All matches for IUB symbols also score 0.
-
-INPUT FORMAT The format used for a new matrix is the same as the BLAST program.
-Any lines beginning with a # character are assumed to be comments. The first
-non-comment line should contain a list of amino acids in any order, using the
-1 letter code, followed by a * character. This should be followed by a square
-matrix of integer scores, with one row and one column for each amino acid. The
-last row and column of the matrix (corresponding to the * character) contain
-the minimum score over the whole matrix.
-
->>HELP 9 << Help for command line parameters
- DATA (sequences)
-
--INFILE=file.ext :input sequences.
--PROFILE1=file.ext and -PROFILE2=file.ext :profiles (old alignment).
-
-
- VERBS (do things)
-
--OPTIONS :list the command line parameters
--HELP or -CHECK :outline the command line params.
--ALIGN :do full multiple alignment
--TREE :calculate NJ tree.
--BOOTSTRAP(=n) :bootstrap a NJ tree (n= number of bootstraps; def. = 1000).
--CONVERT :output the input sequences in a different file format.
-
-
- PARAMETERS (set things)
-
-***General settings:****
--INTERACTIVE :read command line, then enter normal interactive menus
--QUICKTREE :use FAST algorithm for the alignment guide tree
--TYPE= :PROTEIN or DNA sequences
--NEGATIVE :protein alignment with negative values in matrix
--OUTFILE= :sequence alignment file name
--OUTPUT= :GCG, GDE, PHYLIP, PIR or NEXUS
--OUTORDER= :INPUT or ALIGNED
--CASE :LOWER or UPPER (for GDE output only)
--SEQNOS= :OFF or ON (for Clustal output only)
--SEQNO_RANGE=:OFF or ON (NEW: for all output formats)
--RANGE=m,n :sequence range to write starting m to m+n.
-
-***Fast Pairwise Alignments:***
--KTUPLE=n :word size
--TOPDIAGS=n :number of best diags.
--WINDOW=n :window around best diags.
--PAIRGAP=n :gap penalty
--SCORE :PERCENT or ABSOLUTE
-
-
-***Slow Pairwise Alignments:***
--PWMATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
--PWDNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
--PWGAPOPEN=f :gap opening penalty
--PWGAPEXT=f :gap opening penalty
-
-
-***Multiple Alignments:***
--NEWTREE= :file for new guide tree
--USETREE= :file for old guide tree
--MATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
--DNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
--GAPOPEN=f :gap opening penalty
--GAPEXT=f :gap extension penalty
--ENDGAPS :no end gap separation pen.
--GAPDIST=n :gap separation pen. range
--NOPGAP :residue-specific gaps off
--NOHGAP :hydrophilic gaps off
--HGAPRESIDUES= :list hydrophilic res.
--MAXDIV=n :% ident. for delay
--TYPE= :PROTEIN or DNA
--TRANSWEIGHT=f :transitions weighting
-
-
-***Profile Alignments:***
--PROFILE :Merge two alignments by profile alignment
--NEWTREE1= :file for new guide tree for profile1
--NEWTREE2= :file for new guide tree for profile2
--USETREE1= :file for old guide tree for profile1
--USETREE2= :file for old guide tree for profile2
-
-
-***Sequence to Profile Alignments:***
--SEQUENCES :Sequentially add profile2 sequences to profile1 alignment
--NEWTREE= :file for new guide tree
--USETREE= :file for old guide tree
-
-
-***Structure Alignments:***
--NOSECSTR1 :do not use secondary structure-gap penalty mask for profile 1
--NOSECSTR2 :do not use secondary structure-gap penalty mask for profile 2
--SECSTROUT=STRUCTURE or MASK or BOTH or NONE :output in alignment file
--HELIXGAP=n :gap penalty for helix core residues
--STRANDGAP=n :gap penalty for strand core residues
--LOOPGAP=n :gap penalty for loop regions
--TERMINALGAP=n :gap penalty for structure termini
--HELIXENDIN=n :number of residues inside helix to be treated as terminal
--HELIXENDOUT=n :number of residues outside helix to be treated as terminal
--STRANDENDIN=n :number of residues inside strand to be treated as terminal
--STRANDENDOUT=n:number of residues outside strand to be treated as terminal
-
-
-***Trees:***
--OUTPUTTREE=nj OR phylip OR dist OR nexus
--SEED=n :seed number for bootstraps.
--KIMURA :use Kimura's correction.
--TOSSGAPS :ignore positions with gaps.
--BOOTLABELS=node OR branch :position of bootstrap values in tree display
-
->>HELP 0 << Help for tree output format options
-
-Four output formats are offered: 1) Clustal, 2) Phylip, 3) Just the distances
-4) Nexus
-
-None of these formats displays the results graphically. Many packages can
-display trees in the the PHYLIP format 2) below. It can also be imported into
-the PHYLIP programs RETREE, DRAWTREE and DRAWGRAM for graphical display.
-NEXUS format trees can be read by PAUP and MacClade.
-
-1) Clustal format output.
-This format is verbose and lists all of the distances between the sequences and
-the number of alignment positions used for each. The tree is described at the
-end of the file. It lists the sequences that are joined at each alignment step
-and the branch lengths. After two sequences are joined, it is referred to later
-as a NODE. The number of a NODE is the number of the lowest sequence in that
-NODE.
-
-2) Phylip format output.
-This format is the New Hampshire format, used by many phylogenetic analysis
-packages. It consists of a series of nested parentheses, describing the
-branching order, with the sequence names and branch lengths. It can be used by
-the RETREE, DRAWGRAM and DRAWTREE programs of the PHYLIP package to see the
-trees graphically. This is the same format used during multiple alignment for
-the guide trees.
-
-Use this format with NJplot (Manolo Gouy), supplied with Clustal W. Some other
-packages that can read and display New Hampshire format are TreeView (Mac/PC),
-TreeTool (UNIX), and Phylowin.
-
-3) The distances only.
-This format just outputs a matrix of all the pairwise distances in a format
-that can be used by the Phylip package. It used to be useful when one could not
-produce distances from protein sequences in the Phylip package but is now
-redundant (Protdist of Phylip 3.5 now does this).
-
-4) NEXUS FORMAT TREE. This format is used by several popular phylogeny programs,
-including PAUP and MacClade. The format is described fully in:
-Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
-NEXUS: an extensible file format for systematic information.
-Systematic Biology 46:590-621.
-
-5) TOGGLE PHYLIP BOOTSTRAP POSITIONS
-By default, the bootstrap values are placed on the nodes of the phylip format
-output tree. This is inaccurate as the bootstrap values should be associated
-with the tree branches and not the nodes. However, this format can be read and
-displayed by TreeTool, TreeView and Phylowin. An option is available to
-correctly place the bootstrap values on the branches with which they are
-associated.
-
Deleted: trunk/packages/clustalw/trunk/clustalx.c
===================================================================
--- trunk/packages/clustalw/trunk/clustalx.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/clustalx.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,129 +0,0 @@
-#include <string.h>
-#include <stdlib.h>
-#include <vibrant.h>
-
-#include "clustalw.h"
-
-/*
-* Prototypes
-*/
-
-extern void *ckalloc(size_t);
-extern void init_interface(void);
-extern void init_matrix(void);
-extern void fill_chartab(void);
-extern void parse_params(Boolean);
-extern void x_menu(void);
-
-/*
-* Global variables
-*/
-
-double **tmat;
-
-char revision_level[] = "X (1.83)"; /* JULIE feb 2001*/
-Boolean interactive=TRUE;
-#ifdef MSDOS
- char *help_file_name = "clustalx.hlp";
-#else
- char *help_file_name = "/usr/share/clustalw/clustalx_help";
-#endif
-
-sint max_names; /* maximum length of names in current alignment file */
-
-float gap_open, gap_extend;
-float pw_go_penalty, pw_ge_penalty;
-
-FILE *tree;
-FILE *clustal_outfile, *gcg_outfile, *nbrf_outfile, *phylip_outfile,
- *gde_outfile, *nexus_outfile;
-FILE *fasta_outfile; /* Ramu */
-sint *seqlen_array;
-sint max_aln_length;
-short usermat[NUMRES][NUMRES], pw_usermat[NUMRES][NUMRES];
-short score_matrix[NUMRES][NUMRES],score_dnamatrix[NUMRES][NUMRES];
-short segment_matrix[NUMRES][NUMRES],segment_dnamatrix[NUMRES][NUMRES];
-short def_aa_xref[NUMRES+1], aa_xref[NUMRES+1], pw_aa_xref[NUMRES+1];
-short userdnamat[NUMRES][NUMRES], pw_userdnamat[NUMRES][NUMRES];
-short def_dna_xref[NUMRES+1], dna_xref[NUMRES+1], pw_dna_xref[NUMRES+1];
-short score_aa_xref[NUMRES+1],score_dna_xref[NUMRES+1];
-short segment_aa_xref[NUMRES+1],segment_dna_xref[NUMRES+1];
-sint nseqs;
-sint nsets;
-sint *output_index;
-sint **sets;
-sint *seq_weight;
-sint max_aa;
-sint gap_pos1;
-sint gap_pos2;
-sint mat_avscore;
-sint profile_no;
-
-Boolean usemenu=FALSE;
-Boolean dnaflag;
-Boolean distance_tree;
-
-char **seq_array;
-char **names,**titles;
-char **args;
-char seqname[FILENAMELEN+1];
-
-char *gap_penalty_mask1 = NULL, *gap_penalty_mask2 = NULL;
-char *sec_struct_mask1 = NULL, *sec_struct_mask2 = NULL;
-sint struct_penalties;
-char *ss_name1 = NULL, *ss_name2 = NULL;
-
-Boolean user_series = FALSE;
-UserMatSeries matseries;
-short usermatseries[MAXMAT][NUMRES][NUMRES];
-short aa_xrefseries[MAXMAT][NUMRES+1];
-
-
-extern Int2 Main(void)
-
-{
- int i;
-
-#ifndef WIN_MAC
-#ifdef GetArgc
- int argc;
- char **argv;
-
- argc=GetArgc();
- argv=GetArgv();
-#else
- extern int argc;
- extern char **argv;
-#endif
-#endif
-
- init_interface();
- init_matrix();
-
- fill_chartab();
-
-#ifndef WIN_MAC
- if(argc>1) {
- args = (char **)ckalloc(argc * sizeof(char *));
-
- for(i=1;i<argc;++i)
- {
- args[i-1]=(char *)ckalloc((strlen(argv[i])+1) * sizeof(char));
- strcpy(args[i-1],argv[i]);
- }
- usemenu=FALSE;
- parse_params(TRUE);
-
- for(i=0;i<argc-1;i++)
- ckfree(args[i]);
- ckfree(args);
-
- }
-#endif
- interactive=TRUE;
- x_menu();
-
- return 0;
- /* exit(0); */
-}
-
Deleted: trunk/packages/clustalw/trunk/clustalx.html
===================================================================
--- trunk/packages/clustalw/trunk/clustalx.html 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/clustalx.html 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,4224 +0,0 @@
-<HEAD>
-<TITLE>ClustalX Help</TITLE>
-</HEAD>
-<BODY BGCOLOR=white>
-<CENTER><H1>ClustalX Help</H1></CENTER>
-<P>
-You can get the latest version of the ClustalX program here:
-</P>
-<DL><DD>
-<A HREF="ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalX/">
-ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalX/</A>
-</DL>
-<P>For full details of usage and algorithms, please read the <A HREF="clustalw.doc"><EM>ClustalW.Doc</EM></A> file.</P>
-<PRE><EM>
-Toby Gibson EMBL, Heidelberg, Germany.
-Des Higgins UCC, Cork, Ireland.
-Julie Thompson/Francois Jeanmougin IGBMC, Strasbourg, France.
-</EM></PRE>
-<CENTER><H2><A NAME="Index">Index</A></H2></CENTER>
-<OL>
-<LI><A HREF="#G"> General help for CLUSTAL X (1.8)
-</A></LI>
-<LI><A HREF="#F"> Input / Output Files
-</A></LI>
-<LI><A HREF="#E"> Editing Alignments
-</A></LI>
-<LI><A HREF="#M"> Multiple Alignments
-</A></LI>
-<LI><A HREF="#P"> Profile and Structure Alignments
-</A></LI>
-<LI><A HREF="#B"> Secondary Structure / Gap Penalty Masks
-</A></LI>
-<LI><A HREF="#T"> Phylogenetic Trees
-</A></LI>
-<LI><A HREF="#C"> Colors
-</A></LI>
-<LI><A HREF="#Q"> Alignment Quality Analysis
-</A></LI>
-<LI><A HREF="#9"> Command Line Parameters
-</A></LI>
-<LI><A HREF="#R"> References
-</A></LI>
-</OL>
-<CENTER><H2><A NAME="G"> General help for CLUSTAL X (1.8)
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-Clustal X is a windows interface for the ClustalW multiple sequence alignment
-program. It provides an integrated environment for performing multiple sequence
-and profile alignments and analysing the results. The sequence alignment is
-displayed in a window on the screen. A versatile coloring scheme has been
-incorporated allowing you to highlight conserved features in the alignment.
-The pull-down menus at the top of the window allow you to select all the
-options required for traditional multiple sequence and profile alignment.
-</P>
-<P>
-You can cut-and-paste sequences to change the order of the alignment; you can
-select a subset of sequences to be aligned; you can select a sub-range of the
-alignment to be realigned and inserted back into the original alignment.
-</P>
-<P>
-Alignment quality analysis can be performed and low-scoring segments or
-exceptional residues can be highlighted.
-</P>
-<P>
-ClustalX is available for a number of different platforms including: SUN
-Solaris, IRIX5.3 on Silicon Graphics, Digital UNIX on DECStations, Microsoft
-Windows (32 bit) for PC's, Linux ELF for x86 PC's and Macintosh PowerMac. (See
-the README file for Installation instructions.)
-</P>
-<P>
-</P>
-<P>
-<H4>
-SEQUENCE INPUT
-</H4>
-</P>
-<P>
-Sequences and profiles (a term for pre-existing alignments) are input using
-the FILE menu. Invalid options will be disabled. All sequences must be included
-into 1 file. 7 formats are automatically recognised: NBRF/PIR, EMBL/SWISSPROT,
-Pearson (Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9 RSF and GDE flat file.
-All non-alphabetic characters (spaces, digits, punctuation marks) are ignored
-except "-" which is used to indicate a GAP ("." in MSF/RSF).
-</P>
-<P>
-<H4>
-SEQUENCE / PROFILE ALIGNMENTS
-</H4>
-</P>
-<P>
-Clustal X has two modes which can be selected using the switch directly above
-the sequence display: MULTIPLE ALIGNMENT MODE and PROFILE ALIGNMENT MODE.
-</P>
-<P>
-To do a MULTIPLE ALIGNMENT on a set of sequences, make sure MULTIPLE ALIGNMENT
-MODE is selected. A single sequence data area is then displayed. The ALIGNMENT
-menu then allows you to either produce a guide tree for the alignment, or to do
-a multiple alignment following the guide tree, or to do a full multiple
-alignment.
-</P>
-<P>
-In PROFILE ALIGNMENT MODE, two sequence data areas are displayed, allowing you
-to align 2 alignments (termed profiles). Profiles are also used to add a new
-sequence to an old alignment, or to use secondary structure to guide the
-alignment process. GAPS in the old alignments are indicated using the "-"
-character. PROFILES can be input in ANY of the allowed formats; just use "-"
-(or "." for MSF/RSF) for each gap position. In Profile Alignment Mode, a button
-"Lock Scroll" is displayed which allows you to scroll the two profiles together
-using a single scroll bar. When the Lock Scroll is turned off, the two profiles
-can be scrolled independently.
-</P>
-<P>
-<H4>
-PHYLOGENETIC TREES
-</H4>
-</P>
-<P>
-Phylogenetic trees can be calculated from old alignments (read in with "-"
-characters to indicate gaps) OR after a multiple alignment while the alignment
-is still displayed.
-</P>
-<P>
-<H4>
-ALIGNMENT DISPLAY
-</H4>
-</P>
-<P>
-The alignment is displayed on the screen with the sequence names on the left
-hand side. The sequence alignment is for display only, it cannot be edited here
-(except for changing the sequence order by cutting-and-pasting on the sequence
-names).
-</P>
-<P>
-A ruler is displayed below the sequences, starting at 1 for the first residue
-position (residue numbers in the sequence input file are ignored).
-</P>
-<P>
-A line above the alignment is used to mark strongly conserved positions. Three
-characters ('*', ':' and '.') are used:
-</P>
-<P>
-'*' indicates positions which have a single, fully conserved residue
-</P>
-<P>
-':' indicates that one of the following 'strong' groups is fully conserved:-
-<PRE>
- STA
- NEQK
- NHQK
- NDEQ
- QHRK
- MILV
- MILF
- HY
- FYW
-</PRE>
-</P>
-<P>
-'.' indicates that one of the following 'weaker' groups is fully conserved:-
-<PRE>
- CSA
- ATV
- SAG
- STNK
- STPA
- SGND
- SNDEQK
- NDEQHK
- NEQHRK
- FVLIM
- HFY
-</PRE>
-</P>
-<P>
-These are all the positively scoring groups that occur in the Gonnet Pam250
-matrix. The strong and weak groups are defined as strong score >0.5 and weak
-score =<0.5 respectively.
-</P>
-<P>
-For profile alignments, secondary structure and gap penalty masks are displayed
-above the sequences, if any data is found in the profile input file.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="F"> Input / Output Files
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-LOAD SEQUENCES reads sequences from one of 7 file formats, replacing any
-sequences that are already loaded. All sequences must be in 1 file. The formats
-that are automatically recognised are: NBRF/PIR, EMBL/SWISSPROT, Pearson
-(Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9/RSF and GDE flat file. All
-non-alphabetic characters (spaces, digits, punctuation marks) are ignored
-except "-" which is used to indicate a GAP ("." in MSF/RSF).
-</P>
-<P>
-The program tries to automatically recognise the different file formats used
-and to guess whether the sequences are amino acid or nucleotide. This is not
-always foolproof.
-</P>
-<P>
-FASTA and NBRF/PIR formats are recognised by having a ">" as the first
-character in the file.
-</P>
-<P>
-EMBL/Swiss Prot formats are recognised by the letters "ID" at the start of the
-file (the token for the entry name field).
-</P>
-<P>
-CLUSTAL format is recognised by the word CLUSTAL at the beginning of the file.
-</P>
-<P>
-GCG/MSF format is recognised by one of the following:
-<UL>
-<LI>
- - the word PileUp at the start of the file.
-</LI><LI>
- - the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
- at the start of the file.
-</LI><LI>
- - the word MSF on the first line of the file, and the characters ..
- at the end of this line.
-</LI>
-</UL>
-</P>
-<P>
-GCG/RSF format is recognised by the word !!RICH_SEQUENCE at the beginning of
-the file.
-</P>
-<P>
-</P>
-<P>
-If 85% or more of the characters in the sequence are from A,C,G,T,U or N, the
-sequence will be assumed to be nucleotide. This works in 97.3% of cases but
-watch out!
-</P>
-<P>
-APPEND SEQUENCES is only valid in MULTIPLE ALIGNMENT MODE. The input sequences
-do not replace those already loaded, but are appended at the end of the
-alignment.
-</P>
-<P>
-SAVE SEQUENCES AS... offers the user a choice of one of six output formats:
-CLUSTAL, NBRF/PIR, GCG/MSF, PHYLIP, NEXUS or GDE. All sequences are written
-to a single file. Options are available to save a range of the alignment,
-switch between UPPER/LOWER case for GDE files, and to output SEQUENCE NUMBERING
-for CLUSTAL files.
-</P>
-<P>
-LOAD PROFILE 1 reads sequences in the same 7 file formats, replacing any
-sequences already loaded as Profile 1. This option will also remove any
-sequences which are loaded in Profile 2.
-</P>
-<P>
-LOAD PROFILE 2 reads sequences in the same 7 file formats, replacing any
-sequences already loaded as Profile 2.
-</P>
-<P>
-SAVE PROFILE 1 AS... is similar to the Save Sequences option except that only
-those sequences in Profile 1 will be written to the output file.
-</P>
-<P>
-SAVE PROFILE 2 AS... is similar to the Save Sequences option except that only
-those sequences in Profile 2 will be written to the output file.
-</P>
-<P>
-WRITE ALIGNMENT AS POSTSCRIPT will write the sequence display to a postscript
-format file. This will include any secondary structure / gap penalty mask
-information and the consensus and ruler lines which are displayed on the
-screen. The Alignment Quality curve can be optionally included in the output
-file.
-</P>
-<P>
-WRITE PROFILE 1 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
-except that only the profile 1 display will be printed.
-</P>
-<P>
-WRITE PROFILE 2 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
-except that only the profile 2 display will be printed.
-</P>
-<P>
-</P>
-<P>
-<H4>
-POSTSCRIPT PARAMETERS
-</H4>
-</P>
-<P>
-A number of options are available to allow you to configure your postscript
-output file.
-</P>
-<P>
-PS COLORS FILE:
-</P>
-<P>
-The exact RGB values required to reproduce the colors used in the alignment
-window will vary from printer to printer. A PS colors file can be specified
-that contains the RGB values for all the colors required by each of your
-postscript printers.
-</P>
-<P>
-By default, Clustal X looks for a file called 'colprint.par' in the current
-directory (if your running under UNIX, it then looks in your home directory,
-and finally in the directories in your PATH environment variable). If no PS
-colors file is found or a color used on the screen is not defined here, the
-screen RGB values (from the Color Parameter File) are used.
-</P>
-<P>
-The PS colors file consists of one line for each color to be defined, with the
-color name followed by the RGB values (on a scale of 0 to 1). For example,
-</P>
-<P>
-RED 0.9 0.1 0.1
-</P>
-<P>
-Blank lines and comments (lines beginning with a '#' character) are ignored.
-</P>
-<P>
-</P>
-<P>
-PAGE SIZE: The alignment can be displayed on either A4, A3 or US Letter size
-pages.
-</P>
-<P>
-ORIENTATION: The alignment can be displayed on either a landscape or portrait
-page.
-</P>
-<P>
-PRINT HEADER: An optional header including the postscript filename, and
-creation date can be printed at the top of each page.
-</P>
-<P>
-PRINT QUALITY CURVE: The Alignment Quality curve which is displayed underneath
-the alignment on the screen can be included in the postscript output.
-</P>
-<P>
-PRINT RULER: The ruler which is displayed underneath the alignment on the
-screen can be included in the postscript output.
-</P>
-<P>
-PRINT RESIDUE NUMBERS: Sequence residue numbers can be printed at the right
-hand side of the alignment.
-</P>
-<P>
-RESIZE TO FIT PAGE: By default, the alignment is scaled to fit the page size
-selected. This option can be turned off, in which case a font size of 10 will
-be used for the sequences.
-</P>
-<P>
-PRINT FROM POSITION/TO: A range of the alignment can be printed. The default
-is to print the full alignment. The first and last residues to be printed are
-specified here.
-</P>
-<P>
-USE BLOCK LENGTH: The alignment can be divided into blocks of residues. The
-number of residues in a block is specified here. More than one block may then
-be printed on a single page. This is useful for long alignments of a small
-number of sequences. If the block length is set to 0, The alignment will not
-be divided into blocks, but printed across a number of pages.
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="E"> Editing Alignments
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-Clustal X allows you to change the order of the sequences in the alignment, by
-cutting-and-pasting the sequence names.
-</P>
-<P>
-To select a group of sequences to be moved, click on a sequence name and drag
-the cursor until all the required sequences are highlighted. Holding down the
-Shift key when clicking on the first name will add new sequences to those
-already selected.
-</P>
-<P>
-(Options are provided to Select All Sequences, Select Profile 1 or Select
-Profile 2.)
-</P>
-<P>
-The selected sequences can be removed from the alignment by using the EDIT
-menu, CUT option.
-</P>
-<P>
-To add the cut sequences back into an alignment, select a sequence by clicking
-on the sequence name. The cut sequences will be added to the alignment,
-immediately following the selected sequence, by the EDIT menu, PASTE option.
-</P>
-<P>
-To add the cut sequences to an empty alignment (eg. when cutting sequences from
-Profile 1 and pasting them to Profile 2), click on the empty sequence name
-display area, and select the EDIT menu, PASTE option as before.
-</P>
-<P>
-The sequence selection and sequence range selection can be cleared using the
-EDIT menu, CLEAR SEQUENCE SELECTION and CLEAR RANGE SELECTION options
-respectively.
-</P>
-<P>
-To search for a string of residues in the sequences, select the sequences to be
-searched by clicking on the sequence names. You can then enter the string to
-search for by selecting the SEARCH FOR STRING option. If the string is found in
-any of the sequences selected, the sequence name and column number is printed
-below the sequence display.
-</P>
-<P>
-In PROFILE ALIGNMENT MODE, the two profiles can be merged (normally done after
-alignment) by selecting ADD PROFILE 2 TO PROFILE 1. The sequences currently
-displayed as Profile 2 will be appended to Profile 1.
-</P>
-<P>
-The REMOVE ALL GAPS option will remove all gaps from the sequences currently
-selected.
-WARNING: This option removes ALL gaps, not only those introduced by ClustalX,
-but also those that were read from the input alignment file. Any secondary
-structure information associated with the alignment will NOT be automatically
-realigned.
-</P>
-<P>
-The REMOVE GAP-ONLY COLUMNS will remove those positions in the alignment which
-contain gaps in all sequences. This can occur as a result of removing divergent
-sequences from an alignment, or if an alignment has been realigned.
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="M"> Multiple Alignments
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-Make sure MULTIPLE ALIGNMENT MODE is selected, using the switch directly above
-the sequence display area. Then, use the ALIGNMENT menu to do multiple
-alignments.
-</P>
-<P>
-Multiple alignments are carried out in 3 stages:
-</P>
-<P>
-1) all sequences are compared to each other (pairwise alignments);
-</P>
-<P>
-2) a dendrogram (like a phylogenetic tree) is constructed, describing the
-approximate groupings of the sequences by similarity (stored in a file).
-</P>
-<P>
-3) the final multiple alignment is carried out, using the dendrogram as a guide.
-</P>
-<P>
-The 3 stages are carried out automatically by the DO COMPLETE ALIGNMENT option.
-You can skip the first stages (pairwise alignments; guide tree) by using an old
-guide tree file (DO ALIGNMENT FROM GUIDE TREE); or you can just produce the
-guide tree with no final multiple alignment (PRODUCE GUIDE TREE ONLY).
-</P>
-<P>
-</P>
-<P>
-REALIGN SELECTED SEQUENCES is used to realign badly aligned sequences in the
-alignment. Sequences can be selected by clicking on the sequence names - see
-Editing Alignments for more details. The unselected sequences are then 'fixed'
-and a profile is made including only the unselected sequences. Each of the
-selected sequences in turn is then realigned to this profile. The realigned
-sequences will be displayed as a group at the end the alignment.
-</P>
-<P>
-</P>
-<P>
-REALIGN SELECTED SEQUENCE RANGE is used to realign a small region of the
-alignment. A residue range can be selected by clicking on the sequence display
-area. A multiple alignment is then performed, following the 3 stages described
-above, but only using the selected residue range. Finally the new alignment of
-the range is pasted back into the full sequence alignment.
-</P>
-<P>
-By default, gap penalties are used at each end of the subrange in order to
-penalise terminal gaps. If the REALIGN SEGMENT END GAP PENALTIES option is
-switched off, gaps can be introduced at the ends of the residue range at no
-cost.
-</P>
-<P>
-</P>
-<P>
-ALIGNMENT PARAMETERS displays a sub-menu with the following options:
-</P>
-<P>
-RESET NEW GAPS BEFORE ALIGNMENT will remove any new gaps introduced into the
-sequences during multiple alignment if you wish to change the parameters and
-try again. This only takes effect just before you do a second multiple
-alignment. You can make phylogenetic trees after alignment whether or not this
-is ON. If you turn this OFF, the new gaps are kept even if you do a second
-multiple alignment. This allows you to iterate the alignment gradually.
-Sometimes, the alignment is improved by a second or third pass.
-</P>
-<P>
-RESET ALL GAPS BEFORE ALIGNMENT will remove all gaps in the sequences including
-gaps which were read in from the sequence input file. This only takes effect
-just before you do a second multiple alignment. You can make phylogenetic
-trees after alignment whether or not this is ON. If you turn this OFF, all
-gaps are kept even if you do a second multiple alignment. This allows you to
-iterate the alignment gradually. Sometimes, the alignment is improved by a
-second or third pass.
-</P>
-<P>
-</P>
-<P>
-PAIRWISE ALIGNMENT PARAMETERS control the speed/sensitivity of the initial
-alignments.
-</P>
-<P>
-MULTIPLE ALIGNMENT PARAMETERS control the gaps in the final multiple
-alignments.
-</P>
-<P>
-PROTEIN GAP PARAMETERS displays a temporary window which allows you to set
-various parameters only used in the alignment of protein sequences.
-</P>
-<P>
-(SECONDARY STRUCTURE PARAMETERS, for use with the Profile Alignment Mode only,
-allows you to set various parameters only used with gap penalty masks.)
-</P>
-<P>
-SAVE LOG FILE will write the alignment calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-</P>
-<P>
-</P>
-<P>
-<H4>
-OUTPUT FORMAT OPTIONS
-</H4>
-</P>
-<P>
-You can choose from 6 different alignment formats (CLUSTAL, GCG, NBRF/PIR,
-PHYLIP, GDE and NEXUS). You can choose more than one (or all 6 if you wish).
-</P>
-<P>
-CLUSTAL format output is a self explanatory alignment format. It shows the
-sequences aligned in blocks. It can be read in again at a later date to (for
-example) calculate a phylogenetic tree or add in new sequences by profile
-alignment.
-</P>
-<P>
-GCG output can be used by any of the GCG programs that can work on multiple
-alignments (e.g. PRETTY, PROFILEMAKE, PLOTALIGN). It is the same as the GCG
-.msf format files (multiple sequence file); new in version 7 of GCG.
-</P>
-<P>
-NEXUS format is used by several phylogeny programs, including PAUP and
-MacClade.
-</P>
-<P>
-PHYLIP format output can be used for input to the PHYLIP package of Joe
-Felsenstein. This is a very widely used package for doing every imaginable
-form of phylogenetic analysis (MUCH more than the the modest introduction
-offered by this program).
-</P>
-<P>
-NBRF/PIR: this is the same as the standard PIR format with ONE ADDITION. Gap
-characters "-" are used to indicate the positions of gaps in the multiple
-alignment. These files can be re-used as input in any part of clustal that
-allows sequences (or alignments or profiles) to be read in.
-</P>
-<P>
-GDE: this format is used by the GDE package of Steven Smith and is understood
-by SEQLAB in GCG 9 or later.
-</P>
-<P>
-GDE OUTPUT CASE: sequences in GDE format may be written in either upper or
-lower case.
-</P>
-<P>
-CLUSTALW SEQUENCE NUMBERS: residue numbers may be added to the end of the
-alignment lines in clustalw format.
-</P>
-<P>
-OUTPUT ORDER is used to control the order of the sequences in the output
-alignments. By default, it uses the order in which the sequences were aligned
-(from the guide tree/dendrogram), thus automatically grouping closely related
-sequences. It can be switched to be the same as the original input order.
-</P>
-<P>
-PARAMETER OUTPUT: This option will save all your parameter settings in a
-parameter file (suffix .par) during alignment. The file can be subsequently
-used to rerun ClustalW using the same parameters.
-</P>
-<P>
-</P>
-<P>
-<H3>
-ALIGNMENT PARAMETERS
-</H3>
-</P>
-<P>
-<STRONG>
-PAIRWISE ALIGNMENT PARAMETERS
-</STRONG>
-</P>
-<P>
-A distance is calculated between every pair of sequences and these are used to
-construct the phylogenetic tree which guides the final multiple alignment. The
-scores are calculated from separate pairwise alignments. These can be
-calculated using 2 methods: dynamic programming (slow but accurate) or by the
-method of Wilbur and Lipman (extremely fast but approximate).
-</P>
-<P>
-You can choose between the 2 alignment methods using the PAIRWISE ALIGNMENTS
-option. The slow/accurate method is fast enough for short sequences but will be
-VERY SLOW for many (e.g. >100) long (e.g. >1000 residue) sequences.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-SLOW-ACCURATE alignment parameters:
-</STRONG>
-</P>
-<P>
-These parameters do not have any affect on the speed of the alignments. They
-are used to give initial alignments which are then rescored to give percent
-identity scores. These % scores are the ones which are displayed on the
-screen. The scores are converted to distances for the trees.
-</P>
-<P>
-Gap Open Penalty: the penalty for opening a gap in the alignment.
-</P>
-<P>
-Gap Extension Penalty: the penalty for extending a gap by 1 residue.
-</P>
-<P>
-Protein Weight Matrix: the scoring table which describes the similarity of
-each amino acid to each other.
-</P>
-<P>
-Load protein matrix: allows you to read in a comparison table from a file.
-</P>
-<P>
-DNA weight matrix: the scores assigned to matches and mismatches (including
-IUB ambiguity codes).
-</P>
-<P>
-Load DNA matrix: allows you to read in a comparison table from a file.
-</P>
-<P>
-See the Multiple alignment parameters, MATRIX option below for details of the
-matrix input format.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-FAST-APPROXIMATE alignment parameters:
-</STRONG>
-</P>
-<P>
-These similarity scores are calculated from fast, approximate, global align-
-ments, which are controlled by 4 parameters. 2 techniques are used to make
-these alignments very fast: 1) only exactly matching fragments (k-tuples) are
-considered; 2) only the 'best' diagonals (the ones with most k-tuple matches)
-are used.
-</P>
-<P>
-GAP PENALTY: This is a penalty for each gap in the fast alignments. It has
-little effect on the speed or sensitivity except for extreme values.
-</P>
-<P>
-K-TUPLE SIZE: This is the size of exactly matching fragment that is used.
-INCREASE for speed (max= 2 for proteins; 4 for DNA), DECREASE for sensitivity.
-For longer sequences (e.g. >1000 residues) you may wish to increase the
-default.
-</P>
-<P>
-TOP DIAGONALS: The number of k-tuple matches on each diagonal (in an imaginary
-dot-matrix plot) is calculated. Only the best ones (with most matches) are used
-in the alignment. This parameter specifies how many. Decrease for speed;
-increase for sensitivity.
-</P>
-<P>
-WINDOW SIZE: This is the number of diagonals around each of the 'best'
-diagonals that will be used. Decrease for speed; increase for sensitivity.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-MULTIPLE ALIGNMENT PARAMETERS
-</STRONG>
-</P>
-<P>
-These parameters control the final multiple alignment. This is the core of the
-program and the details are complicated. To fully understand the use of the
-parameters and the scoring system, you will have to refer to the documentation.
-</P>
-<P>
-Each step in the final multiple alignment consists of aligning two alignments
-or sequences. This is done progressively, following the branching order in the
-GUIDE TREE. The basic parameters to control this are two gap penalties and the
-scores for various identical/non-indentical residues.
-</P>
-<P>
-The GAP OPENING and EXTENSION PENALTIES can be set here. These control the
-cost of opening up every new gap and the cost of every item in a gap.
-Increasing the gap opening penalty will make gaps less frequent. Increasing
-the gap extension penalty will make gaps shorter. Terminal gaps are not
-penalised.
-</P>
-<P>
-The DELAY DIVERGENT SEQUENCES switch delays the alignment of the most distantly
-related sequences until after the most closely related sequences have been
-aligned. The setting shows the percent identity level required to delay the
-addition of a sequence; sequences that are less identical than this level to
-any other sequences will be aligned later.
-</P>
-<P>
-The TRANSITION WEIGHT gives transitions (A<-->G or C<-->T i.e. purine-purine or
-pyrimidine-pyrimidine substitutions) a weight between 0 and 1; a weight of zero
-means that the transitions are scored as mismatches, while a weight of 1 gives
-the transitions the match score. For distantly related DNA sequences, the
-weight should be near to zero; for closely related sequences it can be useful
-to assign a higher score. The default is set to 0.5.
-</P>
-<P>
-</P>
-<P>
-The PROTEIN WEIGHT MATRIX option allows you to choose a series of weight
-matrices. For protein alignments, you use a weight matrix to determine the
-similarity of non-identical amino acids. For example, Tyr aligned with Phe is
-usually judged to be 'better' than Tyr aligned with Pro.
-</P>
-<P>
-There are three 'in-built' series of weight matrices offered. Each consists of
-several matrices which work differently at different evolutionary distances. To
-see the exact details, read the documentation. Crudely, we store several
-matrices in memory, spanning the full range of amino acid distance (from almost
-identical sequences to highly divergent ones). For very similar sequences, it
-is best to use a strict weight matrix which only gives a high score to
-identities and the most favoured conservative substitutions. For more divergent
-sequences, it is appropriate to use "softer" matrices which give a high score
-to many other frequent substitutions.
-</P>
-<P>
-1) BLOSUM (Henikoff). These matrices appear to be the best available for
-carrying out data base similarity (homology searches). The matrices currently
-used are: Blosum 80, 62, 45 and 30. BLOSUM was the default in earlier Clustal X
-versions.
-</P>
-<P>
-2) PAM (Dayhoff). These have been extremely widely used since the late '70s. We
-currently use the PAM 20, 60, 120, 350 matrices.
-</P>
-<P>
-3) GONNET. These matrices were derived using almost the same procedure as the
-Dayhoff one (above) but are much more up to date and are based on a far larger
-data set. They appear to be more sensitive than the Dayhoff series. We
-currently use the GONNET 80, 120, 160, 250 and 350 matrices. This series is the
-default for Clustal X version 1.8.
-</P>
-<P>
-We also supply an identity matrix which gives a score of 10 to two identical
-amino acids and a score of zero otherwise. This matrix is not very useful.
-</P>
-<P>
-Load protein matrix: allows you to read in a comparison matrix from a file.
-This can be either a single matrix or a series of matrices (see below for
-format).
-</P>
-<P>
-</P>
-<P>
-DNA WEIGHT MATRIX option allows you to select a single matrix (not a series)
-used for aligning nucleic acid sequences. Two hard-coded matrices are available:
-</P>
-<P>
-1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
-of nucleic acid sequences. X's and N's are treated as matches to any IUB
-ambiguity symbol. All matches score 1.9; all mismatches for IUB symbols score 0.
-</P>
-<P>
-2) CLUSTALW(1.6). A previous system used by ClustalW, in which matches score
-1.0 and mismatches score 0. All matches for IUB symbols also score 0.
-</P>
-<P>
-Load DNA matrix: allows you to read in a nucleic acid comparison matrix from a
-file (just one matrix, not a series).
-</P>
-<P>
-</P>
-<P>
-SINGLE MATRIX INPUT FORMAT
-The format used for a single matrix is the same as the BLAST program. The
-scores in the new weight matrix should be similarities. You can use negative as
-well as positive values if you wish, although the matrix will be automatically
-adjusted to all positive scores, unless the NEGATIVE MATRIX option is selected.
-Any lines beginning with a # character are assumed to be comments. The first
-non-comment line should contain a list of amino acids in any order, using the 1
-letter code, followed by a * character. This should be followed by a square
-matrix of scores, with one row and one column for each amino acid. The last row
-and column of the matrix (corresponding to the * character) contain the minimum
-score over the whole matrix.
-</P>
-<P>
-MATRIX SERIES INPUT FORMAT
-ClustalX uses different matrices depending on the mean percent identity of the
-sequences to be aligned. You can specify a series of matrices and the range of
-the percent identity for each matrix in a matrix series file. The file is
-automatically recognised by the word CLUSTAL_SERIES at the beginning of the
-file. Each matrix in the series is then specified on one line which should
-start with the word MATRIX. This is followed by the lower and upper limits of
-the sequence percent identities for which you want to apply the matrix. The
-final entry on the matrix line is the filename of a Blast format matrix file
-(see above for details of the single matrix file format).
-</P>
-<P>
-Example.
-</P>
-<P>
-CLUSTAL_SERIES
-</P>
-<P>
-MATRIX 81 100 /us1/user/julie/matrices/blosum80
-MATRIX 61 80 /us1/user/julie/matrices/blosum62
-MATRIX 31 60 /us1/user/julie/matrices/blosum45
-MATRIX 0 30 /us1/user/julie/matrices/blosum30
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-PROTEIN GAP PARAMETERS
-</STRONG>
-</P>
-<P>
-RESIDUE SPECIFIC PENALTIES are amino acid specific gap penalties that reduce or
-increase the gap opening penalties at each position in the alignment or
-sequence. See the documentation for details. As an example, positions that are
-rich in glycine are more likely to have an adjacent gap than positions that are
-rich in valine.
-</P>
-<P>
-HYDROPHILIC GAP PENALTIES are used to increase the chances of a gap within a
-run (5 or more residues) of hydrophilic amino acids; these are likely to be
-loop or random coil regions where gaps are more common. The residues that are
-"considered" to be hydrophilic can be entered in HYDROPHILIC RESIDUES.
-</P>
-<P>
-GAP SEPARATION DISTANCE tries to decrease the chances of gaps being too close
-to each other. Gaps that are less than this distance apart are penalised more
-than other gaps. This does not prevent close gaps; it makes them less frequent,
-promoting a block-like appearance of the alignment.
-</P>
-<P>
-END GAP SEPARATION treats end gaps just like internal gaps for the purposes of
-avoiding gaps that are too close (set by GAP SEPARATION DISTANCE above). If you
-turn this off, end gaps will be ignored for this purpose. This is useful when
-you wish to align fragments where the end gaps are not biologically meaningful.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="P"> Profile and Structure Alignments
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-By PROFILE ALIGNMENT, we mean alignment using existing alignments. Profile
-alignments allow you to store alignments of your favourite sequences and add
-new sequences to them in small bunches at a time. A profile is simply an
-alignment of one or more sequences (e.g. an alignment output file from Clustal
-X). Each input can be a single sequence. One or both sets of input sequences
-may include secondary structure assignments or gap penalty masks to guide the
-alignment.
-</P>
-<P>
-Make sure PROFILE ALIGNMENT MODE is selected, using the switch directly above
-the sequence display area. Then, use the ALIGNMENT menu to do profile and
-secondary structure alignments.
-</P>
-<P>
-The profiles can be in any of the allowed input formats with "-" characters
-used to specify gaps (except for GCG/MSF where "." is used).
-</P>
-<P>
-You have to load the 2 profiles by choosing FILE, LOAD PROFILE 1 and LOAD
-PROFILE 2. Then ALIGNMENT, ALIGN PROFILE 2 TO PROFILE 1 will align the 2
-profiles to each other. Secondary structure masks in either profile can be used
-to guide the alignment. This option compares all the sequences in profile 1
-with all the sequences in profile 2 in order to build guide trees which will be
-used to calculate sequence weights, and select appropriate alignment parameters
-for the final profile alignment.
-</P>
-<P>
-You can skip the first stage (pairwise alignments; guide trees) by using old
-guide tree files (ALIGN PROFILES FROM GUIDE TREES).
-</P>
-<P>
-The ALIGN SEQUENCES TO PROFILE 1 option will take the sequences in the second
-profile and align them to the first profile, 1 at a time. This is useful to
-add some new sequences to an existing alignment, or to align a set of sequences
-to a known structure. In this case, the second profile set need not be
-pre-aligned.
-</P>
-<P>
-You can skip the first stage (pairwise alignments; guide tree) by using an old
-guide tree file (ALIGN SEQUENCES TO PROFILE 1 FROM TREE).
-</P>
-<P>
-SAVE LOG FILE will write the alignment calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-</P>
-<P>
-The alignment parameters can be set using the ALIGNMENT PARAMETERS menu,
-Pairwise Parameters, Multiple Parameters and Protein Gap Parameters options.
-These are EXACTLY the same parameters as used by the general, automatic
-multiple alignment procedure. The general multiple alignment procedure is
-simply a series of profile alignments. Carrying out a series of profile
-alignments on larger and larger groups of sequences, allows you to manually
-build up a complete alignment, if necessary editing intermediate alignments.
-</P>
-<P>
-<STRONG>
-SECONDARY STRUCTURE PARAMETERS
-</STRONG>
-</P>
-<P>
-Use this menu to set secondary structure options. If a solved structure is
-known, it can be used to guide the alignment by raising gap penalties within
-secondary structure elements, so that gaps will preferentially be inserted into
-unstructured surface loop regions. Alternatively, a user-specified gap penalty
-mask can be supplied for a similar purpose.
-</P>
-<P>
-A gap penalty mask is a series of numbers between 1 and 9, one per position in
-the alignment. Each number specifies how much the gap opening penalty is to be
-raised at that position (raised by multiplying the basic gap opening penalty
-by the number) i.e. a mask figure of 1 at a position means no change
-in gap opening penalty; a figure of 4 means that the gap opening penalty is
-four times greater at that position, making gaps 4 times harder to open.
-</P>
-<P>
-The format for gap penalty masks and secondary structure masks is explained in
-a separate help section.
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="B"> Secondary Structure / Gap Penalty Masks
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-The use of secondary structure-based penalties has been shown to improve the
-accuracy of sequence alignment. Clustal X now allows secondary structure/ gap
-penalty masks to be supplied with the input sequences used during profile
-alignment. (NB. The secondary structure information is NOT used during multiple
-sequence alignment). The masks work by raising gap penalties in specified
-regions (typically secondary structure elements) so that gaps are
-preferentially opened in the less well conserved regions (typically surface
-loops).
-</P>
-<P>
-The USE PROFILE 1(2) SECONDARY STRUCTURE / GAP PENALTY MASK options control
-whether the input 2D-structure information or gap penalty masks will be used
-during the profile alignment.
-</P>
-<P>
-The OUTPUT options control whether the secondary structure and gap penalty
-masks should be included in the Clustal X output alignments. Showing both is
-useful for understanding how the masks work. The 2D-structure information is
-itself useful in judging the alignment quality and in seeing how residue
-conservation patterns vary with secondary structure.
-</P>
-<P>
-The HELIX and STRAND GAP PENALTY options provide the value for raising the gap
-penalty at core Alpha Helical (A) and Beta Strand (B) residues. In CLUSTAL
-format, capital residues denote the A and B core structure notation. Basic gap
-penalties are multiplied by the amount specified.
-</P>
-<P>
-The LOOP GAP PENALTY option provides the value for the gap penalty in Loops.
-By default this penalty is not raised. In CLUSTAL format, loops are specified
-by "." in the secondary structure notation.
-</P>
-<P>
-The SECONDARY STRUCTURE TERMINAL PENALTY provides the value for setting the gap
-penalty at the ends of secondary structures. Ends of secondary structures are
-known to grow or shrink, comparing related structures. Therefore by default
-these are given intermediate values, lower than the core penalties. All
-secondary structure read in as lower case in CLUSTAL format gets the reduced
-terminal penalty.
-</P>
-<P>
-The HELIX and STRAND TERMINAL POSITIONS options specify the range of structure
-termini for the intermediate penalties. In the alignment output, these are
-indicated as lower case. For Alpha Helices, by default, the range spans the
-end-helical turn (3 residues). For Beta Strands, the default range spans the
-end residue and the adjacent loop residue, since sequence conservation often
-extends beyond the actual H-bonded Beta Strand.
-</P>
-<P>
-Clustal X can read the masks from SWISS-PROT, CLUSTAL or GDE format input
-files. For many 3-D protein structures, secondary structure information is
-recorded in the feature tables of SWISS-PROT database entries. You should
-always check that the assignments are correct - some are quite inaccurate.
-Clustal X looks for SWISS-PROT HELIX and STRAND assignments e.g.
-</P>
-<P>
-</P>
-<P>
-<PRE>
-FT HELIX 100 115
-FT STRAND 118 119
-</PRE>
-</P>
-<P>
-The structure and penalty masks can also be read from CLUSTAL alignment format
-as comment lines beginning "!SS_" or "!GM_" e.g.
-</P>
-<P>
-<PRE>
-!SS_HBA_HUMA ..aaaAAAAAAAAAAaaa.aaaAAAAAAAAAAaaaaaaAaaa.........aaaAAAAAA
-!GM_HBA_HUMA 112224444444444222122244444444442222224222111111111222444444
-HBA_HUMA VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGK
-</PRE>
-</P>
-<P>
-Note that the mask itself is a set of numbers between 1 and 9 each of which is
-assigned to the residue(s) in the same column below.
-</P>
-<P>
-In GDE flat file format, the masks are specified as text and the names must
-begin with "SS_ or "GM_.
-</P>
-<P>
-Either a structure or penalty mask or both may be used. If both are included
-in an alignment, the user will be asked which is to be used.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="T"> Phylogenetic Trees
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-Before calculating a tree, you must have an ALIGNMENT in memory. This can be
-input using the FILE menu, LOAD SEQUENCES option or you should have just
-carried out a full multiple alignment and the alignment is still in memory.
-Remember YOU MUST ALIGN THE SEQUENCES FIRST!!!!
-</P>
-<P>
-The method used is the NJ (Neighbour Joining) method of Saitou and Nei. First
-you calculate distances (percent divergence) between all pairs of sequence from
-a multiple alignment; second you apply the NJ method to the distance matrix.
-</P>
-<P>
-To calculate a tree, use the DRAW N-J TREE option. This gives an UNROOTED tree
-and all branch lengths. The root of the tree can only be inferred by using an
-outgroup (a sequence that you are certain branches at the outside of the tree
-.... certain on biological grounds) OR if you assume a degree of constancy in
-the 'molecular clock', you can place the root in the 'middle' of the tree
-(roughly equidistant from all tips).
-</P>
-<P>
-BOOTSTRAP N-J TREE uses a method for deriving confidence values for the
-groupings in a tree (first adapted for trees by Joe Felsenstein). It involves
-making N random samples of sites from the alignment (N should be LARGE, e.g.
-500 - 1000); drawing N trees (1 from each sample) and counting how many times
-each grouping from the original tree occurs in the sample trees. You can set N
-using the NUMBER OF BOOTSTRAP TRIALS option in the BOOTSTRAP TREE window. In
-practice, you should use a large number of bootstrap replicates (1000 is
-recommended, even if it means running the program for an hour on a slow
-computer). You can also supply a seed number for the random number generator
-here. Different runs with the same seed will give the same answer. See the
-documentation for more details.
-</P>
-<P>
-EXCLUDE POSITIONS WITH GAPS? With this option, any alignment positions where
-ANY of the sequences have a gap will be ignored. This means that 'like' will
-be compared to 'like' in all distances, which is highly desirable. It also
-automatically throws away the most ambiguous parts of the alignment, which are
-concentrated around gaps (usually). The disadvantage is that you may throw away
-much of the data if there are many gaps (which is why it is difficult for us to
-make it the default).
-</P>
-<P>
-CORRECT FOR MULTIPLE SUBSTITUTIONS? For small divergence (say <10%) this option
-makes no difference. For greater divergence, this option corrects for the fact
-that observed distances underestimate actual evolutionary distances. This is
-because, as sequences diverge, more than one substitution will happen at many
-sites. However, you only see one difference when you look at the present day
-sequences. Therefore, this option has the effect of stretching branch lengths
-in trees (especially long branches). The corrections used here (for DNA or
-proteins) are both due to Motoo Kimura. See the documentation for details.
-</P>
-<P>
-Where possible, this option should be used. However, for VERY divergent
-sequences, the distances cannot be reliably corrected. You will be warned if
-this happens. Even if none of the distances in a data set exceed the reliable
-threshold, if you bootstrap the data, some of the bootstrap distances may
-randomly exceed the safe limit.
-</P>
-<P>
-SAVE LOG FILE will write the tree calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-</P>
-<P>
-<H4>
-OUTPUT FORMAT OPTIONS
-</H4>
-</P>
-<P>
-Three different formats are allowed. None of these displays the tree visually.
-You can display the tree using the NJPLOT program distributed with Clustal X
-OR get the PHYLIP package and use the tree drawing facilities there.
-</P>
-<P>
-1) CLUSTAL FORMAT TREE. This format is verbose and lists all of the distances
-between the sequences and the number of alignment positions used for each. The
-tree is described at the end of the file. It lists the sequences that are
-joined at each alignment step and the branch lengths. After two sequences are
-joined, it is referred to later as a NODE. The number of a NODE is the number
-of the lowest sequence in that NODE.
-</P>
-<P>
-2) PHYLIP FORMAT TREE. This format is the New Hampshire format, used by many
-phylogenetic analysis packages. It consists of a series of nested parentheses,
-describing the branching order, with the sequence names and branch lengths. It
-can be read by the NJPLOT program distributed with ClustalX. It can also be
-used by the RETREE, DRAWGRAM and DRAWTREE programs of the PHYLIP package to see
-the trees graphically. This is the same format used during multiple alignment
-for the guide trees. Some other packages that can read and display New
-Hampshire format are TreeTool, TreeView, and Phylowin.
-</P>
-<P>
-3) PHYLIP DISTANCE MATRIX. This format just outputs a matrix of all the
-pairwise distances in a format that can be used by the PHYLIP package. It used
-to be useful when one could not produce distances from protein sequences in the
-Phylip package but is now redundant (PROTDIST of Phylip 3.5 now does this).
-</P>
-<P>
-4) NEXUS FORMAT TREE. This format is used by several popular phylogeny programs,
-including PAUP and MacClade. The format is described fully in:
-Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
-NEXUS: an extensible file format for systematic information.
-Systematic Biology 46:590-621.
-</P>
-<P>
-BOOTSTRAP LABELS ON: By default, the bootstrap values are correctly placed on
-the tree branches of the phylip format output tree. The toggle allows them to
-be placed on the nodes, which is incorrect, but some display packages (e.g.
-TreeTool, TreeView and Phylowin) only support node labelling but not branch
-labelling. Care should be taken to note which branches and labels go together.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="C"> Colors
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-Clustal X provides a versatile coloring scheme for the sequence alignment
-display. The sequences (or profiles) are colored automatically, when they are
-loaded. Sequences can be colored either by assigning a color to specific
-residues, or on the basis of an alignment consensus. In the latter case, the
-alignment consensus is calculated automatically, and the residues in each
-column are colored according to the consensus character assigned to that
-column. In this way, you can choose to highlight, for example, conserved
-hydrophylic or hydrophobic positions in the alignment.
-</P>
-<P>
-The 'rules' used to color the alignment are specified in a COLOR PARAMETER
-FILE. Clustal X automatically looks for a file called 'colprot.par' for protein
-sequences or 'coldna.par' for DNA, in the current directory. (If your running
-under UNIX, it then looks in your home directory, and finally in the
-directories in your PATH environment variable).
-</P>
-<P>
-By default, if no color parameter file is found, protein sequences are colored
-by residue as follows:
-</P>
-<P>
-<PRE>
- Color Residue Code
-</P>
-<P>
- ORANGE GPST
- RED HKR
- BLUE FWY
- GREEN ILMV
-</PRE>
-</P>
-<P>
-In the case of DNA sequences, the default colors are as follows:
-</P>
-<P>
-<PRE>
- Color Residue Code
-</P>
-<P>
- ORANGE A
- RED C
- BLUE T
- GREEN G
-</PRE>
-</P>
-<P>
-</P>
-<P>
-The default BACKGROUND COLORING option shows the sequence residues using a
-black character on a colored background. It can be switched off to show
-residues as a colored character on a white background.
-</P>
-<P>
-Either BLACK AND WHITE or DEFAULT COLOR options can be selected. The Color
-option looks first for the color parameter file (as described above) and, if no
-file is found, uses the default residue-specific colors.
-</P>
-<P>
-You can specify your own coloring scheme by using the LOAD COLOR PARAMETER FILE
-option. The format of the color parameter file is described below.
-</P>
-<P>
-<H4>
-COLOR PARAMETER FILE
-</H4>
-</P>
-<P>
-This file is divided into 3 sections:
-</P>
-<P>
-1) the names and rgb values of the colors
-2) the rules for calculating the consensus
-3) the rules for assigning colors to the residues
-</P>
-<P>
-An example file is given here.
-</P>
-<P>
-<PRE>
- --------------------------------------------------------------------
- at rgbindex
-RED 0.9 0.1 0.1
-BLUE 0.1 0.1 0.9
-GREEN 0.1 0.9 0.1
-YELLOW 0.9 0.9 0.0
-</P>
-<P>
- at consensus
-% = 60% w:l:v:i:m:a:f:c:y:h:p
-# = 80% w:l:v:i:m:a:f:c:y:h:p
-- = 50% e:d
-+ = 60% k:r
-q = 50% q:e
-p = 50% p
-n = 50% n
-t = 50% t:s
-</P>
-<P>
- at color
-g = RED
-p = YELLOW
-t = GREEN if t:%:#
-n = GREEN if n
-w = BLUE if %:#:p
-k = RED if +
- --------------------------------------------------------------------
-</PRE>
-</P>
-<P>
-The first section is optional and is identified by the header @rgbindex. If
-this section exists, each color used in the file must be named and the rgb
-values specified (on a scale from 0 to 1). If the rgb index section is not
-found, the following set of hard-coded colors will be used.
-</P>
-<P>
-<PRE>
-RED 0.9 0.1 0.1
-BLUE 0.1 0.1 0.9
-GREEN 0.1 0.9 0.1
-ORANGE 0.9 0.7 0.3
-CYAN 0.1 0.9 0.9
-PINK 0.9 0.5 0.5
-MAGENTA 0.9 0.1 0.9
-YELLOW 0.9 0.9 0.0
-</PRE>
-</P>
-<P>
-The second section is optional and is identified by the header @consensus. It
-defines how the consensus is calculated.
-</P>
-<P>
-The format of each consensus parameter is:-
-</P>
-<P>
-<PRE>
-c = n% residue_list
-</P>
-<P>
- where
- c is a character used to identify the parameter.
- n is an integer value used as the percentage cutoff
- point.
- residue_list is a list of residues denoted by a single
- character, delimited by a colon (:).
-</PRE>
-</P>
-<P>
-For example: # = 60% w:l:v:i
-</P>
-<P>
-will assign a consensus character # to any column in the alignment which
-contains more than 60% of the residues w,l,v and i.
-</P>
-<P>
-</P>
-<P>
-The third section is identified by the header @color, and defines how colors
-are assigned to each residue in the alignment.
-</P>
-<P>
-The color parameters can take one of two formats:
-</P>
-<P>
-<PRE>
-1) r = color
-2) r = color if consensus_list
-</P>
-<P>
- where
- r is a character used to denote a residue.
- color is one of the colors in the GDE color lookup table.
- residue_list is a list of residues denoted by a single
- character, delimited by a colon (:).
-</PRE>
-</P>
-<P>
-Examples:
-1) g = ORANGE
-</P>
-<P>
-will color all glycines ORANGE, regardless of the consensus.
-</P>
-<P>
-2) w = BLUE if w:%:#
-</P>
-<P>
-will color BLUE any tryptophan which is found in a column with a consensus of
-w, % or #.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="Q"> Alignment Quality Analysis
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-<H3>
-QUALITY SCORES
-</H3>
-</P>
-<P>
-Clustal X provides an indication of the quality of an alignment by plotting
-a 'conservation score' for each column of the alignment. A high score indicates
-a well-conserved column; a low score indicates low conservation. The quality
-curve is drawn below the alignment.
-</P>
-<P>
-Two methods are also provided to indicate single residues or sequence segments
-which score badly in the alignment.
-</P>
-<P>
-Low-scoring residues are expected to occur at a moderate frequency in all the
-sequences because of their steady divergence due to the natural processes of
-evolution. The most divergent sequences are likely to have the most outliers.
-However, the highlighted residues are especially useful in pointing to
-sequence misalignments. Note that clustering of highlighted residues is a
-strong indication of misalignment. This can arise due to various reasons, for
-example:
-</P>
-<P>
- 1. Partial or total misalignments caused by a failure in the
- alignment algorithm. Usually only in difficult alignment cases.
-</P>
-<P>
- 2. Partial or total misalignments because at least one of the
- sequences in the given set is partly or completely unrelated to the
- other sequences. It is up to the user to check that the set of
- sequences are alignable.
-</P>
-<P>
- 3. Frameshift translation errors in a protein sequence causing local
- mismatched regions to be heavily highlighted. These are surprisingly
- common in database entries. If suspected, a 3-frame translation of
- the source DNA needs to be examined.
-</P>
-<P>
-Occasionally, highlighted residues may point to regions of some biological
-significance. This might happen for example if a protein alignment contains a
-sequence which has acquired new functions relative to the main sequence set. It
-is important to exclude other explanations, such as error or the natural
-divergence of sequences, before invoking a biological explanation.
-</P>
-<P>
-</P>
-<P>
-<H3>
-LOW-SCORING SEGMENTS
-</H3>
-</P>
-<P>
-Unreliable regions in the alignment can be highlighted using the Low-Scoring
-Segments option. A sequence-weighted profile is used to indicate any segments
-in the sequences which score badly. Because the profile calculation may take
-some time, an option is provided to calculate LOW-SCORING SEGMENTS. The
-segment display can then be toggled on or off without having to repeat the
-time-consuming calculations.
-</P>
-<P>
-For details of the low-scoring segment calculation, see the CALCULATION section
-below.
-</P>
-<P>
-</P>
-<P>
-<H4>
-LOW-SCORING SEGMENT PARAMETERS
-</H4>
-</P>
-<P>
-MINIMUM LENGTH OF SEGMENTS: short segments (or even single residues) can be
-hidden by increasing the minimum length of segments which will be displayed.
-</P>
-<P>
-DNA MARKING SCALE is used to remove less significant segments from the
-highlighted display. Increase the scale to display more segments; decrease the
-scale to remove the least significant.
-</P>
-<P>
-</P>
-<P>
-PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of each
-amino acid to each other. The matrix is used to calculate the sequence-
-weighted profile scores. There are four 'in-built' Log-Odds matrices offered:
-the Gonnet PAM 80, 120, 250, 350 matrices. A more stringent matrix which only
-gives a high score to identities and the most favoured conservative
-substitutions, may be more suitable when the sequences are closely related. For
-more divergent sequences, it is appropriate to use "softer" matrices which give
-a high score to many other frequent substitutions. This option automatically
-recalculates the low-scoring segments.
-</P>
-<P>
-</P>
-<P>
-DNA WEIGHT MATRIX: Two hard-coded matrices are available:
-</P>
-<P>
-1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
-of nucleic acid sequences. X's and N's are treated as matches to any IUB
-ambiguity symbol. All matches score 1.0; all mismatches for IUB symbols score
-0.9.
-</P>
-<P>
-2) CLUSTALW(1.6). The previous system used by ClustalW, in which matches score
-1.0 and mismatches score 0. All matches for IUB symbols also score 0.
-</P>
-<P>
-A new matrix can be read from a file on disk, if the filename consists only
-of lower case characters. The values in the new weight matrix should be
-similarities and should be NEGATIVE for infrequent substitutions.
-</P>
-<P>
-INPUT FORMAT. The format used for a new matrix is the same as the BLAST
-program. Any lines beginning with a # character are assumed to be comments. The
-first non-comment line should contain a list of amino acids in any order, using
-the 1 letter code, followed by a * character. This should be followed by a
-square matrix of scores, with one row and one column for each amino acid. The
-last row and column of the matrix (corresponding to the * character) contain
-the minimum score over the whole matrix.
-</P>
-<P>
-<H4>
-QUALITY SCORE PARAMETERS
-</H4>
-</P>
-<P>
-You can customise the column 'quality scores' plotted underneath the alignment
-display using the following options.
-</P>
-<P>
-SCORE PLOT SCALE: this is a scalar value from 1 to 10, which can be used to
-change the scale of the quality score plot.
-</P>
-<P>
-RESIDUE EXCEPTION CUTOFF: this is a scalar value from 1 to 10, which can be
-used to change the number of residue exceptions which are highlighted in the
-alignment display. (For an explanation of this cutoff, see the CALCULATION OF
-RESIDUE EXCEPTIONS section below.)
-</P>
-<P>
-PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of
-each amino acid to each other.
-</P>
-<P>
-DNA WEIGHT MATRIX: two hard-coded matrices are available: IUB and CLUSTALW(1.6).
-</P>
-<P>
-For more information about the weight matrices, see the help above for
-the Low-scoring Segments Weight Matrix.
-</P>
-<P>
-For details of the quality score calculations, see the CALCULATION section
-below.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-SHOW LOW-SCORING SEGMENTS
-</STRONG>
-</P>
-<P>
-The low-scoring segment display can be toggled on or off. This option does not
-recalculate the profile scores.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-SHOW EXCEPTIONAL RESIDUES
-</STRONG>
-</P>
-<P>
-This option highlights individual residues which score badly in the alignment
-quality calculations. Residues which score exceptionally low are highlighted by
-using a white character on a grey background.
-</P>
-<P>
-<STRONG>
-SAVE QUALITY SCORES TO FILE
-</STRONG>
-</P>
-<P>
-The quality scores that are plotted underneath the alignment display can also
-be saved in a text file. Each column in the alignment is written on one line in
-the output file, with the value of the quality score at the end of the line.
-Only the sequences currently selected in the display are written to the file.
-One use for quality scores is to color residues in a protein structure by
-sequence conservation. In this way conserved surface residues can be
-highlighted to locate functional regions such as ligand-binding sites.
-</P>
-<P>
-</P>
-<P>
-<H3>
-CALCULATION OF QUALITY SCORES
-</H3>
-</P>
-<P>
-Suppose we have an alignment of m sequences of length n. Then, the alignment
-can be written as:
-</P>
-<P>
-<PRE>
- A11 A12 A13 .......... A1n
- A21 A22 A23 .......... A2n
- .
- .
- Am1 Am2 Am3 .......... Amn
-</PRE>
-</P>
-<P>
-We also have a residue comparison matrix of size R where C(i,j) is the score
-for aligning residue i with residue j.
-</P>
-<P>
-We want to calculate a score for the conservation of the jth position in the
-alignment.
-</P>
-<P>
-To do this, we define an R-dimensional sequence space. For the jth position in
-the alignment, each sequence consists of a single residue which is assigned a
-point S in the space. S has R dimensions, and for sequence i, the rth dimension
-is defined as:
-</P>
-<P>
-<PRE>
- Sr = C(r,Aij)
-</PRE>
-</P>
-<P>
-We then calculate a consensus value for the jth position in the alignment. This
-value X also has R dimensions, and the rth dimension is defined as:
-</P>
-<P>
-<PRE>
- Xr = ( SUM (Fij * C(i,r)) ) / m
- 1<=i<=R
-</PRE>
-</P>
-<P>
-where Fij is the count of residues i at position j in the alignment.
-</P>
-<P>
-Now we can calculate the distance Di between each sequence i and the consensus
-position X in the R-dimensional space.
-</P>
-<P>
-<PRE>
- Di = SQRT ( SUM (Xr - Sr)(Xr - Sr) )
- 1<=i<=R
-</P>
-<P>
-</PRE>
-</P>
-<P>
-The quality score for the jth position in the alignment is defined as the mean
-of the sequence distances Di.
-</P>
-<P>
-The score is normalised by multiplying by the percentage of sequences which
-have residues (and not gaps) at this position.
-</P>
-<P>
-<H3>
-CALCULATION OF RESIDUE EXCEPTIONS
-</H3>
-</P>
-<P>
-The jth residue of the ith sequence is considered as an exception if the
-distance Di of the sequence from the consensus value P is greater than (Upper
-Quartile + Inter Quartile Range * Cutoff). The value used as a cutoff for
-displaying exceptions can be set from the SCORE PARAMETERS menu. A high cutoff
-value will only display very significant exceptions; a low value will allow
-more, less significant, exceptions to be highlighted.
-</P>
-<P>
-(NB. Sequences which contain gaps at this position are not included in the
-exception calculation.)
-</P>
-<P>
-</P>
-<P>
-<H3>
-CALCULATION OF LOW-SCORING SEGMENTS
-</H3>
-</P>
-<P>
-Suppose we have an alignment of m sequences of length n. Then, the alignment
-can be written as:
-</P>
-<P>
-<PRE>
- A11 A12 A13 .......... A1n
- A21 A22 A23 .......... A2n
- .
- .
- Am1 Am2 Am3 .......... Amn
-</PRE>
-</P>
-<P>
-We also have a residue comparison matrix of size R where C(i,j) is the score
-for aligning residue i with residue j.
-</P>
-<P>
-We calculate sequence weights by building a neighbour-joining tree, in which
-branch lengths are proportional to divergence. Summing the branches by branch
-ownership provides the weights. See (Thompson et al., CABIOS, 10, 19 (1994) and
-Henikoff et al.,JMB, 243, 574 1994).
-</P>
-<P>
-To find the low-scoring segments in a sequence Si, we build a weighted profile
-of the remaining sequences in the alignment. Suppose we find residue r at
-position j in the sequence; then the score for the jth position in the sequence
-is defined as
-</P>
-<P>
-<PRE>
- Score(Si,j) = Profile(j,r) where Profile(j,r) is the profile score
- for residue r at position j in the
- alignment.
-</PRE>
-</P>
-<P>
-These residue scores are summed along the sequence in both forward and backward
-directions. If the sum of the scores is positive, then it is reset to zero.
-Segments which score negatively in both directions are considered as
-'low-scoring' and will be highlighted in the alignment display.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="9"> Command Line Parameters
-</A></H2></CENTER>
-<CENTER><H3> DATA (sequences)
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-PROFILE1=file.ext and -PROFILE2=file.ext </TT></TD>
-<TD><EM>profiles (aligned sequences)</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3> VERBS (do things)
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-HELP or -CHECK </TT></TD>
-<TD><EM>outline the command line parameters</EM></TD>
-</TR>
-<TR>
-<TD><TT>-ALIGN </TT></TD>
-<TD><EM>do full multiple alignment </EM></TD>
-</TR>
-<TR>
-<TD><TT>-TREE </TT></TD>
-<TD><EM>calculate NJ tree</EM></TD>
-</TR>
-<TR>
-<TD><TT>-BOOTSTRAP(=n) </TT></TD>
-<TD><EM>bootstrap a NJ tree (n= number of bootstraps; def. = 1000)</EM></TD>
-</TR>
-<TR>
-<TD><TT>-CONVERT </TT></TD>
-<TD><EM>output the input sequences in a different file format</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3> PARAMETERS (set things)
-</H3></CENTER>
-<CENTER><P><STRONG>***General settings:****
-</STRONG></P></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-INTERACTIVE </TT></TD>
-<TD><EM>read command line, then enter normal interactive menus</EM></TD>
-</TR>
-<TR>
-<TD><TT>-QUICKTREE </TT></TD>
-<TD><EM>use FAST algorithm for the alignment guide tree</EM></TD>
-</TR>
-<TR>
-<TD><TT>-TYPE= </TT></TD>
-<TD><EM>PROTEIN or DNA sequences</EM></TD>
-</TR>
-<TR>
-<TD><TT>-NEGATIVE </TT></TD>
-<TD><EM>protein alignment with negative values in matrix</EM></TD>
-</TR>
-<TR>
-<TD><TT>-OUTFILE= </TT></TD>
-<TD><EM>sequence alignment file name</EM></TD>
-</TR>
-<TR>
-<TD><TT>-OUTPUT= </TT></TD>
-<TD><EM>GCG, GDE, PHYLIP, PIR or NEXUS</EM></TD>
-</TR>
-<TR>
-<TD><TT>-OUTORDER= </TT></TD>
-<TD><EM>INPUT or ALIGNED</EM></TD>
-</TR>
-<TR>
-<TD><TT>-CASE= </TT></TD>
-<TD><EM>LOWER or UPPER (for GDE output only)</EM></TD>
-</TR>
-<TR>
-<TD><TT>-SEQNOS= </TT></TD>
-<TD><EM>OFF or ON (for Clustal output only)</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Fast Pairwise Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-TOPDIAGS=n </TT></TD>
-<TD><EM>number of best diags.</EM></TD>
-</TR>
-<TR>
-<TD><TT>-WINDOW=n </TT></TD>
-<TD><EM>window around best diags.</EM></TD>
-</TR>
-<TR>
-<TD><TT>-PAIRGAP=n </TT></TD>
-<TD><EM>gap penalty</EM></TD>
-</TR>
-<TR>
-<TD><TT>-SCORE= </TT></TD>
-<TD><EM>PERCENT or ABSOLUTE</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Slow Pairwise Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-PWDNAMATRIX= </TT></TD>
-<TD><EM>DNA weight matrix=IUB, CLUSTALW or filename</EM></TD>
-</TR>
-<TR>
-<TD><TT>-PWGAPOPEN=f </TT></TD>
-<TD><EM>gap opening penalty</EM></TD>
-</TR>
-<TR>
-<TD><TT>-PWGAPEXT=f </TT></TD>
-<TD><EM>gap opening penalty</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Multiple Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-USETREE= </TT></TD>
-<TD><EM>file for old guide tree</EM></TD>
-</TR>
-<TR>
-<TD><TT>-MATRIX= </TT></TD>
-<TD><EM>Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename</EM></TD>
-</TR>
-<TR>
-<TD><TT>-DNAMATRIX= </TT></TD>
-<TD><EM>DNA weight matrix=IUB, CLUSTALW or filename</EM></TD>
-</TR>
-<TR>
-<TD><TT>-GAPOPEN=f </TT></TD>
-<TD><EM>gap opening penalty</EM></TD>
-</TR>
-<TR>
-<TD><TT>-GAPEXT=f </TT></TD>
-<TD><EM>gap extension penalty</EM></TD>
-</TR>
-<TR>
-<TD><TT>-ENDGAPS </TT></TD>
-<TD><EM>no end gap separation pen.</EM></TD>
-</TR>
-<TR>
-<TD><TT>-GAPDIST=n </TT></TD>
-<TD><EM>gap separation pen. range</EM></TD>
-</TR>
-<TR>
-<TD><TT>-NOPGAP </TT></TD>
-<TD><EM>residue-specific gaps off</EM></TD>
-</TR>
-<TR>
-<TD><TT>-NOHGAP </TT></TD>
-<TD><EM>hydrophilic gaps off</EM></TD>
-</TR>
-<TR>
-<TD><TT>-HGAPRESIDUES= </TT></TD>
-<TD><EM>list hydrophilic res.</EM></TD>
-</TR>
-<TR>
-<TD><TT>-MAXDIV=n </TT></TD>
-<TD><EM>% ident. for delay</EM></TD>
-</TR>
-<TR>
-<TD><TT>-TYPE= </TT></TD>
-<TD><EM>PROTEIN or DNA</EM></TD>
-</TR>
-<TR>
-<TD><TT>-TRANSWEIGHT=f </TT></TD>
-<TD><EM>transitions weighting</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Profile Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-NEWTREE1= </TT></TD>
-<TD><EM>file for new guide tree for profile1</EM></TD>
-</TR>
-<TR>
-<TD><TT>-NEWTREE2= </TT></TD>
-<TD><EM>file for new guide tree for profile2</EM></TD>
-</TR>
-<TR>
-<TD><TT>-USETREE1= </TT></TD>
-<TD><EM>file for old guide tree for profile1</EM></TD>
-</TR>
-<TR>
-<TD><TT>-USETREE2= </TT></TD>
-<TD><EM>file for old guide tree for profile2</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Sequence to Profile Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-NEWTREE= </TT></TD>
-<TD><EM>file for new guide tree</EM></TD>
-</TR>
-<TR>
-<TD><TT>-USETREE= </TT></TD>
-<TD><EM>file for old guide tree</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Structure Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-NOSECSTR2 </TT></TD>
-<TD><EM>do not use secondary structure/gap penalty mask for profile 2</EM></TD>
-</TR>
-<TR>
-<TD><TT>-SECSTROUT=STRUCTURE or MASK or BOTH or NONE </TT></TD>
-<TD><EM>output in alignment file</EM></TD>
-</TR>
-<TR>
-<TD><TT>-HELIXGAP=n </TT></TD>
-<TD><EM>gap penalty for helix core residues </EM></TD>
-</TR>
-<TR>
-<TD><TT>-STRANDGAP=n </TT></TD>
-<TD><EM>gap penalty for strand core residues</EM></TD>
-</TR>
-<TR>
-<TD><TT>-LOOPGAP=n </TT></TD>
-<TD><EM>gap penalty for loop regions</EM></TD>
-</TR>
-<TR>
-<TD><TT>-TERMINALGAP=n </TT></TD>
-<TD><EM>gap penalty for structure termini</EM></TD>
-</TR>
-<TR>
-<TD><TT>-HELIXENDIN=n </TT></TD>
-<TD><EM>number of residues inside helix to be treated as terminal</EM></TD>
-</TR>
-<TR>
-<TD><TT>-HELIXENDOUT=n </TT></TD>
-<TD><EM>number of residues outside helix to be treated as terminal</EM></TD>
-</TR>
-<TR>
-<TD><TT>-STRANDENDIN=n </TT></TD>
-<TD><EM>number of residues inside strand to be treated as terminal</EM></TD>
-</TR>
-<TR>
-<TD><TT>-STRANDENDOUT=n</TT></TD>
-<TD><EM>number of residues outside strand to be treated as terminal </EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Trees:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-SEED=n </TT></TD>
-<TD><EM>seed number for bootstraps</EM></TD>
-</TR>
-<TR>
-<TD><TT>-KIMURA </TT></TD>
-<TD><EM>use Kimura's correction</EM></TD>
-</TR>
-<TR>
-<TD><TT>-TOSSGAPS </TT></TD>
-<TD><EM>ignore positions with gaps</EM></TD>
-</TR>
-<TR>
-<TD><TT>-BOOTLABELS=node OR branch </TT></TD>
-<TD><EM>position of bootstrap values in tree display</EM></TD>
-</TR>
-</TABLE></CENTER>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="R"> References
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-<STRONG>
-The ClustalX program is described in the manuscript:
-</STRONG>
-</P>
-<P>
-Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
-The ClustalX windows interface: flexible strategies for multiple sequence
-alignment aided by quality analysis tools. Nucleic Acids Research, 24:4876-4882.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-The ClustalW program is described in the manuscript:
-</STRONG>
-</P>
-<P>
-Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
-sensitivity of progressive multiple sequence alignment through sequence
-weighting, positions-specific gap penalties and weight matrix choice. Nucleic
-Acids Research, 22:4673-4680.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-The ClustalV program is described in the manuscript:
-</STRONG>
-</P>
-<P>
-Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
-multiple sequence alignment. CABIOS 8,189-191.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-The original Clustal program is described in the manuscripts:
-</STRONG>
-</P>
-<P>
-Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
-alignments on a microcomputer.
-CABIOS 5,151-153.
-</P>
-<P>
-Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
-sequence alignment on a microcomputer. Gene 73,237-244.
-</P>
-<P>
-<STRONG>
-Some tips on using Clustal X:
-</STRONG>
-</P>
-<P>
-Jeannmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
-Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
-</P>
-<P>
-<STRONG>
-Some tips on using Clustal W:
-</STRONG>
-</P>
-<P>
-Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
-multiple sequence alignments. Methods Enzymol., 266, 383-402.
-</P>
-<P>
-<STRONG>
-You can get the latest version of the ClustalX program by anonymous ftp to:
-</STRONG>
-</P>
-<P>
-ftp-igbmc.u-strasbg.fr
-ftp.embl-heidelberg.de
-ftp.ebi.ac.uk
-</P>
-<P>
-<STRONG>
-Or, have a look at the following WWW site:
-</STRONG>
-</P>
-<P>
-http://www-igbmc.u-strasbg.fr/BioInfo/
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<HEAD>
-<TITLE>ClustalX Help</TITLE>
-</HEAD>
-<BODY BGCOLOR=white>
-<CENTER><H1>ClustalX Help</H1></CENTER>
-<P>
-You can get the latest version of the ClustalX program here:
-</P>
-<DL><DD>
-<A HREF="ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalX/">
-ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalX/</A>
-</DL>
-<P>For full details of usage and algorithms, please read the <A HREF="clustalw.doc"><EM>ClustalW.Doc</EM></A> file.</P>
-<PRE><EM>
-Toby Gibson EMBL, Heidelberg, Germany.
-Des Higgins UCC, Cork, Ireland.
-Julie Thompson/Francois Jeanmougin IGBMC, Strasbourg, France.
-</EM></PRE>
-<CENTER><H2><A NAME="Index">Index</A></H2></CENTER>
-<OL>
-<LI><A HREF="#G"> General help for CLUSTAL X (1.8)
-</A></LI>
-<LI><A HREF="#F"> Input / Output Files
-</A></LI>
-<LI><A HREF="#E"> Editing Alignments
-</A></LI>
-<LI><A HREF="#M"> Multiple Alignments
-</A></LI>
-<LI><A HREF="#P"> Profile and Structure Alignments
-</A></LI>
-<LI><A HREF="#B"> Secondary Structure / Gap Penalty Masks
-</A></LI>
-<LI><A HREF="#T"> Phylogenetic Trees
-</A></LI>
-<LI><A HREF="#C"> Colors
-</A></LI>
-<LI><A HREF="#Q"> Alignment Quality Analysis
-</A></LI>
-<LI><A HREF="#9"> Command Line Parameters
-</A></LI>
-<LI><A HREF="#R"> References
-</A></LI>
-</OL>
-<CENTER><H2><A NAME="G"> General help for CLUSTAL X (1.8)
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-Clustal X is a windows interface for the ClustalW multiple sequence alignment
-program. It provides an integrated environment for performing multiple sequence
-and profile alignments and analysing the results. The sequence alignment is
-displayed in a window on the screen. A versatile coloring scheme has been
-incorporated allowing you to highlight conserved features in the alignment.
-The pull-down menus at the top of the window allow you to select all the
-options required for traditional multiple sequence and profile alignment.
-</P>
-<P>
-You can cut-and-paste sequences to change the order of the alignment; you can
-select a subset of sequences to be aligned; you can select a sub-range of the
-alignment to be realigned and inserted back into the original alignment.
-</P>
-<P>
-Alignment quality analysis can be performed and low-scoring segments or
-exceptional residues can be highlighted.
-</P>
-<P>
-ClustalX is available for a number of different platforms including: SUN
-Solaris, IRIX5.3 on Silicon Graphics, Digital UNIX on DECStations, Microsoft
-Windows (32 bit) for PC's, Linux ELF for x86 PC's and Macintosh PowerMac. (See
-the README file for Installation instructions.)
-</P>
-<P>
-</P>
-<P>
-<H4>
-SEQUENCE INPUT
-</H4>
-</P>
-<P>
-Sequences and profiles (a term for pre-existing alignments) are input using
-the FILE menu. Invalid options will be disabled. All sequences must be included
-into 1 file. 7 formats are automatically recognised: NBRF/PIR, EMBL/SWISSPROT,
-Pearson (Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9 RSF and GDE flat file.
-All non-alphabetic characters (spaces, digits, punctuation marks) are ignored
-except "-" which is used to indicate a GAP ("." in MSF/RSF).
-</P>
-<P>
-<H4>
-SEQUENCE / PROFILE ALIGNMENTS
-</H4>
-</P>
-<P>
-Clustal X has two modes which can be selected using the switch directly above
-the sequence display: MULTIPLE ALIGNMENT MODE and PROFILE ALIGNMENT MODE.
-</P>
-<P>
-To do a MULTIPLE ALIGNMENT on a set of sequences, make sure MULTIPLE ALIGNMENT
-MODE is selected. A single sequence data area is then displayed. The ALIGNMENT
-menu then allows you to either produce a guide tree for the alignment, or to do
-a multiple alignment following the guide tree, or to do a full multiple
-alignment.
-</P>
-<P>
-In PROFILE ALIGNMENT MODE, two sequence data areas are displayed, allowing you
-to align 2 alignments (termed profiles). Profiles are also used to add a new
-sequence to an old alignment, or to use secondary structure to guide the
-alignment process. GAPS in the old alignments are indicated using the "-"
-character. PROFILES can be input in ANY of the allowed formats; just use "-"
-(or "." for MSF/RSF) for each gap position. In Profile Alignment Mode, a button
-"Lock Scroll" is displayed which allows you to scroll the two profiles together
-using a single scroll bar. When the Lock Scroll is turned off, the two profiles
-can be scrolled independently.
-</P>
-<P>
-<H4>
-PHYLOGENETIC TREES
-</H4>
-</P>
-<P>
-Phylogenetic trees can be calculated from old alignments (read in with "-"
-characters to indicate gaps) OR after a multiple alignment while the alignment
-is still displayed.
-</P>
-<P>
-<H4>
-ALIGNMENT DISPLAY
-</H4>
-</P>
-<P>
-The alignment is displayed on the screen with the sequence names on the left
-hand side. The sequence alignment is for display only, it cannot be edited here
-(except for changing the sequence order by cutting-and-pasting on the sequence
-names).
-</P>
-<P>
-A ruler is displayed below the sequences, starting at 1 for the first residue
-position (residue numbers in the sequence input file are ignored).
-</P>
-<P>
-A line above the alignment is used to mark strongly conserved positions. Three
-characters ('*', ':' and '.') are used:
-</P>
-<P>
-'*' indicates positions which have a single, fully conserved residue
-</P>
-<P>
-':' indicates that one of the following 'strong' groups is fully conserved:-
-<PRE>
- STA
- NEQK
- NHQK
- NDEQ
- QHRK
- MILV
- MILF
- HY
- FYW
-</PRE>
-</P>
-<P>
-'.' indicates that one of the following 'weaker' groups is fully conserved:-
-<PRE>
- CSA
- ATV
- SAG
- STNK
- STPA
- SGND
- SNDEQK
- NDEQHK
- NEQHRK
- FVLIM
- HFY
-</PRE>
-</P>
-<P>
-These are all the positively scoring groups that occur in the Gonnet Pam250
-matrix. The strong and weak groups are defined as strong score >0.5 and weak
-score =<0.5 respectively.
-</P>
-<P>
-For profile alignments, secondary structure and gap penalty masks are displayed
-above the sequences, if any data is found in the profile input file.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="F"> Input / Output Files
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-LOAD SEQUENCES reads sequences from one of 7 file formats, replacing any
-sequences that are already loaded. All sequences must be in 1 file. The formats
-that are automatically recognised are: NBRF/PIR, EMBL/SWISSPROT, Pearson
-(Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9/RSF and GDE flat file. All
-non-alphabetic characters (spaces, digits, punctuation marks) are ignored
-except "-" which is used to indicate a GAP ("." in MSF/RSF).
-</P>
-<P>
-The program tries to automatically recognise the different file formats used
-and to guess whether the sequences are amino acid or nucleotide. This is not
-always foolproof.
-</P>
-<P>
-FASTA and NBRF/PIR formats are recognised by having a ">" as the first
-character in the file.
-</P>
-<P>
-EMBL/Swiss Prot formats are recognised by the letters "ID" at the start of the
-file (the token for the entry name field).
-</P>
-<P>
-CLUSTAL format is recognised by the word CLUSTAL at the beginning of the file.
-</P>
-<P>
-GCG/MSF format is recognised by one of the following:
-<UL>
-<LI>
- - the word PileUp at the start of the file.
-</LI><LI>
- - the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
- at the start of the file.
-</LI><LI>
- - the word MSF on the first line of the file, and the characters ..
- at the end of this line.
-</LI>
-</UL>
-</P>
-<P>
-GCG/RSF format is recognised by the word !!RICH_SEQUENCE at the beginning of
-the file.
-</P>
-<P>
-</P>
-<P>
-If 85% or more of the characters in the sequence are from A,C,G,T,U or N, the
-sequence will be assumed to be nucleotide. This works in 97.3% of cases but
-watch out!
-</P>
-<P>
-APPEND SEQUENCES is only valid in MULTIPLE ALIGNMENT MODE. The input sequences
-do not replace those already loaded, but are appended at the end of the
-alignment.
-</P>
-<P>
-SAVE SEQUENCES AS... offers the user a choice of one of six output formats:
-CLUSTAL, NBRF/PIR, GCG/MSF, PHYLIP, NEXUS or GDE. All sequences are written
-to a single file. Options are available to save a range of the alignment,
-switch between UPPER/LOWER case for GDE files, and to output SEQUENCE NUMBERING
-for CLUSTAL files.
-</P>
-<P>
-LOAD PROFILE 1 reads sequences in the same 7 file formats, replacing any
-sequences already loaded as Profile 1. This option will also remove any
-sequences which are loaded in Profile 2.
-</P>
-<P>
-LOAD PROFILE 2 reads sequences in the same 7 file formats, replacing any
-sequences already loaded as Profile 2.
-</P>
-<P>
-SAVE PROFILE 1 AS... is similar to the Save Sequences option except that only
-those sequences in Profile 1 will be written to the output file.
-</P>
-<P>
-SAVE PROFILE 2 AS... is similar to the Save Sequences option except that only
-those sequences in Profile 2 will be written to the output file.
-</P>
-<P>
-WRITE ALIGNMENT AS POSTSCRIPT will write the sequence display to a postscript
-format file. This will include any secondary structure / gap penalty mask
-information and the consensus and ruler lines which are displayed on the
-screen. The Alignment Quality curve can be optionally included in the output
-file.
-</P>
-<P>
-WRITE PROFILE 1 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
-except that only the profile 1 display will be printed.
-</P>
-<P>
-WRITE PROFILE 2 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
-except that only the profile 2 display will be printed.
-</P>
-<P>
-</P>
-<P>
-<H4>
-POSTSCRIPT PARAMETERS
-</H4>
-</P>
-<P>
-A number of options are available to allow you to configure your postscript
-output file.
-</P>
-<P>
-PS COLORS FILE:
-</P>
-<P>
-The exact RGB values required to reproduce the colors used in the alignment
-window will vary from printer to printer. A PS colors file can be specified
-that contains the RGB values for all the colors required by each of your
-postscript printers.
-</P>
-<P>
-By default, Clustal X looks for a file called 'colprint.par' in the current
-directory (if your running under UNIX, it then looks in your home directory,
-and finally in the directories in your PATH environment variable). If no PS
-colors file is found or a color used on the screen is not defined here, the
-screen RGB values (from the Color Parameter File) are used.
-</P>
-<P>
-The PS colors file consists of one line for each color to be defined, with the
-color name followed by the RGB values (on a scale of 0 to 1). For example,
-</P>
-<P>
-RED 0.9 0.1 0.1
-</P>
-<P>
-Blank lines and comments (lines beginning with a '#' character) are ignored.
-</P>
-<P>
-</P>
-<P>
-PAGE SIZE: The alignment can be displayed on either A4, A3 or US Letter size
-pages.
-</P>
-<P>
-ORIENTATION: The alignment can be displayed on either a landscape or portrait
-page.
-</P>
-<P>
-PRINT HEADER: An optional header including the postscript filename, and
-creation date can be printed at the top of each page.
-</P>
-<P>
-PRINT QUALITY CURVE: The Alignment Quality curve which is displayed underneath
-the alignment on the screen can be included in the postscript output.
-</P>
-<P>
-PRINT RULER: The ruler which is displayed underneath the alignment on the
-screen can be included in the postscript output.
-</P>
-<P>
-PRINT RESIDUE NUMBERS: Sequence residue numbers can be printed at the right
-hand side of the alignment.
-</P>
-<P>
-RESIZE TO FIT PAGE: By default, the alignment is scaled to fit the page size
-selected. This option can be turned off, in which case a font size of 10 will
-be used for the sequences.
-</P>
-<P>
-PRINT FROM POSITION/TO: A range of the alignment can be printed. The default
-is to print the full alignment. The first and last residues to be printed are
-specified here.
-</P>
-<P>
-USE BLOCK LENGTH: The alignment can be divided into blocks of residues. The
-number of residues in a block is specified here. More than one block may then
-be printed on a single page. This is useful for long alignments of a small
-number of sequences. If the block length is set to 0, The alignment will not
-be divided into blocks, but printed across a number of pages.
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="E"> Editing Alignments
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-Clustal X allows you to change the order of the sequences in the alignment, by
-cutting-and-pasting the sequence names.
-</P>
-<P>
-To select a group of sequences to be moved, click on a sequence name and drag
-the cursor until all the required sequences are highlighted. Holding down the
-Shift key when clicking on the first name will add new sequences to those
-already selected.
-</P>
-<P>
-(Options are provided to Select All Sequences, Select Profile 1 or Select
-Profile 2.)
-</P>
-<P>
-The selected sequences can be removed from the alignment by using the EDIT
-menu, CUT option.
-</P>
-<P>
-To add the cut sequences back into an alignment, select a sequence by clicking
-on the sequence name. The cut sequences will be added to the alignment,
-immediately following the selected sequence, by the EDIT menu, PASTE option.
-</P>
-<P>
-To add the cut sequences to an empty alignment (eg. when cutting sequences from
-Profile 1 and pasting them to Profile 2), click on the empty sequence name
-display area, and select the EDIT menu, PASTE option as before.
-</P>
-<P>
-The sequence selection and sequence range selection can be cleared using the
-EDIT menu, CLEAR SEQUENCE SELECTION and CLEAR RANGE SELECTION options
-respectively.
-</P>
-<P>
-To search for a string of residues in the sequences, select the sequences to be
-searched by clicking on the sequence names. You can then enter the string to
-search for by selecting the SEARCH FOR STRING option. If the string is found in
-any of the sequences selected, the sequence name and column number is printed
-below the sequence display.
-</P>
-<P>
-In PROFILE ALIGNMENT MODE, the two profiles can be merged (normally done after
-alignment) by selecting ADD PROFILE 2 TO PROFILE 1. The sequences currently
-displayed as Profile 2 will be appended to Profile 1.
-</P>
-<P>
-The REMOVE ALL GAPS option will remove all gaps from the sequences currently
-selected.
-WARNING: This option removes ALL gaps, not only those introduced by ClustalX,
-but also those that were read from the input alignment file. Any secondary
-structure information associated with the alignment will NOT be automatically
-realigned.
-</P>
-<P>
-The REMOVE GAP-ONLY COLUMNS will remove those positions in the alignment which
-contain gaps in all sequences. This can occur as a result of removing divergent
-sequences from an alignment, or if an alignment has been realigned.
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="M"> Multiple Alignments
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-Make sure MULTIPLE ALIGNMENT MODE is selected, using the switch directly above
-the sequence display area. Then, use the ALIGNMENT menu to do multiple
-alignments.
-</P>
-<P>
-Multiple alignments are carried out in 3 stages:
-</P>
-<P>
-1) all sequences are compared to each other (pairwise alignments);
-</P>
-<P>
-2) a dendrogram (like a phylogenetic tree) is constructed, describing the
-approximate groupings of the sequences by similarity (stored in a file).
-</P>
-<P>
-3) the final multiple alignment is carried out, using the dendrogram as a guide.
-</P>
-<P>
-The 3 stages are carried out automatically by the DO COMPLETE ALIGNMENT option.
-You can skip the first stages (pairwise alignments; guide tree) by using an old
-guide tree file (DO ALIGNMENT FROM GUIDE TREE); or you can just produce the
-guide tree with no final multiple alignment (PRODUCE GUIDE TREE ONLY).
-</P>
-<P>
-</P>
-<P>
-REALIGN SELECTED SEQUENCES is used to realign badly aligned sequences in the
-alignment. Sequences can be selected by clicking on the sequence names - see
-Editing Alignments for more details. The unselected sequences are then 'fixed'
-and a profile is made including only the unselected sequences. Each of the
-selected sequences in turn is then realigned to this profile. The realigned
-sequences will be displayed as a group at the end the alignment.
-</P>
-<P>
-</P>
-<P>
-REALIGN SELECTED SEQUENCE RANGE is used to realign a small region of the
-alignment. A residue range can be selected by clicking on the sequence display
-area. A multiple alignment is then performed, following the 3 stages described
-above, but only using the selected residue range. Finally the new alignment of
-the range is pasted back into the full sequence alignment.
-</P>
-<P>
-By default, gap penalties are used at each end of the subrange in order to
-penalise terminal gaps. If the REALIGN SEGMENT END GAP PENALTIES option is
-switched off, gaps can be introduced at the ends of the residue range at no
-cost.
-</P>
-<P>
-</P>
-<P>
-ALIGNMENT PARAMETERS displays a sub-menu with the following options:
-</P>
-<P>
-RESET NEW GAPS BEFORE ALIGNMENT will remove any new gaps introduced into the
-sequences during multiple alignment if you wish to change the parameters and
-try again. This only takes effect just before you do a second multiple
-alignment. You can make phylogenetic trees after alignment whether or not this
-is ON. If you turn this OFF, the new gaps are kept even if you do a second
-multiple alignment. This allows you to iterate the alignment gradually.
-Sometimes, the alignment is improved by a second or third pass.
-</P>
-<P>
-RESET ALL GAPS BEFORE ALIGNMENT will remove all gaps in the sequences including
-gaps which were read in from the sequence input file. This only takes effect
-just before you do a second multiple alignment. You can make phylogenetic
-trees after alignment whether or not this is ON. If you turn this OFF, all
-gaps are kept even if you do a second multiple alignment. This allows you to
-iterate the alignment gradually. Sometimes, the alignment is improved by a
-second or third pass.
-</P>
-<P>
-</P>
-<P>
-PAIRWISE ALIGNMENT PARAMETERS control the speed/sensitivity of the initial
-alignments.
-</P>
-<P>
-MULTIPLE ALIGNMENT PARAMETERS control the gaps in the final multiple
-alignments.
-</P>
-<P>
-PROTEIN GAP PARAMETERS displays a temporary window which allows you to set
-various parameters only used in the alignment of protein sequences.
-</P>
-<P>
-(SECONDARY STRUCTURE PARAMETERS, for use with the Profile Alignment Mode only,
-allows you to set various parameters only used with gap penalty masks.)
-</P>
-<P>
-SAVE LOG FILE will write the alignment calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-</P>
-<P>
-</P>
-<P>
-<H4>
-OUTPUT FORMAT OPTIONS
-</H4>
-</P>
-<P>
-You can choose from 6 different alignment formats (CLUSTAL, GCG, NBRF/PIR,
-PHYLIP, GDE and NEXUS). You can choose more than one (or all 6 if you wish).
-</P>
-<P>
-CLUSTAL format output is a self explanatory alignment format. It shows the
-sequences aligned in blocks. It can be read in again at a later date to (for
-example) calculate a phylogenetic tree or add in new sequences by profile
-alignment.
-</P>
-<P>
-GCG output can be used by any of the GCG programs that can work on multiple
-alignments (e.g. PRETTY, PROFILEMAKE, PLOTALIGN). It is the same as the GCG
-.msf format files (multiple sequence file); new in version 7 of GCG.
-</P>
-<P>
-NEXUS format is used by several phylogeny programs, including PAUP and
-MacClade.
-</P>
-<P>
-PHYLIP format output can be used for input to the PHYLIP package of Joe
-Felsenstein. This is a very widely used package for doing every imaginable
-form of phylogenetic analysis (MUCH more than the the modest introduction
-offered by this program).
-</P>
-<P>
-NBRF/PIR: this is the same as the standard PIR format with ONE ADDITION. Gap
-characters "-" are used to indicate the positions of gaps in the multiple
-alignment. These files can be re-used as input in any part of clustal that
-allows sequences (or alignments or profiles) to be read in.
-</P>
-<P>
-GDE: this format is used by the GDE package of Steven Smith and is understood
-by SEQLAB in GCG 9 or later.
-</P>
-<P>
-GDE OUTPUT CASE: sequences in GDE format may be written in either upper or
-lower case.
-</P>
-<P>
-CLUSTALW SEQUENCE NUMBERS: residue numbers may be added to the end of the
-alignment lines in clustalw format.
-</P>
-<P>
-OUTPUT ORDER is used to control the order of the sequences in the output
-alignments. By default, it uses the order in which the sequences were aligned
-(from the guide tree/dendrogram), thus automatically grouping closely related
-sequences. It can be switched to be the same as the original input order.
-</P>
-<P>
-PARAMETER OUTPUT: This option will save all your parameter settings in a
-parameter file (suffix .par) during alignment. The file can be subsequently
-used to rerun ClustalW using the same parameters.
-</P>
-<P>
-</P>
-<P>
-<H3>
-ALIGNMENT PARAMETERS
-</H3>
-</P>
-<P>
-<STRONG>
-PAIRWISE ALIGNMENT PARAMETERS
-</STRONG>
-</P>
-<P>
-A distance is calculated between every pair of sequences and these are used to
-construct the phylogenetic tree which guides the final multiple alignment. The
-scores are calculated from separate pairwise alignments. These can be
-calculated using 2 methods: dynamic programming (slow but accurate) or by the
-method of Wilbur and Lipman (extremely fast but approximate).
-</P>
-<P>
-You can choose between the 2 alignment methods using the PAIRWISE ALIGNMENTS
-option. The slow/accurate method is fast enough for short sequences but will be
-VERY SLOW for many (e.g. >100) long (e.g. >1000 residue) sequences.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-SLOW-ACCURATE alignment parameters:
-</STRONG>
-</P>
-<P>
-These parameters do not have any affect on the speed of the alignments. They
-are used to give initial alignments which are then rescored to give percent
-identity scores. These % scores are the ones which are displayed on the
-screen. The scores are converted to distances for the trees.
-</P>
-<P>
-Gap Open Penalty: the penalty for opening a gap in the alignment.
-</P>
-<P>
-Gap Extension Penalty: the penalty for extending a gap by 1 residue.
-</P>
-<P>
-Protein Weight Matrix: the scoring table which describes the similarity of
-each amino acid to each other.
-</P>
-<P>
-Load protein matrix: allows you to read in a comparison table from a file.
-</P>
-<P>
-DNA weight matrix: the scores assigned to matches and mismatches (including
-IUB ambiguity codes).
-</P>
-<P>
-Load DNA matrix: allows you to read in a comparison table from a file.
-</P>
-<P>
-See the Multiple alignment parameters, MATRIX option below for details of the
-matrix input format.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-FAST-APPROXIMATE alignment parameters:
-</STRONG>
-</P>
-<P>
-These similarity scores are calculated from fast, approximate, global align-
-ments, which are controlled by 4 parameters. 2 techniques are used to make
-these alignments very fast: 1) only exactly matching fragments (k-tuples) are
-considered; 2) only the 'best' diagonals (the ones with most k-tuple matches)
-are used.
-</P>
-<P>
-GAP PENALTY: This is a penalty for each gap in the fast alignments. It has
-little effect on the speed or sensitivity except for extreme values.
-</P>
-<P>
-K-TUPLE SIZE: This is the size of exactly matching fragment that is used.
-INCREASE for speed (max= 2 for proteins; 4 for DNA), DECREASE for sensitivity.
-For longer sequences (e.g. >1000 residues) you may wish to increase the
-default.
-</P>
-<P>
-TOP DIAGONALS: The number of k-tuple matches on each diagonal (in an imaginary
-dot-matrix plot) is calculated. Only the best ones (with most matches) are used
-in the alignment. This parameter specifies how many. Decrease for speed;
-increase for sensitivity.
-</P>
-<P>
-WINDOW SIZE: This is the number of diagonals around each of the 'best'
-diagonals that will be used. Decrease for speed; increase for sensitivity.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-MULTIPLE ALIGNMENT PARAMETERS
-</STRONG>
-</P>
-<P>
-These parameters control the final multiple alignment. This is the core of the
-program and the details are complicated. To fully understand the use of the
-parameters and the scoring system, you will have to refer to the documentation.
-</P>
-<P>
-Each step in the final multiple alignment consists of aligning two alignments
-or sequences. This is done progressively, following the branching order in the
-GUIDE TREE. The basic parameters to control this are two gap penalties and the
-scores for various identical/non-indentical residues.
-</P>
-<P>
-The GAP OPENING and EXTENSION PENALTIES can be set here. These control the
-cost of opening up every new gap and the cost of every item in a gap.
-Increasing the gap opening penalty will make gaps less frequent. Increasing
-the gap extension penalty will make gaps shorter. Terminal gaps are not
-penalised.
-</P>
-<P>
-The DELAY DIVERGENT SEQUENCES switch delays the alignment of the most distantly
-related sequences until after the most closely related sequences have been
-aligned. The setting shows the percent identity level required to delay the
-addition of a sequence; sequences that are less identical than this level to
-any other sequences will be aligned later.
-</P>
-<P>
-The TRANSITION WEIGHT gives transitions (A<-->G or C<-->T i.e. purine-purine or
-pyrimidine-pyrimidine substitutions) a weight between 0 and 1; a weight of zero
-means that the transitions are scored as mismatches, while a weight of 1 gives
-the transitions the match score. For distantly related DNA sequences, the
-weight should be near to zero; for closely related sequences it can be useful
-to assign a higher score. The default is set to 0.5.
-</P>
-<P>
-</P>
-<P>
-The PROTEIN WEIGHT MATRIX option allows you to choose a series of weight
-matrices. For protein alignments, you use a weight matrix to determine the
-similarity of non-identical amino acids. For example, Tyr aligned with Phe is
-usually judged to be 'better' than Tyr aligned with Pro.
-</P>
-<P>
-There are three 'in-built' series of weight matrices offered. Each consists of
-several matrices which work differently at different evolutionary distances. To
-see the exact details, read the documentation. Crudely, we store several
-matrices in memory, spanning the full range of amino acid distance (from almost
-identical sequences to highly divergent ones). For very similar sequences, it
-is best to use a strict weight matrix which only gives a high score to
-identities and the most favoured conservative substitutions. For more divergent
-sequences, it is appropriate to use "softer" matrices which give a high score
-to many other frequent substitutions.
-</P>
-<P>
-1) BLOSUM (Henikoff). These matrices appear to be the best available for
-carrying out data base similarity (homology searches). The matrices currently
-used are: Blosum 80, 62, 45 and 30. BLOSUM was the default in earlier Clustal X
-versions.
-</P>
-<P>
-2) PAM (Dayhoff). These have been extremely widely used since the late '70s. We
-currently use the PAM 20, 60, 120, 350 matrices.
-</P>
-<P>
-3) GONNET. These matrices were derived using almost the same procedure as the
-Dayhoff one (above) but are much more up to date and are based on a far larger
-data set. They appear to be more sensitive than the Dayhoff series. We
-currently use the GONNET 80, 120, 160, 250 and 350 matrices. This series is the
-default for Clustal X version 1.8.
-</P>
-<P>
-We also supply an identity matrix which gives a score of 10 to two identical
-amino acids and a score of zero otherwise. This matrix is not very useful.
-</P>
-<P>
-Load protein matrix: allows you to read in a comparison matrix from a file.
-This can be either a single matrix or a series of matrices (see below for
-format).
-</P>
-<P>
-</P>
-<P>
-DNA WEIGHT MATRIX option allows you to select a single matrix (not a series)
-used for aligning nucleic acid sequences. Two hard-coded matrices are available:
-</P>
-<P>
-1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
-of nucleic acid sequences. X's and N's are treated as matches to any IUB
-ambiguity symbol. All matches score 1.9; all mismatches for IUB symbols score 0.
-</P>
-<P>
-2) CLUSTALW(1.6). A previous system used by ClustalW, in which matches score
-1.0 and mismatches score 0. All matches for IUB symbols also score 0.
-</P>
-<P>
-Load DNA matrix: allows you to read in a nucleic acid comparison matrix from a
-file (just one matrix, not a series).
-</P>
-<P>
-</P>
-<P>
-SINGLE MATRIX INPUT FORMAT
-The format used for a single matrix is the same as the BLAST program. The
-scores in the new weight matrix should be similarities. You can use negative as
-well as positive values if you wish, although the matrix will be automatically
-adjusted to all positive scores, unless the NEGATIVE MATRIX option is selected.
-Any lines beginning with a # character are assumed to be comments. The first
-non-comment line should contain a list of amino acids in any order, using the 1
-letter code, followed by a * character. This should be followed by a square
-matrix of scores, with one row and one column for each amino acid. The last row
-and column of the matrix (corresponding to the * character) contain the minimum
-score over the whole matrix.
-</P>
-<P>
-MATRIX SERIES INPUT FORMAT
-ClustalX uses different matrices depending on the mean percent identity of the
-sequences to be aligned. You can specify a series of matrices and the range of
-the percent identity for each matrix in a matrix series file. The file is
-automatically recognised by the word CLUSTAL_SERIES at the beginning of the
-file. Each matrix in the series is then specified on one line which should
-start with the word MATRIX. This is followed by the lower and upper limits of
-the sequence percent identities for which you want to apply the matrix. The
-final entry on the matrix line is the filename of a Blast format matrix file
-(see above for details of the single matrix file format).
-</P>
-<P>
-Example.
-</P>
-<P>
-CLUSTAL_SERIES
-</P>
-<P>
-MATRIX 81 100 /us1/user/julie/matrices/blosum80
-MATRIX 61 80 /us1/user/julie/matrices/blosum62
-MATRIX 31 60 /us1/user/julie/matrices/blosum45
-MATRIX 0 30 /us1/user/julie/matrices/blosum30
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-PROTEIN GAP PARAMETERS
-</STRONG>
-</P>
-<P>
-RESIDUE SPECIFIC PENALTIES are amino acid specific gap penalties that reduce or
-increase the gap opening penalties at each position in the alignment or
-sequence. See the documentation for details. As an example, positions that are
-rich in glycine are more likely to have an adjacent gap than positions that are
-rich in valine.
-</P>
-<P>
-HYDROPHILIC GAP PENALTIES are used to increase the chances of a gap within a
-run (5 or more residues) of hydrophilic amino acids; these are likely to be
-loop or random coil regions where gaps are more common. The residues that are
-"considered" to be hydrophilic can be entered in HYDROPHILIC RESIDUES.
-</P>
-<P>
-GAP SEPARATION DISTANCE tries to decrease the chances of gaps being too close
-to each other. Gaps that are less than this distance apart are penalised more
-than other gaps. This does not prevent close gaps; it makes them less frequent,
-promoting a block-like appearance of the alignment.
-</P>
-<P>
-END GAP SEPARATION treats end gaps just like internal gaps for the purposes of
-avoiding gaps that are too close (set by GAP SEPARATION DISTANCE above). If you
-turn this off, end gaps will be ignored for this purpose. This is useful when
-you wish to align fragments where the end gaps are not biologically meaningful.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="P"> Profile and Structure Alignments
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-By PROFILE ALIGNMENT, we mean alignment using existing alignments. Profile
-alignments allow you to store alignments of your favourite sequences and add
-new sequences to them in small bunches at a time. A profile is simply an
-alignment of one or more sequences (e.g. an alignment output file from Clustal
-X). Each input can be a single sequence. One or both sets of input sequences
-may include secondary structure assignments or gap penalty masks to guide the
-alignment.
-</P>
-<P>
-Make sure PROFILE ALIGNMENT MODE is selected, using the switch directly above
-the sequence display area. Then, use the ALIGNMENT menu to do profile and
-secondary structure alignments.
-</P>
-<P>
-The profiles can be in any of the allowed input formats with "-" characters
-used to specify gaps (except for GCG/MSF where "." is used).
-</P>
-<P>
-You have to load the 2 profiles by choosing FILE, LOAD PROFILE 1 and LOAD
-PROFILE 2. Then ALIGNMENT, ALIGN PROFILE 2 TO PROFILE 1 will align the 2
-profiles to each other. Secondary structure masks in either profile can be used
-to guide the alignment. This option compares all the sequences in profile 1
-with all the sequences in profile 2 in order to build guide trees which will be
-used to calculate sequence weights, and select appropriate alignment parameters
-for the final profile alignment.
-</P>
-<P>
-You can skip the first stage (pairwise alignments; guide trees) by using old
-guide tree files (ALIGN PROFILES FROM GUIDE TREES).
-</P>
-<P>
-The ALIGN SEQUENCES TO PROFILE 1 option will take the sequences in the second
-profile and align them to the first profile, 1 at a time. This is useful to
-add some new sequences to an existing alignment, or to align a set of sequences
-to a known structure. In this case, the second profile set need not be
-pre-aligned.
-</P>
-<P>
-You can skip the first stage (pairwise alignments; guide tree) by using an old
-guide tree file (ALIGN SEQUENCES TO PROFILE 1 FROM TREE).
-</P>
-<P>
-SAVE LOG FILE will write the alignment calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-</P>
-<P>
-The alignment parameters can be set using the ALIGNMENT PARAMETERS menu,
-Pairwise Parameters, Multiple Parameters and Protein Gap Parameters options.
-These are EXACTLY the same parameters as used by the general, automatic
-multiple alignment procedure. The general multiple alignment procedure is
-simply a series of profile alignments. Carrying out a series of profile
-alignments on larger and larger groups of sequences, allows you to manually
-build up a complete alignment, if necessary editing intermediate alignments.
-</P>
-<P>
-<STRONG>
-SECONDARY STRUCTURE PARAMETERS
-</STRONG>
-</P>
-<P>
-Use this menu to set secondary structure options. If a solved structure is
-known, it can be used to guide the alignment by raising gap penalties within
-secondary structure elements, so that gaps will preferentially be inserted into
-unstructured surface loop regions. Alternatively, a user-specified gap penalty
-mask can be supplied for a similar purpose.
-</P>
-<P>
-A gap penalty mask is a series of numbers between 1 and 9, one per position in
-the alignment. Each number specifies how much the gap opening penalty is to be
-raised at that position (raised by multiplying the basic gap opening penalty
-by the number) i.e. a mask figure of 1 at a position means no change
-in gap opening penalty; a figure of 4 means that the gap opening penalty is
-four times greater at that position, making gaps 4 times harder to open.
-</P>
-<P>
-The format for gap penalty masks and secondary structure masks is explained in
-a separate help section.
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="B"> Secondary Structure / Gap Penalty Masks
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-The use of secondary structure-based penalties has been shown to improve the
-accuracy of sequence alignment. Clustal X now allows secondary structure/ gap
-penalty masks to be supplied with the input sequences used during profile
-alignment. (NB. The secondary structure information is NOT used during multiple
-sequence alignment). The masks work by raising gap penalties in specified
-regions (typically secondary structure elements) so that gaps are
-preferentially opened in the less well conserved regions (typically surface
-loops).
-</P>
-<P>
-The USE PROFILE 1(2) SECONDARY STRUCTURE / GAP PENALTY MASK options control
-whether the input 2D-structure information or gap penalty masks will be used
-during the profile alignment.
-</P>
-<P>
-The OUTPUT options control whether the secondary structure and gap penalty
-masks should be included in the Clustal X output alignments. Showing both is
-useful for understanding how the masks work. The 2D-structure information is
-itself useful in judging the alignment quality and in seeing how residue
-conservation patterns vary with secondary structure.
-</P>
-<P>
-The HELIX and STRAND GAP PENALTY options provide the value for raising the gap
-penalty at core Alpha Helical (A) and Beta Strand (B) residues. In CLUSTAL
-format, capital residues denote the A and B core structure notation. Basic gap
-penalties are multiplied by the amount specified.
-</P>
-<P>
-The LOOP GAP PENALTY option provides the value for the gap penalty in Loops.
-By default this penalty is not raised. In CLUSTAL format, loops are specified
-by "." in the secondary structure notation.
-</P>
-<P>
-The SECONDARY STRUCTURE TERMINAL PENALTY provides the value for setting the gap
-penalty at the ends of secondary structures. Ends of secondary structures are
-known to grow or shrink, comparing related structures. Therefore by default
-these are given intermediate values, lower than the core penalties. All
-secondary structure read in as lower case in CLUSTAL format gets the reduced
-terminal penalty.
-</P>
-<P>
-The HELIX and STRAND TERMINAL POSITIONS options specify the range of structure
-termini for the intermediate penalties. In the alignment output, these are
-indicated as lower case. For Alpha Helices, by default, the range spans the
-end-helical turn (3 residues). For Beta Strands, the default range spans the
-end residue and the adjacent loop residue, since sequence conservation often
-extends beyond the actual H-bonded Beta Strand.
-</P>
-<P>
-Clustal X can read the masks from SWISS-PROT, CLUSTAL or GDE format input
-files. For many 3-D protein structures, secondary structure information is
-recorded in the feature tables of SWISS-PROT database entries. You should
-always check that the assignments are correct - some are quite inaccurate.
-Clustal X looks for SWISS-PROT HELIX and STRAND assignments e.g.
-</P>
-<P>
-</P>
-<P>
-<PRE>
-FT HELIX 100 115
-FT STRAND 118 119
-</PRE>
-</P>
-<P>
-The structure and penalty masks can also be read from CLUSTAL alignment format
-as comment lines beginning "!SS_" or "!GM_" e.g.
-</P>
-<P>
-<PRE>
-!SS_HBA_HUMA ..aaaAAAAAAAAAAaaa.aaaAAAAAAAAAAaaaaaaAaaa.........aaaAAAAAA
-!GM_HBA_HUMA 112224444444444222122244444444442222224222111111111222444444
-HBA_HUMA VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGK
-</PRE>
-</P>
-<P>
-Note that the mask itself is a set of numbers between 1 and 9 each of which is
-assigned to the residue(s) in the same column below.
-</P>
-<P>
-In GDE flat file format, the masks are specified as text and the names must
-begin with "SS_ or "GM_.
-</P>
-<P>
-Either a structure or penalty mask or both may be used. If both are included
-in an alignment, the user will be asked which is to be used.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="T"> Phylogenetic Trees
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-Before calculating a tree, you must have an ALIGNMENT in memory. This can be
-input using the FILE menu, LOAD SEQUENCES option or you should have just
-carried out a full multiple alignment and the alignment is still in memory.
-Remember YOU MUST ALIGN THE SEQUENCES FIRST!!!!
-</P>
-<P>
-The method used is the NJ (Neighbour Joining) method of Saitou and Nei. First
-you calculate distances (percent divergence) between all pairs of sequence from
-a multiple alignment; second you apply the NJ method to the distance matrix.
-</P>
-<P>
-To calculate a tree, use the DRAW N-J TREE option. This gives an UNROOTED tree
-and all branch lengths. The root of the tree can only be inferred by using an
-outgroup (a sequence that you are certain branches at the outside of the tree
-.... certain on biological grounds) OR if you assume a degree of constancy in
-the 'molecular clock', you can place the root in the 'middle' of the tree
-(roughly equidistant from all tips).
-</P>
-<P>
-BOOTSTRAP N-J TREE uses a method for deriving confidence values for the
-groupings in a tree (first adapted for trees by Joe Felsenstein). It involves
-making N random samples of sites from the alignment (N should be LARGE, e.g.
-500 - 1000); drawing N trees (1 from each sample) and counting how many times
-each grouping from the original tree occurs in the sample trees. You can set N
-using the NUMBER OF BOOTSTRAP TRIALS option in the BOOTSTRAP TREE window. In
-practice, you should use a large number of bootstrap replicates (1000 is
-recommended, even if it means running the program for an hour on a slow
-computer). You can also supply a seed number for the random number generator
-here. Different runs with the same seed will give the same answer. See the
-documentation for more details.
-</P>
-<P>
-EXCLUDE POSITIONS WITH GAPS? With this option, any alignment positions where
-ANY of the sequences have a gap will be ignored. This means that 'like' will
-be compared to 'like' in all distances, which is highly desirable. It also
-automatically throws away the most ambiguous parts of the alignment, which are
-concentrated around gaps (usually). The disadvantage is that you may throw away
-much of the data if there are many gaps (which is why it is difficult for us to
-make it the default).
-</P>
-<P>
-CORRECT FOR MULTIPLE SUBSTITUTIONS? For small divergence (say <10%) this option
-makes no difference. For greater divergence, this option corrects for the fact
-that observed distances underestimate actual evolutionary distances. This is
-because, as sequences diverge, more than one substitution will happen at many
-sites. However, you only see one difference when you look at the present day
-sequences. Therefore, this option has the effect of stretching branch lengths
-in trees (especially long branches). The corrections used here (for DNA or
-proteins) are both due to Motoo Kimura. See the documentation for details.
-</P>
-<P>
-Where possible, this option should be used. However, for VERY divergent
-sequences, the distances cannot be reliably corrected. You will be warned if
-this happens. Even if none of the distances in a data set exceed the reliable
-threshold, if you bootstrap the data, some of the bootstrap distances may
-randomly exceed the safe limit.
-</P>
-<P>
-SAVE LOG FILE will write the tree calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-</P>
-<P>
-<H4>
-OUTPUT FORMAT OPTIONS
-</H4>
-</P>
-<P>
-Three different formats are allowed. None of these displays the tree visually.
-You can display the tree using the NJPLOT program distributed with Clustal X
-OR get the PHYLIP package and use the tree drawing facilities there.
-</P>
-<P>
-1) CLUSTAL FORMAT TREE. This format is verbose and lists all of the distances
-between the sequences and the number of alignment positions used for each. The
-tree is described at the end of the file. It lists the sequences that are
-joined at each alignment step and the branch lengths. After two sequences are
-joined, it is referred to later as a NODE. The number of a NODE is the number
-of the lowest sequence in that NODE.
-</P>
-<P>
-2) PHYLIP FORMAT TREE. This format is the New Hampshire format, used by many
-phylogenetic analysis packages. It consists of a series of nested parentheses,
-describing the branching order, with the sequence names and branch lengths. It
-can be read by the NJPLOT program distributed with ClustalX. It can also be
-used by the RETREE, DRAWGRAM and DRAWTREE programs of the PHYLIP package to see
-the trees graphically. This is the same format used during multiple alignment
-for the guide trees. Some other packages that can read and display New
-Hampshire format are TreeTool, TreeView, and Phylowin.
-</P>
-<P>
-3) PHYLIP DISTANCE MATRIX. This format just outputs a matrix of all the
-pairwise distances in a format that can be used by the PHYLIP package. It used
-to be useful when one could not produce distances from protein sequences in the
-Phylip package but is now redundant (PROTDIST of Phylip 3.5 now does this).
-</P>
-<P>
-4) NEXUS FORMAT TREE. This format is used by several popular phylogeny programs,
-including PAUP and MacClade. The format is described fully in:
-Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
-NEXUS: an extensible file format for systematic information.
-Systematic Biology 46:590-621.
-</P>
-<P>
-BOOTSTRAP LABELS ON: By default, the bootstrap values are correctly placed on
-the tree branches of the phylip format output tree. The toggle allows them to
-be placed on the nodes, which is incorrect, but some display packages (e.g.
-TreeTool, TreeView and Phylowin) only support node labelling but not branch
-labelling. Care should be taken to note which branches and labels go together.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="C"> Colors
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-Clustal X provides a versatile coloring scheme for the sequence alignment
-display. The sequences (or profiles) are colored automatically, when they are
-loaded. Sequences can be colored either by assigning a color to specific
-residues, or on the basis of an alignment consensus. In the latter case, the
-alignment consensus is calculated automatically, and the residues in each
-column are colored according to the consensus character assigned to that
-column. In this way, you can choose to highlight, for example, conserved
-hydrophylic or hydrophobic positions in the alignment.
-</P>
-<P>
-The 'rules' used to color the alignment are specified in a COLOR PARAMETER
-FILE. Clustal X automatically looks for a file called 'colprot.par' for protein
-sequences or 'coldna.par' for DNA, in the current directory. (If your running
-under UNIX, it then looks in your home directory, and finally in the
-directories in your PATH environment variable).
-</P>
-<P>
-By default, if no color parameter file is found, protein sequences are colored
-by residue as follows:
-</P>
-<P>
-<PRE>
- Color Residue Code
-</P>
-<P>
- ORANGE GPST
- RED HKR
- BLUE FWY
- GREEN ILMV
-</PRE>
-</P>
-<P>
-In the case of DNA sequences, the default colors are as follows:
-</P>
-<P>
-<PRE>
- Color Residue Code
-</P>
-<P>
- ORANGE A
- RED C
- BLUE T
- GREEN G
-</PRE>
-</P>
-<P>
-</P>
-<P>
-The default BACKGROUND COLORING option shows the sequence residues using a
-black character on a colored background. It can be switched off to show
-residues as a colored character on a white background.
-</P>
-<P>
-Either BLACK AND WHITE or DEFAULT COLOR options can be selected. The Color
-option looks first for the color parameter file (as described above) and, if no
-file is found, uses the default residue-specific colors.
-</P>
-<P>
-You can specify your own coloring scheme by using the LOAD COLOR PARAMETER FILE
-option. The format of the color parameter file is described below.
-</P>
-<P>
-<H4>
-COLOR PARAMETER FILE
-</H4>
-</P>
-<P>
-This file is divided into 3 sections:
-</P>
-<P>
-1) the names and rgb values of the colors
-2) the rules for calculating the consensus
-3) the rules for assigning colors to the residues
-</P>
-<P>
-An example file is given here.
-</P>
-<P>
-<PRE>
- --------------------------------------------------------------------
- at rgbindex
-RED 0.9 0.1 0.1
-BLUE 0.1 0.1 0.9
-GREEN 0.1 0.9 0.1
-YELLOW 0.9 0.9 0.0
-</P>
-<P>
- at consensus
-% = 60% w:l:v:i:m:a:f:c:y:h:p
-# = 80% w:l:v:i:m:a:f:c:y:h:p
-- = 50% e:d
-+ = 60% k:r
-q = 50% q:e
-p = 50% p
-n = 50% n
-t = 50% t:s
-</P>
-<P>
- at color
-g = RED
-p = YELLOW
-t = GREEN if t:%:#
-n = GREEN if n
-w = BLUE if %:#:p
-k = RED if +
- --------------------------------------------------------------------
-</PRE>
-</P>
-<P>
-The first section is optional and is identified by the header @rgbindex. If
-this section exists, each color used in the file must be named and the rgb
-values specified (on a scale from 0 to 1). If the rgb index section is not
-found, the following set of hard-coded colors will be used.
-</P>
-<P>
-<PRE>
-RED 0.9 0.1 0.1
-BLUE 0.1 0.1 0.9
-GREEN 0.1 0.9 0.1
-ORANGE 0.9 0.7 0.3
-CYAN 0.1 0.9 0.9
-PINK 0.9 0.5 0.5
-MAGENTA 0.9 0.1 0.9
-YELLOW 0.9 0.9 0.0
-</PRE>
-</P>
-<P>
-The second section is optional and is identified by the header @consensus. It
-defines how the consensus is calculated.
-</P>
-<P>
-The format of each consensus parameter is:-
-</P>
-<P>
-<PRE>
-c = n% residue_list
-</P>
-<P>
- where
- c is a character used to identify the parameter.
- n is an integer value used as the percentage cutoff
- point.
- residue_list is a list of residues denoted by a single
- character, delimited by a colon (:).
-</PRE>
-</P>
-<P>
-For example: # = 60% w:l:v:i
-</P>
-<P>
-will assign a consensus character # to any column in the alignment which
-contains more than 60% of the residues w,l,v and i.
-</P>
-<P>
-</P>
-<P>
-The third section is identified by the header @color, and defines how colors
-are assigned to each residue in the alignment.
-</P>
-<P>
-The color parameters can take one of two formats:
-</P>
-<P>
-<PRE>
-1) r = color
-2) r = color if consensus_list
-</P>
-<P>
- where
- r is a character used to denote a residue.
- color is one of the colors in the GDE color lookup table.
- residue_list is a list of residues denoted by a single
- character, delimited by a colon (:).
-</PRE>
-</P>
-<P>
-Examples:
-1) g = ORANGE
-</P>
-<P>
-will color all glycines ORANGE, regardless of the consensus.
-</P>
-<P>
-2) w = BLUE if w:%:#
-</P>
-<P>
-will color BLUE any tryptophan which is found in a column with a consensus of
-w, % or #.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="Q"> Alignment Quality Analysis
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-<H3>
-QUALITY SCORES
-</H3>
-</P>
-<P>
-Clustal X provides an indication of the quality of an alignment by plotting
-a 'conservation score' for each column of the alignment. A high score indicates
-a well-conserved column; a low score indicates low conservation. The quality
-curve is drawn below the alignment.
-</P>
-<P>
-Two methods are also provided to indicate single residues or sequence segments
-which score badly in the alignment.
-</P>
-<P>
-Low-scoring residues are expected to occur at a moderate frequency in all the
-sequences because of their steady divergence due to the natural processes of
-evolution. The most divergent sequences are likely to have the most outliers.
-However, the highlighted residues are especially useful in pointing to
-sequence misalignments. Note that clustering of highlighted residues is a
-strong indication of misalignment. This can arise due to various reasons, for
-example:
-</P>
-<P>
- 1. Partial or total misalignments caused by a failure in the
- alignment algorithm. Usually only in difficult alignment cases.
-</P>
-<P>
- 2. Partial or total misalignments because at least one of the
- sequences in the given set is partly or completely unrelated to the
- other sequences. It is up to the user to check that the set of
- sequences are alignable.
-</P>
-<P>
- 3. Frameshift translation errors in a protein sequence causing local
- mismatched regions to be heavily highlighted. These are surprisingly
- common in database entries. If suspected, a 3-frame translation of
- the source DNA needs to be examined.
-</P>
-<P>
-Occasionally, highlighted residues may point to regions of some biological
-significance. This might happen for example if a protein alignment contains a
-sequence which has acquired new functions relative to the main sequence set. It
-is important to exclude other explanations, such as error or the natural
-divergence of sequences, before invoking a biological explanation.
-</P>
-<P>
-</P>
-<P>
-<H3>
-LOW-SCORING SEGMENTS
-</H3>
-</P>
-<P>
-Unreliable regions in the alignment can be highlighted using the Low-Scoring
-Segments option. A sequence-weighted profile is used to indicate any segments
-in the sequences which score badly. Because the profile calculation may take
-some time, an option is provided to calculate LOW-SCORING SEGMENTS. The
-segment display can then be toggled on or off without having to repeat the
-time-consuming calculations.
-</P>
-<P>
-For details of the low-scoring segment calculation, see the CALCULATION section
-below.
-</P>
-<P>
-</P>
-<P>
-<H4>
-LOW-SCORING SEGMENT PARAMETERS
-</H4>
-</P>
-<P>
-MINIMUM LENGTH OF SEGMENTS: short segments (or even single residues) can be
-hidden by increasing the minimum length of segments which will be displayed.
-</P>
-<P>
-DNA MARKING SCALE is used to remove less significant segments from the
-highlighted display. Increase the scale to display more segments; decrease the
-scale to remove the least significant.
-</P>
-<P>
-</P>
-<P>
-PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of each
-amino acid to each other. The matrix is used to calculate the sequence-
-weighted profile scores. There are four 'in-built' Log-Odds matrices offered:
-the Gonnet PAM 80, 120, 250, 350 matrices. A more stringent matrix which only
-gives a high score to identities and the most favoured conservative
-substitutions, may be more suitable when the sequences are closely related. For
-more divergent sequences, it is appropriate to use "softer" matrices which give
-a high score to many other frequent substitutions. This option automatically
-recalculates the low-scoring segments.
-</P>
-<P>
-</P>
-<P>
-DNA WEIGHT MATRIX: Two hard-coded matrices are available:
-</P>
-<P>
-1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
-of nucleic acid sequences. X's and N's are treated as matches to any IUB
-ambiguity symbol. All matches score 1.0; all mismatches for IUB symbols score
-0.9.
-</P>
-<P>
-2) CLUSTALW(1.6). The previous system used by ClustalW, in which matches score
-1.0 and mismatches score 0. All matches for IUB symbols also score 0.
-</P>
-<P>
-A new matrix can be read from a file on disk, if the filename consists only
-of lower case characters. The values in the new weight matrix should be
-similarities and should be NEGATIVE for infrequent substitutions.
-</P>
-<P>
-INPUT FORMAT. The format used for a new matrix is the same as the BLAST
-program. Any lines beginning with a # character are assumed to be comments. The
-first non-comment line should contain a list of amino acids in any order, using
-the 1 letter code, followed by a * character. This should be followed by a
-square matrix of scores, with one row and one column for each amino acid. The
-last row and column of the matrix (corresponding to the * character) contain
-the minimum score over the whole matrix.
-</P>
-<P>
-<H4>
-QUALITY SCORE PARAMETERS
-</H4>
-</P>
-<P>
-You can customise the column 'quality scores' plotted underneath the alignment
-display using the following options.
-</P>
-<P>
-SCORE PLOT SCALE: this is a scalar value from 1 to 10, which can be used to
-change the scale of the quality score plot.
-</P>
-<P>
-RESIDUE EXCEPTION CUTOFF: this is a scalar value from 1 to 10, which can be
-used to change the number of residue exceptions which are highlighted in the
-alignment display. (For an explanation of this cutoff, see the CALCULATION OF
-RESIDUE EXCEPTIONS section below.)
-</P>
-<P>
-PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of
-each amino acid to each other.
-</P>
-<P>
-DNA WEIGHT MATRIX: two hard-coded matrices are available: IUB and CLUSTALW(1.6).
-</P>
-<P>
-For more information about the weight matrices, see the help above for
-the Low-scoring Segments Weight Matrix.
-</P>
-<P>
-For details of the quality score calculations, see the CALCULATION section
-below.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-SHOW LOW-SCORING SEGMENTS
-</STRONG>
-</P>
-<P>
-The low-scoring segment display can be toggled on or off. This option does not
-recalculate the profile scores.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-SHOW EXCEPTIONAL RESIDUES
-</STRONG>
-</P>
-<P>
-This option highlights individual residues which score badly in the alignment
-quality calculations. Residues which score exceptionally low are highlighted by
-using a white character on a grey background.
-</P>
-<P>
-<STRONG>
-SAVE QUALITY SCORES TO FILE
-</STRONG>
-</P>
-<P>
-The quality scores that are plotted underneath the alignment display can also
-be saved in a text file. Each column in the alignment is written on one line in
-the output file, with the value of the quality score at the end of the line.
-Only the sequences currently selected in the display are written to the file.
-One use for quality scores is to color residues in a protein structure by
-sequence conservation. In this way conserved surface residues can be
-highlighted to locate functional regions such as ligand-binding sites.
-</P>
-<P>
-</P>
-<P>
-<H3>
-CALCULATION OF QUALITY SCORES
-</H3>
-</P>
-<P>
-Suppose we have an alignment of m sequences of length n. Then, the alignment
-can be written as:
-</P>
-<P>
-<PRE>
- A11 A12 A13 .......... A1n
- A21 A22 A23 .......... A2n
- .
- .
- Am1 Am2 Am3 .......... Amn
-</PRE>
-</P>
-<P>
-We also have a residue comparison matrix of size R where C(i,j) is the score
-for aligning residue i with residue j.
-</P>
-<P>
-We want to calculate a score for the conservation of the jth position in the
-alignment.
-</P>
-<P>
-To do this, we define an R-dimensional sequence space. For the jth position in
-the alignment, each sequence consists of a single residue which is assigned a
-point S in the space. S has R dimensions, and for sequence i, the rth dimension
-is defined as:
-</P>
-<P>
-<PRE>
- Sr = C(r,Aij)
-</PRE>
-</P>
-<P>
-We then calculate a consensus value for the jth position in the alignment. This
-value X also has R dimensions, and the rth dimension is defined as:
-</P>
-<P>
-<PRE>
- Xr = ( SUM (Fij * C(i,r)) ) / m
- 1<=i<=R
-</PRE>
-</P>
-<P>
-where Fij is the count of residues i at position j in the alignment.
-</P>
-<P>
-Now we can calculate the distance Di between each sequence i and the consensus
-position X in the R-dimensional space.
-</P>
-<P>
-<PRE>
- Di = SQRT ( SUM (Xr - Sr)(Xr - Sr) )
- 1<=i<=R
-</P>
-<P>
-</PRE>
-</P>
-<P>
-The quality score for the jth position in the alignment is defined as the mean
-of the sequence distances Di.
-</P>
-<P>
-The score is normalised by multiplying by the percentage of sequences which
-have residues (and not gaps) at this position.
-</P>
-<P>
-<H3>
-CALCULATION OF RESIDUE EXCEPTIONS
-</H3>
-</P>
-<P>
-The jth residue of the ith sequence is considered as an exception if the
-distance Di of the sequence from the consensus value P is greater than (Upper
-Quartile + Inter Quartile Range * Cutoff). The value used as a cutoff for
-displaying exceptions can be set from the SCORE PARAMETERS menu. A high cutoff
-value will only display very significant exceptions; a low value will allow
-more, less significant, exceptions to be highlighted.
-</P>
-<P>
-(NB. Sequences which contain gaps at this position are not included in the
-exception calculation.)
-</P>
-<P>
-</P>
-<P>
-<H3>
-CALCULATION OF LOW-SCORING SEGMENTS
-</H3>
-</P>
-<P>
-Suppose we have an alignment of m sequences of length n. Then, the alignment
-can be written as:
-</P>
-<P>
-<PRE>
- A11 A12 A13 .......... A1n
- A21 A22 A23 .......... A2n
- .
- .
- Am1 Am2 Am3 .......... Amn
-</PRE>
-</P>
-<P>
-We also have a residue comparison matrix of size R where C(i,j) is the score
-for aligning residue i with residue j.
-</P>
-<P>
-We calculate sequence weights by building a neighbour-joining tree, in which
-branch lengths are proportional to divergence. Summing the branches by branch
-ownership provides the weights. See (Thompson et al., CABIOS, 10, 19 (1994) and
-Henikoff et al.,JMB, 243, 574 1994).
-</P>
-<P>
-To find the low-scoring segments in a sequence Si, we build a weighted profile
-of the remaining sequences in the alignment. Suppose we find residue r at
-position j in the sequence; then the score for the jth position in the sequence
-is defined as
-</P>
-<P>
-<PRE>
- Score(Si,j) = Profile(j,r) where Profile(j,r) is the profile score
- for residue r at position j in the
- alignment.
-</PRE>
-</P>
-<P>
-These residue scores are summed along the sequence in both forward and backward
-directions. If the sum of the scores is positive, then it is reset to zero.
-Segments which score negatively in both directions are considered as
-'low-scoring' and will be highlighted in the alignment display.
-</P>
-<P>
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="9"> Command Line Parameters
-</A></H2></CENTER>
-<CENTER><H3> DATA (sequences)
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-PROFILE1=file.ext and -PROFILE2=file.ext </TT></TD>
-<TD><EM>profiles (aligned sequences)</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3> VERBS (do things)
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-HELP or -CHECK </TT></TD>
-<TD><EM>outline the command line parameters</EM></TD>
-</TR>
-<TR>
-<TD><TT>-ALIGN </TT></TD>
-<TD><EM>do full multiple alignment </EM></TD>
-</TR>
-<TR>
-<TD><TT>-TREE </TT></TD>
-<TD><EM>calculate NJ tree</EM></TD>
-</TR>
-<TR>
-<TD><TT>-BOOTSTRAP(=n) </TT></TD>
-<TD><EM>bootstrap a NJ tree (n= number of bootstraps; def. = 1000)</EM></TD>
-</TR>
-<TR>
-<TD><TT>-CONVERT </TT></TD>
-<TD><EM>output the input sequences in a different file format</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3> PARAMETERS (set things)
-</H3></CENTER>
-<CENTER><P><STRONG>***General settings:****
-</STRONG></P></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-INTERACTIVE </TT></TD>
-<TD><EM>read command line, then enter normal interactive menus</EM></TD>
-</TR>
-<TR>
-<TD><TT>-QUICKTREE </TT></TD>
-<TD><EM>use FAST algorithm for the alignment guide tree</EM></TD>
-</TR>
-<TR>
-<TD><TT>-TYPE= </TT></TD>
-<TD><EM>PROTEIN or DNA sequences</EM></TD>
-</TR>
-<TR>
-<TD><TT>-NEGATIVE </TT></TD>
-<TD><EM>protein alignment with negative values in matrix</EM></TD>
-</TR>
-<TR>
-<TD><TT>-OUTFILE= </TT></TD>
-<TD><EM>sequence alignment file name</EM></TD>
-</TR>
-<TR>
-<TD><TT>-OUTPUT= </TT></TD>
-<TD><EM>GCG, GDE, PHYLIP, PIR or NEXUS</EM></TD>
-</TR>
-<TR>
-<TD><TT>-OUTORDER= </TT></TD>
-<TD><EM>INPUT or ALIGNED</EM></TD>
-</TR>
-<TR>
-<TD><TT>-CASE= </TT></TD>
-<TD><EM>LOWER or UPPER (for GDE output only)</EM></TD>
-</TR>
-<TR>
-<TD><TT>-SEQNOS= </TT></TD>
-<TD><EM>OFF or ON (for Clustal output only)</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Fast Pairwise Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-TOPDIAGS=n </TT></TD>
-<TD><EM>number of best diags.</EM></TD>
-</TR>
-<TR>
-<TD><TT>-WINDOW=n </TT></TD>
-<TD><EM>window around best diags.</EM></TD>
-</TR>
-<TR>
-<TD><TT>-PAIRGAP=n </TT></TD>
-<TD><EM>gap penalty</EM></TD>
-</TR>
-<TR>
-<TD><TT>-SCORE= </TT></TD>
-<TD><EM>PERCENT or ABSOLUTE</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Slow Pairwise Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-PWDNAMATRIX= </TT></TD>
-<TD><EM>DNA weight matrix=IUB, CLUSTALW or filename</EM></TD>
-</TR>
-<TR>
-<TD><TT>-PWGAPOPEN=f </TT></TD>
-<TD><EM>gap opening penalty</EM></TD>
-</TR>
-<TR>
-<TD><TT>-PWGAPEXT=f </TT></TD>
-<TD><EM>gap opening penalty</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Multiple Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-USETREE= </TT></TD>
-<TD><EM>file for old guide tree</EM></TD>
-</TR>
-<TR>
-<TD><TT>-MATRIX= </TT></TD>
-<TD><EM>Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename</EM></TD>
-</TR>
-<TR>
-<TD><TT>-DNAMATRIX= </TT></TD>
-<TD><EM>DNA weight matrix=IUB, CLUSTALW or filename</EM></TD>
-</TR>
-<TR>
-<TD><TT>-GAPOPEN=f </TT></TD>
-<TD><EM>gap opening penalty</EM></TD>
-</TR>
-<TR>
-<TD><TT>-GAPEXT=f </TT></TD>
-<TD><EM>gap extension penalty</EM></TD>
-</TR>
-<TR>
-<TD><TT>-ENDGAPS </TT></TD>
-<TD><EM>no end gap separation pen.</EM></TD>
-</TR>
-<TR>
-<TD><TT>-GAPDIST=n </TT></TD>
-<TD><EM>gap separation pen. range</EM></TD>
-</TR>
-<TR>
-<TD><TT>-NOPGAP </TT></TD>
-<TD><EM>residue-specific gaps off</EM></TD>
-</TR>
-<TR>
-<TD><TT>-NOHGAP </TT></TD>
-<TD><EM>hydrophilic gaps off</EM></TD>
-</TR>
-<TR>
-<TD><TT>-HGAPRESIDUES= </TT></TD>
-<TD><EM>list hydrophilic res.</EM></TD>
-</TR>
-<TR>
-<TD><TT>-MAXDIV=n </TT></TD>
-<TD><EM>% ident. for delay</EM></TD>
-</TR>
-<TR>
-<TD><TT>-TYPE= </TT></TD>
-<TD><EM>PROTEIN or DNA</EM></TD>
-</TR>
-<TR>
-<TD><TT>-TRANSWEIGHT=f </TT></TD>
-<TD><EM>transitions weighting</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Profile Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-NEWTREE1= </TT></TD>
-<TD><EM>file for new guide tree for profile1</EM></TD>
-</TR>
-<TR>
-<TD><TT>-NEWTREE2= </TT></TD>
-<TD><EM>file for new guide tree for profile2</EM></TD>
-</TR>
-<TR>
-<TD><TT>-USETREE1= </TT></TD>
-<TD><EM>file for old guide tree for profile1</EM></TD>
-</TR>
-<TR>
-<TD><TT>-USETREE2= </TT></TD>
-<TD><EM>file for old guide tree for profile2</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Sequence to Profile Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-NEWTREE= </TT></TD>
-<TD><EM>file for new guide tree</EM></TD>
-</TR>
-<TR>
-<TD><TT>-USETREE= </TT></TD>
-<TD><EM>file for old guide tree</EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Structure Alignments:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-NOSECSTR2 </TT></TD>
-<TD><EM>do not use secondary structure/gap penalty mask for profile 2</EM></TD>
-</TR>
-<TR>
-<TD><TT>-SECSTROUT=STRUCTURE or MASK or BOTH or NONE </TT></TD>
-<TD><EM>output in alignment file</EM></TD>
-</TR>
-<TR>
-<TD><TT>-HELIXGAP=n </TT></TD>
-<TD><EM>gap penalty for helix core residues </EM></TD>
-</TR>
-<TR>
-<TD><TT>-STRANDGAP=n </TT></TD>
-<TD><EM>gap penalty for strand core residues</EM></TD>
-</TR>
-<TR>
-<TD><TT>-LOOPGAP=n </TT></TD>
-<TD><EM>gap penalty for loop regions</EM></TD>
-</TR>
-<TR>
-<TD><TT>-TERMINALGAP=n </TT></TD>
-<TD><EM>gap penalty for structure termini</EM></TD>
-</TR>
-<TR>
-<TD><TT>-HELIXENDIN=n </TT></TD>
-<TD><EM>number of residues inside helix to be treated as terminal</EM></TD>
-</TR>
-<TR>
-<TD><TT>-HELIXENDOUT=n </TT></TD>
-<TD><EM>number of residues outside helix to be treated as terminal</EM></TD>
-</TR>
-<TR>
-<TD><TT>-STRANDENDIN=n </TT></TD>
-<TD><EM>number of residues inside strand to be treated as terminal</EM></TD>
-</TR>
-<TR>
-<TD><TT>-STRANDENDOUT=n</TT></TD>
-<TD><EM>number of residues outside strand to be treated as terminal </EM></TD>
-</TR>
-</TABLE></CENTER>
-<CENTER><H3>***Trees:***
-</H3></CENTER>
-<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
-<TR>
-<TD><STRONG>Parameter</STRONG></TD>
-<TD><STRONG><EM>Description</EM></STRONG></TD>
-</TR>
-<TR>
-<TD><TT>-SEED=n </TT></TD>
-<TD><EM>seed number for bootstraps</EM></TD>
-</TR>
-<TR>
-<TD><TT>-KIMURA </TT></TD>
-<TD><EM>use Kimura's correction</EM></TD>
-</TR>
-<TR>
-<TD><TT>-TOSSGAPS </TT></TD>
-<TD><EM>ignore positions with gaps</EM></TD>
-</TR>
-<TR>
-<TD><TT>-BOOTLABELS=node OR branch </TT></TD>
-<TD><EM>position of bootstrap values in tree display</EM></TD>
-</TR>
-</TABLE></CENTER>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
-<CENTER><H2><A NAME="R"> References
-</A></H2></CENTER>
-<P>
-</P>
-<P>
-<STRONG>
-The ClustalX program is described in the manuscript:
-</STRONG>
-</P>
-<P>
-Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
-The ClustalX windows interface: flexible strategies for multiple sequence
-alignment aided by quality analysis tools. Nucleic Acids Research, 25:4876-4882.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-The ClustalW program is described in the manuscript:
-</STRONG>
-</P>
-<P>
-Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
-sensitivity of progressive multiple sequence alignment through sequence
-weighting, positions-specific gap penalties and weight matrix choice. Nucleic
-Acids Research, 22:4673-4680.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-The ClustalV program is described in the manuscript:
-</STRONG>
-</P>
-<P>
-Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
-multiple sequence alignment. CABIOS 8,189-191.
-</P>
-<P>
-</P>
-<P>
-<STRONG>
-The original Clustal program is described in the manuscripts:
-</STRONG>
-</P>
-<P>
-Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
-alignments on a microcomputer.
-CABIOS 5,151-153.
-</P>
-<P>
-Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
-sequence alignment on a microcomputer. Gene 73,237-244.
-</P>
-<P>
-<STRONG>
-Some tips on using Clustal X:
-</STRONG>
-</P>
-<P>
-Jeannmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
-Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
-</P>
-<P>
-<STRONG>
-Some tips on using Clustal W:
-</STRONG>
-</P>
-<P>
-Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
-multiple sequence alignments. Methods Enzymol., 266, 383-402.
-</P>
-<P>
-<STRONG>
-You can get the latest version of the ClustalX program by anonymous ftp to:
-</STRONG>
-</P>
-<P>
-ftp-igbmc.u-strasbg.fr
-ftp.embl-heidelberg.de
-ftp.ebi.ac.uk
-</P>
-<P>
-<STRONG>
-Or, have a look at the following WWW site:
-</STRONG>
-</P>
-<P>
-http://www-igbmc.u-strasbg.fr/BioInfo/
-</P>
-<P>
-</P>
-<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
Deleted: trunk/packages/clustalw/trunk/clustalx_help
===================================================================
--- trunk/packages/clustalw/trunk/clustalx_help 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/clustalx_help 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,3043 +0,0 @@
-
-This is the on-line help file for Clustal X (version 1.81), using the NCBI
-Vibrant Toolkit.
-
-It should be named or defined as: clustalx_help
-except with MSDOS in which case it should be named ClustalX.HLP
-
-For full details of usage and algorithms, please read the CLUSTALW.DOC file.
-
-
-Toby Gibson EMBL, Heidelberg, Germany.
-Des Higgins UCC, Cork, Ireland.
-Julie Thompson/Francois Jeanmougin IGBMC, Strasbourg, France.
-
-
-
-
->>HELP G <<
- General help for CLUSTAL X (1.8)
-
-Clustal X is a windows interface for the ClustalW multiple sequence alignment
-program. It provides an integrated environment for performing multiple sequence
-and profile alignments and analysing the results. The sequence alignment is
-displayed in a window on the screen. A versatile coloring scheme has been
-incorporated allowing you to highlight conserved features in the alignment.
-The pull-down menus at the top of the window allow you to select all the
-options required for traditional multiple sequence and profile alignment.
-
-You can cut-and-paste sequences to change the order of the alignment; you can
-select a subset of sequences to be aligned; you can select a sub-range of the
-alignment to be realigned and inserted back into the original alignment.
-
-Alignment quality analysis can be performed and low-scoring segments or
-exceptional residues can be highlighted.
-
-ClustalX is available for a number of different platforms including: SUN
-Solaris, IRIX5.3 on Silicon Graphics, Digital UNIX on DECStations, Microsoft
-Windows (32 bit) for PC's, Linux ELF for x86 PC's and Macintosh PowerMac. (See
-the README file for Installation instructions.)
-
-
-<H4>
-SEQUENCE INPUT
-</H4>
-
-Sequences and profiles (a term for pre-existing alignments) are input using
-the FILE menu. Invalid options will be disabled. All sequences must be included
-into 1 file. 7 formats are automatically recognised: NBRF/PIR, EMBL/SWISSPROT,
-Pearson (Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9 RSF and GDE flat file.
-All non-alphabetic characters (spaces, digits, punctuation marks) are ignored
-except "-" which is used to indicate a GAP ("." in MSF/RSF).
-
-<H4>
-SEQUENCE / PROFILE ALIGNMENTS
-</H4>
-
-Clustal X has two modes which can be selected using the switch directly above
-the sequence display: MULTIPLE ALIGNMENT MODE and PROFILE ALIGNMENT MODE.
-
-To do a MULTIPLE ALIGNMENT on a set of sequences, make sure MULTIPLE ALIGNMENT
-MODE is selected. A single sequence data area is then displayed. The ALIGNMENT
-menu then allows you to either produce a guide tree for the alignment, or to do
-a multiple alignment following the guide tree, or to do a full multiple
-alignment.
-
-In PROFILE ALIGNMENT MODE, two sequence data areas are displayed, allowing you
-to align 2 alignments (termed profiles). Profiles are also used to add a new
-sequence to an old alignment, or to use secondary structure to guide the
-alignment process. GAPS in the old alignments are indicated using the "-"
-character. PROFILES can be input in ANY of the allowed formats; just use "-"
-(or "." for MSF/RSF) for each gap position. In Profile Alignment Mode, a button
-"Lock Scroll" is displayed which allows you to scroll the two profiles together
-using a single scroll bar. When the Lock Scroll is turned off, the two profiles
-can be scrolled independently.
-
-<H4>
-PHYLOGENETIC TREES
-</H4>
-
-Phylogenetic trees can be calculated from old alignments (read in with "-"
-characters to indicate gaps) OR after a multiple alignment while the alignment
-is still displayed.
-
-<H4>
-ALIGNMENT DISPLAY
-</H4>
-
-The alignment is displayed on the screen with the sequence names on the left
-hand side. The sequence alignment is for display only, it cannot be edited here
-(except for changing the sequence order by cutting-and-pasting on the sequence
-names).
-
-A ruler is displayed below the sequences, starting at 1 for the first residue
-position (residue numbers in the sequence input file are ignored).
-
-A line above the alignment is used to mark strongly conserved positions. Three
-characters ('*', ':' and '.') are used:
-
-'*' indicates positions which have a single, fully conserved residue
-
-':' indicates that one of the following 'strong' groups is fully conserved:-
-<PRE>
- STA
- NEQK
- NHQK
- NDEQ
- QHRK
- MILV
- MILF
- HY
- FYW
-</PRE>
-
-'.' indicates that one of the following 'weaker' groups is fully conserved:-
-<PRE>
- CSA
- ATV
- SAG
- STNK
- STPA
- SGND
- SNDEQK
- NDEQHK
- NEQHRK
- FVLIM
- HFY
-</PRE>
-
-These are all the positively scoring groups that occur in the Gonnet Pam250
-matrix. The strong and weak groups are defined as strong score >0.5 and weak
-score =<0.5 respectively.
-
-For profile alignments, secondary structure and gap penalty masks are displayed
-above the sequences, if any data is found in the profile input file.
-
-
->>HELP F <<
- Input / Output Files
-
-LOAD SEQUENCES reads sequences from one of 7 file formats, replacing any
-sequences that are already loaded. All sequences must be in 1 file. The formats
-that are automatically recognised are: NBRF/PIR, EMBL/SWISSPROT, Pearson
-(Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9/RSF and GDE flat file. All
-non-alphabetic characters (spaces, digits, punctuation marks) are ignored
-except "-" which is used to indicate a GAP ("." in MSF/RSF).
-
-The program tries to automatically recognise the different file formats used
-and to guess whether the sequences are amino acid or nucleotide. This is not
-always foolproof.
-
-FASTA and NBRF/PIR formats are recognised by having a ">" as the first
-character in the file.
-
-EMBL/Swiss Prot formats are recognised by the letters "ID" at the start of the
-file (the token for the entry name field).
-
-CLUSTAL format is recognised by the word CLUSTAL at the beginning of the file.
-
-GCG/MSF format is recognised by one of the following:
-<UL>
-<LI>
- - the word PileUp at the start of the file.
-</LI><LI>
- - the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
- at the start of the file.
-</LI><LI>
- - the word MSF on the first line of the file, and the characters ..
- at the end of this line.
-</LI>
-</UL>
-
-GCG/RSF format is recognised by the word !!RICH_SEQUENCE at the beginning of
-the file.
-
-
-If 85% or more of the characters in the sequence are from A,C,G,T,U or N, the
-sequence will be assumed to be nucleotide. This works in 97.3% of cases but
-watch out!
-
-APPEND SEQUENCES is only valid in MULTIPLE ALIGNMENT MODE. The input sequences
-do not replace those already loaded, but are appended at the end of the
-alignment.
-
-SAVE SEQUENCES AS... offers the user a choice of one of six output formats:
-CLUSTAL, NBRF/PIR, GCG/MSF, PHYLIP, NEXUS or GDE. All sequences are written
-to a single file. Options are available to save a range of the alignment,
-switch between UPPER/LOWER case for GDE files, and to output SEQUENCE NUMBERING
-for CLUSTAL files.
-
-LOAD PROFILE 1 reads sequences in the same 7 file formats, replacing any
-sequences already loaded as Profile 1. This option will also remove any
-sequences which are loaded in Profile 2.
-
-LOAD PROFILE 2 reads sequences in the same 7 file formats, replacing any
-sequences already loaded as Profile 2.
-
-SAVE PROFILE 1 AS... is similar to the Save Sequences option except that only
-those sequences in Profile 1 will be written to the output file.
-
-SAVE PROFILE 2 AS... is similar to the Save Sequences option except that only
-those sequences in Profile 2 will be written to the output file.
-
-WRITE ALIGNMENT AS POSTSCRIPT will write the sequence display to a postscript
-format file. This will include any secondary structure / gap penalty mask
-information and the consensus and ruler lines which are displayed on the
-screen. The Alignment Quality curve can be optionally included in the output
-file.
-
-WRITE PROFILE 1 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
-except that only the profile 1 display will be printed.
-
-WRITE PROFILE 2 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
-except that only the profile 2 display will be printed.
-
-
-<H4>
-POSTSCRIPT PARAMETERS
-</H4>
-
-A number of options are available to allow you to configure your postscript
-output file.
-
-PS COLORS FILE:
-
-The exact RGB values required to reproduce the colors used in the alignment
-window will vary from printer to printer. A PS colors file can be specified
-that contains the RGB values for all the colors required by each of your
-postscript printers.
-
-By default, Clustal X looks for a file called 'colprint.par' in the current
-directory (if your running under UNIX, it then looks in your home directory,
-and finally in the directories in your PATH environment variable). If no PS
-colors file is found or a color used on the screen is not defined here, the
-screen RGB values (from the Color Parameter File) are used.
-
-The PS colors file consists of one line for each color to be defined, with the
-color name followed by the RGB values (on a scale of 0 to 1). For example,
-
-RED 0.9 0.1 0.1
-
-Blank lines and comments (lines beginning with a '#' character) are ignored.
-
-
-PAGE SIZE: The alignment can be displayed on either A4, A3 or US Letter size
-pages.
-
-ORIENTATION: The alignment can be displayed on either a landscape or portrait
-page.
-
-PRINT HEADER: An optional header including the postscript filename, and
-creation date can be printed at the top of each page.
-
-PRINT QUALITY CURVE: The Alignment Quality curve which is displayed underneath
-the alignment on the screen can be included in the postscript output.
-
-PRINT RULER: The ruler which is displayed underneath the alignment on the
-screen can be included in the postscript output.
-
-PRINT RESIDUE NUMBERS: Sequence residue numbers can be printed at the right
-hand side of the alignment.
-
-RESIZE TO FIT PAGE: By default, the alignment is scaled to fit the page size
-selected. This option can be turned off, in which case a font size of 10 will
-be used for the sequences.
-
-PRINT FROM POSITION/TO: A range of the alignment can be printed. The default
-is to print the full alignment. The first and last residues to be printed are
-specified here.
-
-USE BLOCK LENGTH: The alignment can be divided into blocks of residues. The
-number of residues in a block is specified here. More than one block may then
-be printed on a single page. This is useful for long alignments of a small
-number of sequences. If the block length is set to 0, The alignment will not
-be divided into blocks, but printed across a number of pages.
-
->>HELP E <<
- Editing Alignments
-
-Clustal X allows you to change the order of the sequences in the alignment, by
-cutting-and-pasting the sequence names.
-
-To select a group of sequences to be moved, click on a sequence name and drag
-the cursor until all the required sequences are highlighted. Holding down the
-Shift key when clicking on the first name will add new sequences to those
-already selected.
-
-(Options are provided to Select All Sequences, Select Profile 1 or Select
-Profile 2.)
-
-The selected sequences can be removed from the alignment by using the EDIT
-menu, CUT option.
-
-To add the cut sequences back into an alignment, select a sequence by clicking
-on the sequence name. The cut sequences will be added to the alignment,
-immediately following the selected sequence, by the EDIT menu, PASTE option.
-
-To add the cut sequences to an empty alignment (eg. when cutting sequences from
-Profile 1 and pasting them to Profile 2), click on the empty sequence name
-display area, and select the EDIT menu, PASTE option as before.
-
-The sequence selection and sequence range selection can be cleared using the
-EDIT menu, CLEAR SEQUENCE SELECTION and CLEAR RANGE SELECTION options
-respectively.
-
-To search for a string of residues in the sequences, select the sequences to be
-searched by clicking on the sequence names. You can then enter the string to
-search for by selecting the SEARCH FOR STRING option. If the string is found in
-any of the sequences selected, the sequence name and column number is printed
-below the sequence display.
-
-In PROFILE ALIGNMENT MODE, the two profiles can be merged (normally done after
-alignment) by selecting ADD PROFILE 2 TO PROFILE 1. The sequences currently
-displayed as Profile 2 will be appended to Profile 1.
-
-The REMOVE ALL GAPS option will remove all gaps from the sequences currently
-selected.
-WARNING: This option removes ALL gaps, not only those introduced by ClustalX,
-but also those that were read from the input alignment file. Any secondary
-structure information associated with the alignment will NOT be automatically
-realigned.
-
-The REMOVE GAP-ONLY COLUMNS will remove those positions in the alignment which
-contain gaps in all sequences. This can occur as a result of removing divergent
-sequences from an alignment, or if an alignment has been realigned.
-
->>HELP M <<
- Multiple Alignments
-
-Make sure MULTIPLE ALIGNMENT MODE is selected, using the switch directly above
-the sequence display area. Then, use the ALIGNMENT menu to do multiple
-alignments.
-
-Multiple alignments are carried out in 3 stages:
-
-1) all sequences are compared to each other (pairwise alignments);
-
-2) a dendrogram (like a phylogenetic tree) is constructed, describing the
-approximate groupings of the sequences by similarity (stored in a file).
-
-3) the final multiple alignment is carried out, using the dendrogram as a guide.
-
-The 3 stages are carried out automatically by the DO COMPLETE ALIGNMENT option.
-You can skip the first stages (pairwise alignments; guide tree) by using an old
-guide tree file (DO ALIGNMENT FROM GUIDE TREE); or you can just produce the
-guide tree with no final multiple alignment (PRODUCE GUIDE TREE ONLY).
-
-
-REALIGN SELECTED SEQUENCES is used to realign badly aligned sequences in the
-alignment. Sequences can be selected by clicking on the sequence names - see
-Editing Alignments for more details. The unselected sequences are then 'fixed'
-and a profile is made including only the unselected sequences. Each of the
-selected sequences in turn is then realigned to this profile. The realigned
-sequences will be displayed as a group at the end the alignment.
-
-
-REALIGN SELECTED SEQUENCE RANGE is used to realign a small region of the
-alignment. A residue range can be selected by clicking on the sequence display
-area. A multiple alignment is then performed, following the 3 stages described
-above, but only using the selected residue range. Finally the new alignment of
-the range is pasted back into the full sequence alignment.
-
-By default, gap penalties are used at each end of the subrange in order to
-penalise terminal gaps. If the REALIGN SEGMENT END GAP PENALTIES option is
-switched off, gaps can be introduced at the ends of the residue range at no
-cost.
-
-
-ALIGNMENT PARAMETERS displays a sub-menu with the following options:
-
-RESET NEW GAPS BEFORE ALIGNMENT will remove any new gaps introduced into the
-sequences during multiple alignment if you wish to change the parameters and
-try again. This only takes effect just before you do a second multiple
-alignment. You can make phylogenetic trees after alignment whether or not this
-is ON. If you turn this OFF, the new gaps are kept even if you do a second
-multiple alignment. This allows you to iterate the alignment gradually.
-Sometimes, the alignment is improved by a second or third pass.
-
-RESET ALL GAPS BEFORE ALIGNMENT will remove all gaps in the sequences including
-gaps which were read in from the sequence input file. This only takes effect
-just before you do a second multiple alignment. You can make phylogenetic
-trees after alignment whether or not this is ON. If you turn this OFF, all
-gaps are kept even if you do a second multiple alignment. This allows you to
-iterate the alignment gradually. Sometimes, the alignment is improved by a
-second or third pass.
-
-
-PAIRWISE ALIGNMENT PARAMETERS control the speed/sensitivity of the initial
-alignments.
-
-MULTIPLE ALIGNMENT PARAMETERS control the gaps in the final multiple
-alignments.
-
-PROTEIN GAP PARAMETERS displays a temporary window which allows you to set
-various parameters only used in the alignment of protein sequences.
-
-(SECONDARY STRUCTURE PARAMETERS, for use with the Profile Alignment Mode only,
-allows you to set various parameters only used with gap penalty masks.)
-
-SAVE LOG FILE will write the alignment calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-
-
-<H4>
-OUTPUT FORMAT OPTIONS
-</H4>
-
-You can choose from 6 different alignment formats (CLUSTAL, GCG, NBRF/PIR,
-PHYLIP, GDE and NEXUS). You can choose more than one (or all 6 if you wish).
-
-CLUSTAL format output is a self explanatory alignment format. It shows the
-sequences aligned in blocks. It can be read in again at a later date to (for
-example) calculate a phylogenetic tree or add in new sequences by profile
-alignment.
-
-GCG output can be used by any of the GCG programs that can work on multiple
-alignments (e.g. PRETTY, PROFILEMAKE, PLOTALIGN). It is the same as the GCG
-.msf format files (multiple sequence file); new in version 7 of GCG.
-
-NEXUS format is used by several phylogeny programs, including PAUP and
-MacClade.
-
-PHYLIP format output can be used for input to the PHYLIP package of Joe
-Felsenstein. This is a very widely used package for doing every imaginable
-form of phylogenetic analysis (MUCH more than the the modest introduction
-offered by this program).
-
-NBRF/PIR: this is the same as the standard PIR format with ONE ADDITION. Gap
-characters "-" are used to indicate the positions of gaps in the multiple
-alignment. These files can be re-used as input in any part of clustal that
-allows sequences (or alignments or profiles) to be read in.
-
-GDE: this format is used by the GDE package of Steven Smith and is understood
-by SEQLAB in GCG 9 or later.
-
-GDE OUTPUT CASE: sequences in GDE format may be written in either upper or
-lower case.
-
-CLUSTALW SEQUENCE NUMBERS: residue numbers may be added to the end of the
-alignment lines in clustalw format.
-
-OUTPUT ORDER is used to control the order of the sequences in the output
-alignments. By default, it uses the order in which the sequences were aligned
-(from the guide tree/dendrogram), thus automatically grouping closely related
-sequences. It can be switched to be the same as the original input order.
-
-PARAMETER OUTPUT: This option will save all your parameter settings in a
-parameter file (suffix .par) during alignment. The file can be subsequently
-used to rerun ClustalW using the same parameters.
-
-
-<H3>
-ALIGNMENT PARAMETERS
-</H3>
---------------------
-
-<STRONG>
-PAIRWISE ALIGNMENT PARAMETERS
-</STRONG>
-
-A distance is calculated between every pair of sequences and these are used to
-construct the phylogenetic tree which guides the final multiple alignment. The
-scores are calculated from separate pairwise alignments. These can be
-calculated using 2 methods: dynamic programming (slow but accurate) or by the
-method of Wilbur and Lipman (extremely fast but approximate).
-
-You can choose between the 2 alignment methods using the PAIRWISE ALIGNMENTS
-option. The slow/accurate method is fast enough for short sequences but will be
-VERY SLOW for many (e.g. >100) long (e.g. >1000 residue) sequences.
-
-
-<STRONG>
-SLOW-ACCURATE alignment parameters:
-</STRONG>
-
-These parameters do not have any affect on the speed of the alignments. They
-are used to give initial alignments which are then rescored to give percent
-identity scores. These % scores are the ones which are displayed on the
-screen. The scores are converted to distances for the trees.
-
-Gap Open Penalty: the penalty for opening a gap in the alignment.
-
-Gap Extension Penalty: the penalty for extending a gap by 1 residue.
-
-Protein Weight Matrix: the scoring table which describes the similarity of
-each amino acid to each other.
-
-Load protein matrix: allows you to read in a comparison table from a file.
-
-DNA weight matrix: the scores assigned to matches and mismatches (including
-IUB ambiguity codes).
-
-Load DNA matrix: allows you to read in a comparison table from a file.
-
-See the Multiple alignment parameters, MATRIX option below for details of the
-matrix input format.
-
-
-<STRONG>
-FAST-APPROXIMATE alignment parameters:
-</STRONG>
-
-These similarity scores are calculated from fast, approximate, global align-
-ments, which are controlled by 4 parameters. 2 techniques are used to make
-these alignments very fast: 1) only exactly matching fragments (k-tuples) are
-considered; 2) only the 'best' diagonals (the ones with most k-tuple matches)
-are used.
-
-GAP PENALTY: This is a penalty for each gap in the fast alignments. It has
-little effect on the speed or sensitivity except for extreme values.
-
-K-TUPLE SIZE: This is the size of exactly matching fragment that is used.
-INCREASE for speed (max= 2 for proteins; 4 for DNA), DECREASE for sensitivity.
-For longer sequences (e.g. >1000 residues) you may wish to increase the
-default.
-
-TOP DIAGONALS: The number of k-tuple matches on each diagonal (in an imaginary
-dot-matrix plot) is calculated. Only the best ones (with most matches) are used
-in the alignment. This parameter specifies how many. Decrease for speed;
-increase for sensitivity.
-
-WINDOW SIZE: This is the number of diagonals around each of the 'best'
-diagonals that will be used. Decrease for speed; increase for sensitivity.
-
-
-<STRONG>
-MULTIPLE ALIGNMENT PARAMETERS
-</STRONG>
-
-These parameters control the final multiple alignment. This is the core of the
-program and the details are complicated. To fully understand the use of the
-parameters and the scoring system, you will have to refer to the documentation.
-
-Each step in the final multiple alignment consists of aligning two alignments
-or sequences. This is done progressively, following the branching order in the
-GUIDE TREE. The basic parameters to control this are two gap penalties and the
-scores for various identical/non-indentical residues.
-
-The GAP OPENING and EXTENSION PENALTIES can be set here. These control the
-cost of opening up every new gap and the cost of every item in a gap.
-Increasing the gap opening penalty will make gaps less frequent. Increasing
-the gap extension penalty will make gaps shorter. Terminal gaps are not
-penalised.
-
-The DELAY DIVERGENT SEQUENCES switch delays the alignment of the most distantly
-related sequences until after the most closely related sequences have been
-aligned. The setting shows the percent identity level required to delay the
-addition of a sequence; sequences that are less identical than this level to
-any other sequences will be aligned later.
-
-The TRANSITION WEIGHT gives transitions (A<-->G or C<-->T i.e. purine-purine or
-pyrimidine-pyrimidine substitutions) a weight between 0 and 1; a weight of zero
-means that the transitions are scored as mismatches, while a weight of 1 gives
-the transitions the match score. For distantly related DNA sequences, the
-weight should be near to zero; for closely related sequences it can be useful
-to assign a higher score. The default is set to 0.5.
-
-
-The PROTEIN WEIGHT MATRIX option allows you to choose a series of weight
-matrices. For protein alignments, you use a weight matrix to determine the
-similarity of non-identical amino acids. For example, Tyr aligned with Phe is
-usually judged to be 'better' than Tyr aligned with Pro.
-
-There are three 'in-built' series of weight matrices offered. Each consists of
-several matrices which work differently at different evolutionary distances. To
-see the exact details, read the documentation. Crudely, we store several
-matrices in memory, spanning the full range of amino acid distance (from almost
-identical sequences to highly divergent ones). For very similar sequences, it
-is best to use a strict weight matrix which only gives a high score to
-identities and the most favoured conservative substitutions. For more divergent
-sequences, it is appropriate to use "softer" matrices which give a high score
-to many other frequent substitutions.
-
-1) BLOSUM (Henikoff). These matrices appear to be the best available for
-carrying out data base similarity (homology searches). The matrices currently
-used are: Blosum 80, 62, 45 and 30. BLOSUM was the default in earlier Clustal X
-versions.
-
-2) PAM (Dayhoff). These have been extremely widely used since the late '70s. We
-currently use the PAM 20, 60, 120, 350 matrices.
-
-3) GONNET. These matrices were derived using almost the same procedure as the
-Dayhoff one (above) but are much more up to date and are based on a far larger
-data set. They appear to be more sensitive than the Dayhoff series. We
-currently use the GONNET 80, 120, 160, 250 and 350 matrices. This series is the
-default for Clustal X version 1.8.
-
-We also supply an identity matrix which gives a score of 10 to two identical
-amino acids and a score of zero otherwise. This matrix is not very useful.
-
-Load protein matrix: allows you to read in a comparison matrix from a file.
-This can be either a single matrix or a series of matrices (see below for
-format).
-
-
-DNA WEIGHT MATRIX option allows you to select a single matrix (not a series)
-used for aligning nucleic acid sequences. Two hard-coded matrices are available:
-
-1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
-of nucleic acid sequences. X's and N's are treated as matches to any IUB
-ambiguity symbol. All matches score 1.9; all mismatches for IUB symbols score 0.
-
-2) CLUSTALW(1.6). A previous system used by ClustalW, in which matches score
-1.0 and mismatches score 0. All matches for IUB symbols also score 0.
-
-Load DNA matrix: allows you to read in a nucleic acid comparison matrix from a
-file (just one matrix, not a series).
-
-
-SINGLE MATRIX INPUT FORMAT
-The format used for a single matrix is the same as the BLAST program. The
-scores in the new weight matrix should be similarities. You can use negative as
-well as positive values if you wish, although the matrix will be automatically
-adjusted to all positive scores, unless the NEGATIVE MATRIX option is selected.
-Any lines beginning with a # character are assumed to be comments. The first
-non-comment line should contain a list of amino acids in any order, using the 1
-letter code, followed by a * character. This should be followed by a square
-matrix of scores, with one row and one column for each amino acid. The last row
-and column of the matrix (corresponding to the * character) contain the minimum
-score over the whole matrix.
-
-MATRIX SERIES INPUT FORMAT
-ClustalX uses different matrices depending on the mean percent identity of the
-sequences to be aligned. You can specify a series of matrices and the range of
-the percent identity for each matrix in a matrix series file. The file is
-automatically recognised by the word CLUSTAL_SERIES at the beginning of the
-file. Each matrix in the series is then specified on one line which should
-start with the word MATRIX. This is followed by the lower and upper limits of
-the sequence percent identities for which you want to apply the matrix. The
-final entry on the matrix line is the filename of a Blast format matrix file
-(see above for details of the single matrix file format).
-
-Example.
-
-CLUSTAL_SERIES
-
-MATRIX 81 100 /us1/user/julie/matrices/blosum80
-MATRIX 61 80 /us1/user/julie/matrices/blosum62
-MATRIX 31 60 /us1/user/julie/matrices/blosum45
-MATRIX 0 30 /us1/user/julie/matrices/blosum30
-
-
-<STRONG>
-PROTEIN GAP PARAMETERS
-</STRONG>
-
-RESIDUE SPECIFIC PENALTIES are amino acid specific gap penalties that reduce or
-increase the gap opening penalties at each position in the alignment or
-sequence. See the documentation for details. As an example, positions that are
-rich in glycine are more likely to have an adjacent gap than positions that are
-rich in valine.
-
-HYDROPHILIC GAP PENALTIES are used to increase the chances of a gap within a
-run (5 or more residues) of hydrophilic amino acids; these are likely to be
-loop or random coil regions where gaps are more common. The residues that are
-"considered" to be hydrophilic can be entered in HYDROPHILIC RESIDUES.
-
-GAP SEPARATION DISTANCE tries to decrease the chances of gaps being too close
-to each other. Gaps that are less than this distance apart are penalised more
-than other gaps. This does not prevent close gaps; it makes them less frequent,
-promoting a block-like appearance of the alignment.
-
-END GAP SEPARATION treats end gaps just like internal gaps for the purposes of
-avoiding gaps that are too close (set by GAP SEPARATION DISTANCE above). If you
-turn this off, end gaps will be ignored for this purpose. This is useful when
-you wish to align fragments where the end gaps are not biologically meaningful.
-
-
->>HELP P <<
- Profile and Structure Alignments
-
-By PROFILE ALIGNMENT, we mean alignment using existing alignments. Profile
-alignments allow you to store alignments of your favourite sequences and add
-new sequences to them in small bunches at a time. A profile is simply an
-alignment of one or more sequences (e.g. an alignment output file from Clustal
-X). Each input can be a single sequence. One or both sets of input sequences
-may include secondary structure assignments or gap penalty masks to guide the
-alignment.
-
-Make sure PROFILE ALIGNMENT MODE is selected, using the switch directly above
-the sequence display area. Then, use the ALIGNMENT menu to do profile and
-secondary structure alignments.
-
-The profiles can be in any of the allowed input formats with "-" characters
-used to specify gaps (except for GCG/MSF where "." is used).
-
-You have to load the 2 profiles by choosing FILE, LOAD PROFILE 1 and LOAD
-PROFILE 2. Then ALIGNMENT, ALIGN PROFILE 2 TO PROFILE 1 will align the 2
-profiles to each other. Secondary structure masks in either profile can be used
-to guide the alignment. This option compares all the sequences in profile 1
-with all the sequences in profile 2 in order to build guide trees which will be
-used to calculate sequence weights, and select appropriate alignment parameters
-for the final profile alignment.
-
-You can skip the first stage (pairwise alignments; guide trees) by using old
-guide tree files (ALIGN PROFILES FROM GUIDE TREES).
-
-The ALIGN SEQUENCES TO PROFILE 1 option will take the sequences in the second
-profile and align them to the first profile, 1 at a time. This is useful to
-add some new sequences to an existing alignment, or to align a set of sequences
-to a known structure. In this case, the second profile set need not be
-pre-aligned.
-
-You can skip the first stage (pairwise alignments; guide tree) by using an old
-guide tree file (ALIGN SEQUENCES TO PROFILE 1 FROM TREE).
-
-SAVE LOG FILE will write the alignment calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-
-The alignment parameters can be set using the ALIGNMENT PARAMETERS menu,
-Pairwise Parameters, Multiple Parameters and Protein Gap Parameters options.
-These are EXACTLY the same parameters as used by the general, automatic
-multiple alignment procedure. The general multiple alignment procedure is
-simply a series of profile alignments. Carrying out a series of profile
-alignments on larger and larger groups of sequences, allows you to manually
-build up a complete alignment, if necessary editing intermediate alignments.
-
-<STRONG>
-SECONDARY STRUCTURE PARAMETERS
-</STRONG>
-
-Use this menu to set secondary structure options. If a solved structure is
-known, it can be used to guide the alignment by raising gap penalties within
-secondary structure elements, so that gaps will preferentially be inserted into
-unstructured surface loop regions. Alternatively, a user-specified gap penalty
-mask can be supplied for a similar purpose.
-
-A gap penalty mask is a series of numbers between 1 and 9, one per position in
-the alignment. Each number specifies how much the gap opening penalty is to be
-raised at that position (raised by multiplying the basic gap opening penalty
-by the number) i.e. a mask figure of 1 at a position means no change
-in gap opening penalty; a figure of 4 means that the gap opening penalty is
-four times greater at that position, making gaps 4 times harder to open.
-
-The format for gap penalty masks and secondary structure masks is explained in
-a separate help section.
-
->>HELP B <<
- Secondary Structure / Gap Penalty Masks
-
-The use of secondary structure-based penalties has been shown to improve the
-accuracy of sequence alignment. Clustal X now allows secondary structure/ gap
-penalty masks to be supplied with the input sequences used during profile
-alignment. (NB. The secondary structure information is NOT used during multiple
-sequence alignment). The masks work by raising gap penalties in specified
-regions (typically secondary structure elements) so that gaps are
-preferentially opened in the less well conserved regions (typically surface
-loops).
-
-The USE PROFILE 1(2) SECONDARY STRUCTURE / GAP PENALTY MASK options control
-whether the input 2D-structure information or gap penalty masks will be used
-during the profile alignment.
-
-The OUTPUT options control whether the secondary structure and gap penalty
-masks should be included in the Clustal X output alignments. Showing both is
-useful for understanding how the masks work. The 2D-structure information is
-itself useful in judging the alignment quality and in seeing how residue
-conservation patterns vary with secondary structure.
-
-The HELIX and STRAND GAP PENALTY options provide the value for raising the gap
-penalty at core Alpha Helical (A) and Beta Strand (B) residues. In CLUSTAL
-format, capital residues denote the A and B core structure notation. Basic gap
-penalties are multiplied by the amount specified.
-
-The LOOP GAP PENALTY option provides the value for the gap penalty in Loops.
-By default this penalty is not raised. In CLUSTAL format, loops are specified
-by "." in the secondary structure notation.
-
-The SECONDARY STRUCTURE TERMINAL PENALTY provides the value for setting the gap
-penalty at the ends of secondary structures. Ends of secondary structures are
-known to grow or shrink, comparing related structures. Therefore by default
-these are given intermediate values, lower than the core penalties. All
-secondary structure read in as lower case in CLUSTAL format gets the reduced
-terminal penalty.
-
-The HELIX and STRAND TERMINAL POSITIONS options specify the range of structure
-termini for the intermediate penalties. In the alignment output, these are
-indicated as lower case. For Alpha Helices, by default, the range spans the
-end-helical turn (3 residues). For Beta Strands, the default range spans the
-end residue and the adjacent loop residue, since sequence conservation often
-extends beyond the actual H-bonded Beta Strand.
-
-Clustal X can read the masks from SWISS-PROT, CLUSTAL or GDE format input
-files. For many 3-D protein structures, secondary structure information is
-recorded in the feature tables of SWISS-PROT database entries. You should
-always check that the assignments are correct - some are quite inaccurate.
-Clustal X looks for SWISS-PROT HELIX and STRAND assignments e.g.
-
-
-<PRE>
-FT HELIX 100 115
-FT STRAND 118 119
-</PRE>
-
-The structure and penalty masks can also be read from CLUSTAL alignment format
-as comment lines beginning "!SS_" or "!GM_" e.g.
-
-<PRE>
-!SS_HBA_HUMA ..aaaAAAAAAAAAAaaa.aaaAAAAAAAAAAaaaaaaAaaa.........aaaAAAAAA
-!GM_HBA_HUMA 112224444444444222122244444444442222224222111111111222444444
-HBA_HUMA VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGK
-</PRE>
-
-Note that the mask itself is a set of numbers between 1 and 9 each of which is
-assigned to the residue(s) in the same column below.
-
-In GDE flat file format, the masks are specified as text and the names must
-begin with "SS_ or "GM_.
-
-Either a structure or penalty mask or both may be used. If both are included
-in an alignment, the user will be asked which is to be used.
-
-
->>HELP T <<
- Phylogenetic Trees
-
-Before calculating a tree, you must have an ALIGNMENT in memory. This can be
-input using the FILE menu, LOAD SEQUENCES option or you should have just
-carried out a full multiple alignment and the alignment is still in memory.
-Remember YOU MUST ALIGN THE SEQUENCES FIRST!!!!
-
-The method used is the NJ (Neighbour Joining) method of Saitou and Nei. First
-you calculate distances (percent divergence) between all pairs of sequence from
-a multiple alignment; second you apply the NJ method to the distance matrix.
-
-To calculate a tree, use the DRAW N-J TREE option. This gives an UNROOTED tree
-and all branch lengths. The root of the tree can only be inferred by using an
-outgroup (a sequence that you are certain branches at the outside of the tree
-.... certain on biological grounds) OR if you assume a degree of constancy in
-the 'molecular clock', you can place the root in the 'middle' of the tree
-(roughly equidistant from all tips).
-
-BOOTSTRAP N-J TREE uses a method for deriving confidence values for the
-groupings in a tree (first adapted for trees by Joe Felsenstein). It involves
-making N random samples of sites from the alignment (N should be LARGE, e.g.
-500 - 1000); drawing N trees (1 from each sample) and counting how many times
-each grouping from the original tree occurs in the sample trees. You can set N
-using the NUMBER OF BOOTSTRAP TRIALS option in the BOOTSTRAP TREE window. In
-practice, you should use a large number of bootstrap replicates (1000 is
-recommended, even if it means running the program for an hour on a slow
-computer). You can also supply a seed number for the random number generator
-here. Different runs with the same seed will give the same answer. See the
-documentation for more details.
-
-EXCLUDE POSITIONS WITH GAPS? With this option, any alignment positions where
-ANY of the sequences have a gap will be ignored. This means that 'like' will
-be compared to 'like' in all distances, which is highly desirable. It also
-automatically throws away the most ambiguous parts of the alignment, which are
-concentrated around gaps (usually). The disadvantage is that you may throw away
-much of the data if there are many gaps (which is why it is difficult for us to
-make it the default).
-
-CORRECT FOR MULTIPLE SUBSTITUTIONS? For small divergence (say <10%) this option
-makes no difference. For greater divergence, this option corrects for the fact
-that observed distances underestimate actual evolutionary distances. This is
-because, as sequences diverge, more than one substitution will happen at many
-sites. However, you only see one difference when you look at the present day
-sequences. Therefore, this option has the effect of stretching branch lengths
-in trees (especially long branches). The corrections used here (for DNA or
-proteins) are both due to Motoo Kimura. See the documentation for details.
-
-Where possible, this option should be used. However, for VERY divergent
-sequences, the distances cannot be reliably corrected. You will be warned if
-this happens. Even if none of the distances in a data set exceed the reliable
-threshold, if you bootstrap the data, some of the bootstrap distances may
-randomly exceed the safe limit.
-
-SAVE LOG FILE will write the tree calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-
-<H4>
-OUTPUT FORMAT OPTIONS
-</H4>
-
-Three different formats are allowed. None of these displays the tree visually.
-You can display the tree using the NJPLOT program distributed with Clustal X
-OR get the PHYLIP package and use the tree drawing facilities there.
-
-1) CLUSTAL FORMAT TREE. This format is verbose and lists all of the distances
-between the sequences and the number of alignment positions used for each. The
-tree is described at the end of the file. It lists the sequences that are
-joined at each alignment step and the branch lengths. After two sequences are
-joined, it is referred to later as a NODE. The number of a NODE is the number
-of the lowest sequence in that NODE.
-
-2) PHYLIP FORMAT TREE. This format is the New Hampshire format, used by many
-phylogenetic analysis packages. It consists of a series of nested parentheses,
-describing the branching order, with the sequence names and branch lengths. It
-can be read by the NJPLOT program distributed with ClustalX. It can also be
-used by the RETREE, DRAWGRAM and DRAWTREE programs of the PHYLIP package to see
-the trees graphically. This is the same format used during multiple alignment
-for the guide trees. Some other packages that can read and display New
-Hampshire format are TreeTool, TreeView, and Phylowin.
-
-3) PHYLIP DISTANCE MATRIX. This format just outputs a matrix of all the
-pairwise distances in a format that can be used by the PHYLIP package. It used
-to be useful when one could not produce distances from protein sequences in the
-Phylip package but is now redundant (PROTDIST of Phylip 3.5 now does this).
-
-4) NEXUS FORMAT TREE. This format is used by several popular phylogeny programs,
-including PAUP and MacClade. The format is described fully in:
-Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
-NEXUS: an extensible file format for systematic information.
-Systematic Biology 46:590-621.
-
-BOOTSTRAP LABELS ON: By default, the bootstrap values are correctly placed on
-the tree branches of the phylip format output tree. The toggle allows them to
-be placed on the nodes, which is incorrect, but some display packages (e.g.
-TreeTool, TreeView and Phylowin) only support node labelling but not branch
-labelling. Care should be taken to note which branches and labels go together.
-
-
->>HELP C <<
- Colors
-
-Clustal X provides a versatile coloring scheme for the sequence alignment
-display. The sequences (or profiles) are colored automatically, when they are
-loaded. Sequences can be colored either by assigning a color to specific
-residues, or on the basis of an alignment consensus. In the latter case, the
-alignment consensus is calculated automatically, and the residues in each
-column are colored according to the consensus character assigned to that
-column. In this way, you can choose to highlight, for example, conserved
-hydrophylic or hydrophobic positions in the alignment.
-
-The 'rules' used to color the alignment are specified in a COLOR PARAMETER
-FILE. Clustal X automatically looks for a file called 'colprot.par' for protein
-sequences or 'coldna.par' for DNA, in the current directory. (If your running
-under UNIX, it then looks in your home directory, and finally in the
-directories in your PATH environment variable).
-
-By default, if no color parameter file is found, protein sequences are colored
-by residue as follows:
-
-<PRE>
- Color Residue Code
-
- ORANGE GPST
- RED HKR
- BLUE FWY
- GREEN ILMV
-</PRE>
-
-In the case of DNA sequences, the default colors are as follows:
-
-<PRE>
- Color Residue Code
-
- ORANGE A
- RED C
- BLUE T
- GREEN G
-</PRE>
-
-
-The default BACKGROUND COLORING option shows the sequence residues using a
-black character on a colored background. It can be switched off to show
-residues as a colored character on a white background.
-
-Either BLACK AND WHITE or DEFAULT COLOR options can be selected. The Color
-option looks first for the color parameter file (as described above) and, if no
-file is found, uses the default residue-specific colors.
-
-You can specify your own coloring scheme by using the LOAD COLOR PARAMETER FILE
-option. The format of the color parameter file is described below.
-
-<H4>
-COLOR PARAMETER FILE
-</H4>
-
-This file is divided into 3 sections:
-
-1) the names and rgb values of the colors
-2) the rules for calculating the consensus
-3) the rules for assigning colors to the residues
-
-An example file is given here.
-
-<PRE>
- --------------------------------------------------------------------
- at rgbindex
-RED 0.9 0.1 0.1
-BLUE 0.1 0.1 0.9
-GREEN 0.1 0.9 0.1
-YELLOW 0.9 0.9 0.0
-
- at consensus
-% = 60% w:l:v:i:m:a:f:c:y:h:p
-# = 80% w:l:v:i:m:a:f:c:y:h:p
-- = 50% e:d
-+ = 60% k:r
-q = 50% q:e
-p = 50% p
-n = 50% n
-t = 50% t:s
-
- at color
-g = RED
-p = YELLOW
-t = GREEN if t:%:#
-n = GREEN if n
-w = BLUE if %:#:p
-k = RED if +
- --------------------------------------------------------------------
-</PRE>
-
-The first section is optional and is identified by the header @rgbindex. If
-this section exists, each color used in the file must be named and the rgb
-values specified (on a scale from 0 to 1). If the rgb index section is not
-found, the following set of hard-coded colors will be used.
-
-<PRE>
-RED 0.9 0.1 0.1
-BLUE 0.1 0.1 0.9
-GREEN 0.1 0.9 0.1
-ORANGE 0.9 0.7 0.3
-CYAN 0.1 0.9 0.9
-PINK 0.9 0.5 0.5
-MAGENTA 0.9 0.1 0.9
-YELLOW 0.9 0.9 0.0
-</PRE>
-
-The second section is optional and is identified by the header @consensus. It
-defines how the consensus is calculated.
-
-The format of each consensus parameter is:-
-
-<PRE>
-c = n% residue_list
-
- where
- c is a character used to identify the parameter.
- n is an integer value used as the percentage cutoff
- point.
- residue_list is a list of residues denoted by a single
- character, delimited by a colon (:).
-</PRE>
-
-For example: # = 60% w:l:v:i
-
-will assign a consensus character # to any column in the alignment which
-contains more than 60% of the residues w,l,v and i.
-
-
-The third section is identified by the header @color, and defines how colors
-are assigned to each residue in the alignment.
-
-The color parameters can take one of two formats:
-
-<PRE>
-1) r = color
-2) r = color if consensus_list
-
- where
- r is a character used to denote a residue.
- color is one of the colors in the GDE color lookup table.
- residue_list is a list of residues denoted by a single
- character, delimited by a colon (:).
-</PRE>
-
-Examples:
-1) g = ORANGE
-
-will color all glycines ORANGE, regardless of the consensus.
-
-2) w = BLUE if w:%:#
-
-will color BLUE any tryptophan which is found in a column with a consensus of
-w, % or #.
-
-
->>HELP Q <<
- Alignment Quality Analysis
-
-<H3>
-QUALITY SCORES
-</H3>
---------------
-
-Clustal X provides an indication of the quality of an alignment by plotting
-a 'conservation score' for each column of the alignment. A high score indicates
-a well-conserved column; a low score indicates low conservation. The quality
-curve is drawn below the alignment.
-
-Two methods are also provided to indicate single residues or sequence segments
-which score badly in the alignment.
-
-Low-scoring residues are expected to occur at a moderate frequency in all the
-sequences because of their steady divergence due to the natural processes of
-evolution. The most divergent sequences are likely to have the most outliers.
-However, the highlighted residues are especially useful in pointing to
-sequence misalignments. Note that clustering of highlighted residues is a
-strong indication of misalignment. This can arise due to various reasons, for
-example:
-
- 1. Partial or total misalignments caused by a failure in the
- alignment algorithm. Usually only in difficult alignment cases.
-
- 2. Partial or total misalignments because at least one of the
- sequences in the given set is partly or completely unrelated to the
- other sequences. It is up to the user to check that the set of
- sequences are alignable.
-
- 3. Frameshift translation errors in a protein sequence causing local
- mismatched regions to be heavily highlighted. These are surprisingly
- common in database entries. If suspected, a 3-frame translation of
- the source DNA needs to be examined.
-
-Occasionally, highlighted residues may point to regions of some biological
-significance. This might happen for example if a protein alignment contains a
-sequence which has acquired new functions relative to the main sequence set. It
-is important to exclude other explanations, such as error or the natural
-divergence of sequences, before invoking a biological explanation.
-
-
-<H3>
-LOW-SCORING SEGMENTS
-</H3>
---------------------
-
-Unreliable regions in the alignment can be highlighted using the Low-Scoring
-Segments option. A sequence-weighted profile is used to indicate any segments
-in the sequences which score badly. Because the profile calculation may take
-some time, an option is provided to calculate LOW-SCORING SEGMENTS. The
-segment display can then be toggled on or off without having to repeat the
-time-consuming calculations.
-
-For details of the low-scoring segment calculation, see the CALCULATION section
-below.
-
-
-<H4>
-LOW-SCORING SEGMENT PARAMETERS
-</H4>
-------------------------------
-
-MINIMUM LENGTH OF SEGMENTS: short segments (or even single residues) can be
-hidden by increasing the minimum length of segments which will be displayed.
-
-DNA MARKING SCALE is used to remove less significant segments from the
-highlighted display. Increase the scale to display more segments; decrease the
-scale to remove the least significant.
-
-
-PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of each
-amino acid to each other. The matrix is used to calculate the sequence-
-weighted profile scores. There are four 'in-built' Log-Odds matrices offered:
-the Gonnet PAM 80, 120, 250, 350 matrices. A more stringent matrix which only
-gives a high score to identities and the most favoured conservative
-substitutions, may be more suitable when the sequences are closely related. For
-more divergent sequences, it is appropriate to use "softer" matrices which give
-a high score to many other frequent substitutions. This option automatically
-recalculates the low-scoring segments.
-
-
-DNA WEIGHT MATRIX: Two hard-coded matrices are available:
-
-1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
-of nucleic acid sequences. X's and N's are treated as matches to any IUB
-ambiguity symbol. All matches score 1.0; all mismatches for IUB symbols score
-0.9.
-
-2) CLUSTALW(1.6). The previous system used by ClustalW, in which matches score
-1.0 and mismatches score 0. All matches for IUB symbols also score 0.
-
-A new matrix can be read from a file on disk, if the filename consists only
-of lower case characters. The values in the new weight matrix should be
-similarities and should be NEGATIVE for infrequent substitutions.
-
-INPUT FORMAT. The format used for a new matrix is the same as the BLAST
-program. Any lines beginning with a # character are assumed to be comments. The
-first non-comment line should contain a list of amino acids in any order, using
-the 1 letter code, followed by a * character. This should be followed by a
-square matrix of scores, with one row and one column for each amino acid. The
-last row and column of the matrix (corresponding to the * character) contain
-the minimum score over the whole matrix.
-
-<H4>
-QUALITY SCORE PARAMETERS
-</H4>
-------------------------
-
-You can customise the column 'quality scores' plotted underneath the alignment
-display using the following options.
-
-SCORE PLOT SCALE: this is a scalar value from 1 to 10, which can be used to
-change the scale of the quality score plot.
-
-RESIDUE EXCEPTION CUTOFF: this is a scalar value from 1 to 10, which can be
-used to change the number of residue exceptions which are highlighted in the
-alignment display. (For an explanation of this cutoff, see the CALCULATION OF
-RESIDUE EXCEPTIONS section below.)
-
-PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of
-each amino acid to each other.
-
-DNA WEIGHT MATRIX: two hard-coded matrices are available: IUB and CLUSTALW(1.6).
-
-For more information about the weight matrices, see the help above for
-the Low-scoring Segments Weight Matrix.
-
-For details of the quality score calculations, see the CALCULATION section
-below.
-
-
-<STRONG>
-SHOW LOW-SCORING SEGMENTS
-</STRONG>
-
-The low-scoring segment display can be toggled on or off. This option does not
-recalculate the profile scores.
-
-
-<STRONG>
-SHOW EXCEPTIONAL RESIDUES
-</STRONG>
-
-This option highlights individual residues which score badly in the alignment
-quality calculations. Residues which score exceptionally low are highlighted by
-using a white character on a grey background.
-
-<STRONG>
-SAVE QUALITY SCORES TO FILE
-</STRONG>
-
-The quality scores that are plotted underneath the alignment display can also
-be saved in a text file. Each column in the alignment is written on one line in
-the output file, with the value of the quality score at the end of the line.
-Only the sequences currently selected in the display are written to the file.
-One use for quality scores is to color residues in a protein structure by
-sequence conservation. In this way conserved surface residues can be
-highlighted to locate functional regions such as ligand-binding sites.
-
-
-<H3>
-CALCULATION OF QUALITY SCORES
-</H3>
------------------------------
-
-Suppose we have an alignment of m sequences of length n. Then, the alignment
-can be written as:
-
-<PRE>
- A11 A12 A13 .......... A1n
- A21 A22 A23 .......... A2n
- .
- .
- Am1 Am2 Am3 .......... Amn
-</PRE>
-
-We also have a residue comparison matrix of size R where C(i,j) is the score
-for aligning residue i with residue j.
-
-We want to calculate a score for the conservation of the jth position in the
-alignment.
-
-To do this, we define an R-dimensional sequence space. For the jth position in
-the alignment, each sequence consists of a single residue which is assigned a
-point S in the space. S has R dimensions, and for sequence i, the rth dimension
-is defined as:
-
-<PRE>
- Sr = C(r,Aij)
-</PRE>
-
-We then calculate a consensus value for the jth position in the alignment. This
-value X also has R dimensions, and the rth dimension is defined as:
-
-<PRE>
- Xr = ( SUM (Fij * C(i,r)) ) / m
- 1<=i<=R
-</PRE>
-
-where Fij is the count of residues i at position j in the alignment.
-
-Now we can calculate the distance Di between each sequence i and the consensus
-position X in the R-dimensional space.
-
-<PRE>
- Di = SQRT ( SUM (Xr - Sr)(Xr - Sr) )
- 1<=i<=R
-
-</PRE>
-
-The quality score for the jth position in the alignment is defined as the mean
-of the sequence distances Di.
-
-The score is normalised by multiplying by the percentage of sequences which
-have residues (and not gaps) at this position.
-
-<H3>
-CALCULATION OF RESIDUE EXCEPTIONS
-</H3>
----------------------------------
-
-The jth residue of the ith sequence is considered as an exception if the
-distance Di of the sequence from the consensus value P is greater than (Upper
-Quartile + Inter Quartile Range * Cutoff). The value used as a cutoff for
-displaying exceptions can be set from the SCORE PARAMETERS menu. A high cutoff
-value will only display very significant exceptions; a low value will allow
-more, less significant, exceptions to be highlighted.
-
-(NB. Sequences which contain gaps at this position are not included in the
-exception calculation.)
-
-
-<H3>
-CALCULATION OF LOW-SCORING SEGMENTS
-</H3>
------------------------------------
-
-Suppose we have an alignment of m sequences of length n. Then, the alignment
-can be written as:
-
-<PRE>
- A11 A12 A13 .......... A1n
- A21 A22 A23 .......... A2n
- .
- .
- Am1 Am2 Am3 .......... Amn
-</PRE>
-
-We also have a residue comparison matrix of size R where C(i,j) is the score
-for aligning residue i with residue j.
-
-We calculate sequence weights by building a neighbour-joining tree, in which
-branch lengths are proportional to divergence. Summing the branches by branch
-ownership provides the weights. See (Thompson et al., CABIOS, 10, 19 (1994) and
-Henikoff et al.,JMB, 243, 574 1994).
-
-To find the low-scoring segments in a sequence Si, we build a weighted profile
-of the remaining sequences in the alignment. Suppose we find residue r at
-position j in the sequence; then the score for the jth position in the sequence
-is defined as
-
-<PRE>
- Score(Si,j) = Profile(j,r) where Profile(j,r) is the profile score
- for residue r at position j in the
- alignment.
-</PRE>
-
-These residue scores are summed along the sequence in both forward and backward
-directions. If the sum of the scores is positive, then it is reset to zero.
-Segments which score negatively in both directions are considered as
-'low-scoring' and will be highlighted in the alignment display.
-
-
->>HELP 9 <<
- Command Line Parameters
-
- DATA (sequences)
-
--INFILE=file.ext :input sequences
--PROFILE1=file.ext and -PROFILE2=file.ext :profiles (aligned sequences)
-
-
- VERBS (do things)
-
--OPTIONS :list the command line parameters
--HELP or -CHECK :outline the command line parameters
--ALIGN :do full multiple alignment
--TREE :calculate NJ tree
--BOOTSTRAP(=n) :bootstrap a NJ tree (n= number of bootstraps; def. = 1000)
--CONVERT :output the input sequences in a different file format
-
-
- PARAMETERS (set things)
-
-***General settings:****
--INTERACTIVE :read command line, then enter normal interactive menus
--QUICKTREE :use FAST algorithm for the alignment guide tree
--TYPE= :PROTEIN or DNA sequences
--NEGATIVE :protein alignment with negative values in matrix
--OUTFILE= :sequence alignment file name
--OUTPUT= :GCG, GDE, PHYLIP, PIR or NEXUS
--OUTORDER= :INPUT or ALIGNED
--CASE= :LOWER or UPPER (for GDE output only)
--SEQNOS= :OFF or ON (for Clustal output only)
-
-
-***Fast Pairwise Alignments:***
--KTUPLE=n :word size
--TOPDIAGS=n :number of best diags.
--WINDOW=n :window around best diags.
--PAIRGAP=n :gap penalty
--SCORE= :PERCENT or ABSOLUTE
-
-
-***Slow Pairwise Alignments:***
--PWMATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
--PWDNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
--PWGAPOPEN=f :gap opening penalty
--PWGAPEXT=f :gap opening penalty
-
-
-***Multiple Alignments:***
--NEWTREE= :file for new guide tree
--USETREE= :file for old guide tree
--MATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
--DNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
--GAPOPEN=f :gap opening penalty
--GAPEXT=f :gap extension penalty
--ENDGAPS :no end gap separation pen.
--GAPDIST=n :gap separation pen. range
--NOPGAP :residue-specific gaps off
--NOHGAP :hydrophilic gaps off
--HGAPRESIDUES= :list hydrophilic res.
--MAXDIV=n :% ident. for delay
--TYPE= :PROTEIN or DNA
--TRANSWEIGHT=f :transitions weighting
-
-
-***Profile Alignments:***
--PROFILE :Merge two alignments by profile alignment
--NEWTREE1= :file for new guide tree for profile1
--NEWTREE2= :file for new guide tree for profile2
--USETREE1= :file for old guide tree for profile1
--USETREE2= :file for old guide tree for profile2
-
-
-***Sequence to Profile Alignments:***
--SEQUENCES :Sequentially add profile2 sequences to profile1 alignment
--NEWTREE= :file for new guide tree
--USETREE= :file for old guide tree
-
-
-***Structure Alignments:***
--NOSECSTR1 :do not use secondary structure/gap penalty mask for profile 1
--NOSECSTR2 :do not use secondary structure/gap penalty mask for profile 2
--SECSTROUT=STRUCTURE or MASK or BOTH or NONE :output in alignment file
--HELIXGAP=n :gap penalty for helix core residues
--STRANDGAP=n :gap penalty for strand core residues
--LOOPGAP=n :gap penalty for loop regions
--TERMINALGAP=n :gap penalty for structure termini
--HELIXENDIN=n :number of residues inside helix to be treated as terminal
--HELIXENDOUT=n :number of residues outside helix to be treated as terminal
--STRANDENDIN=n :number of residues inside strand to be treated as terminal
--STRANDENDOUT=n:number of residues outside strand to be treated as terminal
-
-
-***Trees:***
--OUTPUTTREE=nj OR phylip OR dist OR nexus
--SEED=n :seed number for bootstraps
--KIMURA :use Kimura's correction
--TOSSGAPS :ignore positions with gaps
--BOOTLABELS=node OR branch :position of bootstrap values in tree display
-
-
->>HELP R <<
- References
-
-<STRONG>
-The ClustalX program is described in the manuscript:
-</STRONG>
-
-Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
-The ClustalX windows interface: flexible strategies for multiple sequence
-alignment aided by quality analysis tools. Nucleic Acids Research, 24:4876-4882.
-
-
-<STRONG>
-The ClustalW program is described in the manuscript:
-</STRONG>
-
-Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
-sensitivity of progressive multiple sequence alignment through sequence
-weighting, positions-specific gap penalties and weight matrix choice. Nucleic
-Acids Research, 22:4673-4680.
-
-
-<STRONG>
-The ClustalV program is described in the manuscript:
-</STRONG>
-
-Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
-multiple sequence alignment. CABIOS 8,189-191.
-
-
-<STRONG>
-The original Clustal program is described in the manuscripts:
-</STRONG>
-
-Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
-alignments on a microcomputer.
-CABIOS 5,151-153.
-
-Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
-sequence alignment on a microcomputer. Gene 73,237-244.
-
--------------------------------------------------------------------------------
-<STRONG>
-Some tips on using Clustal X:
-</STRONG>
-
-Jeanmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
-Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
-
-<STRONG>
-Some tips on using Clustal W:
-</STRONG>
-
-Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
-multiple sequence alignments. Methods Enzymol., 266, 383-402.
-
--------------------------------------------------------------------------------
-<STRONG>
-You can get the latest version of the ClustalX program by anonymous ftp to:
-</STRONG>
-
-ftp-igbmc.u-strasbg.fr
-ftp.embl-heidelberg.de
-ftp.ebi.ac.uk
-
-<STRONG>
-Or, have a look at the following WWW site:
-</STRONG>
-
-http://www-igbmc.u-strasbg.fr/BioInfo/
-
-
-This is the on-line help file for Clustal X (version 1.83), using the NCBI
-Vibrant Toolkit.
-
-It should be named or defined as: clustalx_help
-except with MSDOS in which case it should be named ClustalX.HLP
-
-For full details of usage and algorithms, please read the CLUSTALW.DOC file.
-
-
-Toby Gibson EMBL, Heidelberg, Germany.
-Des Higgins UCC, Cork, Ireland.
-Julie Thompson/Francois Jeanmougin IGBMC, Strasbourg, France.
-
-
-
-
->>HELP G <<
- General help for CLUSTAL X (1.83)
-
-Clustal X is a windows interface for the ClustalW multiple sequence alignment
-program. It provides an integrated environment for performing multiple sequence
-and profile alignments and analysing the results. The sequence alignment is
-displayed in a window on the screen. A versatile coloring scheme has been
-incorporated allowing you to highlight conserved features in the alignment.
-The pull-down menus at the top of the window allow you to select all the
-options required for traditional multiple sequence and profile alignment.
-
-You can cut-and-paste sequences to change the order of the alignment; you can
-select a subset of sequences to be aligned; you can select a sub-range of the
-alignment to be realigned and inserted back into the original alignment.
-
-Alignment quality analysis can be performed and low-scoring segments or
-exceptional residues can be highlighted.
-
-ClustalX is available for a number of different platforms including: SUN
-Solaris, IRIX5.3 on Silicon Graphics, Digital UNIX on DECStations, Microsoft
-Windows (32 bit) for PC's, Linux ELF for x86 PC's and Macintosh PowerMac. (See
-the README file for Installation instructions.)
-
-
-<H4>
-SEQUENCE INPUT
-</H4>
-
-Sequences and profiles (a term for pre-existing alignments) are input using
-the FILE menu. Invalid options will be disabled. All sequences must be included
-into 1 file. 7 formats are automatically recognised: NBRF/PIR, EMBL/SWISSPROT,
-Pearson (Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9 RSF and GDE flat file.
-All non-alphabetic characters (spaces, digits, punctuation marks) are ignored
-except "-" which is used to indicate a GAP ("." in MSF/RSF).
-
-<H4>
-SEQUENCE / PROFILE ALIGNMENTS
-</H4>
-
-Clustal X has two modes which can be selected using the switch directly above
-the sequence display: MULTIPLE ALIGNMENT MODE and PROFILE ALIGNMENT MODE.
-
-To do a MULTIPLE ALIGNMENT on a set of sequences, make sure MULTIPLE ALIGNMENT
-MODE is selected. A single sequence data area is then displayed. The ALIGNMENT
-menu then allows you to either produce a guide tree for the alignment, or to do
-a multiple alignment following the guide tree, or to do a full multiple
-alignment.
-
-In PROFILE ALIGNMENT MODE, two sequence data areas are displayed, allowing you
-to align 2 alignments (termed profiles). Profiles are also used to add a new
-sequence to an old alignment, or to use secondary structure to guide the
-alignment process. GAPS in the old alignments are indicated using the "-"
-character. PROFILES can be input in ANY of the allowed formats; just use "-"
-(or "." for MSF/RSF) for each gap position. In Profile Alignment Mode, a button
-"Lock Scroll" is displayed which allows you to scroll the two profiles together
-using a single scroll bar. When the Lock Scroll is turned off, the two profiles
-can be scrolled independently.
-
-<H4>
-PHYLOGENETIC TREES
-</H4>
-
-Phylogenetic trees can be calculated from old alignments (read in with "-"
-characters to indicate gaps) OR after a multiple alignment while the alignment
-is still displayed.
-
-<H4>
-ALIGNMENT DISPLAY
-</H4>
-
-The alignment is displayed on the screen with the sequence names on the left
-hand side. The sequence alignment is for display only, it cannot be edited here
-(except for changing the sequence order by cutting-and-pasting on the sequence
-names).
-
-A ruler is displayed below the sequences, starting at 1 for the first residue
-position (residue numbers in the sequence input file are ignored).
-
-A line above the alignment is used to mark strongly conserved positions. Three
-characters ('*', ':' and '.') are used:
-
-'*' indicates positions which have a single, fully conserved residue
-
-':' indicates that one of the following 'strong' groups is fully conserved:-
-<PRE>
- STA
- NEQK
- NHQK
- NDEQ
- QHRK
- MILV
- MILF
- HY
- FYW
-</PRE>
-
-'.' indicates that one of the following 'weaker' groups is fully conserved:-
-<PRE>
- CSA
- ATV
- SAG
- STNK
- STPA
- SGND
- SNDEQK
- NDEQHK
- NEQHRK
- FVLIM
- HFY
-</PRE>
-
-These are all the positively scoring groups that occur in the Gonnet Pam250
-matrix. The strong and weak groups are defined as strong score >0.5 and weak
-score =<0.5 respectively.
-
-For profile alignments, secondary structure and gap penalty masks are displayed
-above the sequences, if any data is found in the profile input file.
-
-
->>HELP F <<
- Input / Output Files
-
-LOAD SEQUENCES reads sequences from one of 7 file formats, replacing any
-sequences that are already loaded. All sequences must be in 1 file. The formats
-that are automatically recognised are: NBRF/PIR, EMBL/SWISSPROT, Pearson
-(Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9/RSF and GDE flat file. All
-non-alphabetic characters (spaces, digits, punctuation marks) are ignored
-except "-" which is used to indicate a GAP ("." in MSF/RSF).
-
-The program tries to automatically recognise the different file formats used
-and to guess whether the sequences are amino acid or nucleotide. This is not
-always foolproof.
-
-FASTA and NBRF/PIR formats are recognised by having a ">" as the first
-character in the file.
-
-EMBL/Swiss Prot formats are recognised by the letters "ID" at the start of the
-file (the token for the entry name field).
-
-CLUSTAL format is recognised by the word CLUSTAL at the beginning of the file.
-
-GCG/MSF format is recognised by one of the following:
-<UL>
-<LI>
- - the word PileUp at the start of the file.
-</LI><LI>
- - the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
- at the start of the file.
-</LI><LI>
- - the word MSF on the first line of the file, and the characters ..
- at the end of this line.
-</LI>
-</UL>
-
-GCG/RSF format is recognised by the word !!RICH_SEQUENCE at the beginning of
-the file.
-
-
-If 85% or more of the characters in the sequence are from A,C,G,T,U or N, the
-sequence will be assumed to be nucleotide. This works in 97.3% of cases but
-watch out!
-
-APPEND SEQUENCES is only valid in MULTIPLE ALIGNMENT MODE. The input sequences
-do not replace those already loaded, but are appended at the end of the
-alignment.
-
-SAVE SEQUENCES AS... offers the user a choice of one of six output formats:
-CLUSTAL, NBRF/PIR, GCG/MSF, PHYLIP, NEXUS, GDE or FASTA. All sequences are written
-to a single file. Options are available to save a range of the alignment,
-switch between UPPER/LOWER case for GDE files, and to output SEQUENCE NUMBERING
-for CLUSTAL files. Users can also choose to include the residue range numbers
-by appending them to the sequence names.
-
-LOAD PROFILE 1 reads sequences in the same 7 file formats, replacing any
-sequences already loaded as Profile 1. This option will also remove any
-sequences which are loaded in Profile 2.
-
-LOAD PROFILE 2 reads sequences in the same 7 file formats, replacing any
-sequences already loaded as Profile 2.
-
-SAVE PROFILE 1 AS... is similar to the Save Sequences option except that only
-those sequences in Profile 1 will be written to the output file.
-
-SAVE PROFILE 2 AS... is similar to the Save Sequences option except that only
-those sequences in Profile 2 will be written to the output file.
-
-WRITE ALIGNMENT AS POSTSCRIPT will write the sequence display to a postscript
-format file. This will include any secondary structure / gap penalty mask
-information and the consensus and ruler lines which are displayed on the
-screen. The Alignment Quality curve can be optionally included in the output
-file.
-
-WRITE PROFILE 1 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
-except that only the profile 1 display will be printed.
-
-WRITE PROFILE 2 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
-except that only the profile 2 display will be printed.
-
-
-<H4>
-POSTSCRIPT PARAMETERS
-</H4>
-
-A number of options are available to allow you to configure your postscript
-output file.
-
-PS COLORS FILE:
-
-The exact RGB values required to reproduce the colors used in the alignment
-window will vary from printer to printer. A PS colors file can be specified
-that contains the RGB values for all the colors required by each of your
-postscript printers.
-
-By default, Clustal X looks for a file called 'colprint.par' in the current
-directory (if your running under UNIX, it then looks in your home directory,
-and finally in the directories in your PATH environment variable). If no PS
-colors file is found or a color used on the screen is not defined here, the
-screen RGB values (from the Color Parameter File) are used.
-
-The PS colors file consists of one line for each color to be defined, with the
-color name followed by the RGB values (on a scale of 0 to 1). For example,
-
-RED 0.9 0.1 0.1
-
-Blank lines and comments (lines beginning with a '#' character) are ignored.
-
-
-PAGE SIZE: The alignment can be displayed on either A4, A3 or US Letter size
-pages.
-
-ORIENTATION: The alignment can be displayed on either a landscape or portrait
-page.
-
-PRINT HEADER: An optional header including the postscript filename, and
-creation date can be printed at the top of each page.
-
-PRINT QUALITY CURVE: The Alignment Quality curve which is displayed underneath
-the alignment on the screen can be included in the postscript output.
-
-PRINT RULER: The ruler which is displayed underneath the alignment on the
-screen can be included in the postscript output.
-
-PRINT RESIDUE NUMBERS: Sequence residue numbers can be printed at the right
-hand side of the alignment.
-
-RESIZE TO FIT PAGE: By default, the alignment is scaled to fit the page size
-selected. This option can be turned off, in which case a font size of 10 will
-be used for the sequences.
-
-PRINT FROM POSITION/TO: A range of the alignment can be printed. The default
-is to print the full alignment. The first and last residues to be printed are
-specified here.
-
-USE BLOCK LENGTH: The alignment can be divided into blocks of residues. The
-number of residues in a block is specified here. More than one block may then
-be printed on a single page. This is useful for long alignments of a small
-number of sequences. If the block length is set to 0, The alignment will not
-be divided into blocks, but printed across a number of pages.
-
->>HELP E <<
- Editing Alignments
-
-Clustal X allows you to change the order of the sequences in the alignment, by
-cutting-and-pasting the sequence names.
-
-To select a group of sequences to be moved, click on a sequence name and drag
-the cursor until all the required sequences are highlighted. Holding down the
-Shift key when clicking on the first name will add new sequences to those
-already selected.
-
-(Options are provided to Select All Sequences, Select Profile 1 or Select
-Profile 2.)
-
-The selected sequences can be removed from the alignment by using the EDIT
-menu, CUT option.
-
-To add the cut sequences back into an alignment, select a sequence by clicking
-on the sequence name. The cut sequences will be added to the alignment,
-immediately following the selected sequence, by the EDIT menu, PASTE option.
-
-To add the cut sequences to an empty alignment (eg. when cutting sequences from
-Profile 1 and pasting them to Profile 2), click on the empty sequence name
-display area, and select the EDIT menu, PASTE option as before.
-
-The sequence selection and sequence range selection can be cleared using the
-EDIT menu, CLEAR SEQUENCE SELECTION and CLEAR RANGE SELECTION options
-respectively.
-
-To search for a string of residues in the sequences, select the sequences to be
-searched by clicking on the sequence names. You can then enter the string to
-search for by selecting the SEARCH FOR STRING option. If the string is found in
-any of the sequences selected, the sequence name and column number is printed
-below the sequence display.
-
-In PROFILE ALIGNMENT MODE, the two profiles can be merged (normally done after
-alignment) by selecting ADD PROFILE 2 TO PROFILE 1. The sequences currently
-displayed as Profile 2 will be appended to Profile 1.
-
-The REMOVE ALL GAPS option will remove all gaps from the sequences currently
-selected.
-WARNING: This option removes ALL gaps, not only those introduced by ClustalX,
-but also those that were read from the input alignment file. Any secondary
-structure information associated with the alignment will NOT be automatically
-realigned.
-
-The REMOVE GAP-ONLY COLUMNS will remove those positions in the alignment which
-contain gaps in all sequences. This can occur as a result of removing divergent
-sequences from an alignment, or if an alignment has been realigned.
-
->>HELP M <<
- Multiple Alignments
-
-Make sure MULTIPLE ALIGNMENT MODE is selected, using the switch directly above
-the sequence display area. Then, use the ALIGNMENT menu to do multiple
-alignments.
-
-Multiple alignments are carried out in 3 stages:
-
-1) all sequences are compared to each other (pairwise alignments);
-
-2) a dendrogram (like a phylogenetic tree) is constructed, describing the
-approximate groupings of the sequences by similarity (stored in a file).
-
-3) the final multiple alignment is carried out, using the dendrogram as a guide.
-
-The 3 stages are carried out automatically by the DO COMPLETE ALIGNMENT option.
-You can skip the first stages (pairwise alignments; guide tree) by using an old
-guide tree file (DO ALIGNMENT FROM GUIDE TREE); or you can just produce the
-guide tree with no final multiple alignment (PRODUCE GUIDE TREE ONLY).
-
-
-REALIGN SELECTED SEQUENCES is used to realign badly aligned sequences in the
-alignment. Sequences can be selected by clicking on the sequence names - see
-Editing Alignments for more details. The unselected sequences are then 'fixed'
-and a profile is made including only the unselected sequences. Each of the
-selected sequences in turn is then realigned to this profile. The realigned
-sequences will be displayed as a group at the end the alignment.
-
-
-REALIGN SELECTED SEQUENCE RANGE is used to realign a small region of the
-alignment. A residue range can be selected by clicking on the sequence display
-area. A multiple alignment is then performed, following the 3 stages described
-above, but only using the selected residue range. Finally the new alignment of
-the range is pasted back into the full sequence alignment.
-
-By default, gap penalties are used at each end of the subrange in order to
-penalise terminal gaps. If the REALIGN SEGMENT END GAP PENALTIES option is
-switched off, gaps can be introduced at the ends of the residue range at no
-cost.
-
-
-ALIGNMENT PARAMETERS displays a sub-menu with the following options:
-
-RESET NEW GAPS BEFORE ALIGNMENT will remove any new gaps introduced into the
-sequences during multiple alignment if you wish to change the parameters and
-try again. This only takes effect just before you do a second multiple
-alignment. You can make phylogenetic trees after alignment whether or not this
-is ON. If you turn this OFF, the new gaps are kept even if you do a second
-multiple alignment. This allows you to iterate the alignment gradually.
-Sometimes, the alignment is improved by a second or third pass.
-
-RESET ALL GAPS BEFORE ALIGNMENT will remove all gaps in the sequences including
-gaps which were read in from the sequence input file. This only takes effect
-just before you do a second multiple alignment. You can make phylogenetic
-trees after alignment whether or not this is ON. If you turn this OFF, all
-gaps are kept even if you do a second multiple alignment. This allows you to
-iterate the alignment gradually. Sometimes, the alignment is improved by a
-second or third pass.
-
-
-PAIRWISE ALIGNMENT PARAMETERS control the speed/sensitivity of the initial
-alignments.
-
-MULTIPLE ALIGNMENT PARAMETERS control the gaps in the final multiple
-alignments.
-
-PROTEIN GAP PARAMETERS displays a temporary window which allows you to set
-various parameters only used in the alignment of protein sequences.
-
-(SECONDARY STRUCTURE PARAMETERS, for use with the Profile Alignment Mode only,
-allows you to set various parameters only used with gap penalty masks.)
-
-SAVE LOG FILE will write the alignment calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-
-
-<H4>
-OUTPUT FORMAT OPTIONS
-</H4>
-
-You can choose from 7 different alignment formats (CLUSTAL, GCG, NBRF/PIR,
-PHYLIP, GDE, NEXUS, FASTA). You can choose more than one (or all 7 if you wish).
-
-CLUSTAL format output is a self explanatory alignment format. It shows the
-sequences aligned in blocks. It can be read in again at a later date to (for
-example) calculate a phylogenetic tree or add in new sequences by profile
-alignment.
-
-GCG output can be used by any of the GCG programs that can work on multiple
-alignments (e.g. PRETTY, PROFILEMAKE, PLOTALIGN). It is the same as the GCG
-.msf format files (multiple sequence file); new in version 7 of GCG.
-
-NEXUS format is used by several phylogeny programs, including PAUP and
-MacClade.
-
-PHYLIP format output can be used for input to the PHYLIP package of Joe
-Felsenstein. This is a very widely used package for doing every imaginable
-form of phylogenetic analysis (MUCH more than the the modest introduction
-offered by this program).
-
-NBRF/PIR: this is the same as the standard PIR format with ONE ADDITION. Gap
-characters "-" are used to indicate the positions of gaps in the multiple
-alignment. These files can be re-used as input in any part of clustal that
-allows sequences (or alignments or profiles) to be read in.
-
-FASTA: this is included for compatibility with numberous sequence analysis programs.
-
-GDE: this format is used by the GDE package of Steven Smith and is understood
-by SEQLAB in GCG 9 or later.
-
-GDE OUTPUT CASE: sequences in GDE format may be written in either upper or
-lower case.
-
-CLUSTALW SEQUENCE NUMBERS: residue numbers may be added to the end of the
-alignment lines in clustalw format.
-
-OUTPUT ORDER is used to control the order of the sequences in the output
-alignments. By default, it uses the order in which the sequences were aligned
-(from the guide tree/dendrogram), thus automatically grouping closely related
-sequences. It can be switched to be the same as the original input order.
-
-PARAMETER OUTPUT: This option will save all your parameter settings in a
-parameter file (suffix .par) during alignment. The file can be subsequently
-used to rerun ClustalW using the same parameters.
-
-
-<H3>
-ALIGNMENT PARAMETERS
-</H3>
---------------------
-
-<STRONG>
-PAIRWISE ALIGNMENT PARAMETERS
-</STRONG>
-
-A distance is calculated between every pair of sequences and these are used to
-construct the phylogenetic tree which guides the final multiple alignment. The
-scores are calculated from separate pairwise alignments. These can be
-calculated using 2 methods: dynamic programming (slow but accurate) or by the
-method of Wilbur and Lipman (extremely fast but approximate).
-
-You can choose between the 2 alignment methods using the PAIRWISE ALIGNMENTS
-option. The slow/accurate method is fast enough for short sequences but will be
-VERY SLOW for many (e.g. >100) long (e.g. >1000 residue) sequences.
-
-
-<STRONG>
-SLOW-ACCURATE alignment parameters:
-</STRONG>
-
-These parameters do not have any affect on the speed of the alignments. They
-are used to give initial alignments which are then rescored to give percent
-identity scores. These % scores are the ones which are displayed on the
-screen. The scores are converted to distances for the trees.
-
-Gap Open Penalty: the penalty for opening a gap in the alignment.
-
-Gap Extension Penalty: the penalty for extending a gap by 1 residue.
-
-Protein Weight Matrix: the scoring table which describes the similarity of
-each amino acid to each other.
-
-Load protein matrix: allows you to read in a comparison table from a file.
-
-DNA weight matrix: the scores assigned to matches and mismatches (including
-IUB ambiguity codes).
-
-Load DNA matrix: allows you to read in a comparison table from a file.
-
-See the Multiple alignment parameters, MATRIX option below for details of the
-matrix input format.
-
-
-<STRONG>
-FAST-APPROXIMATE alignment parameters:
-</STRONG>
-
-These similarity scores are calculated from fast, approximate, global align-
-ments, which are controlled by 4 parameters. 2 techniques are used to make
-these alignments very fast: 1) only exactly matching fragments (k-tuples) are
-considered; 2) only the 'best' diagonals (the ones with most k-tuple matches)
-are used.
-
-GAP PENALTY: This is a penalty for each gap in the fast alignments. It has
-little effect on the speed or sensitivity except for extreme values.
-
-K-TUPLE SIZE: This is the size of exactly matching fragment that is used.
-INCREASE for speed (max= 2 for proteins; 4 for DNA), DECREASE for sensitivity.
-For longer sequences (e.g. >1000 residues) you may wish to increase the
-default.
-
-TOP DIAGONALS: The number of k-tuple matches on each diagonal (in an imaginary
-dot-matrix plot) is calculated. Only the best ones (with most matches) are used
-in the alignment. This parameter specifies how many. Decrease for speed;
-increase for sensitivity.
-
-WINDOW SIZE: This is the number of diagonals around each of the 'best'
-diagonals that will be used. Decrease for speed; increase for sensitivity.
-
-
-<STRONG>
-MULTIPLE ALIGNMENT PARAMETERS
-</STRONG>
-
-These parameters control the final multiple alignment. This is the core of the
-program and the details are complicated. To fully understand the use of the
-parameters and the scoring system, you will have to refer to the documentation.
-
-Each step in the final multiple alignment consists of aligning two alignments
-or sequences. This is done progressively, following the branching order in the
-GUIDE TREE. The basic parameters to control this are two gap penalties and the
-scores for various identical/non-indentical residues.
-
-The GAP OPENING and EXTENSION PENALTIES can be set here. These control the
-cost of opening up every new gap and the cost of every item in a gap.
-Increasing the gap opening penalty will make gaps less frequent. Increasing
-the gap extension penalty will make gaps shorter. Terminal gaps are not
-penalised.
-
-The DELAY DIVERGENT SEQUENCES switch delays the alignment of the most distantly
-related sequences until after the most closely related sequences have been
-aligned. The setting shows the percent identity level required to delay the
-addition of a sequence; sequences that are less identical than this level to
-any other sequences will be aligned later.
-
-The TRANSITION WEIGHT gives transitions (A<-->G or C<-->T i.e. purine-purine or
-pyrimidine-pyrimidine substitutions) a weight between 0 and 1; a weight of zero
-means that the transitions are scored as mismatches, while a weight of 1 gives
-the transitions the match score. For distantly related DNA sequences, the
-weight should be near to zero; for closely related sequences it can be useful
-to assign a higher score. The default is set to 0.5.
-
-
-The PROTEIN WEIGHT MATRIX option allows you to choose a series of weight
-matrices. For protein alignments, you use a weight matrix to determine the
-similarity of non-identical amino acids. For example, Tyr aligned with Phe is
-usually judged to be 'better' than Tyr aligned with Pro.
-
-There are three 'in-built' series of weight matrices offered. Each consists of
-several matrices which work differently at different evolutionary distances. To
-see the exact details, read the documentation. Crudely, we store several
-matrices in memory, spanning the full range of amino acid distance (from almost
-identical sequences to highly divergent ones). For very similar sequences, it
-is best to use a strict weight matrix which only gives a high score to
-identities and the most favoured conservative substitutions. For more divergent
-sequences, it is appropriate to use "softer" matrices which give a high score
-to many other frequent substitutions.
-
-1) BLOSUM (Henikoff). These matrices appear to be the best available for
-carrying out data base similarity (homology searches). The matrices currently
-used are: Blosum 80, 62, 45 and 30. BLOSUM was the default in earlier Clustal X
-versions.
-
-2) PAM (Dayhoff). These have been extremely widely used since the late '70s. We
-currently use the PAM 20, 60, 120, 350 matrices.
-
-3) GONNET. These matrices were derived using almost the same procedure as the
-Dayhoff one (above) but are much more up to date and are based on a far larger
-data set. They appear to be more sensitive than the Dayhoff series. We
-currently use the GONNET 80, 120, 160, 250 and 350 matrices. This series is the
-default for Clustal X version 1.8.
-
-We also supply an identity matrix which gives a score of 10 to two identical
-amino acids and a score of zero otherwise. This matrix is not very useful.
-
-Load protein matrix: allows you to read in a comparison matrix from a file.
-This can be either a single matrix or a series of matrices (see below for
-format).
-
-
-DNA WEIGHT MATRIX option allows you to select a single matrix (not a series)
-used for aligning nucleic acid sequences. Two hard-coded matrices are available:
-
-1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
-of nucleic acid sequences. X's and N's are treated as matches to any IUB
-ambiguity symbol. All matches score 1.9; all mismatches for IUB symbols score 0.
-
-2) CLUSTALW(1.6). A previous system used by ClustalW, in which matches score
-1.0 and mismatches score 0. All matches for IUB symbols also score 0.
-
-Load DNA matrix: allows you to read in a nucleic acid comparison matrix from a
-file (just one matrix, not a series).
-
-
-SINGLE MATRIX INPUT FORMAT
-The format used for a single matrix is the same as the BLAST program. The
-scores in the new weight matrix should be similarities. You can use negative as
-well as positive values if you wish, although the matrix will be automatically
-adjusted to all positive scores, unless the NEGATIVE MATRIX option is selected.
-Any lines beginning with a # character are assumed to be comments. The first
-non-comment line should contain a list of amino acids in any order, using the 1
-letter code, followed by a * character. This should be followed by a square
-matrix of scores, with one row and one column for each amino acid. The last row
-and column of the matrix (corresponding to the * character) contain the minimum
-score over the whole matrix.
-
-MATRIX SERIES INPUT FORMAT
-ClustalX uses different matrices depending on the mean percent identity of the
-sequences to be aligned. You can specify a series of matrices and the range of
-the percent identity for each matrix in a matrix series file. The file is
-automatically recognised by the word CLUSTAL_SERIES at the beginning of the
-file. Each matrix in the series is then specified on one line which should
-start with the word MATRIX. This is followed by the lower and upper limits of
-the sequence percent identities for which you want to apply the matrix. The
-final entry on the matrix line is the filename of a Blast format matrix file
-(see above for details of the single matrix file format).
-
-Example.
-
-CLUSTAL_SERIES
-
-MATRIX 81 100 /us1/user/julie/matrices/blosum80
-MATRIX 61 80 /us1/user/julie/matrices/blosum62
-MATRIX 31 60 /us1/user/julie/matrices/blosum45
-MATRIX 0 30 /us1/user/julie/matrices/blosum30
-
-
-<STRONG>
-PROTEIN GAP PARAMETERS
-</STRONG>
-
-RESIDUE SPECIFIC PENALTIES are amino acid specific gap penalties that reduce or
-increase the gap opening penalties at each position in the alignment or
-sequence. See the documentation for details. As an example, positions that are
-rich in glycine are more likely to have an adjacent gap than positions that are
-rich in valine.
-
-HYDROPHILIC GAP PENALTIES are used to increase the chances of a gap within a
-run (5 or more residues) of hydrophilic amino acids; these are likely to be
-loop or random coil regions where gaps are more common. The residues that are
-"considered" to be hydrophilic can be entered in HYDROPHILIC RESIDUES.
-
-GAP SEPARATION DISTANCE tries to decrease the chances of gaps being too close
-to each other. Gaps that are less than this distance apart are penalised more
-than other gaps. This does not prevent close gaps; it makes them less frequent,
-promoting a block-like appearance of the alignment.
-
-END GAP SEPARATION treats end gaps just like internal gaps for the purposes of
-avoiding gaps that are too close (set by GAP SEPARATION DISTANCE above). If you
-turn this off, end gaps will be ignored for this purpose. This is useful when
-you wish to align fragments where the end gaps are not biologically meaningful.
-
-
->>HELP P <<
- Profile and Structure Alignments
-
-By PROFILE ALIGNMENT, we mean alignment using existing alignments. Profile
-alignments allow you to store alignments of your favourite sequences and add
-new sequences to them in small bunches at a time. A profile is simply an
-alignment of one or more sequences (e.g. an alignment output file from Clustal
-X). Each input can be a single sequence. One or both sets of input sequences
-may include secondary structure assignments or gap penalty masks to guide the
-alignment.
-
-Make sure PROFILE ALIGNMENT MODE is selected, using the switch directly above
-the sequence display area. Then, use the ALIGNMENT menu to do profile and
-secondary structure alignments.
-
-The profiles can be in any of the allowed input formats with "-" characters
-used to specify gaps (except for GCG/MSF where "." is used).
-
-You have to load the 2 profiles by choosing FILE, LOAD PROFILE 1 and LOAD
-PROFILE 2. Then ALIGNMENT, ALIGN PROFILE 2 TO PROFILE 1 will align the 2
-profiles to each other. Secondary structure masks in either profile can be used
-to guide the alignment. This option compares all the sequences in profile 1
-with all the sequences in profile 2 in order to build guide trees which will be
-used to calculate sequence weights, and select appropriate alignment parameters
-for the final profile alignment.
-
-You can skip the first stage (pairwise alignments; guide trees) by using old
-guide tree files (ALIGN PROFILES FROM GUIDE TREES).
-
-The ALIGN SEQUENCES TO PROFILE 1 option will take the sequences in the second
-profile and align them to the first profile, 1 at a time. This is useful to
-add some new sequences to an existing alignment, or to align a set of sequences
-to a known structure. In this case, the second profile set need not be
-pre-aligned.
-
-You can skip the first stage (pairwise alignments; guide tree) by using an old
-guide tree file (ALIGN SEQUENCES TO PROFILE 1 FROM TREE).
-
-SAVE LOG FILE will write the alignment calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-
-The alignment parameters can be set using the ALIGNMENT PARAMETERS menu,
-Pairwise Parameters, Multiple Parameters and Protein Gap Parameters options.
-These are EXACTLY the same parameters as used by the general, automatic
-multiple alignment procedure. The general multiple alignment procedure is
-simply a series of profile alignments. Carrying out a series of profile
-alignments on larger and larger groups of sequences, allows you to manually
-build up a complete alignment, if necessary editing intermediate alignments.
-
-<STRONG>
-SECONDARY STRUCTURE PARAMETERS
-</STRONG>
-
-Use this menu to set secondary structure options. If a solved structure is
-known, it can be used to guide the alignment by raising gap penalties within
-secondary structure elements, so that gaps will preferentially be inserted into
-unstructured surface loop regions. Alternatively, a user-specified gap penalty
-mask can be supplied for a similar purpose.
-
-A gap penalty mask is a series of numbers between 1 and 9, one per position in
-the alignment. Each number specifies how much the gap opening penalty is to be
-raised at that position (raised by multiplying the basic gap opening penalty
-by the number) i.e. a mask figure of 1 at a position means no change
-in gap opening penalty; a figure of 4 means that the gap opening penalty is
-four times greater at that position, making gaps 4 times harder to open.
-
-The format for gap penalty masks and secondary structure masks is explained in
-a separate help section.
-
->>HELP B <<
- Secondary Structure / Gap Penalty Masks
-
-The use of secondary structure-based penalties has been shown to improve the
-accuracy of sequence alignment. Clustal X now allows secondary structure/ gap
-penalty masks to be supplied with the input sequences used during profile
-alignment. (NB. The secondary structure information is NOT used during multiple
-sequence alignment). The masks work by raising gap penalties in specified
-regions (typically secondary structure elements) so that gaps are
-preferentially opened in the less well conserved regions (typically surface
-loops).
-
-The USE PROFILE 1(2) SECONDARY STRUCTURE / GAP PENALTY MASK options control
-whether the input 2D-structure information or gap penalty masks will be used
-during the profile alignment.
-
-The OUTPUT options control whether the secondary structure and gap penalty
-masks should be included in the Clustal X output alignments. Showing both is
-useful for understanding how the masks work. The 2D-structure information is
-itself useful in judging the alignment quality and in seeing how residue
-conservation patterns vary with secondary structure.
-
-The HELIX and STRAND GAP PENALTY options provide the value for raising the gap
-penalty at core Alpha Helical (A) and Beta Strand (B) residues. In CLUSTAL
-format, capital residues denote the A and B core structure notation. Basic gap
-penalties are multiplied by the amount specified.
-
-The LOOP GAP PENALTY option provides the value for the gap penalty in Loops.
-By default this penalty is not raised. In CLUSTAL format, loops are specified
-by "." in the secondary structure notation.
-
-The SECONDARY STRUCTURE TERMINAL PENALTY provides the value for setting the gap
-penalty at the ends of secondary structures. Ends of secondary structures are
-known to grow or shrink, comparing related structures. Therefore by default
-these are given intermediate values, lower than the core penalties. All
-secondary structure read in as lower case in CLUSTAL format gets the reduced
-terminal penalty.
-
-The HELIX and STRAND TERMINAL POSITIONS options specify the range of structure
-termini for the intermediate penalties. In the alignment output, these are
-indicated as lower case. For Alpha Helices, by default, the range spans the
-end-helical turn (3 residues). For Beta Strands, the default range spans the
-end residue and the adjacent loop residue, since sequence conservation often
-extends beyond the actual H-bonded Beta Strand.
-
-Clustal X can read the masks from SWISS-PROT, CLUSTAL or GDE format input
-files. For many 3-D protein structures, secondary structure information is
-recorded in the feature tables of SWISS-PROT database entries. You should
-always check that the assignments are correct - some are quite inaccurate.
-Clustal X looks for SWISS-PROT HELIX and STRAND assignments e.g.
-
-
-<PRE>
-FT HELIX 100 115
-FT STRAND 118 119
-</PRE>
-
-The structure and penalty masks can also be read from CLUSTAL alignment format
-as comment lines beginning "!SS_" or "!GM_" e.g.
-
-<PRE>
-!SS_HBA_HUMA ..aaaAAAAAAAAAAaaa.aaaAAAAAAAAAAaaaaaaAaaa.........aaaAAAAAA
-!GM_HBA_HUMA 112224444444444222122244444444442222224222111111111222444444
-HBA_HUMA VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGK
-</PRE>
-
-Note that the mask itself is a set of numbers between 1 and 9 each of which is
-assigned to the residue(s) in the same column below.
-
-In GDE flat file format, the masks are specified as text and the names must
-begin with "SS_ or "GM_.
-
-Either a structure or penalty mask or both may be used. If both are included
-in an alignment, the user will be asked which is to be used.
-
-
->>HELP T <<
- Phylogenetic Trees
-
-Before calculating a tree, you must have an ALIGNMENT in memory. This can be
-input using the FILE menu, LOAD SEQUENCES option or you should have just
-carried out a full multiple alignment and the alignment is still in memory.
-Remember YOU MUST ALIGN THE SEQUENCES FIRST!!!!
-
-The method used is the NJ (Neighbour Joining) method of Saitou and Nei. First
-you calculate distances (percent divergence) between all pairs of sequence from
-a multiple alignment; second you apply the NJ method to the distance matrix.
-
-To calculate a tree, use the DRAW N-J TREE option. This gives an UNROOTED tree
-and all branch lengths. The root of the tree can only be inferred by using an
-outgroup (a sequence that you are certain branches at the outside of the tree
-.... certain on biological grounds) OR if you assume a degree of constancy in
-the 'molecular clock', you can place the root in the 'middle' of the tree
-(roughly equidistant from all tips).
-
-BOOTSTRAP N-J TREE uses a method for deriving confidence values for the
-groupings in a tree (first adapted for trees by Joe Felsenstein). It involves
-making N random samples of sites from the alignment (N should be LARGE, e.g.
-500 - 1000); drawing N trees (1 from each sample) and counting how many times
-each grouping from the original tree occurs in the sample trees. You can set N
-using the NUMBER OF BOOTSTRAP TRIALS option in the BOOTSTRAP TREE window. In
-practice, you should use a large number of bootstrap replicates (1000 is
-recommended, even if it means running the program for an hour on a slow
-computer). You can also supply a seed number for the random number generator
-here. Different runs with the same seed will give the same answer. See the
-documentation for more details.
-
-EXCLUDE POSITIONS WITH GAPS? With this option, any alignment positions where
-ANY of the sequences have a gap will be ignored. This means that 'like' will
-be compared to 'like' in all distances, which is highly desirable. It also
-automatically throws away the most ambiguous parts of the alignment, which are
-concentrated around gaps (usually). The disadvantage is that you may throw away
-much of the data if there are many gaps (which is why it is difficult for us to
-make it the default).
-
-CORRECT FOR MULTIPLE SUBSTITUTIONS? For small divergence (say <10%) this option
-makes no difference. For greater divergence, this option corrects for the fact
-that observed distances underestimate actual evolutionary distances. This is
-because, as sequences diverge, more than one substitution will happen at many
-sites. However, you only see one difference when you look at the present day
-sequences. Therefore, this option has the effect of stretching branch lengths
-in trees (especially long branches). The corrections used here (for DNA or
-proteins) are both due to Motoo Kimura. See the documentation for details.
-
-Where possible, this option should be used. However, for VERY divergent
-sequences, the distances cannot be reliably corrected. You will be warned if
-this happens. Even if none of the distances in a data set exceed the reliable
-threshold, if you bootstrap the data, some of the bootstrap distances may
-randomly exceed the safe limit.
-
-SAVE LOG FILE will write the tree calculation scores to a file. The log
-filename is the same as the input sequence filename, with an extension .log
-appended.
-
-<H4>
-OUTPUT FORMAT OPTIONS
-</H4>
-
-Three different formats are allowed. None of these displays the tree visually.
-You can display the tree using the NJPLOT program distributed with Clustal X
-OR get the PHYLIP package and use the tree drawing facilities there.
-
-1) CLUSTAL FORMAT TREE. This format is verbose and lists all of the distances
-between the sequences and the number of alignment positions used for each. The
-tree is described at the end of the file. It lists the sequences that are
-joined at each alignment step and the branch lengths. After two sequences are
-joined, it is referred to later as a NODE. The number of a NODE is the number
-of the lowest sequence in that NODE.
-
-2) PHYLIP FORMAT TREE. This format is the New Hampshire format, used by many
-phylogenetic analysis packages. It consists of a series of nested parentheses,
-describing the branching order, with the sequence names and branch lengths. It
-can be read by the NJPLOT program distributed with ClustalX. It can also be
-used by the RETREE, DRAWGRAM and DRAWTREE programs of the PHYLIP package to see
-the trees graphically. This is the same format used during multiple alignment
-for the guide trees. Some other packages that can read and display New
-Hampshire format are TreeTool, TreeView, and Phylowin.
-
-3) PHYLIP DISTANCE MATRIX. This format just outputs a matrix of all the
-pairwise distances in a format that can be used by the PHYLIP package. It used
-to be useful when one could not produce distances from protein sequences in the
-Phylip package but is now redundant (PROTDIST of Phylip 3.5 now does this).
-
-4) NEXUS FORMAT TREE. This format is used by several popular phylogeny programs,
-including PAUP and MacClade. The format is described fully in:
-Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
-NEXUS: an extensible file format for systematic information.
-Systematic Biology 46:590-621.
-
-BOOTSTRAP LABELS ON: By default, the bootstrap values are correctly placed on
-the tree branches of the phylip format output tree. The toggle allows them to
-be placed on the nodes, which is incorrect, but some display packages (e.g.
-TreeTool, TreeView and Phylowin) only support node labelling but not branch
-labelling. Care should be taken to note which branches and labels go together.
-
-
->>HELP C <<
- Colors
-
-Clustal X provides a versatile coloring scheme for the sequence alignment
-display. The sequences (or profiles) are colored automatically, when they are
-loaded. Sequences can be colored either by assigning a color to specific
-residues, or on the basis of an alignment consensus. In the latter case, the
-alignment consensus is calculated automatically, and the residues in each
-column are colored according to the consensus character assigned to that
-column. In this way, you can choose to highlight, for example, conserved
-hydrophylic or hydrophobic positions in the alignment.
-
-The 'rules' used to color the alignment are specified in a COLOR PARAMETER
-FILE. Clustal X automatically looks for a file called 'colprot.par' for protein
-sequences or 'coldna.par' for DNA, in the current directory. (If your running
-under UNIX, it then looks in your home directory, and finally in the
-directories in your PATH environment variable).
-
-By default, if no color parameter file is found, protein sequences are colored
-by residue as follows:
-
-<PRE>
- Color Residue Code
-
- ORANGE GPST
- RED HKR
- BLUE FWY
- GREEN ILMV
-</PRE>
-
-In the case of DNA sequences, the default colors are as follows:
-
-<PRE>
- Color Residue Code
-
- ORANGE A
- RED C
- BLUE T
- GREEN G
-</PRE>
-
-
-The default BACKGROUND COLORING option shows the sequence residues using a
-black character on a colored background. It can be switched off to show
-residues as a colored character on a white background.
-
-Either BLACK AND WHITE or DEFAULT COLOR options can be selected. The Color
-option looks first for the color parameter file (as described above) and, if no
-file is found, uses the default residue-specific colors.
-
-You can specify your own coloring scheme by using the LOAD COLOR PARAMETER FILE
-option. The format of the color parameter file is described below.
-
-<H4>
-COLOR PARAMETER FILE
-</H4>
-
-This file is divided into 3 sections:
-
-1) the names and rgb values of the colors
-2) the rules for calculating the consensus
-3) the rules for assigning colors to the residues
-
-An example file is given here.
-
-<PRE>
- --------------------------------------------------------------------
- at rgbindex
-RED 0.9 0.1 0.1
-BLUE 0.1 0.1 0.9
-GREEN 0.1 0.9 0.1
-YELLOW 0.9 0.9 0.0
-
- at consensus
-% = 60% w:l:v:i:m:a:f:c:y:h:p
-# = 80% w:l:v:i:m:a:f:c:y:h:p
-- = 50% e:d
-+ = 60% k:r
-q = 50% q:e
-p = 50% p
-n = 50% n
-t = 50% t:s
-
- at color
-g = RED
-p = YELLOW
-t = GREEN if t:%:#
-n = GREEN if n
-w = BLUE if %:#:p
-k = RED if +
- --------------------------------------------------------------------
-</PRE>
-
-The first section is optional and is identified by the header @rgbindex. If
-this section exists, each color used in the file must be named and the rgb
-values specified (on a scale from 0 to 1). If the rgb index section is not
-found, the following set of hard-coded colors will be used.
-
-<PRE>
-RED 0.9 0.1 0.1
-BLUE 0.1 0.1 0.9
-GREEN 0.1 0.9 0.1
-ORANGE 0.9 0.7 0.3
-CYAN 0.1 0.9 0.9
-PINK 0.9 0.5 0.5
-MAGENTA 0.9 0.1 0.9
-YELLOW 0.9 0.9 0.0
-</PRE>
-
-The second section is optional and is identified by the header @consensus. It
-defines how the consensus is calculated.
-
-The format of each consensus parameter is:-
-
-<PRE>
-c = n% residue_list
-
- where
- c is a character used to identify the parameter.
- n is an integer value used as the percentage cutoff
- point.
- residue_list is a list of residues denoted by a single
- character, delimited by a colon (:).
-</PRE>
-
-For example: # = 60% w:l:v:i
-
-will assign a consensus character # to any column in the alignment which
-contains more than 60% of the residues w,l,v and i.
-
-
-The third section is identified by the header @color, and defines how colors
-are assigned to each residue in the alignment.
-
-The color parameters can take one of two formats:
-
-<PRE>
-1) r = color
-2) r = color if consensus_list
-
- where
- r is a character used to denote a residue.
- color is one of the colors in the GDE color lookup table.
- residue_list is a list of residues denoted by a single
- character, delimited by a colon (:).
-</PRE>
-
-Examples:
-1) g = ORANGE
-
-will color all glycines ORANGE, regardless of the consensus.
-
-2) w = BLUE if w:%:#
-
-will color BLUE any tryptophan which is found in a column with a consensus of
-w, % or #.
-
-
->>HELP Q <<
- Alignment Quality Analysis
-
-<H3>
-QUALITY SCORES
-</H3>
---------------
-
-Clustal X provides an indication of the quality of an alignment by plotting
-a 'conservation score' for each column of the alignment. A high score indicates
-a well-conserved column; a low score indicates low conservation. The quality
-curve is drawn below the alignment.
-
-Two methods are also provided to indicate single residues or sequence segments
-which score badly in the alignment.
-
-Low-scoring residues are expected to occur at a moderate frequency in all the
-sequences because of their steady divergence due to the natural processes of
-evolution. The most divergent sequences are likely to have the most outliers.
-However, the highlighted residues are especially useful in pointing to
-sequence misalignments. Note that clustering of highlighted residues is a
-strong indication of misalignment. This can arise due to various reasons, for
-example:
-
- 1. Partial or total misalignments caused by a failure in the
- alignment algorithm. Usually only in difficult alignment cases.
-
- 2. Partial or total misalignments because at least one of the
- sequences in the given set is partly or completely unrelated to the
- other sequences. It is up to the user to check that the set of
- sequences are alignable.
-
- 3. Frameshift translation errors in a protein sequence causing local
- mismatched regions to be heavily highlighted. These are surprisingly
- common in database entries. If suspected, a 3-frame translation of
- the source DNA needs to be examined.
-
-Occasionally, highlighted residues may point to regions of some biological
-significance. This might happen for example if a protein alignment contains a
-sequence which has acquired new functions relative to the main sequence set. It
-is important to exclude other explanations, such as error or the natural
-divergence of sequences, before invoking a biological explanation.
-
-
-<H3>
-LOW-SCORING SEGMENTS
-</H3>
---------------------
-
-Unreliable regions in the alignment can be highlighted using the Low-Scoring
-Segments option. A sequence-weighted profile is used to indicate any segments
-in the sequences which score badly. Because the profile calculation may take
-some time, an option is provided to calculate LOW-SCORING SEGMENTS. The
-segment display can then be toggled on or off without having to repeat the
-time-consuming calculations.
-
-For details of the low-scoring segment calculation, see the CALCULATION section
-below.
-
-
-<H4>
-LOW-SCORING SEGMENT PARAMETERS
-</H4>
-------------------------------
-
-MINIMUM LENGTH OF SEGMENTS: short segments (or even single residues) can be
-hidden by increasing the minimum length of segments which will be displayed.
-
-DNA MARKING SCALE is used to remove less significant segments from the
-highlighted display. Increase the scale to display more segments; decrease the
-scale to remove the least significant.
-
-
-PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of each
-amino acid to each other. The matrix is used to calculate the sequence-
-weighted profile scores. There are four 'in-built' Log-Odds matrices offered:
-the Gonnet PAM 80, 120, 250, 350 matrices. A more stringent matrix which only
-gives a high score to identities and the most favoured conservative
-substitutions, may be more suitable when the sequences are closely related. For
-more divergent sequences, it is appropriate to use "softer" matrices which give
-a high score to many other frequent substitutions. This option automatically
-recalculates the low-scoring segments.
-
-
-DNA WEIGHT MATRIX: Two hard-coded matrices are available:
-
-1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
-of nucleic acid sequences. X's and N's are treated as matches to any IUB
-ambiguity symbol. All matches score 1.0; all mismatches for IUB symbols score
-0.9.
-
-2) CLUSTALW(1.6). The previous system used by ClustalW, in which matches score
-1.0 and mismatches score 0. All matches for IUB symbols also score 0.
-
-A new matrix can be read from a file on disk, if the filename consists only
-of lower case characters. The values in the new weight matrix should be
-similarities and should be NEGATIVE for infrequent substitutions.
-
-INPUT FORMAT. The format used for a new matrix is the same as the BLAST
-program. Any lines beginning with a # character are assumed to be comments. The
-first non-comment line should contain a list of amino acids in any order, using
-the 1 letter code, followed by a * character. This should be followed by a
-square matrix of scores, with one row and one column for each amino acid. The
-last row and column of the matrix (corresponding to the * character) contain
-the minimum score over the whole matrix.
-
-<H4>
-QUALITY SCORE PARAMETERS
-</H4>
-------------------------
-
-You can customise the column 'quality scores' plotted underneath the alignment
-display using the following options.
-
-SCORE PLOT SCALE: this is a scalar value from 1 to 10, which can be used to
-change the scale of the quality score plot.
-
-RESIDUE EXCEPTION CUTOFF: this is a scalar value from 1 to 10, which can be
-used to change the number of residue exceptions which are highlighted in the
-alignment display. (For an explanation of this cutoff, see the CALCULATION OF
-RESIDUE EXCEPTIONS section below.)
-
-PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of
-each amino acid to each other.
-
-DNA WEIGHT MATRIX: two hard-coded matrices are available: IUB and CLUSTALW(1.6).
-
-For more information about the weight matrices, see the help above for
-the Low-scoring Segments Weight Matrix.
-
-For details of the quality score calculations, see the CALCULATION section
-below.
-
-
-<STRONG>
-SHOW LOW-SCORING SEGMENTS
-</STRONG>
-
-The low-scoring segment display can be toggled on or off. This option does not
-recalculate the profile scores.
-
-
-<STRONG>
-SHOW EXCEPTIONAL RESIDUES
-</STRONG>
-
-This option highlights individual residues which score badly in the alignment
-quality calculations. Residues which score exceptionally low are highlighted by
-using a white character on a grey background.
-
-<STRONG>
-SAVE QUALITY SCORES TO FILE
-</STRONG>
-
-The quality scores that are plotted underneath the alignment display can also
-be saved in a text file. Each column in the alignment is written on one line in
-the output file, with the value of the quality score at the end of the line.
-Only the sequences currently selected in the display are written to the file.
-One use for quality scores is to color residues in a protein structure by
-sequence conservation. In this way conserved surface residues can be
-highlighted to locate functional regions such as ligand-binding sites.
-
-
-<H3>
-CALCULATION OF QUALITY SCORES
-</H3>
------------------------------
-
-Suppose we have an alignment of m sequences of length n. Then, the alignment
-can be written as:
-
-<PRE>
- A11 A12 A13 .......... A1n
- A21 A22 A23 .......... A2n
- .
- .
- Am1 Am2 Am3 .......... Amn
-</PRE>
-
-We also have a residue comparison matrix of size R where C(i,j) is the score
-for aligning residue i with residue j.
-
-We want to calculate a score for the conservation of the jth position in the
-alignment.
-
-To do this, we define an R-dimensional sequence space. For the jth position in
-the alignment, each sequence consists of a single residue which is assigned a
-point S in the space. S has R dimensions, and for sequence i, the rth dimension
-is defined as:
-
-<PRE>
- Sr = C(r,Aij)
-</PRE>
-
-We then calculate a consensus value for the jth position in the alignment. This
-value X also has R dimensions, and the rth dimension is defined as:
-
-<PRE>
- Xr = ( SUM (Fij * C(i,r)) ) / m
- 1<=i<=R
-</PRE>
-
-where Fij is the count of residues i at position j in the alignment.
-
-Now we can calculate the distance Di between each sequence i and the consensus
-position X in the R-dimensional space.
-
-<PRE>
- Di = SQRT ( SUM (Xr - Sr)(Xr - Sr) )
- 1<=i<=R
-
-</PRE>
-
-The quality score for the jth position in the alignment is defined as the mean
-of the sequence distances Di.
-
-The score is normalised by multiplying by the percentage of sequences which
-have residues (and not gaps) at this position.
-
-<H3>
-CALCULATION OF RESIDUE EXCEPTIONS
-</H3>
----------------------------------
-
-The jth residue of the ith sequence is considered as an exception if the
-distance Di of the sequence from the consensus value P is greater than (Upper
-Quartile + Inter Quartile Range * Cutoff). The value used as a cutoff for
-displaying exceptions can be set from the SCORE PARAMETERS menu. A high cutoff
-value will only display very significant exceptions; a low value will allow
-more, less significant, exceptions to be highlighted.
-
-(NB. Sequences which contain gaps at this position are not included in the
-exception calculation.)
-
-
-<H3>
-CALCULATION OF LOW-SCORING SEGMENTS
-</H3>
------------------------------------
-
-Suppose we have an alignment of m sequences of length n. Then, the alignment
-can be written as:
-
-<PRE>
- A11 A12 A13 .......... A1n
- A21 A22 A23 .......... A2n
- .
- .
- Am1 Am2 Am3 .......... Amn
-</PRE>
-
-We also have a residue comparison matrix of size R where C(i,j) is the score
-for aligning residue i with residue j.
-
-We calculate sequence weights by building a neighbour-joining tree, in which
-branch lengths are proportional to divergence. Summing the branches by branch
-ownership provides the weights. See (Thompson et al., CABIOS, 10, 19 (1994) and
-Henikoff et al.,JMB, 243, 574 1994).
-
-To find the low-scoring segments in a sequence Si, we build a weighted profile
-of the remaining sequences in the alignment. Suppose we find residue r at
-position j in the sequence; then the score for the jth position in the sequence
-is defined as
-
-<PRE>
- Score(Si,j) = Profile(j,r) where Profile(j,r) is the profile score
- for residue r at position j in the
- alignment.
-</PRE>
-
-These residue scores are summed along the sequence in both forward and backward
-directions. If the sum of the scores is positive, then it is reset to zero.
-Segments which score negatively in both directions are considered as
-'low-scoring' and will be highlighted in the alignment display.
-
-
->>HELP 9 <<
- Command Line Parameters
-
- DATA (sequences)
-
--INFILE=file.ext :input sequences
--PROFILE1=file.ext and -PROFILE2=file.ext :profiles (aligned sequences)
-
-
- VERBS (do things)
-
--OPTIONS :list the command line parameters
--HELP or -CHECK :outline the command line parameters
--ALIGN :do full multiple alignment
--TREE :calculate NJ tree
--BOOTSTRAP(=n) :bootstrap a NJ tree (n= number of bootstraps; def. = 1000)
--CONVERT :output the input sequences in a different file format
-
-
- PARAMETERS (set things)
-
-***General settings:****
--INTERACTIVE :read command line, then enter normal interactive menus
--QUICKTREE :use FAST algorithm for the alignment guide tree
--TYPE= :PROTEIN or DNA sequences
--NEGATIVE :protein alignment with negative values in matrix
--OUTFILE= :sequence alignment file name
--OUTPUT= :CLUSTAL, GCG, GDE, PHYLIP, PIR, NEXUS, FASTA
--OUTORDER= :INPUT or ALIGNED
--CASE= :LOWER or UPPER (for GDE output only)
--SEQNOS= :OFF or ON (for Clustal output only)
-
-
-***Fast Pairwise Alignments:***
--KTUPLE=n :word size
--TOPDIAGS=n :number of best diags.
--WINDOW=n :window around best diags.
--PAIRGAP=n :gap penalty
--SCORE= :PERCENT or ABSOLUTE
-
-
-***Slow Pairwise Alignments:***
--PWMATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
--PWDNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
--PWGAPOPEN=f :gap opening penalty
--PWGAPEXT=f :gap opening penalty
-
-
-***Multiple Alignments:***
--NEWTREE= :file for new guide tree
--USETREE= :file for old guide tree
--MATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
--DNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
--GAPOPEN=f :gap opening penalty
--GAPEXT=f :gap extension penalty
--ENDGAPS :no end gap separation pen.
--GAPDIST=n :gap separation pen. range
--NOPGAP :residue-specific gaps off
--NOHGAP :hydrophilic gaps off
--HGAPRESIDUES= :list hydrophilic res.
--MAXDIV=n :% ident. for delay
--TYPE= :PROTEIN or DNA
--TRANSWEIGHT=f :transitions weighting
-
-
-***Profile Alignments:***
--PROFILE :Merge two alignments by profile alignment
--NEWTREE1= :file for new guide tree for profile1
--NEWTREE2= :file for new guide tree for profile2
--USETREE1= :file for old guide tree for profile1
--USETREE2= :file for old guide tree for profile2
-
-
-***Sequence to Profile Alignments:***
--SEQUENCES :Sequentially add profile2 sequences to profile1 alignment
--NEWTREE= :file for new guide tree
--USETREE= :file for old guide tree
-
-
-***Structure Alignments:***
--NOSECSTR1 :do not use secondary structure/gap penalty mask for profile 1
--NOSECSTR2 :do not use secondary structure/gap penalty mask for profile 2
--SECSTROUT=STRUCTURE or MASK or BOTH or NONE :output in alignment file
--HELIXGAP=n :gap penalty for helix core residues
--STRANDGAP=n :gap penalty for strand core residues
--LOOPGAP=n :gap penalty for loop regions
--TERMINALGAP=n :gap penalty for structure termini
--HELIXENDIN=n :number of residues inside helix to be treated as terminal
--HELIXENDOUT=n :number of residues outside helix to be treated as terminal
--STRANDENDIN=n :number of residues inside strand to be treated as terminal
--STRANDENDOUT=n:number of residues outside strand to be treated as terminal
-
-
-***Trees:***
--OUTPUTTREE=nj OR phylip OR dist OR nexus
--SEED=n :seed number for bootstraps
--KIMURA :use Kimura's correction
--TOSSGAPS :ignore positions with gaps
--BOOTLABELS=node OR branch :position of bootstrap values in tree display
-
-
->>HELP R <<
- References
-
-<STRONG>
-The ClustalX program is described in the manuscript:
-</STRONG>
-
-Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
-The ClustalX windows interface: flexible strategies for multiple sequence
-alignment aided by quality analysis tools. Nucleic Acids Research, 25:4876-4882.
-
-
-<STRONG>
-The ClustalW program is described in the manuscript:
-</STRONG>
-
-Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
-sensitivity of progressive multiple sequence alignment through sequence
-weighting, positions-specific gap penalties and weight matrix choice. Nucleic
-Acids Research, 22:4673-4680.
-
-
-<STRONG>
-The ClustalV program is described in the manuscript:
-</STRONG>
-
-Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
-multiple sequence alignment. CABIOS 8,189-191.
-
-
-<STRONG>
-The original Clustal program is described in the manuscripts:
-</STRONG>
-
-Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
-alignments on a microcomputer.
-CABIOS 5,151-153.
-
-Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
-sequence alignment on a microcomputer. Gene 73,237-244.
-
--------------------------------------------------------------------------------
-<STRONG>
-Some tips on using Clustal X:
-</STRONG>
-
-Jeanmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
-Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
-
-<STRONG>
-Some tips on using Clustal W:
-</STRONG>
-
-Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
-multiple sequence alignments. Methods Enzymol., 266, 383-402.
-
--------------------------------------------------------------------------------
-<STRONG>
-You can get the latest version of the ClustalX program by anonymous ftp to:
-</STRONG>
-
-ftp-igbmc.u-strasbg.fr
-ftp.embl-heidelberg.de
-ftp.ebi.ac.uk
-
-<STRONG>
-Or, have a look at the following WWW site:
-</STRONG>
-
-http://www-igbmc.u-strasbg.fr/BioInfo/
-
Deleted: trunk/packages/clustalw/trunk/coldna.par
===================================================================
--- trunk/packages/clustalw/trunk/coldna.par 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/coldna.par 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,16 +0,0 @@
-# color lookup table - this is optional, if no rgbindex is specified, 8
-# hardcoded colors will be used.
-# A maximum of 16 colors can be specified - any more will be ignored!
- at rgbindex
-RED 0.9 0.2 0.1
-BLUE 0.1 0.5 0.9
-GREEN 0.1 0.8 0.1
-ORANGE 0.9 0.6 0.3
-
-
- at color
-a = RED
-c = BLUE
-g = ORANGE
-t = GREEN
-u = GREEN
Deleted: trunk/packages/clustalw/trunk/colprint.par
===================================================================
--- trunk/packages/clustalw/trunk/colprint.par 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/colprint.par 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,15 +0,0 @@
-WHITE 1.0 1.0 1.0
-YELLOW 1.0 1.0 0.0
-VIOLET 0.4 0.1 0.9
-RED 0.9 0.5 0.4
-BLUE 0.4 0.9 0.9
-PURPLE 0.7 0.6 0.9
-BLACK 0.0 0.0 0.0
-GREY 0.6 0.7 0.7
-PINK 0.8 0.3 0.8
-ORANGE 0.9 0.7 0.3
-CYAN 0.1 0.7 0.7
-PINK 0.9 0.5 0.5
-MAGENTA 0.8 0.3 0.8
-ORANGE 0.9 0.6 0.3
-
Deleted: trunk/packages/clustalw/trunk/colprot.par
===================================================================
--- trunk/packages/clustalw/trunk/colprot.par 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/colprot.par 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,66 +0,0 @@
-# color lookup table - this is optional, if no rgbindex is specified, 8
-# hardcoded colors will be used.
-# A maximum of 16 colors can be specified - any more will be ignored!
- at rgbindex
-RED 0.9 0.2 0.1
-BLUE 0.1 0.5 0.9
-GREEN 0.1 0.8 0.1
-CYAN 0.1 0.7 0.7
-PINK 0.9 0.5 0.5
-MAGENTA 0.8 0.3 0.8
-YELLOW 0.8 0.8 0.0
-ORANGE 0.9 0.6 0.3
-
- at consensus
-% = 60% w:l:v:i:m:a:f:c:y:h:p
-# = 80% w:l:v:i:m:a:f:c:y:h:p
-- = 50% e:d
-+ = 60% k:r
-g = 50% g
-n = 50% n
-q = 50% q:e
-p = 50% p
-t = 50% t:s
-A = 85% a
-C = 85% c
-D = 85% d
-E = 85% e
-F = 85% f
-G = 85% g
-H = 85% h
-I = 85% i
-K = 85% k
-L = 85% l
-M = 85% m
-N = 85% n
-P = 85% p
-Q = 85% q
-R = 85% r
-S = 85% s
-T = 85% t
-V = 85% v
-W = 85% w
-Y = 85% y
-
- at color
-g = ORANGE
-p = YELLOW
-t = GREEN if t:S:T:%:#
-s = GREEN if t:S:T:#
-n = GREEN if n:N:D
-q = GREEN if q:Q:E:+:K:R
-w = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
-l = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
-v = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
-i = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
-m = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
-a = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p:T:S:s:G
-f = BLUE if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
-c = BLUE if %:#:A:F:H:I:L:M:V:W:Y:S:P:p
-c = PINK if C
-h = CYAN if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
-y = CYAN if %:#:A:C:F:H:I:L:M:V:W:Y:P:p
-e = MAGENTA if -:D:E:q:Q
-d = MAGENTA if -:D:E:n:N
-k = RED if +:K:R:Q
-r = RED if +:K:R:Q
Deleted: trunk/packages/clustalw/trunk/dayhoff.h
===================================================================
--- trunk/packages/clustalw/trunk/dayhoff.h 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/dayhoff.h 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,45 +0,0 @@
-/* DAYHOFF.H
-
- Table of estimated PAMS (actual no. of substitutions per 100 residues)
- for a range of observed amino acid distances from 75.0% (the first entry
- in the array), in 0.1% increments, up to 93.0%.
-
- These values are used to correct for multiple hits in protein alignments.
- The values below are for observed distances above 74.9%. For values above
- 93%, an arbitrary value of 1000 PAMS (1000% substitution) is used.
-
- These values are derived from a Dayhoff model (1978) of amino acid
- substitution and assume average amino acid composition and that amino
- acids replace each other at the same rate as in the original Dayhoff model.
-
- Up to 75% observed distance, use Kimura's emprical formula to derive
- the correction. For 75% or greater, use this table. Kimura's formula
- is accurate up to about 75% and fails completely above 85%.
-*/
-
-int dayhoff_pams[]={
- 195, /* 75.0% observed d; 195 PAMs estimated = 195% estimated d */
- 196, /* 75.1% observed d; 196 PAMs estimated */
- 197, 198, 199, 200, 200, 201, 202, 203,
- 204, 205, 206, 207, 208, 209, 209, 210, 211, 212,
- 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
- 223, 224, 226, 227, 228, 229, 230, 231, 232, 233,
- 234, 236, 237, 238, 239, 240, 241, 243, 244, 245,
- 246, 248, 249, 250, /* 250 PAMs = 80.3% observed d */
- 252, 253, 254, 255, 257, 258,
- 260, 261, 262, 264, 265, 267, 268, 270, 271, 273,
- 274, 276, 277, 279, 281, 282, 284, 285, 287, 289,
- 291, 292, 294, 296, 298, 299, 301, 303, 305, 307,
- 309, 311, 313, 315, 317, 319, 321, 323, 325, 328,
- 330, 332, 335, 337, 339, 342, 344, 347, 349, 352,
- 354, 357, 360, 362, 365, 368, 371, 374, 377, 380,
- 383, 386, 389, 393, 396, 399, 403, 407, 410, 414,
- 418, 422, 426, 430, 434, 438, 442, 447, 451, 456,
- 461, 466, 471, 476, 482, 487, 493, 498, 504, 511,
- 517, 524, 531, 538, 545, 553, 560, 569, 577, 586,
- 595, 605, 615, 626, 637, 649, 661, 675, 688, 703,
- 719, 736, 754, 775, 796, 819, 845, 874, 907, 945,
- /* 92.9% observed; 945 PAMs */
- 988 /* 93.0% observed; 988 PAMs */
-};
-
Property changes on: trunk/packages/clustalw/trunk/debian
___________________________________________________________________
Name: mergeWithUpstream
+ 1
Modified: trunk/packages/clustalw/trunk/debian/changelog
===================================================================
--- trunk/packages/clustalw/trunk/debian/changelog 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/debian/changelog 2007-08-12 15:08:40 UTC (rev 399)
@@ -19,8 +19,9 @@
* Debian Menu transition: Apps/Science becomes Applications/Science/Biology.
* Fixed a typo in clustalw.menu (Closes: #428518)
* Updated Steffen's email adress.
+ * Using quilt to manage the changes to the sources.
- -- Charles Plessy <charles-debian-nospam at plessy.org> Sun, 12 Aug 2007 22:19:19 +0900
+ -- Charles Plessy <charles-debian-nospam at plessy.org> Sun, 12 Aug 2007 23:06:39 +0900
clustalw (1.83-1.2) unstable; urgency=high
Modified: trunk/packages/clustalw/trunk/debian/control
===================================================================
--- trunk/packages/clustalw/trunk/debian/control 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/debian/control 2007-08-12 15:08:40 UTC (rev 399)
@@ -3,7 +3,7 @@
Priority: optional
Maintainer: Debian-Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
Uploaders: Steffen Moeller <moeller at debian.org>, Charles Plessy <charles-debian-nospam at plessy.org>
-Build-Depends: debhelper (>= 5), ncbi-tools6-dev, libvibrant6-dev, lesstif2-dev
+Build-Depends: debhelper (>= 5), ncbi-tools6-dev, libvibrant6-dev, lesstif2-dev, quilt
Standards-Version: 3.7.2
XS-Vcs-Browser: http://svn.debian.org/wsvn/debian-med/trunk/packages/clustalw/trunk/
XS-Vcs-Svn: svn://svn.debian.org/svn/debian-med/trunk/packages/clustalw
Added: trunk/packages/clustalw/trunk/debian/patches/amenu.c.patch
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/amenu.c.patch (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/amenu.c.patch 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,130 @@
+Index: clustalw-1.83/amenu.c
+===================================================================
+--- clustalw-1.83.orig/amenu.c
++++ clustalw-1.83/amenu.c
+@@ -184,7 +184,7 @@
+ fprintf(stdout," H. HELP\n");
+ fprintf(stdout," X. EXIT (leave program)\n\n\n");
+
+- getstr("Your choice",lin1);
++ getstr("Your choice",MAXLINE+1,lin1);
+
+ switch(toupper(*lin1)) {
+ case '1': seq_input(FALSE);
+@@ -268,7 +268,7 @@
+ fprintf(stdout," H. HELP\n");
+ fprintf(stdout," or press [RETURN] to go back to main menu\n\n\n");
+
+- getstr("Your choice",lin1);
++ getstr("Your choice",MAXLINE+1,lin1);
+ if(*lin1 == EOS) return;
+
+ switch(toupper(*lin1))
+@@ -361,7 +361,7 @@
+ fprintf(stdout," H. HELP\n");
+ fprintf(stdout," or press [RETURN] to go back to main menu\n\n\n");
+
+- getstr("Your choice",lin1);
++ getstr("Your choice",MAXLINE+1,lin1);
+ if(*lin1 == EOS) return;
+
+ switch(toupper(*lin1))
+@@ -457,7 +457,7 @@
+ fprintf(stdout,"\n\n");
+ fprintf(stdout," H. HELP\n\n\n");
+
+- getstr("Enter number (or [RETURN] to exit)",lin2);
++ getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
+ if( *lin2 == EOS) {
+ return;
+ }
+@@ -533,7 +533,7 @@
+ fprintf(stdout,"--\n");
+
+
+- getstr("\n\nEnter number (or [RETURN] to exit)",lin2);
++ getstr("\n\nEnter number (or [RETURN] to exit)",MAXLINE+1,lin2);
+ if(*lin2 == EOS) return(output_struct_penalties);
+
+ switch(toupper(*lin2))
+@@ -602,7 +602,7 @@
+ fprintf(stdout," H. HELP\n");
+ fprintf(stdout," or press [RETURN] to go back to main menu\n\n\n");
+
+- getstr("Your choice",lin1);
++ getstr("Your choice",MAXLINE+1,lin1);
+ if(*lin1 == EOS) return;
+
+ switch(toupper(*lin1))
+@@ -677,7 +677,7 @@
+ fprintf(stdout,"\n");
+ fprintf(stdout," H. HELP\n\n\n");
+
+- getstr("Enter number (or [RETURN] to exit)",lin2);
++ getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
+ if(*lin2 == EOS) return;
+
+ switch(toupper(*lin2)) {
+@@ -766,7 +766,7 @@
+ fprintf(stdout,"\n");
+ fprintf(stdout," H. HELP\n\n\n");
+
+- getstr("Enter number (or [RETURN] to exit)",lin2);
++ getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
+ if(*lin2 == EOS) return;
+
+ switch(toupper(*lin2)) {
+@@ -907,7 +907,7 @@
+
+ fprintf(stdout," H. HELP\n\n\n");
+
+- getstr("Enter number (or [RETURN] to exit)",lin2);
++ getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
+ if( *lin2 == EOS) {
+ if(dnaflag) {
+ dna_pw_go_penalty = pw_go_penalty;
+@@ -1029,7 +1029,7 @@
+ fprintf(stdout," 8. Protein Gap Parameters\n\n");
+ fprintf(stdout," H. HELP\n\n\n");
+
+- getstr("Enter number (or [RETURN] to exit)",lin2);
++ getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
+
+ if(*lin2 == EOS) {
+ if(dnaflag) {
+@@ -1122,7 +1122,7 @@
+ fprintf(stdout," 5. Toggle End Gap Separation :%s\n\n",(!use_endgaps) ? "OFF" : "ON");
+ fprintf(stdout," H. HELP\n\n\n");
+
+- getstr("Enter number (or [RETURN] to exit)",lin2);
++ getstr("Enter number (or [RETURN] to exit)",MAXLINE+1,lin2);
+
+ if(*lin2 == EOS) return;
+
+@@ -1136,7 +1136,7 @@
+ case '3':
+ fprintf(stdout,"Hydrophilic Residues Currently: %s\n",hyd_residues);
+
+- getstr("Enter residues (or [RETURN] to quit)",lin1);
++ getstr("Enter residues (or [RETURN] to quit)",MAXLINE+1,lin1);
+ if (*lin1 != EOS) {
+ for (i=0;i<strlen(hyd_residues) && i<26;i++) {
+ c = lin1[i];
+@@ -1188,7 +1188,7 @@
+ fprintf(stdout,"--\n");
+
+
+- getstr("\n\nEnter number (or [RETURN] to exit)",lin2);
++ getstr("\n\nEnter number (or [RETURN] to exit)",MAXLINE+1,lin2);
+ if(*lin2 == EOS) return(matn);
+
+ i=toupper(*lin2)-'0';
+@@ -1223,7 +1223,7 @@
+ fprintf(stdout,"\n%s\n",title);
+ strcpy(line,prompt);
+ strcat(line, "(y/n) ? [y]");
+- getstr(line,lin2);
++ getstr(line,MAXLINE+1,lin2);
+ if ((*lin2 != 'n') && (*lin2 != 'N'))
+ return('y');
+ else
Added: trunk/packages/clustalw/trunk/debian/patches/clustal-help.patch
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/clustal-help.patch (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/clustal-help.patch 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,26 @@
+Index: clustalw-1.83/clustalw.c
+===================================================================
+--- clustalw-1.83.orig/clustalw.c
++++ clustalw-1.83/clustalw.c
+@@ -34,7 +34,7 @@
+ #ifdef MSDOS
+ char *help_file_name = "clustalw.hlp";
+ #else
+- char *help_file_name = "clustalw_help";
++ char *help_file_name = "/usr/share/clustalw/clustalw_help";
+ #endif
+
+ sint max_names; /* maximum length of names in current alignment file */
+Index: clustalw-1.83/clustalx.c
+===================================================================
+--- clustalw-1.83.orig/clustalx.c
++++ clustalw-1.83/clustalx.c
+@@ -26,7 +26,7 @@
+ #ifdef MSDOS
+ char *help_file_name = "clustalx.hlp";
+ #else
+- char *help_file_name = "clustalx_help";
++ char *help_file_name = "/usr/share/clustalw/clustalx_help";
+ #endif
+
+ sint max_names; /* maximum length of names in current alignment file */
Added: trunk/packages/clustalw/trunk/debian/patches/clustalw.h.patch
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/clustalw.h.patch (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/clustalw.h.patch 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,13 @@
+Index: clustalw-1.83/clustalw.h
+===================================================================
+--- clustalw-1.83.orig/clustalw.h
++++ clustalw-1.83/clustalw.h
+@@ -238,7 +238,7 @@
+ char *blank_to_(char *str);
+ char *upstr(char *str);
+ char *lowstr(char *str);
+-void getstr(char *instr, char *outstr);
++void getstr(char *instr, int n, char *outstr);
+ double getreal(char *instr, double minx, double maxx, double def);
+ int getint(char *instr, int minx, int maxx, int def);
+ void do_system(void);
Added: trunk/packages/clustalw/trunk/debian/patches/clustalx.html.patch
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/clustalx.html.patch (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/clustalx.html.patch 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,2123 @@
+Index: clustalw-1.83/clustalx.html
+===================================================================
+--- clustalw-1.83.orig/clustalx.html
++++ clustalw-1.83/clustalx.html
+@@ -2029,6 +2029,2118 @@
+ <P>
+ Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
+ The ClustalX windows interface: flexible strategies for multiple sequence
++alignment aided by quality analysis tools. Nucleic Acids Research, 24:4876-4882.
++</P>
++<P>
++</P>
++<P>
++<STRONG>
++The ClustalW program is described in the manuscript:
++</STRONG>
++</P>
++<P>
++Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
++sensitivity of progressive multiple sequence alignment through sequence
++weighting, positions-specific gap penalties and weight matrix choice. Nucleic
++Acids Research, 22:4673-4680.
++</P>
++<P>
++</P>
++<P>
++<STRONG>
++The ClustalV program is described in the manuscript:
++</STRONG>
++</P>
++<P>
++Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
++multiple sequence alignment. CABIOS 8,189-191.
++</P>
++<P>
++</P>
++<P>
++<STRONG>
++The original Clustal program is described in the manuscripts:
++</STRONG>
++</P>
++<P>
++Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
++alignments on a microcomputer.
++CABIOS 5,151-153.
++</P>
++<P>
++Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
++sequence alignment on a microcomputer. Gene 73,237-244.
++</P>
++<P>
++<STRONG>
++Some tips on using Clustal X:
++</STRONG>
++</P>
++<P>
++Jeannmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
++Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
++</P>
++<P>
++<STRONG>
++Some tips on using Clustal W:
++</STRONG>
++</P>
++<P>
++Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
++multiple sequence alignments. Methods Enzymol., 266, 383-402.
++</P>
++<P>
++<STRONG>
++You can get the latest version of the ClustalX program by anonymous ftp to:
++</STRONG>
++</P>
++<P>
++ftp-igbmc.u-strasbg.fr
++ftp.embl-heidelberg.de
++ftp.ebi.ac.uk
++</P>
++<P>
++<STRONG>
++Or, have a look at the following WWW site:
++</STRONG>
++</P>
++<P>
++http://www-igbmc.u-strasbg.fr/BioInfo/
++</P>
++<P>
++</P>
++<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
++<HEAD>
++<TITLE>ClustalX Help</TITLE>
++</HEAD>
++<BODY BGCOLOR=white>
++<CENTER><H1>ClustalX Help</H1></CENTER>
++<P>
++You can get the latest version of the ClustalX program here:
++</P>
++<DL><DD>
++<A HREF="ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalX/">
++ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalX/</A>
++</DL>
++<P>For full details of usage and algorithms, please read the <A HREF="clustalw.doc"><EM>ClustalW.Doc</EM></A> file.</P>
++<PRE><EM>
++Toby Gibson EMBL, Heidelberg, Germany.
++Des Higgins UCC, Cork, Ireland.
++Julie Thompson/Francois Jeanmougin IGBMC, Strasbourg, France.
++</EM></PRE>
++<CENTER><H2><A NAME="Index">Index</A></H2></CENTER>
++<OL>
++<LI><A HREF="#G"> General help for CLUSTAL X (1.8)
++</A></LI>
++<LI><A HREF="#F"> Input / Output Files
++</A></LI>
++<LI><A HREF="#E"> Editing Alignments
++</A></LI>
++<LI><A HREF="#M"> Multiple Alignments
++</A></LI>
++<LI><A HREF="#P"> Profile and Structure Alignments
++</A></LI>
++<LI><A HREF="#B"> Secondary Structure / Gap Penalty Masks
++</A></LI>
++<LI><A HREF="#T"> Phylogenetic Trees
++</A></LI>
++<LI><A HREF="#C"> Colors
++</A></LI>
++<LI><A HREF="#Q"> Alignment Quality Analysis
++</A></LI>
++<LI><A HREF="#9"> Command Line Parameters
++</A></LI>
++<LI><A HREF="#R"> References
++</A></LI>
++</OL>
++<CENTER><H2><A NAME="G"> General help for CLUSTAL X (1.8)
++</A></H2></CENTER>
++<P>
++</P>
++<P>
++Clustal X is a windows interface for the ClustalW multiple sequence alignment
++program. It provides an integrated environment for performing multiple sequence
++and profile alignments and analysing the results. The sequence alignment is
++displayed in a window on the screen. A versatile coloring scheme has been
++incorporated allowing you to highlight conserved features in the alignment.
++The pull-down menus at the top of the window allow you to select all the
++options required for traditional multiple sequence and profile alignment.
++</P>
++<P>
++You can cut-and-paste sequences to change the order of the alignment; you can
++select a subset of sequences to be aligned; you can select a sub-range of the
++alignment to be realigned and inserted back into the original alignment.
++</P>
++<P>
++Alignment quality analysis can be performed and low-scoring segments or
++exceptional residues can be highlighted.
++</P>
++<P>
++ClustalX is available for a number of different platforms including: SUN
++Solaris, IRIX5.3 on Silicon Graphics, Digital UNIX on DECStations, Microsoft
++Windows (32 bit) for PC's, Linux ELF for x86 PC's and Macintosh PowerMac. (See
++the README file for Installation instructions.)
++</P>
++<P>
++</P>
++<P>
++<H4>
++SEQUENCE INPUT
++</H4>
++</P>
++<P>
++Sequences and profiles (a term for pre-existing alignments) are input using
++the FILE menu. Invalid options will be disabled. All sequences must be included
++into 1 file. 7 formats are automatically recognised: NBRF/PIR, EMBL/SWISSPROT,
++Pearson (Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9 RSF and GDE flat file.
++All non-alphabetic characters (spaces, digits, punctuation marks) are ignored
++except "-" which is used to indicate a GAP ("." in MSF/RSF).
++</P>
++<P>
++<H4>
++SEQUENCE / PROFILE ALIGNMENTS
++</H4>
++</P>
++<P>
++Clustal X has two modes which can be selected using the switch directly above
++the sequence display: MULTIPLE ALIGNMENT MODE and PROFILE ALIGNMENT MODE.
++</P>
++<P>
++To do a MULTIPLE ALIGNMENT on a set of sequences, make sure MULTIPLE ALIGNMENT
++MODE is selected. A single sequence data area is then displayed. The ALIGNMENT
++menu then allows you to either produce a guide tree for the alignment, or to do
++a multiple alignment following the guide tree, or to do a full multiple
++alignment.
++</P>
++<P>
++In PROFILE ALIGNMENT MODE, two sequence data areas are displayed, allowing you
++to align 2 alignments (termed profiles). Profiles are also used to add a new
++sequence to an old alignment, or to use secondary structure to guide the
++alignment process. GAPS in the old alignments are indicated using the "-"
++character. PROFILES can be input in ANY of the allowed formats; just use "-"
++(or "." for MSF/RSF) for each gap position. In Profile Alignment Mode, a button
++"Lock Scroll" is displayed which allows you to scroll the two profiles together
++using a single scroll bar. When the Lock Scroll is turned off, the two profiles
++can be scrolled independently.
++</P>
++<P>
++<H4>
++PHYLOGENETIC TREES
++</H4>
++</P>
++<P>
++Phylogenetic trees can be calculated from old alignments (read in with "-"
++characters to indicate gaps) OR after a multiple alignment while the alignment
++is still displayed.
++</P>
++<P>
++<H4>
++ALIGNMENT DISPLAY
++</H4>
++</P>
++<P>
++The alignment is displayed on the screen with the sequence names on the left
++hand side. The sequence alignment is for display only, it cannot be edited here
++(except for changing the sequence order by cutting-and-pasting on the sequence
++names).
++</P>
++<P>
++A ruler is displayed below the sequences, starting at 1 for the first residue
++position (residue numbers in the sequence input file are ignored).
++</P>
++<P>
++A line above the alignment is used to mark strongly conserved positions. Three
++characters ('*', ':' and '.') are used:
++</P>
++<P>
++'*' indicates positions which have a single, fully conserved residue
++</P>
++<P>
++':' indicates that one of the following 'strong' groups is fully conserved:-
++<PRE>
++ STA
++ NEQK
++ NHQK
++ NDEQ
++ QHRK
++ MILV
++ MILF
++ HY
++ FYW
++</PRE>
++</P>
++<P>
++'.' indicates that one of the following 'weaker' groups is fully conserved:-
++<PRE>
++ CSA
++ ATV
++ SAG
++ STNK
++ STPA
++ SGND
++ SNDEQK
++ NDEQHK
++ NEQHRK
++ FVLIM
++ HFY
++</PRE>
++</P>
++<P>
++These are all the positively scoring groups that occur in the Gonnet Pam250
++matrix. The strong and weak groups are defined as strong score >0.5 and weak
++score =<0.5 respectively.
++</P>
++<P>
++For profile alignments, secondary structure and gap penalty masks are displayed
++above the sequences, if any data is found in the profile input file.
++</P>
++<P>
++</P>
++<P>
++</P>
++<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
++<CENTER><H2><A NAME="F"> Input / Output Files
++</A></H2></CENTER>
++<P>
++</P>
++<P>
++LOAD SEQUENCES reads sequences from one of 7 file formats, replacing any
++sequences that are already loaded. All sequences must be in 1 file. The formats
++that are automatically recognised are: NBRF/PIR, EMBL/SWISSPROT, Pearson
++(Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9/RSF and GDE flat file. All
++non-alphabetic characters (spaces, digits, punctuation marks) are ignored
++except "-" which is used to indicate a GAP ("." in MSF/RSF).
++</P>
++<P>
++The program tries to automatically recognise the different file formats used
++and to guess whether the sequences are amino acid or nucleotide. This is not
++always foolproof.
++</P>
++<P>
++FASTA and NBRF/PIR formats are recognised by having a ">" as the first
++character in the file.
++</P>
++<P>
++EMBL/Swiss Prot formats are recognised by the letters "ID" at the start of the
++file (the token for the entry name field).
++</P>
++<P>
++CLUSTAL format is recognised by the word CLUSTAL at the beginning of the file.
++</P>
++<P>
++GCG/MSF format is recognised by one of the following:
++<UL>
++<LI>
++ - the word PileUp at the start of the file.
++</LI><LI>
++ - the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
++ at the start of the file.
++</LI><LI>
++ - the word MSF on the first line of the file, and the characters ..
++ at the end of this line.
++</LI>
++</UL>
++</P>
++<P>
++GCG/RSF format is recognised by the word !!RICH_SEQUENCE at the beginning of
++the file.
++</P>
++<P>
++</P>
++<P>
++If 85% or more of the characters in the sequence are from A,C,G,T,U or N, the
++sequence will be assumed to be nucleotide. This works in 97.3% of cases but
++watch out!
++</P>
++<P>
++APPEND SEQUENCES is only valid in MULTIPLE ALIGNMENT MODE. The input sequences
++do not replace those already loaded, but are appended at the end of the
++alignment.
++</P>
++<P>
++SAVE SEQUENCES AS... offers the user a choice of one of six output formats:
++CLUSTAL, NBRF/PIR, GCG/MSF, PHYLIP, NEXUS or GDE. All sequences are written
++to a single file. Options are available to save a range of the alignment,
++switch between UPPER/LOWER case for GDE files, and to output SEQUENCE NUMBERING
++for CLUSTAL files.
++</P>
++<P>
++LOAD PROFILE 1 reads sequences in the same 7 file formats, replacing any
++sequences already loaded as Profile 1. This option will also remove any
++sequences which are loaded in Profile 2.
++</P>
++<P>
++LOAD PROFILE 2 reads sequences in the same 7 file formats, replacing any
++sequences already loaded as Profile 2.
++</P>
++<P>
++SAVE PROFILE 1 AS... is similar to the Save Sequences option except that only
++those sequences in Profile 1 will be written to the output file.
++</P>
++<P>
++SAVE PROFILE 2 AS... is similar to the Save Sequences option except that only
++those sequences in Profile 2 will be written to the output file.
++</P>
++<P>
++WRITE ALIGNMENT AS POSTSCRIPT will write the sequence display to a postscript
++format file. This will include any secondary structure / gap penalty mask
++information and the consensus and ruler lines which are displayed on the
++screen. The Alignment Quality curve can be optionally included in the output
++file.
++</P>
++<P>
++WRITE PROFILE 1 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
++except that only the profile 1 display will be printed.
++</P>
++<P>
++WRITE PROFILE 2 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
++except that only the profile 2 display will be printed.
++</P>
++<P>
++</P>
++<P>
++<H4>
++POSTSCRIPT PARAMETERS
++</H4>
++</P>
++<P>
++A number of options are available to allow you to configure your postscript
++output file.
++</P>
++<P>
++PS COLORS FILE:
++</P>
++<P>
++The exact RGB values required to reproduce the colors used in the alignment
++window will vary from printer to printer. A PS colors file can be specified
++that contains the RGB values for all the colors required by each of your
++postscript printers.
++</P>
++<P>
++By default, Clustal X looks for a file called 'colprint.par' in the current
++directory (if your running under UNIX, it then looks in your home directory,
++and finally in the directories in your PATH environment variable). If no PS
++colors file is found or a color used on the screen is not defined here, the
++screen RGB values (from the Color Parameter File) are used.
++</P>
++<P>
++The PS colors file consists of one line for each color to be defined, with the
++color name followed by the RGB values (on a scale of 0 to 1). For example,
++</P>
++<P>
++RED 0.9 0.1 0.1
++</P>
++<P>
++Blank lines and comments (lines beginning with a '#' character) are ignored.
++</P>
++<P>
++</P>
++<P>
++PAGE SIZE: The alignment can be displayed on either A4, A3 or US Letter size
++pages.
++</P>
++<P>
++ORIENTATION: The alignment can be displayed on either a landscape or portrait
++page.
++</P>
++<P>
++PRINT HEADER: An optional header including the postscript filename, and
++creation date can be printed at the top of each page.
++</P>
++<P>
++PRINT QUALITY CURVE: The Alignment Quality curve which is displayed underneath
++the alignment on the screen can be included in the postscript output.
++</P>
++<P>
++PRINT RULER: The ruler which is displayed underneath the alignment on the
++screen can be included in the postscript output.
++</P>
++<P>
++PRINT RESIDUE NUMBERS: Sequence residue numbers can be printed at the right
++hand side of the alignment.
++</P>
++<P>
++RESIZE TO FIT PAGE: By default, the alignment is scaled to fit the page size
++selected. This option can be turned off, in which case a font size of 10 will
++be used for the sequences.
++</P>
++<P>
++PRINT FROM POSITION/TO: A range of the alignment can be printed. The default
++is to print the full alignment. The first and last residues to be printed are
++specified here.
++</P>
++<P>
++USE BLOCK LENGTH: The alignment can be divided into blocks of residues. The
++number of residues in a block is specified here. More than one block may then
++be printed on a single page. This is useful for long alignments of a small
++number of sequences. If the block length is set to 0, The alignment will not
++be divided into blocks, but printed across a number of pages.
++</P>
++<P>
++</P>
++<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
++<CENTER><H2><A NAME="E"> Editing Alignments
++</A></H2></CENTER>
++<P>
++</P>
++<P>
++Clustal X allows you to change the order of the sequences in the alignment, by
++cutting-and-pasting the sequence names.
++</P>
++<P>
++To select a group of sequences to be moved, click on a sequence name and drag
++the cursor until all the required sequences are highlighted. Holding down the
++Shift key when clicking on the first name will add new sequences to those
++already selected.
++</P>
++<P>
++(Options are provided to Select All Sequences, Select Profile 1 or Select
++Profile 2.)
++</P>
++<P>
++The selected sequences can be removed from the alignment by using the EDIT
++menu, CUT option.
++</P>
++<P>
++To add the cut sequences back into an alignment, select a sequence by clicking
++on the sequence name. The cut sequences will be added to the alignment,
++immediately following the selected sequence, by the EDIT menu, PASTE option.
++</P>
++<P>
++To add the cut sequences to an empty alignment (eg. when cutting sequences from
++Profile 1 and pasting them to Profile 2), click on the empty sequence name
++display area, and select the EDIT menu, PASTE option as before.
++</P>
++<P>
++The sequence selection and sequence range selection can be cleared using the
++EDIT menu, CLEAR SEQUENCE SELECTION and CLEAR RANGE SELECTION options
++respectively.
++</P>
++<P>
++To search for a string of residues in the sequences, select the sequences to be
++searched by clicking on the sequence names. You can then enter the string to
++search for by selecting the SEARCH FOR STRING option. If the string is found in
++any of the sequences selected, the sequence name and column number is printed
++below the sequence display.
++</P>
++<P>
++In PROFILE ALIGNMENT MODE, the two profiles can be merged (normally done after
++alignment) by selecting ADD PROFILE 2 TO PROFILE 1. The sequences currently
++displayed as Profile 2 will be appended to Profile 1.
++</P>
++<P>
++The REMOVE ALL GAPS option will remove all gaps from the sequences currently
++selected.
++WARNING: This option removes ALL gaps, not only those introduced by ClustalX,
++but also those that were read from the input alignment file. Any secondary
++structure information associated with the alignment will NOT be automatically
++realigned.
++</P>
++<P>
++The REMOVE GAP-ONLY COLUMNS will remove those positions in the alignment which
++contain gaps in all sequences. This can occur as a result of removing divergent
++sequences from an alignment, or if an alignment has been realigned.
++</P>
++<P>
++</P>
++<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
++<CENTER><H2><A NAME="M"> Multiple Alignments
++</A></H2></CENTER>
++<P>
++</P>
++<P>
++Make sure MULTIPLE ALIGNMENT MODE is selected, using the switch directly above
++the sequence display area. Then, use the ALIGNMENT menu to do multiple
++alignments.
++</P>
++<P>
++Multiple alignments are carried out in 3 stages:
++</P>
++<P>
++1) all sequences are compared to each other (pairwise alignments);
++</P>
++<P>
++2) a dendrogram (like a phylogenetic tree) is constructed, describing the
++approximate groupings of the sequences by similarity (stored in a file).
++</P>
++<P>
++3) the final multiple alignment is carried out, using the dendrogram as a guide.
++</P>
++<P>
++The 3 stages are carried out automatically by the DO COMPLETE ALIGNMENT option.
++You can skip the first stages (pairwise alignments; guide tree) by using an old
++guide tree file (DO ALIGNMENT FROM GUIDE TREE); or you can just produce the
++guide tree with no final multiple alignment (PRODUCE GUIDE TREE ONLY).
++</P>
++<P>
++</P>
++<P>
++REALIGN SELECTED SEQUENCES is used to realign badly aligned sequences in the
++alignment. Sequences can be selected by clicking on the sequence names - see
++Editing Alignments for more details. The unselected sequences are then 'fixed'
++and a profile is made including only the unselected sequences. Each of the
++selected sequences in turn is then realigned to this profile. The realigned
++sequences will be displayed as a group at the end the alignment.
++</P>
++<P>
++</P>
++<P>
++REALIGN SELECTED SEQUENCE RANGE is used to realign a small region of the
++alignment. A residue range can be selected by clicking on the sequence display
++area. A multiple alignment is then performed, following the 3 stages described
++above, but only using the selected residue range. Finally the new alignment of
++the range is pasted back into the full sequence alignment.
++</P>
++<P>
++By default, gap penalties are used at each end of the subrange in order to
++penalise terminal gaps. If the REALIGN SEGMENT END GAP PENALTIES option is
++switched off, gaps can be introduced at the ends of the residue range at no
++cost.
++</P>
++<P>
++</P>
++<P>
++ALIGNMENT PARAMETERS displays a sub-menu with the following options:
++</P>
++<P>
++RESET NEW GAPS BEFORE ALIGNMENT will remove any new gaps introduced into the
++sequences during multiple alignment if you wish to change the parameters and
++try again. This only takes effect just before you do a second multiple
++alignment. You can make phylogenetic trees after alignment whether or not this
++is ON. If you turn this OFF, the new gaps are kept even if you do a second
++multiple alignment. This allows you to iterate the alignment gradually.
++Sometimes, the alignment is improved by a second or third pass.
++</P>
++<P>
++RESET ALL GAPS BEFORE ALIGNMENT will remove all gaps in the sequences including
++gaps which were read in from the sequence input file. This only takes effect
++just before you do a second multiple alignment. You can make phylogenetic
++trees after alignment whether or not this is ON. If you turn this OFF, all
++gaps are kept even if you do a second multiple alignment. This allows you to
++iterate the alignment gradually. Sometimes, the alignment is improved by a
++second or third pass.
++</P>
++<P>
++</P>
++<P>
++PAIRWISE ALIGNMENT PARAMETERS control the speed/sensitivity of the initial
++alignments.
++</P>
++<P>
++MULTIPLE ALIGNMENT PARAMETERS control the gaps in the final multiple
++alignments.
++</P>
++<P>
++PROTEIN GAP PARAMETERS displays a temporary window which allows you to set
++various parameters only used in the alignment of protein sequences.
++</P>
++<P>
++(SECONDARY STRUCTURE PARAMETERS, for use with the Profile Alignment Mode only,
++allows you to set various parameters only used with gap penalty masks.)
++</P>
++<P>
++SAVE LOG FILE will write the alignment calculation scores to a file. The log
++filename is the same as the input sequence filename, with an extension .log
++appended.
++</P>
++<P>
++</P>
++<P>
++<H4>
++OUTPUT FORMAT OPTIONS
++</H4>
++</P>
++<P>
++You can choose from 6 different alignment formats (CLUSTAL, GCG, NBRF/PIR,
++PHYLIP, GDE and NEXUS). You can choose more than one (or all 6 if you wish).
++</P>
++<P>
++CLUSTAL format output is a self explanatory alignment format. It shows the
++sequences aligned in blocks. It can be read in again at a later date to (for
++example) calculate a phylogenetic tree or add in new sequences by profile
++alignment.
++</P>
++<P>
++GCG output can be used by any of the GCG programs that can work on multiple
++alignments (e.g. PRETTY, PROFILEMAKE, PLOTALIGN). It is the same as the GCG
++.msf format files (multiple sequence file); new in version 7 of GCG.
++</P>
++<P>
++NEXUS format is used by several phylogeny programs, including PAUP and
++MacClade.
++</P>
++<P>
++PHYLIP format output can be used for input to the PHYLIP package of Joe
++Felsenstein. This is a very widely used package for doing every imaginable
++form of phylogenetic analysis (MUCH more than the the modest introduction
++offered by this program).
++</P>
++<P>
++NBRF/PIR: this is the same as the standard PIR format with ONE ADDITION. Gap
++characters "-" are used to indicate the positions of gaps in the multiple
++alignment. These files can be re-used as input in any part of clustal that
++allows sequences (or alignments or profiles) to be read in.
++</P>
++<P>
++GDE: this format is used by the GDE package of Steven Smith and is understood
++by SEQLAB in GCG 9 or later.
++</P>
++<P>
++GDE OUTPUT CASE: sequences in GDE format may be written in either upper or
++lower case.
++</P>
++<P>
++CLUSTALW SEQUENCE NUMBERS: residue numbers may be added to the end of the
++alignment lines in clustalw format.
++</P>
++<P>
++OUTPUT ORDER is used to control the order of the sequences in the output
++alignments. By default, it uses the order in which the sequences were aligned
++(from the guide tree/dendrogram), thus automatically grouping closely related
++sequences. It can be switched to be the same as the original input order.
++</P>
++<P>
++PARAMETER OUTPUT: This option will save all your parameter settings in a
++parameter file (suffix .par) during alignment. The file can be subsequently
++used to rerun ClustalW using the same parameters.
++</P>
++<P>
++</P>
++<P>
++<H3>
++ALIGNMENT PARAMETERS
++</H3>
++</P>
++<P>
++<STRONG>
++PAIRWISE ALIGNMENT PARAMETERS
++</STRONG>
++</P>
++<P>
++A distance is calculated between every pair of sequences and these are used to
++construct the phylogenetic tree which guides the final multiple alignment. The
++scores are calculated from separate pairwise alignments. These can be
++calculated using 2 methods: dynamic programming (slow but accurate) or by the
++method of Wilbur and Lipman (extremely fast but approximate).
++</P>
++<P>
++You can choose between the 2 alignment methods using the PAIRWISE ALIGNMENTS
++option. The slow/accurate method is fast enough for short sequences but will be
++VERY SLOW for many (e.g. >100) long (e.g. >1000 residue) sequences.
++</P>
++<P>
++</P>
++<P>
++<STRONG>
++SLOW-ACCURATE alignment parameters:
++</STRONG>
++</P>
++<P>
++These parameters do not have any affect on the speed of the alignments. They
++are used to give initial alignments which are then rescored to give percent
++identity scores. These % scores are the ones which are displayed on the
++screen. The scores are converted to distances for the trees.
++</P>
++<P>
++Gap Open Penalty: the penalty for opening a gap in the alignment.
++</P>
++<P>
++Gap Extension Penalty: the penalty for extending a gap by 1 residue.
++</P>
++<P>
++Protein Weight Matrix: the scoring table which describes the similarity of
++each amino acid to each other.
++</P>
++<P>
++Load protein matrix: allows you to read in a comparison table from a file.
++</P>
++<P>
++DNA weight matrix: the scores assigned to matches and mismatches (including
++IUB ambiguity codes).
++</P>
++<P>
++Load DNA matrix: allows you to read in a comparison table from a file.
++</P>
++<P>
++See the Multiple alignment parameters, MATRIX option below for details of the
++matrix input format.
++</P>
++<P>
++</P>
++<P>
++<STRONG>
++FAST-APPROXIMATE alignment parameters:
++</STRONG>
++</P>
++<P>
++These similarity scores are calculated from fast, approximate, global align-
++ments, which are controlled by 4 parameters. 2 techniques are used to make
++these alignments very fast: 1) only exactly matching fragments (k-tuples) are
++considered; 2) only the 'best' diagonals (the ones with most k-tuple matches)
++are used.
++</P>
++<P>
++GAP PENALTY: This is a penalty for each gap in the fast alignments. It has
++little effect on the speed or sensitivity except for extreme values.
++</P>
++<P>
++K-TUPLE SIZE: This is the size of exactly matching fragment that is used.
++INCREASE for speed (max= 2 for proteins; 4 for DNA), DECREASE for sensitivity.
++For longer sequences (e.g. >1000 residues) you may wish to increase the
++default.
++</P>
++<P>
++TOP DIAGONALS: The number of k-tuple matches on each diagonal (in an imaginary
++dot-matrix plot) is calculated. Only the best ones (with most matches) are used
++in the alignment. This parameter specifies how many. Decrease for speed;
++increase for sensitivity.
++</P>
++<P>
++WINDOW SIZE: This is the number of diagonals around each of the 'best'
++diagonals that will be used. Decrease for speed; increase for sensitivity.
++</P>
++<P>
++</P>
++<P>
++<STRONG>
++MULTIPLE ALIGNMENT PARAMETERS
++</STRONG>
++</P>
++<P>
++These parameters control the final multiple alignment. This is the core of the
++program and the details are complicated. To fully understand the use of the
++parameters and the scoring system, you will have to refer to the documentation.
++</P>
++<P>
++Each step in the final multiple alignment consists of aligning two alignments
++or sequences. This is done progressively, following the branching order in the
++GUIDE TREE. The basic parameters to control this are two gap penalties and the
++scores for various identical/non-indentical residues.
++</P>
++<P>
++The GAP OPENING and EXTENSION PENALTIES can be set here. These control the
++cost of opening up every new gap and the cost of every item in a gap.
++Increasing the gap opening penalty will make gaps less frequent. Increasing
++the gap extension penalty will make gaps shorter. Terminal gaps are not
++penalised.
++</P>
++<P>
++The DELAY DIVERGENT SEQUENCES switch delays the alignment of the most distantly
++related sequences until after the most closely related sequences have been
++aligned. The setting shows the percent identity level required to delay the
++addition of a sequence; sequences that are less identical than this level to
++any other sequences will be aligned later.
++</P>
++<P>
++The TRANSITION WEIGHT gives transitions (A<-->G or C<-->T i.e. purine-purine or
++pyrimidine-pyrimidine substitutions) a weight between 0 and 1; a weight of zero
++means that the transitions are scored as mismatches, while a weight of 1 gives
++the transitions the match score. For distantly related DNA sequences, the
++weight should be near to zero; for closely related sequences it can be useful
++to assign a higher score. The default is set to 0.5.
++</P>
++<P>
++</P>
++<P>
++The PROTEIN WEIGHT MATRIX option allows you to choose a series of weight
++matrices. For protein alignments, you use a weight matrix to determine the
++similarity of non-identical amino acids. For example, Tyr aligned with Phe is
++usually judged to be 'better' than Tyr aligned with Pro.
++</P>
++<P>
++There are three 'in-built' series of weight matrices offered. Each consists of
++several matrices which work differently at different evolutionary distances. To
++see the exact details, read the documentation. Crudely, we store several
++matrices in memory, spanning the full range of amino acid distance (from almost
++identical sequences to highly divergent ones). For very similar sequences, it
++is best to use a strict weight matrix which only gives a high score to
++identities and the most favoured conservative substitutions. For more divergent
++sequences, it is appropriate to use "softer" matrices which give a high score
++to many other frequent substitutions.
++</P>
++<P>
++1) BLOSUM (Henikoff). These matrices appear to be the best available for
++carrying out data base similarity (homology searches). The matrices currently
++used are: Blosum 80, 62, 45 and 30. BLOSUM was the default in earlier Clustal X
++versions.
++</P>
++<P>
++2) PAM (Dayhoff). These have been extremely widely used since the late '70s. We
++currently use the PAM 20, 60, 120, 350 matrices.
++</P>
++<P>
++3) GONNET. These matrices were derived using almost the same procedure as the
++Dayhoff one (above) but are much more up to date and are based on a far larger
++data set. They appear to be more sensitive than the Dayhoff series. We
++currently use the GONNET 80, 120, 160, 250 and 350 matrices. This series is the
++default for Clustal X version 1.8.
++</P>
++<P>
++We also supply an identity matrix which gives a score of 10 to two identical
++amino acids and a score of zero otherwise. This matrix is not very useful.
++</P>
++<P>
++Load protein matrix: allows you to read in a comparison matrix from a file.
++This can be either a single matrix or a series of matrices (see below for
++format).
++</P>
++<P>
++</P>
++<P>
++DNA WEIGHT MATRIX option allows you to select a single matrix (not a series)
++used for aligning nucleic acid sequences. Two hard-coded matrices are available:
++</P>
++<P>
++1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
++of nucleic acid sequences. X's and N's are treated as matches to any IUB
++ambiguity symbol. All matches score 1.9; all mismatches for IUB symbols score 0.
++</P>
++<P>
++2) CLUSTALW(1.6). A previous system used by ClustalW, in which matches score
++1.0 and mismatches score 0. All matches for IUB symbols also score 0.
++</P>
++<P>
++Load DNA matrix: allows you to read in a nucleic acid comparison matrix from a
++file (just one matrix, not a series).
++</P>
++<P>
++</P>
++<P>
++SINGLE MATRIX INPUT FORMAT
++The format used for a single matrix is the same as the BLAST program. The
++scores in the new weight matrix should be similarities. You can use negative as
++well as positive values if you wish, although the matrix will be automatically
++adjusted to all positive scores, unless the NEGATIVE MATRIX option is selected.
++Any lines beginning with a # character are assumed to be comments. The first
++non-comment line should contain a list of amino acids in any order, using the 1
++letter code, followed by a * character. This should be followed by a square
++matrix of scores, with one row and one column for each amino acid. The last row
++and column of the matrix (corresponding to the * character) contain the minimum
++score over the whole matrix.
++</P>
++<P>
++MATRIX SERIES INPUT FORMAT
++ClustalX uses different matrices depending on the mean percent identity of the
++sequences to be aligned. You can specify a series of matrices and the range of
++the percent identity for each matrix in a matrix series file. The file is
++automatically recognised by the word CLUSTAL_SERIES at the beginning of the
++file. Each matrix in the series is then specified on one line which should
++start with the word MATRIX. This is followed by the lower and upper limits of
++the sequence percent identities for which you want to apply the matrix. The
++final entry on the matrix line is the filename of a Blast format matrix file
++(see above for details of the single matrix file format).
++</P>
++<P>
++Example.
++</P>
++<P>
++CLUSTAL_SERIES
++</P>
++<P>
++MATRIX 81 100 /us1/user/julie/matrices/blosum80
++MATRIX 61 80 /us1/user/julie/matrices/blosum62
++MATRIX 31 60 /us1/user/julie/matrices/blosum45
++MATRIX 0 30 /us1/user/julie/matrices/blosum30
++</P>
++<P>
++</P>
++<P>
++<STRONG>
++PROTEIN GAP PARAMETERS
++</STRONG>
++</P>
++<P>
++RESIDUE SPECIFIC PENALTIES are amino acid specific gap penalties that reduce or
++increase the gap opening penalties at each position in the alignment or
++sequence. See the documentation for details. As an example, positions that are
++rich in glycine are more likely to have an adjacent gap than positions that are
++rich in valine.
++</P>
++<P>
++HYDROPHILIC GAP PENALTIES are used to increase the chances of a gap within a
++run (5 or more residues) of hydrophilic amino acids; these are likely to be
++loop or random coil regions where gaps are more common. The residues that are
++"considered" to be hydrophilic can be entered in HYDROPHILIC RESIDUES.
++</P>
++<P>
++GAP SEPARATION DISTANCE tries to decrease the chances of gaps being too close
++to each other. Gaps that are less than this distance apart are penalised more
++than other gaps. This does not prevent close gaps; it makes them less frequent,
++promoting a block-like appearance of the alignment.
++</P>
++<P>
++END GAP SEPARATION treats end gaps just like internal gaps for the purposes of
++avoiding gaps that are too close (set by GAP SEPARATION DISTANCE above). If you
++turn this off, end gaps will be ignored for this purpose. This is useful when
++you wish to align fragments where the end gaps are not biologically meaningful.
++</P>
++<P>
++</P>
++<P>
++</P>
++<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
++<CENTER><H2><A NAME="P"> Profile and Structure Alignments
++</A></H2></CENTER>
++<P>
++</P>
++<P>
++By PROFILE ALIGNMENT, we mean alignment using existing alignments. Profile
++alignments allow you to store alignments of your favourite sequences and add
++new sequences to them in small bunches at a time. A profile is simply an
++alignment of one or more sequences (e.g. an alignment output file from Clustal
++X). Each input can be a single sequence. One or both sets of input sequences
++may include secondary structure assignments or gap penalty masks to guide the
++alignment.
++</P>
++<P>
++Make sure PROFILE ALIGNMENT MODE is selected, using the switch directly above
++the sequence display area. Then, use the ALIGNMENT menu to do profile and
++secondary structure alignments.
++</P>
++<P>
++The profiles can be in any of the allowed input formats with "-" characters
++used to specify gaps (except for GCG/MSF where "." is used).
++</P>
++<P>
++You have to load the 2 profiles by choosing FILE, LOAD PROFILE 1 and LOAD
++PROFILE 2. Then ALIGNMENT, ALIGN PROFILE 2 TO PROFILE 1 will align the 2
++profiles to each other. Secondary structure masks in either profile can be used
++to guide the alignment. This option compares all the sequences in profile 1
++with all the sequences in profile 2 in order to build guide trees which will be
++used to calculate sequence weights, and select appropriate alignment parameters
++for the final profile alignment.
++</P>
++<P>
++You can skip the first stage (pairwise alignments; guide trees) by using old
++guide tree files (ALIGN PROFILES FROM GUIDE TREES).
++</P>
++<P>
++The ALIGN SEQUENCES TO PROFILE 1 option will take the sequences in the second
++profile and align them to the first profile, 1 at a time. This is useful to
++add some new sequences to an existing alignment, or to align a set of sequences
++to a known structure. In this case, the second profile set need not be
++pre-aligned.
++</P>
++<P>
++You can skip the first stage (pairwise alignments; guide tree) by using an old
++guide tree file (ALIGN SEQUENCES TO PROFILE 1 FROM TREE).
++</P>
++<P>
++SAVE LOG FILE will write the alignment calculation scores to a file. The log
++filename is the same as the input sequence filename, with an extension .log
++appended.
++</P>
++<P>
++The alignment parameters can be set using the ALIGNMENT PARAMETERS menu,
++Pairwise Parameters, Multiple Parameters and Protein Gap Parameters options.
++These are EXACTLY the same parameters as used by the general, automatic
++multiple alignment procedure. The general multiple alignment procedure is
++simply a series of profile alignments. Carrying out a series of profile
++alignments on larger and larger groups of sequences, allows you to manually
++build up a complete alignment, if necessary editing intermediate alignments.
++</P>
++<P>
++<STRONG>
++SECONDARY STRUCTURE PARAMETERS
++</STRONG>
++</P>
++<P>
++Use this menu to set secondary structure options. If a solved structure is
++known, it can be used to guide the alignment by raising gap penalties within
++secondary structure elements, so that gaps will preferentially be inserted into
++unstructured surface loop regions. Alternatively, a user-specified gap penalty
++mask can be supplied for a similar purpose.
++</P>
++<P>
++A gap penalty mask is a series of numbers between 1 and 9, one per position in
++the alignment. Each number specifies how much the gap opening penalty is to be
++raised at that position (raised by multiplying the basic gap opening penalty
++by the number) i.e. a mask figure of 1 at a position means no change
++in gap opening penalty; a figure of 4 means that the gap opening penalty is
++four times greater at that position, making gaps 4 times harder to open.
++</P>
++<P>
++The format for gap penalty masks and secondary structure masks is explained in
++a separate help section.
++</P>
++<P>
++</P>
++<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
++<CENTER><H2><A NAME="B"> Secondary Structure / Gap Penalty Masks
++</A></H2></CENTER>
++<P>
++</P>
++<P>
++The use of secondary structure-based penalties has been shown to improve the
++accuracy of sequence alignment. Clustal X now allows secondary structure/ gap
++penalty masks to be supplied with the input sequences used during profile
++alignment. (NB. The secondary structure information is NOT used during multiple
++sequence alignment). The masks work by raising gap penalties in specified
++regions (typically secondary structure elements) so that gaps are
++preferentially opened in the less well conserved regions (typically surface
++loops).
++</P>
++<P>
++The USE PROFILE 1(2) SECONDARY STRUCTURE / GAP PENALTY MASK options control
++whether the input 2D-structure information or gap penalty masks will be used
++during the profile alignment.
++</P>
++<P>
++The OUTPUT options control whether the secondary structure and gap penalty
++masks should be included in the Clustal X output alignments. Showing both is
++useful for understanding how the masks work. The 2D-structure information is
++itself useful in judging the alignment quality and in seeing how residue
++conservation patterns vary with secondary structure.
++</P>
++<P>
++The HELIX and STRAND GAP PENALTY options provide the value for raising the gap
++penalty at core Alpha Helical (A) and Beta Strand (B) residues. In CLUSTAL
++format, capital residues denote the A and B core structure notation. Basic gap
++penalties are multiplied by the amount specified.
++</P>
++<P>
++The LOOP GAP PENALTY option provides the value for the gap penalty in Loops.
++By default this penalty is not raised. In CLUSTAL format, loops are specified
++by "." in the secondary structure notation.
++</P>
++<P>
++The SECONDARY STRUCTURE TERMINAL PENALTY provides the value for setting the gap
++penalty at the ends of secondary structures. Ends of secondary structures are
++known to grow or shrink, comparing related structures. Therefore by default
++these are given intermediate values, lower than the core penalties. All
++secondary structure read in as lower case in CLUSTAL format gets the reduced
++terminal penalty.
++</P>
++<P>
++The HELIX and STRAND TERMINAL POSITIONS options specify the range of structure
++termini for the intermediate penalties. In the alignment output, these are
++indicated as lower case. For Alpha Helices, by default, the range spans the
++end-helical turn (3 residues). For Beta Strands, the default range spans the
++end residue and the adjacent loop residue, since sequence conservation often
++extends beyond the actual H-bonded Beta Strand.
++</P>
++<P>
++Clustal X can read the masks from SWISS-PROT, CLUSTAL or GDE format input
++files. For many 3-D protein structures, secondary structure information is
++recorded in the feature tables of SWISS-PROT database entries. You should
++always check that the assignments are correct - some are quite inaccurate.
++Clustal X looks for SWISS-PROT HELIX and STRAND assignments e.g.
++</P>
++<P>
++</P>
++<P>
++<PRE>
++FT HELIX 100 115
++FT STRAND 118 119
++</PRE>
++</P>
++<P>
++The structure and penalty masks can also be read from CLUSTAL alignment format
++as comment lines beginning "!SS_" or "!GM_" e.g.
++</P>
++<P>
++<PRE>
++!SS_HBA_HUMA ..aaaAAAAAAAAAAaaa.aaaAAAAAAAAAAaaaaaaAaaa.........aaaAAAAAA
++!GM_HBA_HUMA 112224444444444222122244444444442222224222111111111222444444
++HBA_HUMA VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGK
++</PRE>
++</P>
++<P>
++Note that the mask itself is a set of numbers between 1 and 9 each of which is
++assigned to the residue(s) in the same column below.
++</P>
++<P>
++In GDE flat file format, the masks are specified as text and the names must
++begin with "SS_ or "GM_.
++</P>
++<P>
++Either a structure or penalty mask or both may be used. If both are included
++in an alignment, the user will be asked which is to be used.
++</P>
++<P>
++</P>
++<P>
++</P>
++<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
++<CENTER><H2><A NAME="T"> Phylogenetic Trees
++</A></H2></CENTER>
++<P>
++</P>
++<P>
++Before calculating a tree, you must have an ALIGNMENT in memory. This can be
++input using the FILE menu, LOAD SEQUENCES option or you should have just
++carried out a full multiple alignment and the alignment is still in memory.
++Remember YOU MUST ALIGN THE SEQUENCES FIRST!!!!
++</P>
++<P>
++The method used is the NJ (Neighbour Joining) method of Saitou and Nei. First
++you calculate distances (percent divergence) between all pairs of sequence from
++a multiple alignment; second you apply the NJ method to the distance matrix.
++</P>
++<P>
++To calculate a tree, use the DRAW N-J TREE option. This gives an UNROOTED tree
++and all branch lengths. The root of the tree can only be inferred by using an
++outgroup (a sequence that you are certain branches at the outside of the tree
++.... certain on biological grounds) OR if you assume a degree of constancy in
++the 'molecular clock', you can place the root in the 'middle' of the tree
++(roughly equidistant from all tips).
++</P>
++<P>
++BOOTSTRAP N-J TREE uses a method for deriving confidence values for the
++groupings in a tree (first adapted for trees by Joe Felsenstein). It involves
++making N random samples of sites from the alignment (N should be LARGE, e.g.
++500 - 1000); drawing N trees (1 from each sample) and counting how many times
++each grouping from the original tree occurs in the sample trees. You can set N
++using the NUMBER OF BOOTSTRAP TRIALS option in the BOOTSTRAP TREE window. In
++practice, you should use a large number of bootstrap replicates (1000 is
++recommended, even if it means running the program for an hour on a slow
++computer). You can also supply a seed number for the random number generator
++here. Different runs with the same seed will give the same answer. See the
++documentation for more details.
++</P>
++<P>
++EXCLUDE POSITIONS WITH GAPS? With this option, any alignment positions where
++ANY of the sequences have a gap will be ignored. This means that 'like' will
++be compared to 'like' in all distances, which is highly desirable. It also
++automatically throws away the most ambiguous parts of the alignment, which are
++concentrated around gaps (usually). The disadvantage is that you may throw away
++much of the data if there are many gaps (which is why it is difficult for us to
++make it the default).
++</P>
++<P>
++CORRECT FOR MULTIPLE SUBSTITUTIONS? For small divergence (say <10%) this option
++makes no difference. For greater divergence, this option corrects for the fact
++that observed distances underestimate actual evolutionary distances. This is
++because, as sequences diverge, more than one substitution will happen at many
++sites. However, you only see one difference when you look at the present day
++sequences. Therefore, this option has the effect of stretching branch lengths
++in trees (especially long branches). The corrections used here (for DNA or
++proteins) are both due to Motoo Kimura. See the documentation for details.
++</P>
++<P>
++Where possible, this option should be used. However, for VERY divergent
++sequences, the distances cannot be reliably corrected. You will be warned if
++this happens. Even if none of the distances in a data set exceed the reliable
++threshold, if you bootstrap the data, some of the bootstrap distances may
++randomly exceed the safe limit.
++</P>
++<P>
++SAVE LOG FILE will write the tree calculation scores to a file. The log
++filename is the same as the input sequence filename, with an extension .log
++appended.
++</P>
++<P>
++<H4>
++OUTPUT FORMAT OPTIONS
++</H4>
++</P>
++<P>
++Three different formats are allowed. None of these displays the tree visually.
++You can display the tree using the NJPLOT program distributed with Clustal X
++OR get the PHYLIP package and use the tree drawing facilities there.
++</P>
++<P>
++1) CLUSTAL FORMAT TREE. This format is verbose and lists all of the distances
++between the sequences and the number of alignment positions used for each. The
++tree is described at the end of the file. It lists the sequences that are
++joined at each alignment step and the branch lengths. After two sequences are
++joined, it is referred to later as a NODE. The number of a NODE is the number
++of the lowest sequence in that NODE.
++</P>
++<P>
++2) PHYLIP FORMAT TREE. This format is the New Hampshire format, used by many
++phylogenetic analysis packages. It consists of a series of nested parentheses,
++describing the branching order, with the sequence names and branch lengths. It
++can be read by the NJPLOT program distributed with ClustalX. It can also be
++used by the RETREE, DRAWGRAM and DRAWTREE programs of the PHYLIP package to see
++the trees graphically. This is the same format used during multiple alignment
++for the guide trees. Some other packages that can read and display New
++Hampshire format are TreeTool, TreeView, and Phylowin.
++</P>
++<P>
++3) PHYLIP DISTANCE MATRIX. This format just outputs a matrix of all the
++pairwise distances in a format that can be used by the PHYLIP package. It used
++to be useful when one could not produce distances from protein sequences in the
++Phylip package but is now redundant (PROTDIST of Phylip 3.5 now does this).
++</P>
++<P>
++4) NEXUS FORMAT TREE. This format is used by several popular phylogeny programs,
++including PAUP and MacClade. The format is described fully in:
++Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
++NEXUS: an extensible file format for systematic information.
++Systematic Biology 46:590-621.
++</P>
++<P>
++BOOTSTRAP LABELS ON: By default, the bootstrap values are correctly placed on
++the tree branches of the phylip format output tree. The toggle allows them to
++be placed on the nodes, which is incorrect, but some display packages (e.g.
++TreeTool, TreeView and Phylowin) only support node labelling but not branch
++labelling. Care should be taken to note which branches and labels go together.
++</P>
++<P>
++</P>
++<P>
++</P>
++<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
++<CENTER><H2><A NAME="C"> Colors
++</A></H2></CENTER>
++<P>
++</P>
++<P>
++Clustal X provides a versatile coloring scheme for the sequence alignment
++display. The sequences (or profiles) are colored automatically, when they are
++loaded. Sequences can be colored either by assigning a color to specific
++residues, or on the basis of an alignment consensus. In the latter case, the
++alignment consensus is calculated automatically, and the residues in each
++column are colored according to the consensus character assigned to that
++column. In this way, you can choose to highlight, for example, conserved
++hydrophylic or hydrophobic positions in the alignment.
++</P>
++<P>
++The 'rules' used to color the alignment are specified in a COLOR PARAMETER
++FILE. Clustal X automatically looks for a file called 'colprot.par' for protein
++sequences or 'coldna.par' for DNA, in the current directory. (If your running
++under UNIX, it then looks in your home directory, and finally in the
++directories in your PATH environment variable).
++</P>
++<P>
++By default, if no color parameter file is found, protein sequences are colored
++by residue as follows:
++</P>
++<P>
++<PRE>
++ Color Residue Code
++</P>
++<P>
++ ORANGE GPST
++ RED HKR
++ BLUE FWY
++ GREEN ILMV
++</PRE>
++</P>
++<P>
++In the case of DNA sequences, the default colors are as follows:
++</P>
++<P>
++<PRE>
++ Color Residue Code
++</P>
++<P>
++ ORANGE A
++ RED C
++ BLUE T
++ GREEN G
++</PRE>
++</P>
++<P>
++</P>
++<P>
++The default BACKGROUND COLORING option shows the sequence residues using a
++black character on a colored background. It can be switched off to show
++residues as a colored character on a white background.
++</P>
++<P>
++Either BLACK AND WHITE or DEFAULT COLOR options can be selected. The Color
++option looks first for the color parameter file (as described above) and, if no
++file is found, uses the default residue-specific colors.
++</P>
++<P>
++You can specify your own coloring scheme by using the LOAD COLOR PARAMETER FILE
++option. The format of the color parameter file is described below.
++</P>
++<P>
++<H4>
++COLOR PARAMETER FILE
++</H4>
++</P>
++<P>
++This file is divided into 3 sections:
++</P>
++<P>
++1) the names and rgb values of the colors
++2) the rules for calculating the consensus
++3) the rules for assigning colors to the residues
++</P>
++<P>
++An example file is given here.
++</P>
++<P>
++<PRE>
++ --------------------------------------------------------------------
++ at rgbindex
++RED 0.9 0.1 0.1
++BLUE 0.1 0.1 0.9
++GREEN 0.1 0.9 0.1
++YELLOW 0.9 0.9 0.0
++</P>
++<P>
++ at consensus
++% = 60% w:l:v:i:m:a:f:c:y:h:p
++# = 80% w:l:v:i:m:a:f:c:y:h:p
++- = 50% e:d
+++ = 60% k:r
++q = 50% q:e
++p = 50% p
++n = 50% n
++t = 50% t:s
++</P>
++<P>
++ at color
++g = RED
++p = YELLOW
++t = GREEN if t:%:#
++n = GREEN if n
++w = BLUE if %:#:p
++k = RED if +
++ --------------------------------------------------------------------
++</PRE>
++</P>
++<P>
++The first section is optional and is identified by the header @rgbindex. If
++this section exists, each color used in the file must be named and the rgb
++values specified (on a scale from 0 to 1). If the rgb index section is not
++found, the following set of hard-coded colors will be used.
++</P>
++<P>
++<PRE>
++RED 0.9 0.1 0.1
++BLUE 0.1 0.1 0.9
++GREEN 0.1 0.9 0.1
++ORANGE 0.9 0.7 0.3
++CYAN 0.1 0.9 0.9
++PINK 0.9 0.5 0.5
++MAGENTA 0.9 0.1 0.9
++YELLOW 0.9 0.9 0.0
++</PRE>
++</P>
++<P>
++The second section is optional and is identified by the header @consensus. It
++defines how the consensus is calculated.
++</P>
++<P>
++The format of each consensus parameter is:-
++</P>
++<P>
++<PRE>
++c = n% residue_list
++</P>
++<P>
++ where
++ c is a character used to identify the parameter.
++ n is an integer value used as the percentage cutoff
++ point.
++ residue_list is a list of residues denoted by a single
++ character, delimited by a colon (:).
++</PRE>
++</P>
++<P>
++For example: # = 60% w:l:v:i
++</P>
++<P>
++will assign a consensus character # to any column in the alignment which
++contains more than 60% of the residues w,l,v and i.
++</P>
++<P>
++</P>
++<P>
++The third section is identified by the header @color, and defines how colors
++are assigned to each residue in the alignment.
++</P>
++<P>
++The color parameters can take one of two formats:
++</P>
++<P>
++<PRE>
++1) r = color
++2) r = color if consensus_list
++</P>
++<P>
++ where
++ r is a character used to denote a residue.
++ color is one of the colors in the GDE color lookup table.
++ residue_list is a list of residues denoted by a single
++ character, delimited by a colon (:).
++</PRE>
++</P>
++<P>
++Examples:
++1) g = ORANGE
++</P>
++<P>
++will color all glycines ORANGE, regardless of the consensus.
++</P>
++<P>
++2) w = BLUE if w:%:#
++</P>
++<P>
++will color BLUE any tryptophan which is found in a column with a consensus of
++w, % or #.
++</P>
++<P>
++</P>
++<P>
++</P>
++<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
++<CENTER><H2><A NAME="Q"> Alignment Quality Analysis
++</A></H2></CENTER>
++<P>
++</P>
++<P>
++<H3>
++QUALITY SCORES
++</H3>
++</P>
++<P>
++Clustal X provides an indication of the quality of an alignment by plotting
++a 'conservation score' for each column of the alignment. A high score indicates
++a well-conserved column; a low score indicates low conservation. The quality
++curve is drawn below the alignment.
++</P>
++<P>
++Two methods are also provided to indicate single residues or sequence segments
++which score badly in the alignment.
++</P>
++<P>
++Low-scoring residues are expected to occur at a moderate frequency in all the
++sequences because of their steady divergence due to the natural processes of
++evolution. The most divergent sequences are likely to have the most outliers.
++However, the highlighted residues are especially useful in pointing to
++sequence misalignments. Note that clustering of highlighted residues is a
++strong indication of misalignment. This can arise due to various reasons, for
++example:
++</P>
++<P>
++ 1. Partial or total misalignments caused by a failure in the
++ alignment algorithm. Usually only in difficult alignment cases.
++</P>
++<P>
++ 2. Partial or total misalignments because at least one of the
++ sequences in the given set is partly or completely unrelated to the
++ other sequences. It is up to the user to check that the set of
++ sequences are alignable.
++</P>
++<P>
++ 3. Frameshift translation errors in a protein sequence causing local
++ mismatched regions to be heavily highlighted. These are surprisingly
++ common in database entries. If suspected, a 3-frame translation of
++ the source DNA needs to be examined.
++</P>
++<P>
++Occasionally, highlighted residues may point to regions of some biological
++significance. This might happen for example if a protein alignment contains a
++sequence which has acquired new functions relative to the main sequence set. It
++is important to exclude other explanations, such as error or the natural
++divergence of sequences, before invoking a biological explanation.
++</P>
++<P>
++</P>
++<P>
++<H3>
++LOW-SCORING SEGMENTS
++</H3>
++</P>
++<P>
++Unreliable regions in the alignment can be highlighted using the Low-Scoring
++Segments option. A sequence-weighted profile is used to indicate any segments
++in the sequences which score badly. Because the profile calculation may take
++some time, an option is provided to calculate LOW-SCORING SEGMENTS. The
++segment display can then be toggled on or off without having to repeat the
++time-consuming calculations.
++</P>
++<P>
++For details of the low-scoring segment calculation, see the CALCULATION section
++below.
++</P>
++<P>
++</P>
++<P>
++<H4>
++LOW-SCORING SEGMENT PARAMETERS
++</H4>
++</P>
++<P>
++MINIMUM LENGTH OF SEGMENTS: short segments (or even single residues) can be
++hidden by increasing the minimum length of segments which will be displayed.
++</P>
++<P>
++DNA MARKING SCALE is used to remove less significant segments from the
++highlighted display. Increase the scale to display more segments; decrease the
++scale to remove the least significant.
++</P>
++<P>
++</P>
++<P>
++PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of each
++amino acid to each other. The matrix is used to calculate the sequence-
++weighted profile scores. There are four 'in-built' Log-Odds matrices offered:
++the Gonnet PAM 80, 120, 250, 350 matrices. A more stringent matrix which only
++gives a high score to identities and the most favoured conservative
++substitutions, may be more suitable when the sequences are closely related. For
++more divergent sequences, it is appropriate to use "softer" matrices which give
++a high score to many other frequent substitutions. This option automatically
++recalculates the low-scoring segments.
++</P>
++<P>
++</P>
++<P>
++DNA WEIGHT MATRIX: Two hard-coded matrices are available:
++</P>
++<P>
++1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
++of nucleic acid sequences. X's and N's are treated as matches to any IUB
++ambiguity symbol. All matches score 1.0; all mismatches for IUB symbols score
++0.9.
++</P>
++<P>
++2) CLUSTALW(1.6). The previous system used by ClustalW, in which matches score
++1.0 and mismatches score 0. All matches for IUB symbols also score 0.
++</P>
++<P>
++A new matrix can be read from a file on disk, if the filename consists only
++of lower case characters. The values in the new weight matrix should be
++similarities and should be NEGATIVE for infrequent substitutions.
++</P>
++<P>
++INPUT FORMAT. The format used for a new matrix is the same as the BLAST
++program. Any lines beginning with a # character are assumed to be comments. The
++first non-comment line should contain a list of amino acids in any order, using
++the 1 letter code, followed by a * character. This should be followed by a
++square matrix of scores, with one row and one column for each amino acid. The
++last row and column of the matrix (corresponding to the * character) contain
++the minimum score over the whole matrix.
++</P>
++<P>
++<H4>
++QUALITY SCORE PARAMETERS
++</H4>
++</P>
++<P>
++You can customise the column 'quality scores' plotted underneath the alignment
++display using the following options.
++</P>
++<P>
++SCORE PLOT SCALE: this is a scalar value from 1 to 10, which can be used to
++change the scale of the quality score plot.
++</P>
++<P>
++RESIDUE EXCEPTION CUTOFF: this is a scalar value from 1 to 10, which can be
++used to change the number of residue exceptions which are highlighted in the
++alignment display. (For an explanation of this cutoff, see the CALCULATION OF
++RESIDUE EXCEPTIONS section below.)
++</P>
++<P>
++PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of
++each amino acid to each other.
++</P>
++<P>
++DNA WEIGHT MATRIX: two hard-coded matrices are available: IUB and CLUSTALW(1.6).
++</P>
++<P>
++For more information about the weight matrices, see the help above for
++the Low-scoring Segments Weight Matrix.
++</P>
++<P>
++For details of the quality score calculations, see the CALCULATION section
++below.
++</P>
++<P>
++</P>
++<P>
++<STRONG>
++SHOW LOW-SCORING SEGMENTS
++</STRONG>
++</P>
++<P>
++The low-scoring segment display can be toggled on or off. This option does not
++recalculate the profile scores.
++</P>
++<P>
++</P>
++<P>
++<STRONG>
++SHOW EXCEPTIONAL RESIDUES
++</STRONG>
++</P>
++<P>
++This option highlights individual residues which score badly in the alignment
++quality calculations. Residues which score exceptionally low are highlighted by
++using a white character on a grey background.
++</P>
++<P>
++<STRONG>
++SAVE QUALITY SCORES TO FILE
++</STRONG>
++</P>
++<P>
++The quality scores that are plotted underneath the alignment display can also
++be saved in a text file. Each column in the alignment is written on one line in
++the output file, with the value of the quality score at the end of the line.
++Only the sequences currently selected in the display are written to the file.
++One use for quality scores is to color residues in a protein structure by
++sequence conservation. In this way conserved surface residues can be
++highlighted to locate functional regions such as ligand-binding sites.
++</P>
++<P>
++</P>
++<P>
++<H3>
++CALCULATION OF QUALITY SCORES
++</H3>
++</P>
++<P>
++Suppose we have an alignment of m sequences of length n. Then, the alignment
++can be written as:
++</P>
++<P>
++<PRE>
++ A11 A12 A13 .......... A1n
++ A21 A22 A23 .......... A2n
++ .
++ .
++ Am1 Am2 Am3 .......... Amn
++</PRE>
++</P>
++<P>
++We also have a residue comparison matrix of size R where C(i,j) is the score
++for aligning residue i with residue j.
++</P>
++<P>
++We want to calculate a score for the conservation of the jth position in the
++alignment.
++</P>
++<P>
++To do this, we define an R-dimensional sequence space. For the jth position in
++the alignment, each sequence consists of a single residue which is assigned a
++point S in the space. S has R dimensions, and for sequence i, the rth dimension
++is defined as:
++</P>
++<P>
++<PRE>
++ Sr = C(r,Aij)
++</PRE>
++</P>
++<P>
++We then calculate a consensus value for the jth position in the alignment. This
++value X also has R dimensions, and the rth dimension is defined as:
++</P>
++<P>
++<PRE>
++ Xr = ( SUM (Fij * C(i,r)) ) / m
++ 1<=i<=R
++</PRE>
++</P>
++<P>
++where Fij is the count of residues i at position j in the alignment.
++</P>
++<P>
++Now we can calculate the distance Di between each sequence i and the consensus
++position X in the R-dimensional space.
++</P>
++<P>
++<PRE>
++ Di = SQRT ( SUM (Xr - Sr)(Xr - Sr) )
++ 1<=i<=R
++</P>
++<P>
++</PRE>
++</P>
++<P>
++The quality score for the jth position in the alignment is defined as the mean
++of the sequence distances Di.
++</P>
++<P>
++The score is normalised by multiplying by the percentage of sequences which
++have residues (and not gaps) at this position.
++</P>
++<P>
++<H3>
++CALCULATION OF RESIDUE EXCEPTIONS
++</H3>
++</P>
++<P>
++The jth residue of the ith sequence is considered as an exception if the
++distance Di of the sequence from the consensus value P is greater than (Upper
++Quartile + Inter Quartile Range * Cutoff). The value used as a cutoff for
++displaying exceptions can be set from the SCORE PARAMETERS menu. A high cutoff
++value will only display very significant exceptions; a low value will allow
++more, less significant, exceptions to be highlighted.
++</P>
++<P>
++(NB. Sequences which contain gaps at this position are not included in the
++exception calculation.)
++</P>
++<P>
++</P>
++<P>
++<H3>
++CALCULATION OF LOW-SCORING SEGMENTS
++</H3>
++</P>
++<P>
++Suppose we have an alignment of m sequences of length n. Then, the alignment
++can be written as:
++</P>
++<P>
++<PRE>
++ A11 A12 A13 .......... A1n
++ A21 A22 A23 .......... A2n
++ .
++ .
++ Am1 Am2 Am3 .......... Amn
++</PRE>
++</P>
++<P>
++We also have a residue comparison matrix of size R where C(i,j) is the score
++for aligning residue i with residue j.
++</P>
++<P>
++We calculate sequence weights by building a neighbour-joining tree, in which
++branch lengths are proportional to divergence. Summing the branches by branch
++ownership provides the weights. See (Thompson et al., CABIOS, 10, 19 (1994) and
++Henikoff et al.,JMB, 243, 574 1994).
++</P>
++<P>
++To find the low-scoring segments in a sequence Si, we build a weighted profile
++of the remaining sequences in the alignment. Suppose we find residue r at
++position j in the sequence; then the score for the jth position in the sequence
++is defined as
++</P>
++<P>
++<PRE>
++ Score(Si,j) = Profile(j,r) where Profile(j,r) is the profile score
++ for residue r at position j in the
++ alignment.
++</PRE>
++</P>
++<P>
++These residue scores are summed along the sequence in both forward and backward
++directions. If the sum of the scores is positive, then it is reset to zero.
++Segments which score negatively in both directions are considered as
++'low-scoring' and will be highlighted in the alignment display.
++</P>
++<P>
++</P>
++<P>
++</P>
++<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
++<CENTER><H2><A NAME="9"> Command Line Parameters
++</A></H2></CENTER>
++<CENTER><H3> DATA (sequences)
++</H3></CENTER>
++<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
++<TR>
++<TD><STRONG>Parameter</STRONG></TD>
++<TD><STRONG><EM>Description</EM></STRONG></TD>
++</TR>
++<TR>
++<TD><TT>-PROFILE1=file.ext and -PROFILE2=file.ext </TT></TD>
++<TD><EM>profiles (aligned sequences)</EM></TD>
++</TR>
++</TABLE></CENTER>
++<CENTER><H3> VERBS (do things)
++</H3></CENTER>
++<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
++<TR>
++<TD><STRONG>Parameter</STRONG></TD>
++<TD><STRONG><EM>Description</EM></STRONG></TD>
++</TR>
++<TR>
++<TD><TT>-HELP or -CHECK </TT></TD>
++<TD><EM>outline the command line parameters</EM></TD>
++</TR>
++<TR>
++<TD><TT>-ALIGN </TT></TD>
++<TD><EM>do full multiple alignment </EM></TD>
++</TR>
++<TR>
++<TD><TT>-TREE </TT></TD>
++<TD><EM>calculate NJ tree</EM></TD>
++</TR>
++<TR>
++<TD><TT>-BOOTSTRAP(=n) </TT></TD>
++<TD><EM>bootstrap a NJ tree (n= number of bootstraps; def. = 1000)</EM></TD>
++</TR>
++<TR>
++<TD><TT>-CONVERT </TT></TD>
++<TD><EM>output the input sequences in a different file format</EM></TD>
++</TR>
++</TABLE></CENTER>
++<CENTER><H3> PARAMETERS (set things)
++</H3></CENTER>
++<CENTER><P><STRONG>***General settings:****
++</STRONG></P></CENTER>
++<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
++<TR>
++<TD><STRONG>Parameter</STRONG></TD>
++<TD><STRONG><EM>Description</EM></STRONG></TD>
++</TR>
++<TR>
++<TD><TT>-INTERACTIVE </TT></TD>
++<TD><EM>read command line, then enter normal interactive menus</EM></TD>
++</TR>
++<TR>
++<TD><TT>-QUICKTREE </TT></TD>
++<TD><EM>use FAST algorithm for the alignment guide tree</EM></TD>
++</TR>
++<TR>
++<TD><TT>-TYPE= </TT></TD>
++<TD><EM>PROTEIN or DNA sequences</EM></TD>
++</TR>
++<TR>
++<TD><TT>-NEGATIVE </TT></TD>
++<TD><EM>protein alignment with negative values in matrix</EM></TD>
++</TR>
++<TR>
++<TD><TT>-OUTFILE= </TT></TD>
++<TD><EM>sequence alignment file name</EM></TD>
++</TR>
++<TR>
++<TD><TT>-OUTPUT= </TT></TD>
++<TD><EM>GCG, GDE, PHYLIP, PIR or NEXUS</EM></TD>
++</TR>
++<TR>
++<TD><TT>-OUTORDER= </TT></TD>
++<TD><EM>INPUT or ALIGNED</EM></TD>
++</TR>
++<TR>
++<TD><TT>-CASE= </TT></TD>
++<TD><EM>LOWER or UPPER (for GDE output only)</EM></TD>
++</TR>
++<TR>
++<TD><TT>-SEQNOS= </TT></TD>
++<TD><EM>OFF or ON (for Clustal output only)</EM></TD>
++</TR>
++</TABLE></CENTER>
++<CENTER><H3>***Fast Pairwise Alignments:***
++</H3></CENTER>
++<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
++<TR>
++<TD><STRONG>Parameter</STRONG></TD>
++<TD><STRONG><EM>Description</EM></STRONG></TD>
++</TR>
++<TR>
++<TD><TT>-TOPDIAGS=n </TT></TD>
++<TD><EM>number of best diags.</EM></TD>
++</TR>
++<TR>
++<TD><TT>-WINDOW=n </TT></TD>
++<TD><EM>window around best diags.</EM></TD>
++</TR>
++<TR>
++<TD><TT>-PAIRGAP=n </TT></TD>
++<TD><EM>gap penalty</EM></TD>
++</TR>
++<TR>
++<TD><TT>-SCORE= </TT></TD>
++<TD><EM>PERCENT or ABSOLUTE</EM></TD>
++</TR>
++</TABLE></CENTER>
++<CENTER><H3>***Slow Pairwise Alignments:***
++</H3></CENTER>
++<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
++<TR>
++<TD><STRONG>Parameter</STRONG></TD>
++<TD><STRONG><EM>Description</EM></STRONG></TD>
++</TR>
++<TR>
++<TD><TT>-PWDNAMATRIX= </TT></TD>
++<TD><EM>DNA weight matrix=IUB, CLUSTALW or filename</EM></TD>
++</TR>
++<TR>
++<TD><TT>-PWGAPOPEN=f </TT></TD>
++<TD><EM>gap opening penalty</EM></TD>
++</TR>
++<TR>
++<TD><TT>-PWGAPEXT=f </TT></TD>
++<TD><EM>gap opening penalty</EM></TD>
++</TR>
++</TABLE></CENTER>
++<CENTER><H3>***Multiple Alignments:***
++</H3></CENTER>
++<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
++<TR>
++<TD><STRONG>Parameter</STRONG></TD>
++<TD><STRONG><EM>Description</EM></STRONG></TD>
++</TR>
++<TR>
++<TD><TT>-USETREE= </TT></TD>
++<TD><EM>file for old guide tree</EM></TD>
++</TR>
++<TR>
++<TD><TT>-MATRIX= </TT></TD>
++<TD><EM>Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename</EM></TD>
++</TR>
++<TR>
++<TD><TT>-DNAMATRIX= </TT></TD>
++<TD><EM>DNA weight matrix=IUB, CLUSTALW or filename</EM></TD>
++</TR>
++<TR>
++<TD><TT>-GAPOPEN=f </TT></TD>
++<TD><EM>gap opening penalty</EM></TD>
++</TR>
++<TR>
++<TD><TT>-GAPEXT=f </TT></TD>
++<TD><EM>gap extension penalty</EM></TD>
++</TR>
++<TR>
++<TD><TT>-ENDGAPS </TT></TD>
++<TD><EM>no end gap separation pen.</EM></TD>
++</TR>
++<TR>
++<TD><TT>-GAPDIST=n </TT></TD>
++<TD><EM>gap separation pen. range</EM></TD>
++</TR>
++<TR>
++<TD><TT>-NOPGAP </TT></TD>
++<TD><EM>residue-specific gaps off</EM></TD>
++</TR>
++<TR>
++<TD><TT>-NOHGAP </TT></TD>
++<TD><EM>hydrophilic gaps off</EM></TD>
++</TR>
++<TR>
++<TD><TT>-HGAPRESIDUES= </TT></TD>
++<TD><EM>list hydrophilic res.</EM></TD>
++</TR>
++<TR>
++<TD><TT>-MAXDIV=n </TT></TD>
++<TD><EM>% ident. for delay</EM></TD>
++</TR>
++<TR>
++<TD><TT>-TYPE= </TT></TD>
++<TD><EM>PROTEIN or DNA</EM></TD>
++</TR>
++<TR>
++<TD><TT>-TRANSWEIGHT=f </TT></TD>
++<TD><EM>transitions weighting</EM></TD>
++</TR>
++</TABLE></CENTER>
++<CENTER><H3>***Profile Alignments:***
++</H3></CENTER>
++<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
++<TR>
++<TD><STRONG>Parameter</STRONG></TD>
++<TD><STRONG><EM>Description</EM></STRONG></TD>
++</TR>
++<TR>
++<TD><TT>-NEWTREE1= </TT></TD>
++<TD><EM>file for new guide tree for profile1</EM></TD>
++</TR>
++<TR>
++<TD><TT>-NEWTREE2= </TT></TD>
++<TD><EM>file for new guide tree for profile2</EM></TD>
++</TR>
++<TR>
++<TD><TT>-USETREE1= </TT></TD>
++<TD><EM>file for old guide tree for profile1</EM></TD>
++</TR>
++<TR>
++<TD><TT>-USETREE2= </TT></TD>
++<TD><EM>file for old guide tree for profile2</EM></TD>
++</TR>
++</TABLE></CENTER>
++<CENTER><H3>***Sequence to Profile Alignments:***
++</H3></CENTER>
++<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
++<TR>
++<TD><STRONG>Parameter</STRONG></TD>
++<TD><STRONG><EM>Description</EM></STRONG></TD>
++</TR>
++<TR>
++<TD><TT>-NEWTREE= </TT></TD>
++<TD><EM>file for new guide tree</EM></TD>
++</TR>
++<TR>
++<TD><TT>-USETREE= </TT></TD>
++<TD><EM>file for old guide tree</EM></TD>
++</TR>
++</TABLE></CENTER>
++<CENTER><H3>***Structure Alignments:***
++</H3></CENTER>
++<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
++<TR>
++<TD><STRONG>Parameter</STRONG></TD>
++<TD><STRONG><EM>Description</EM></STRONG></TD>
++</TR>
++<TR>
++<TD><TT>-NOSECSTR2 </TT></TD>
++<TD><EM>do not use secondary structure/gap penalty mask for profile 2</EM></TD>
++</TR>
++<TR>
++<TD><TT>-SECSTROUT=STRUCTURE or MASK or BOTH or NONE </TT></TD>
++<TD><EM>output in alignment file</EM></TD>
++</TR>
++<TR>
++<TD><TT>-HELIXGAP=n </TT></TD>
++<TD><EM>gap penalty for helix core residues </EM></TD>
++</TR>
++<TR>
++<TD><TT>-STRANDGAP=n </TT></TD>
++<TD><EM>gap penalty for strand core residues</EM></TD>
++</TR>
++<TR>
++<TD><TT>-LOOPGAP=n </TT></TD>
++<TD><EM>gap penalty for loop regions</EM></TD>
++</TR>
++<TR>
++<TD><TT>-TERMINALGAP=n </TT></TD>
++<TD><EM>gap penalty for structure termini</EM></TD>
++</TR>
++<TR>
++<TD><TT>-HELIXENDIN=n </TT></TD>
++<TD><EM>number of residues inside helix to be treated as terminal</EM></TD>
++</TR>
++<TR>
++<TD><TT>-HELIXENDOUT=n </TT></TD>
++<TD><EM>number of residues outside helix to be treated as terminal</EM></TD>
++</TR>
++<TR>
++<TD><TT>-STRANDENDIN=n </TT></TD>
++<TD><EM>number of residues inside strand to be treated as terminal</EM></TD>
++</TR>
++<TR>
++<TD><TT>-STRANDENDOUT=n</TT></TD>
++<TD><EM>number of residues outside strand to be treated as terminal </EM></TD>
++</TR>
++</TABLE></CENTER>
++<CENTER><H3>***Trees:***
++</H3></CENTER>
++<CENTER><TABLE ALIGN=ABSCENTER BORDER=1 CELLSPACING=1 CELLPADDING=5>
++<TR>
++<TD><STRONG>Parameter</STRONG></TD>
++<TD><STRONG><EM>Description</EM></STRONG></TD>
++</TR>
++<TR>
++<TD><TT>-SEED=n </TT></TD>
++<TD><EM>seed number for bootstraps</EM></TD>
++</TR>
++<TR>
++<TD><TT>-KIMURA </TT></TD>
++<TD><EM>use Kimura's correction</EM></TD>
++</TR>
++<TR>
++<TD><TT>-TOSSGAPS </TT></TD>
++<TD><EM>ignore positions with gaps</EM></TD>
++</TR>
++<TR>
++<TD><TT>-BOOTLABELS=node OR branch </TT></TD>
++<TD><EM>position of bootstrap values in tree display</EM></TD>
++</TR>
++</TABLE></CENTER>
++</P>
++<A HREF="#INDEX"> <EM>Back to Index</EM> </A>
++<CENTER><H2><A NAME="R"> References
++</A></H2></CENTER>
++<P>
++</P>
++<P>
++<STRONG>
++The ClustalX program is described in the manuscript:
++</STRONG>
++</P>
++<P>
++Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
++The ClustalX windows interface: flexible strategies for multiple sequence
+ alignment aided by quality analysis tools. Nucleic Acids Research, 25:4876-4882.
+ </P>
+ <P>
Added: trunk/packages/clustalw/trunk/debian/patches/clustalx_help.patch
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/clustalx_help.patch (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/clustalx_help.patch 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,1529 @@
+Index: clustalw-1.83/clustalx_help
+===================================================================
+--- clustalw-1.83.orig/clustalx_help
++++ clustalw-1.83/clustalx_help
+@@ -1,4 +1,1524 @@
+
++This is the on-line help file for Clustal X (version 1.81), using the NCBI
++Vibrant Toolkit.
++
++It should be named or defined as: clustalx_help
++except with MSDOS in which case it should be named ClustalX.HLP
++
++For full details of usage and algorithms, please read the CLUSTALW.DOC file.
++
++
++Toby Gibson EMBL, Heidelberg, Germany.
++Des Higgins UCC, Cork, Ireland.
++Julie Thompson/Francois Jeanmougin IGBMC, Strasbourg, France.
++
++
++
++
++>>HELP G <<
++ General help for CLUSTAL X (1.8)
++
++Clustal X is a windows interface for the ClustalW multiple sequence alignment
++program. It provides an integrated environment for performing multiple sequence
++and profile alignments and analysing the results. The sequence alignment is
++displayed in a window on the screen. A versatile coloring scheme has been
++incorporated allowing you to highlight conserved features in the alignment.
++The pull-down menus at the top of the window allow you to select all the
++options required for traditional multiple sequence and profile alignment.
++
++You can cut-and-paste sequences to change the order of the alignment; you can
++select a subset of sequences to be aligned; you can select a sub-range of the
++alignment to be realigned and inserted back into the original alignment.
++
++Alignment quality analysis can be performed and low-scoring segments or
++exceptional residues can be highlighted.
++
++ClustalX is available for a number of different platforms including: SUN
++Solaris, IRIX5.3 on Silicon Graphics, Digital UNIX on DECStations, Microsoft
++Windows (32 bit) for PC's, Linux ELF for x86 PC's and Macintosh PowerMac. (See
++the README file for Installation instructions.)
++
++
++<H4>
++SEQUENCE INPUT
++</H4>
++
++Sequences and profiles (a term for pre-existing alignments) are input using
++the FILE menu. Invalid options will be disabled. All sequences must be included
++into 1 file. 7 formats are automatically recognised: NBRF/PIR, EMBL/SWISSPROT,
++Pearson (Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9 RSF and GDE flat file.
++All non-alphabetic characters (spaces, digits, punctuation marks) are ignored
++except "-" which is used to indicate a GAP ("." in MSF/RSF).
++
++<H4>
++SEQUENCE / PROFILE ALIGNMENTS
++</H4>
++
++Clustal X has two modes which can be selected using the switch directly above
++the sequence display: MULTIPLE ALIGNMENT MODE and PROFILE ALIGNMENT MODE.
++
++To do a MULTIPLE ALIGNMENT on a set of sequences, make sure MULTIPLE ALIGNMENT
++MODE is selected. A single sequence data area is then displayed. The ALIGNMENT
++menu then allows you to either produce a guide tree for the alignment, or to do
++a multiple alignment following the guide tree, or to do a full multiple
++alignment.
++
++In PROFILE ALIGNMENT MODE, two sequence data areas are displayed, allowing you
++to align 2 alignments (termed profiles). Profiles are also used to add a new
++sequence to an old alignment, or to use secondary structure to guide the
++alignment process. GAPS in the old alignments are indicated using the "-"
++character. PROFILES can be input in ANY of the allowed formats; just use "-"
++(or "." for MSF/RSF) for each gap position. In Profile Alignment Mode, a button
++"Lock Scroll" is displayed which allows you to scroll the two profiles together
++using a single scroll bar. When the Lock Scroll is turned off, the two profiles
++can be scrolled independently.
++
++<H4>
++PHYLOGENETIC TREES
++</H4>
++
++Phylogenetic trees can be calculated from old alignments (read in with "-"
++characters to indicate gaps) OR after a multiple alignment while the alignment
++is still displayed.
++
++<H4>
++ALIGNMENT DISPLAY
++</H4>
++
++The alignment is displayed on the screen with the sequence names on the left
++hand side. The sequence alignment is for display only, it cannot be edited here
++(except for changing the sequence order by cutting-and-pasting on the sequence
++names).
++
++A ruler is displayed below the sequences, starting at 1 for the first residue
++position (residue numbers in the sequence input file are ignored).
++
++A line above the alignment is used to mark strongly conserved positions. Three
++characters ('*', ':' and '.') are used:
++
++'*' indicates positions which have a single, fully conserved residue
++
++':' indicates that one of the following 'strong' groups is fully conserved:-
++<PRE>
++ STA
++ NEQK
++ NHQK
++ NDEQ
++ QHRK
++ MILV
++ MILF
++ HY
++ FYW
++</PRE>
++
++'.' indicates that one of the following 'weaker' groups is fully conserved:-
++<PRE>
++ CSA
++ ATV
++ SAG
++ STNK
++ STPA
++ SGND
++ SNDEQK
++ NDEQHK
++ NEQHRK
++ FVLIM
++ HFY
++</PRE>
++
++These are all the positively scoring groups that occur in the Gonnet Pam250
++matrix. The strong and weak groups are defined as strong score >0.5 and weak
++score =<0.5 respectively.
++
++For profile alignments, secondary structure and gap penalty masks are displayed
++above the sequences, if any data is found in the profile input file.
++
++
++>>HELP F <<
++ Input / Output Files
++
++LOAD SEQUENCES reads sequences from one of 7 file formats, replacing any
++sequences that are already loaded. All sequences must be in 1 file. The formats
++that are automatically recognised are: NBRF/PIR, EMBL/SWISSPROT, Pearson
++(Fasta), Clustal (*.aln), GCG/MSF (Pileup), GCG9/RSF and GDE flat file. All
++non-alphabetic characters (spaces, digits, punctuation marks) are ignored
++except "-" which is used to indicate a GAP ("." in MSF/RSF).
++
++The program tries to automatically recognise the different file formats used
++and to guess whether the sequences are amino acid or nucleotide. This is not
++always foolproof.
++
++FASTA and NBRF/PIR formats are recognised by having a ">" as the first
++character in the file.
++
++EMBL/Swiss Prot formats are recognised by the letters "ID" at the start of the
++file (the token for the entry name field).
++
++CLUSTAL format is recognised by the word CLUSTAL at the beginning of the file.
++
++GCG/MSF format is recognised by one of the following:
++<UL>
++<LI>
++ - the word PileUp at the start of the file.
++</LI><LI>
++ - the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT
++ at the start of the file.
++</LI><LI>
++ - the word MSF on the first line of the file, and the characters ..
++ at the end of this line.
++</LI>
++</UL>
++
++GCG/RSF format is recognised by the word !!RICH_SEQUENCE at the beginning of
++the file.
++
++
++If 85% or more of the characters in the sequence are from A,C,G,T,U or N, the
++sequence will be assumed to be nucleotide. This works in 97.3% of cases but
++watch out!
++
++APPEND SEQUENCES is only valid in MULTIPLE ALIGNMENT MODE. The input sequences
++do not replace those already loaded, but are appended at the end of the
++alignment.
++
++SAVE SEQUENCES AS... offers the user a choice of one of six output formats:
++CLUSTAL, NBRF/PIR, GCG/MSF, PHYLIP, NEXUS or GDE. All sequences are written
++to a single file. Options are available to save a range of the alignment,
++switch between UPPER/LOWER case for GDE files, and to output SEQUENCE NUMBERING
++for CLUSTAL files.
++
++LOAD PROFILE 1 reads sequences in the same 7 file formats, replacing any
++sequences already loaded as Profile 1. This option will also remove any
++sequences which are loaded in Profile 2.
++
++LOAD PROFILE 2 reads sequences in the same 7 file formats, replacing any
++sequences already loaded as Profile 2.
++
++SAVE PROFILE 1 AS... is similar to the Save Sequences option except that only
++those sequences in Profile 1 will be written to the output file.
++
++SAVE PROFILE 2 AS... is similar to the Save Sequences option except that only
++those sequences in Profile 2 will be written to the output file.
++
++WRITE ALIGNMENT AS POSTSCRIPT will write the sequence display to a postscript
++format file. This will include any secondary structure / gap penalty mask
++information and the consensus and ruler lines which are displayed on the
++screen. The Alignment Quality curve can be optionally included in the output
++file.
++
++WRITE PROFILE 1 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
++except that only the profile 1 display will be printed.
++
++WRITE PROFILE 2 AS POSTSCRIPT is similar to WRITE ALIGNMENT AS POSTSCRIPT
++except that only the profile 2 display will be printed.
++
++
++<H4>
++POSTSCRIPT PARAMETERS
++</H4>
++
++A number of options are available to allow you to configure your postscript
++output file.
++
++PS COLORS FILE:
++
++The exact RGB values required to reproduce the colors used in the alignment
++window will vary from printer to printer. A PS colors file can be specified
++that contains the RGB values for all the colors required by each of your
++postscript printers.
++
++By default, Clustal X looks for a file called 'colprint.par' in the current
++directory (if your running under UNIX, it then looks in your home directory,
++and finally in the directories in your PATH environment variable). If no PS
++colors file is found or a color used on the screen is not defined here, the
++screen RGB values (from the Color Parameter File) are used.
++
++The PS colors file consists of one line for each color to be defined, with the
++color name followed by the RGB values (on a scale of 0 to 1). For example,
++
++RED 0.9 0.1 0.1
++
++Blank lines and comments (lines beginning with a '#' character) are ignored.
++
++
++PAGE SIZE: The alignment can be displayed on either A4, A3 or US Letter size
++pages.
++
++ORIENTATION: The alignment can be displayed on either a landscape or portrait
++page.
++
++PRINT HEADER: An optional header including the postscript filename, and
++creation date can be printed at the top of each page.
++
++PRINT QUALITY CURVE: The Alignment Quality curve which is displayed underneath
++the alignment on the screen can be included in the postscript output.
++
++PRINT RULER: The ruler which is displayed underneath the alignment on the
++screen can be included in the postscript output.
++
++PRINT RESIDUE NUMBERS: Sequence residue numbers can be printed at the right
++hand side of the alignment.
++
++RESIZE TO FIT PAGE: By default, the alignment is scaled to fit the page size
++selected. This option can be turned off, in which case a font size of 10 will
++be used for the sequences.
++
++PRINT FROM POSITION/TO: A range of the alignment can be printed. The default
++is to print the full alignment. The first and last residues to be printed are
++specified here.
++
++USE BLOCK LENGTH: The alignment can be divided into blocks of residues. The
++number of residues in a block is specified here. More than one block may then
++be printed on a single page. This is useful for long alignments of a small
++number of sequences. If the block length is set to 0, The alignment will not
++be divided into blocks, but printed across a number of pages.
++
++>>HELP E <<
++ Editing Alignments
++
++Clustal X allows you to change the order of the sequences in the alignment, by
++cutting-and-pasting the sequence names.
++
++To select a group of sequences to be moved, click on a sequence name and drag
++the cursor until all the required sequences are highlighted. Holding down the
++Shift key when clicking on the first name will add new sequences to those
++already selected.
++
++(Options are provided to Select All Sequences, Select Profile 1 or Select
++Profile 2.)
++
++The selected sequences can be removed from the alignment by using the EDIT
++menu, CUT option.
++
++To add the cut sequences back into an alignment, select a sequence by clicking
++on the sequence name. The cut sequences will be added to the alignment,
++immediately following the selected sequence, by the EDIT menu, PASTE option.
++
++To add the cut sequences to an empty alignment (eg. when cutting sequences from
++Profile 1 and pasting them to Profile 2), click on the empty sequence name
++display area, and select the EDIT menu, PASTE option as before.
++
++The sequence selection and sequence range selection can be cleared using the
++EDIT menu, CLEAR SEQUENCE SELECTION and CLEAR RANGE SELECTION options
++respectively.
++
++To search for a string of residues in the sequences, select the sequences to be
++searched by clicking on the sequence names. You can then enter the string to
++search for by selecting the SEARCH FOR STRING option. If the string is found in
++any of the sequences selected, the sequence name and column number is printed
++below the sequence display.
++
++In PROFILE ALIGNMENT MODE, the two profiles can be merged (normally done after
++alignment) by selecting ADD PROFILE 2 TO PROFILE 1. The sequences currently
++displayed as Profile 2 will be appended to Profile 1.
++
++The REMOVE ALL GAPS option will remove all gaps from the sequences currently
++selected.
++WARNING: This option removes ALL gaps, not only those introduced by ClustalX,
++but also those that were read from the input alignment file. Any secondary
++structure information associated with the alignment will NOT be automatically
++realigned.
++
++The REMOVE GAP-ONLY COLUMNS will remove those positions in the alignment which
++contain gaps in all sequences. This can occur as a result of removing divergent
++sequences from an alignment, or if an alignment has been realigned.
++
++>>HELP M <<
++ Multiple Alignments
++
++Make sure MULTIPLE ALIGNMENT MODE is selected, using the switch directly above
++the sequence display area. Then, use the ALIGNMENT menu to do multiple
++alignments.
++
++Multiple alignments are carried out in 3 stages:
++
++1) all sequences are compared to each other (pairwise alignments);
++
++2) a dendrogram (like a phylogenetic tree) is constructed, describing the
++approximate groupings of the sequences by similarity (stored in a file).
++
++3) the final multiple alignment is carried out, using the dendrogram as a guide.
++
++The 3 stages are carried out automatically by the DO COMPLETE ALIGNMENT option.
++You can skip the first stages (pairwise alignments; guide tree) by using an old
++guide tree file (DO ALIGNMENT FROM GUIDE TREE); or you can just produce the
++guide tree with no final multiple alignment (PRODUCE GUIDE TREE ONLY).
++
++
++REALIGN SELECTED SEQUENCES is used to realign badly aligned sequences in the
++alignment. Sequences can be selected by clicking on the sequence names - see
++Editing Alignments for more details. The unselected sequences are then 'fixed'
++and a profile is made including only the unselected sequences. Each of the
++selected sequences in turn is then realigned to this profile. The realigned
++sequences will be displayed as a group at the end the alignment.
++
++
++REALIGN SELECTED SEQUENCE RANGE is used to realign a small region of the
++alignment. A residue range can be selected by clicking on the sequence display
++area. A multiple alignment is then performed, following the 3 stages described
++above, but only using the selected residue range. Finally the new alignment of
++the range is pasted back into the full sequence alignment.
++
++By default, gap penalties are used at each end of the subrange in order to
++penalise terminal gaps. If the REALIGN SEGMENT END GAP PENALTIES option is
++switched off, gaps can be introduced at the ends of the residue range at no
++cost.
++
++
++ALIGNMENT PARAMETERS displays a sub-menu with the following options:
++
++RESET NEW GAPS BEFORE ALIGNMENT will remove any new gaps introduced into the
++sequences during multiple alignment if you wish to change the parameters and
++try again. This only takes effect just before you do a second multiple
++alignment. You can make phylogenetic trees after alignment whether or not this
++is ON. If you turn this OFF, the new gaps are kept even if you do a second
++multiple alignment. This allows you to iterate the alignment gradually.
++Sometimes, the alignment is improved by a second or third pass.
++
++RESET ALL GAPS BEFORE ALIGNMENT will remove all gaps in the sequences including
++gaps which were read in from the sequence input file. This only takes effect
++just before you do a second multiple alignment. You can make phylogenetic
++trees after alignment whether or not this is ON. If you turn this OFF, all
++gaps are kept even if you do a second multiple alignment. This allows you to
++iterate the alignment gradually. Sometimes, the alignment is improved by a
++second or third pass.
++
++
++PAIRWISE ALIGNMENT PARAMETERS control the speed/sensitivity of the initial
++alignments.
++
++MULTIPLE ALIGNMENT PARAMETERS control the gaps in the final multiple
++alignments.
++
++PROTEIN GAP PARAMETERS displays a temporary window which allows you to set
++various parameters only used in the alignment of protein sequences.
++
++(SECONDARY STRUCTURE PARAMETERS, for use with the Profile Alignment Mode only,
++allows you to set various parameters only used with gap penalty masks.)
++
++SAVE LOG FILE will write the alignment calculation scores to a file. The log
++filename is the same as the input sequence filename, with an extension .log
++appended.
++
++
++<H4>
++OUTPUT FORMAT OPTIONS
++</H4>
++
++You can choose from 6 different alignment formats (CLUSTAL, GCG, NBRF/PIR,
++PHYLIP, GDE and NEXUS). You can choose more than one (or all 6 if you wish).
++
++CLUSTAL format output is a self explanatory alignment format. It shows the
++sequences aligned in blocks. It can be read in again at a later date to (for
++example) calculate a phylogenetic tree or add in new sequences by profile
++alignment.
++
++GCG output can be used by any of the GCG programs that can work on multiple
++alignments (e.g. PRETTY, PROFILEMAKE, PLOTALIGN). It is the same as the GCG
++.msf format files (multiple sequence file); new in version 7 of GCG.
++
++NEXUS format is used by several phylogeny programs, including PAUP and
++MacClade.
++
++PHYLIP format output can be used for input to the PHYLIP package of Joe
++Felsenstein. This is a very widely used package for doing every imaginable
++form of phylogenetic analysis (MUCH more than the the modest introduction
++offered by this program).
++
++NBRF/PIR: this is the same as the standard PIR format with ONE ADDITION. Gap
++characters "-" are used to indicate the positions of gaps in the multiple
++alignment. These files can be re-used as input in any part of clustal that
++allows sequences (or alignments or profiles) to be read in.
++
++GDE: this format is used by the GDE package of Steven Smith and is understood
++by SEQLAB in GCG 9 or later.
++
++GDE OUTPUT CASE: sequences in GDE format may be written in either upper or
++lower case.
++
++CLUSTALW SEQUENCE NUMBERS: residue numbers may be added to the end of the
++alignment lines in clustalw format.
++
++OUTPUT ORDER is used to control the order of the sequences in the output
++alignments. By default, it uses the order in which the sequences were aligned
++(from the guide tree/dendrogram), thus automatically grouping closely related
++sequences. It can be switched to be the same as the original input order.
++
++PARAMETER OUTPUT: This option will save all your parameter settings in a
++parameter file (suffix .par) during alignment. The file can be subsequently
++used to rerun ClustalW using the same parameters.
++
++
++<H3>
++ALIGNMENT PARAMETERS
++</H3>
++--------------------
++
++<STRONG>
++PAIRWISE ALIGNMENT PARAMETERS
++</STRONG>
++
++A distance is calculated between every pair of sequences and these are used to
++construct the phylogenetic tree which guides the final multiple alignment. The
++scores are calculated from separate pairwise alignments. These can be
++calculated using 2 methods: dynamic programming (slow but accurate) or by the
++method of Wilbur and Lipman (extremely fast but approximate).
++
++You can choose between the 2 alignment methods using the PAIRWISE ALIGNMENTS
++option. The slow/accurate method is fast enough for short sequences but will be
++VERY SLOW for many (e.g. >100) long (e.g. >1000 residue) sequences.
++
++
++<STRONG>
++SLOW-ACCURATE alignment parameters:
++</STRONG>
++
++These parameters do not have any affect on the speed of the alignments. They
++are used to give initial alignments which are then rescored to give percent
++identity scores. These % scores are the ones which are displayed on the
++screen. The scores are converted to distances for the trees.
++
++Gap Open Penalty: the penalty for opening a gap in the alignment.
++
++Gap Extension Penalty: the penalty for extending a gap by 1 residue.
++
++Protein Weight Matrix: the scoring table which describes the similarity of
++each amino acid to each other.
++
++Load protein matrix: allows you to read in a comparison table from a file.
++
++DNA weight matrix: the scores assigned to matches and mismatches (including
++IUB ambiguity codes).
++
++Load DNA matrix: allows you to read in a comparison table from a file.
++
++See the Multiple alignment parameters, MATRIX option below for details of the
++matrix input format.
++
++
++<STRONG>
++FAST-APPROXIMATE alignment parameters:
++</STRONG>
++
++These similarity scores are calculated from fast, approximate, global align-
++ments, which are controlled by 4 parameters. 2 techniques are used to make
++these alignments very fast: 1) only exactly matching fragments (k-tuples) are
++considered; 2) only the 'best' diagonals (the ones with most k-tuple matches)
++are used.
++
++GAP PENALTY: This is a penalty for each gap in the fast alignments. It has
++little effect on the speed or sensitivity except for extreme values.
++
++K-TUPLE SIZE: This is the size of exactly matching fragment that is used.
++INCREASE for speed (max= 2 for proteins; 4 for DNA), DECREASE for sensitivity.
++For longer sequences (e.g. >1000 residues) you may wish to increase the
++default.
++
++TOP DIAGONALS: The number of k-tuple matches on each diagonal (in an imaginary
++dot-matrix plot) is calculated. Only the best ones (with most matches) are used
++in the alignment. This parameter specifies how many. Decrease for speed;
++increase for sensitivity.
++
++WINDOW SIZE: This is the number of diagonals around each of the 'best'
++diagonals that will be used. Decrease for speed; increase for sensitivity.
++
++
++<STRONG>
++MULTIPLE ALIGNMENT PARAMETERS
++</STRONG>
++
++These parameters control the final multiple alignment. This is the core of the
++program and the details are complicated. To fully understand the use of the
++parameters and the scoring system, you will have to refer to the documentation.
++
++Each step in the final multiple alignment consists of aligning two alignments
++or sequences. This is done progressively, following the branching order in the
++GUIDE TREE. The basic parameters to control this are two gap penalties and the
++scores for various identical/non-indentical residues.
++
++The GAP OPENING and EXTENSION PENALTIES can be set here. These control the
++cost of opening up every new gap and the cost of every item in a gap.
++Increasing the gap opening penalty will make gaps less frequent. Increasing
++the gap extension penalty will make gaps shorter. Terminal gaps are not
++penalised.
++
++The DELAY DIVERGENT SEQUENCES switch delays the alignment of the most distantly
++related sequences until after the most closely related sequences have been
++aligned. The setting shows the percent identity level required to delay the
++addition of a sequence; sequences that are less identical than this level to
++any other sequences will be aligned later.
++
++The TRANSITION WEIGHT gives transitions (A<-->G or C<-->T i.e. purine-purine or
++pyrimidine-pyrimidine substitutions) a weight between 0 and 1; a weight of zero
++means that the transitions are scored as mismatches, while a weight of 1 gives
++the transitions the match score. For distantly related DNA sequences, the
++weight should be near to zero; for closely related sequences it can be useful
++to assign a higher score. The default is set to 0.5.
++
++
++The PROTEIN WEIGHT MATRIX option allows you to choose a series of weight
++matrices. For protein alignments, you use a weight matrix to determine the
++similarity of non-identical amino acids. For example, Tyr aligned with Phe is
++usually judged to be 'better' than Tyr aligned with Pro.
++
++There are three 'in-built' series of weight matrices offered. Each consists of
++several matrices which work differently at different evolutionary distances. To
++see the exact details, read the documentation. Crudely, we store several
++matrices in memory, spanning the full range of amino acid distance (from almost
++identical sequences to highly divergent ones). For very similar sequences, it
++is best to use a strict weight matrix which only gives a high score to
++identities and the most favoured conservative substitutions. For more divergent
++sequences, it is appropriate to use "softer" matrices which give a high score
++to many other frequent substitutions.
++
++1) BLOSUM (Henikoff). These matrices appear to be the best available for
++carrying out data base similarity (homology searches). The matrices currently
++used are: Blosum 80, 62, 45 and 30. BLOSUM was the default in earlier Clustal X
++versions.
++
++2) PAM (Dayhoff). These have been extremely widely used since the late '70s. We
++currently use the PAM 20, 60, 120, 350 matrices.
++
++3) GONNET. These matrices were derived using almost the same procedure as the
++Dayhoff one (above) but are much more up to date and are based on a far larger
++data set. They appear to be more sensitive than the Dayhoff series. We
++currently use the GONNET 80, 120, 160, 250 and 350 matrices. This series is the
++default for Clustal X version 1.8.
++
++We also supply an identity matrix which gives a score of 10 to two identical
++amino acids and a score of zero otherwise. This matrix is not very useful.
++
++Load protein matrix: allows you to read in a comparison matrix from a file.
++This can be either a single matrix or a series of matrices (see below for
++format).
++
++
++DNA WEIGHT MATRIX option allows you to select a single matrix (not a series)
++used for aligning nucleic acid sequences. Two hard-coded matrices are available:
++
++1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
++of nucleic acid sequences. X's and N's are treated as matches to any IUB
++ambiguity symbol. All matches score 1.9; all mismatches for IUB symbols score 0.
++
++2) CLUSTALW(1.6). A previous system used by ClustalW, in which matches score
++1.0 and mismatches score 0. All matches for IUB symbols also score 0.
++
++Load DNA matrix: allows you to read in a nucleic acid comparison matrix from a
++file (just one matrix, not a series).
++
++
++SINGLE MATRIX INPUT FORMAT
++The format used for a single matrix is the same as the BLAST program. The
++scores in the new weight matrix should be similarities. You can use negative as
++well as positive values if you wish, although the matrix will be automatically
++adjusted to all positive scores, unless the NEGATIVE MATRIX option is selected.
++Any lines beginning with a # character are assumed to be comments. The first
++non-comment line should contain a list of amino acids in any order, using the 1
++letter code, followed by a * character. This should be followed by a square
++matrix of scores, with one row and one column for each amino acid. The last row
++and column of the matrix (corresponding to the * character) contain the minimum
++score over the whole matrix.
++
++MATRIX SERIES INPUT FORMAT
++ClustalX uses different matrices depending on the mean percent identity of the
++sequences to be aligned. You can specify a series of matrices and the range of
++the percent identity for each matrix in a matrix series file. The file is
++automatically recognised by the word CLUSTAL_SERIES at the beginning of the
++file. Each matrix in the series is then specified on one line which should
++start with the word MATRIX. This is followed by the lower and upper limits of
++the sequence percent identities for which you want to apply the matrix. The
++final entry on the matrix line is the filename of a Blast format matrix file
++(see above for details of the single matrix file format).
++
++Example.
++
++CLUSTAL_SERIES
++
++MATRIX 81 100 /us1/user/julie/matrices/blosum80
++MATRIX 61 80 /us1/user/julie/matrices/blosum62
++MATRIX 31 60 /us1/user/julie/matrices/blosum45
++MATRIX 0 30 /us1/user/julie/matrices/blosum30
++
++
++<STRONG>
++PROTEIN GAP PARAMETERS
++</STRONG>
++
++RESIDUE SPECIFIC PENALTIES are amino acid specific gap penalties that reduce or
++increase the gap opening penalties at each position in the alignment or
++sequence. See the documentation for details. As an example, positions that are
++rich in glycine are more likely to have an adjacent gap than positions that are
++rich in valine.
++
++HYDROPHILIC GAP PENALTIES are used to increase the chances of a gap within a
++run (5 or more residues) of hydrophilic amino acids; these are likely to be
++loop or random coil regions where gaps are more common. The residues that are
++"considered" to be hydrophilic can be entered in HYDROPHILIC RESIDUES.
++
++GAP SEPARATION DISTANCE tries to decrease the chances of gaps being too close
++to each other. Gaps that are less than this distance apart are penalised more
++than other gaps. This does not prevent close gaps; it makes them less frequent,
++promoting a block-like appearance of the alignment.
++
++END GAP SEPARATION treats end gaps just like internal gaps for the purposes of
++avoiding gaps that are too close (set by GAP SEPARATION DISTANCE above). If you
++turn this off, end gaps will be ignored for this purpose. This is useful when
++you wish to align fragments where the end gaps are not biologically meaningful.
++
++
++>>HELP P <<
++ Profile and Structure Alignments
++
++By PROFILE ALIGNMENT, we mean alignment using existing alignments. Profile
++alignments allow you to store alignments of your favourite sequences and add
++new sequences to them in small bunches at a time. A profile is simply an
++alignment of one or more sequences (e.g. an alignment output file from Clustal
++X). Each input can be a single sequence. One or both sets of input sequences
++may include secondary structure assignments or gap penalty masks to guide the
++alignment.
++
++Make sure PROFILE ALIGNMENT MODE is selected, using the switch directly above
++the sequence display area. Then, use the ALIGNMENT menu to do profile and
++secondary structure alignments.
++
++The profiles can be in any of the allowed input formats with "-" characters
++used to specify gaps (except for GCG/MSF where "." is used).
++
++You have to load the 2 profiles by choosing FILE, LOAD PROFILE 1 and LOAD
++PROFILE 2. Then ALIGNMENT, ALIGN PROFILE 2 TO PROFILE 1 will align the 2
++profiles to each other. Secondary structure masks in either profile can be used
++to guide the alignment. This option compares all the sequences in profile 1
++with all the sequences in profile 2 in order to build guide trees which will be
++used to calculate sequence weights, and select appropriate alignment parameters
++for the final profile alignment.
++
++You can skip the first stage (pairwise alignments; guide trees) by using old
++guide tree files (ALIGN PROFILES FROM GUIDE TREES).
++
++The ALIGN SEQUENCES TO PROFILE 1 option will take the sequences in the second
++profile and align them to the first profile, 1 at a time. This is useful to
++add some new sequences to an existing alignment, or to align a set of sequences
++to a known structure. In this case, the second profile set need not be
++pre-aligned.
++
++You can skip the first stage (pairwise alignments; guide tree) by using an old
++guide tree file (ALIGN SEQUENCES TO PROFILE 1 FROM TREE).
++
++SAVE LOG FILE will write the alignment calculation scores to a file. The log
++filename is the same as the input sequence filename, with an extension .log
++appended.
++
++The alignment parameters can be set using the ALIGNMENT PARAMETERS menu,
++Pairwise Parameters, Multiple Parameters and Protein Gap Parameters options.
++These are EXACTLY the same parameters as used by the general, automatic
++multiple alignment procedure. The general multiple alignment procedure is
++simply a series of profile alignments. Carrying out a series of profile
++alignments on larger and larger groups of sequences, allows you to manually
++build up a complete alignment, if necessary editing intermediate alignments.
++
++<STRONG>
++SECONDARY STRUCTURE PARAMETERS
++</STRONG>
++
++Use this menu to set secondary structure options. If a solved structure is
++known, it can be used to guide the alignment by raising gap penalties within
++secondary structure elements, so that gaps will preferentially be inserted into
++unstructured surface loop regions. Alternatively, a user-specified gap penalty
++mask can be supplied for a similar purpose.
++
++A gap penalty mask is a series of numbers between 1 and 9, one per position in
++the alignment. Each number specifies how much the gap opening penalty is to be
++raised at that position (raised by multiplying the basic gap opening penalty
++by the number) i.e. a mask figure of 1 at a position means no change
++in gap opening penalty; a figure of 4 means that the gap opening penalty is
++four times greater at that position, making gaps 4 times harder to open.
++
++The format for gap penalty masks and secondary structure masks is explained in
++a separate help section.
++
++>>HELP B <<
++ Secondary Structure / Gap Penalty Masks
++
++The use of secondary structure-based penalties has been shown to improve the
++accuracy of sequence alignment. Clustal X now allows secondary structure/ gap
++penalty masks to be supplied with the input sequences used during profile
++alignment. (NB. The secondary structure information is NOT used during multiple
++sequence alignment). The masks work by raising gap penalties in specified
++regions (typically secondary structure elements) so that gaps are
++preferentially opened in the less well conserved regions (typically surface
++loops).
++
++The USE PROFILE 1(2) SECONDARY STRUCTURE / GAP PENALTY MASK options control
++whether the input 2D-structure information or gap penalty masks will be used
++during the profile alignment.
++
++The OUTPUT options control whether the secondary structure and gap penalty
++masks should be included in the Clustal X output alignments. Showing both is
++useful for understanding how the masks work. The 2D-structure information is
++itself useful in judging the alignment quality and in seeing how residue
++conservation patterns vary with secondary structure.
++
++The HELIX and STRAND GAP PENALTY options provide the value for raising the gap
++penalty at core Alpha Helical (A) and Beta Strand (B) residues. In CLUSTAL
++format, capital residues denote the A and B core structure notation. Basic gap
++penalties are multiplied by the amount specified.
++
++The LOOP GAP PENALTY option provides the value for the gap penalty in Loops.
++By default this penalty is not raised. In CLUSTAL format, loops are specified
++by "." in the secondary structure notation.
++
++The SECONDARY STRUCTURE TERMINAL PENALTY provides the value for setting the gap
++penalty at the ends of secondary structures. Ends of secondary structures are
++known to grow or shrink, comparing related structures. Therefore by default
++these are given intermediate values, lower than the core penalties. All
++secondary structure read in as lower case in CLUSTAL format gets the reduced
++terminal penalty.
++
++The HELIX and STRAND TERMINAL POSITIONS options specify the range of structure
++termini for the intermediate penalties. In the alignment output, these are
++indicated as lower case. For Alpha Helices, by default, the range spans the
++end-helical turn (3 residues). For Beta Strands, the default range spans the
++end residue and the adjacent loop residue, since sequence conservation often
++extends beyond the actual H-bonded Beta Strand.
++
++Clustal X can read the masks from SWISS-PROT, CLUSTAL or GDE format input
++files. For many 3-D protein structures, secondary structure information is
++recorded in the feature tables of SWISS-PROT database entries. You should
++always check that the assignments are correct - some are quite inaccurate.
++Clustal X looks for SWISS-PROT HELIX and STRAND assignments e.g.
++
++
++<PRE>
++FT HELIX 100 115
++FT STRAND 118 119
++</PRE>
++
++The structure and penalty masks can also be read from CLUSTAL alignment format
++as comment lines beginning "!SS_" or "!GM_" e.g.
++
++<PRE>
++!SS_HBA_HUMA ..aaaAAAAAAAAAAaaa.aaaAAAAAAAAAAaaaaaaAaaa.........aaaAAAAAA
++!GM_HBA_HUMA 112224444444444222122244444444442222224222111111111222444444
++HBA_HUMA VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGK
++</PRE>
++
++Note that the mask itself is a set of numbers between 1 and 9 each of which is
++assigned to the residue(s) in the same column below.
++
++In GDE flat file format, the masks are specified as text and the names must
++begin with "SS_ or "GM_.
++
++Either a structure or penalty mask or both may be used. If both are included
++in an alignment, the user will be asked which is to be used.
++
++
++>>HELP T <<
++ Phylogenetic Trees
++
++Before calculating a tree, you must have an ALIGNMENT in memory. This can be
++input using the FILE menu, LOAD SEQUENCES option or you should have just
++carried out a full multiple alignment and the alignment is still in memory.
++Remember YOU MUST ALIGN THE SEQUENCES FIRST!!!!
++
++The method used is the NJ (Neighbour Joining) method of Saitou and Nei. First
++you calculate distances (percent divergence) between all pairs of sequence from
++a multiple alignment; second you apply the NJ method to the distance matrix.
++
++To calculate a tree, use the DRAW N-J TREE option. This gives an UNROOTED tree
++and all branch lengths. The root of the tree can only be inferred by using an
++outgroup (a sequence that you are certain branches at the outside of the tree
++.... certain on biological grounds) OR if you assume a degree of constancy in
++the 'molecular clock', you can place the root in the 'middle' of the tree
++(roughly equidistant from all tips).
++
++BOOTSTRAP N-J TREE uses a method for deriving confidence values for the
++groupings in a tree (first adapted for trees by Joe Felsenstein). It involves
++making N random samples of sites from the alignment (N should be LARGE, e.g.
++500 - 1000); drawing N trees (1 from each sample) and counting how many times
++each grouping from the original tree occurs in the sample trees. You can set N
++using the NUMBER OF BOOTSTRAP TRIALS option in the BOOTSTRAP TREE window. In
++practice, you should use a large number of bootstrap replicates (1000 is
++recommended, even if it means running the program for an hour on a slow
++computer). You can also supply a seed number for the random number generator
++here. Different runs with the same seed will give the same answer. See the
++documentation for more details.
++
++EXCLUDE POSITIONS WITH GAPS? With this option, any alignment positions where
++ANY of the sequences have a gap will be ignored. This means that 'like' will
++be compared to 'like' in all distances, which is highly desirable. It also
++automatically throws away the most ambiguous parts of the alignment, which are
++concentrated around gaps (usually). The disadvantage is that you may throw away
++much of the data if there are many gaps (which is why it is difficult for us to
++make it the default).
++
++CORRECT FOR MULTIPLE SUBSTITUTIONS? For small divergence (say <10%) this option
++makes no difference. For greater divergence, this option corrects for the fact
++that observed distances underestimate actual evolutionary distances. This is
++because, as sequences diverge, more than one substitution will happen at many
++sites. However, you only see one difference when you look at the present day
++sequences. Therefore, this option has the effect of stretching branch lengths
++in trees (especially long branches). The corrections used here (for DNA or
++proteins) are both due to Motoo Kimura. See the documentation for details.
++
++Where possible, this option should be used. However, for VERY divergent
++sequences, the distances cannot be reliably corrected. You will be warned if
++this happens. Even if none of the distances in a data set exceed the reliable
++threshold, if you bootstrap the data, some of the bootstrap distances may
++randomly exceed the safe limit.
++
++SAVE LOG FILE will write the tree calculation scores to a file. The log
++filename is the same as the input sequence filename, with an extension .log
++appended.
++
++<H4>
++OUTPUT FORMAT OPTIONS
++</H4>
++
++Three different formats are allowed. None of these displays the tree visually.
++You can display the tree using the NJPLOT program distributed with Clustal X
++OR get the PHYLIP package and use the tree drawing facilities there.
++
++1) CLUSTAL FORMAT TREE. This format is verbose and lists all of the distances
++between the sequences and the number of alignment positions used for each. The
++tree is described at the end of the file. It lists the sequences that are
++joined at each alignment step and the branch lengths. After two sequences are
++joined, it is referred to later as a NODE. The number of a NODE is the number
++of the lowest sequence in that NODE.
++
++2) PHYLIP FORMAT TREE. This format is the New Hampshire format, used by many
++phylogenetic analysis packages. It consists of a series of nested parentheses,
++describing the branching order, with the sequence names and branch lengths. It
++can be read by the NJPLOT program distributed with ClustalX. It can also be
++used by the RETREE, DRAWGRAM and DRAWTREE programs of the PHYLIP package to see
++the trees graphically. This is the same format used during multiple alignment
++for the guide trees. Some other packages that can read and display New
++Hampshire format are TreeTool, TreeView, and Phylowin.
++
++3) PHYLIP DISTANCE MATRIX. This format just outputs a matrix of all the
++pairwise distances in a format that can be used by the PHYLIP package. It used
++to be useful when one could not produce distances from protein sequences in the
++Phylip package but is now redundant (PROTDIST of Phylip 3.5 now does this).
++
++4) NEXUS FORMAT TREE. This format is used by several popular phylogeny programs,
++including PAUP and MacClade. The format is described fully in:
++Maddison, D. R., D. L. Swofford and W. P. Maddison. 1997.
++NEXUS: an extensible file format for systematic information.
++Systematic Biology 46:590-621.
++
++BOOTSTRAP LABELS ON: By default, the bootstrap values are correctly placed on
++the tree branches of the phylip format output tree. The toggle allows them to
++be placed on the nodes, which is incorrect, but some display packages (e.g.
++TreeTool, TreeView and Phylowin) only support node labelling but not branch
++labelling. Care should be taken to note which branches and labels go together.
++
++
++>>HELP C <<
++ Colors
++
++Clustal X provides a versatile coloring scheme for the sequence alignment
++display. The sequences (or profiles) are colored automatically, when they are
++loaded. Sequences can be colored either by assigning a color to specific
++residues, or on the basis of an alignment consensus. In the latter case, the
++alignment consensus is calculated automatically, and the residues in each
++column are colored according to the consensus character assigned to that
++column. In this way, you can choose to highlight, for example, conserved
++hydrophylic or hydrophobic positions in the alignment.
++
++The 'rules' used to color the alignment are specified in a COLOR PARAMETER
++FILE. Clustal X automatically looks for a file called 'colprot.par' for protein
++sequences or 'coldna.par' for DNA, in the current directory. (If your running
++under UNIX, it then looks in your home directory, and finally in the
++directories in your PATH environment variable).
++
++By default, if no color parameter file is found, protein sequences are colored
++by residue as follows:
++
++<PRE>
++ Color Residue Code
++
++ ORANGE GPST
++ RED HKR
++ BLUE FWY
++ GREEN ILMV
++</PRE>
++
++In the case of DNA sequences, the default colors are as follows:
++
++<PRE>
++ Color Residue Code
++
++ ORANGE A
++ RED C
++ BLUE T
++ GREEN G
++</PRE>
++
++
++The default BACKGROUND COLORING option shows the sequence residues using a
++black character on a colored background. It can be switched off to show
++residues as a colored character on a white background.
++
++Either BLACK AND WHITE or DEFAULT COLOR options can be selected. The Color
++option looks first for the color parameter file (as described above) and, if no
++file is found, uses the default residue-specific colors.
++
++You can specify your own coloring scheme by using the LOAD COLOR PARAMETER FILE
++option. The format of the color parameter file is described below.
++
++<H4>
++COLOR PARAMETER FILE
++</H4>
++
++This file is divided into 3 sections:
++
++1) the names and rgb values of the colors
++2) the rules for calculating the consensus
++3) the rules for assigning colors to the residues
++
++An example file is given here.
++
++<PRE>
++ --------------------------------------------------------------------
++ at rgbindex
++RED 0.9 0.1 0.1
++BLUE 0.1 0.1 0.9
++GREEN 0.1 0.9 0.1
++YELLOW 0.9 0.9 0.0
++
++ at consensus
++% = 60% w:l:v:i:m:a:f:c:y:h:p
++# = 80% w:l:v:i:m:a:f:c:y:h:p
++- = 50% e:d
+++ = 60% k:r
++q = 50% q:e
++p = 50% p
++n = 50% n
++t = 50% t:s
++
++ at color
++g = RED
++p = YELLOW
++t = GREEN if t:%:#
++n = GREEN if n
++w = BLUE if %:#:p
++k = RED if +
++ --------------------------------------------------------------------
++</PRE>
++
++The first section is optional and is identified by the header @rgbindex. If
++this section exists, each color used in the file must be named and the rgb
++values specified (on a scale from 0 to 1). If the rgb index section is not
++found, the following set of hard-coded colors will be used.
++
++<PRE>
++RED 0.9 0.1 0.1
++BLUE 0.1 0.1 0.9
++GREEN 0.1 0.9 0.1
++ORANGE 0.9 0.7 0.3
++CYAN 0.1 0.9 0.9
++PINK 0.9 0.5 0.5
++MAGENTA 0.9 0.1 0.9
++YELLOW 0.9 0.9 0.0
++</PRE>
++
++The second section is optional and is identified by the header @consensus. It
++defines how the consensus is calculated.
++
++The format of each consensus parameter is:-
++
++<PRE>
++c = n% residue_list
++
++ where
++ c is a character used to identify the parameter.
++ n is an integer value used as the percentage cutoff
++ point.
++ residue_list is a list of residues denoted by a single
++ character, delimited by a colon (:).
++</PRE>
++
++For example: # = 60% w:l:v:i
++
++will assign a consensus character # to any column in the alignment which
++contains more than 60% of the residues w,l,v and i.
++
++
++The third section is identified by the header @color, and defines how colors
++are assigned to each residue in the alignment.
++
++The color parameters can take one of two formats:
++
++<PRE>
++1) r = color
++2) r = color if consensus_list
++
++ where
++ r is a character used to denote a residue.
++ color is one of the colors in the GDE color lookup table.
++ residue_list is a list of residues denoted by a single
++ character, delimited by a colon (:).
++</PRE>
++
++Examples:
++1) g = ORANGE
++
++will color all glycines ORANGE, regardless of the consensus.
++
++2) w = BLUE if w:%:#
++
++will color BLUE any tryptophan which is found in a column with a consensus of
++w, % or #.
++
++
++>>HELP Q <<
++ Alignment Quality Analysis
++
++<H3>
++QUALITY SCORES
++</H3>
++--------------
++
++Clustal X provides an indication of the quality of an alignment by plotting
++a 'conservation score' for each column of the alignment. A high score indicates
++a well-conserved column; a low score indicates low conservation. The quality
++curve is drawn below the alignment.
++
++Two methods are also provided to indicate single residues or sequence segments
++which score badly in the alignment.
++
++Low-scoring residues are expected to occur at a moderate frequency in all the
++sequences because of their steady divergence due to the natural processes of
++evolution. The most divergent sequences are likely to have the most outliers.
++However, the highlighted residues are especially useful in pointing to
++sequence misalignments. Note that clustering of highlighted residues is a
++strong indication of misalignment. This can arise due to various reasons, for
++example:
++
++ 1. Partial or total misalignments caused by a failure in the
++ alignment algorithm. Usually only in difficult alignment cases.
++
++ 2. Partial or total misalignments because at least one of the
++ sequences in the given set is partly or completely unrelated to the
++ other sequences. It is up to the user to check that the set of
++ sequences are alignable.
++
++ 3. Frameshift translation errors in a protein sequence causing local
++ mismatched regions to be heavily highlighted. These are surprisingly
++ common in database entries. If suspected, a 3-frame translation of
++ the source DNA needs to be examined.
++
++Occasionally, highlighted residues may point to regions of some biological
++significance. This might happen for example if a protein alignment contains a
++sequence which has acquired new functions relative to the main sequence set. It
++is important to exclude other explanations, such as error or the natural
++divergence of sequences, before invoking a biological explanation.
++
++
++<H3>
++LOW-SCORING SEGMENTS
++</H3>
++--------------------
++
++Unreliable regions in the alignment can be highlighted using the Low-Scoring
++Segments option. A sequence-weighted profile is used to indicate any segments
++in the sequences which score badly. Because the profile calculation may take
++some time, an option is provided to calculate LOW-SCORING SEGMENTS. The
++segment display can then be toggled on or off without having to repeat the
++time-consuming calculations.
++
++For details of the low-scoring segment calculation, see the CALCULATION section
++below.
++
++
++<H4>
++LOW-SCORING SEGMENT PARAMETERS
++</H4>
++------------------------------
++
++MINIMUM LENGTH OF SEGMENTS: short segments (or even single residues) can be
++hidden by increasing the minimum length of segments which will be displayed.
++
++DNA MARKING SCALE is used to remove less significant segments from the
++highlighted display. Increase the scale to display more segments; decrease the
++scale to remove the least significant.
++
++
++PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of each
++amino acid to each other. The matrix is used to calculate the sequence-
++weighted profile scores. There are four 'in-built' Log-Odds matrices offered:
++the Gonnet PAM 80, 120, 250, 350 matrices. A more stringent matrix which only
++gives a high score to identities and the most favoured conservative
++substitutions, may be more suitable when the sequences are closely related. For
++more divergent sequences, it is appropriate to use "softer" matrices which give
++a high score to many other frequent substitutions. This option automatically
++recalculates the low-scoring segments.
++
++
++DNA WEIGHT MATRIX: Two hard-coded matrices are available:
++
++1) IUB. This is the default scoring matrix used by BESTFIT for the comparison
++of nucleic acid sequences. X's and N's are treated as matches to any IUB
++ambiguity symbol. All matches score 1.0; all mismatches for IUB symbols score
++0.9.
++
++2) CLUSTALW(1.6). The previous system used by ClustalW, in which matches score
++1.0 and mismatches score 0. All matches for IUB symbols also score 0.
++
++A new matrix can be read from a file on disk, if the filename consists only
++of lower case characters. The values in the new weight matrix should be
++similarities and should be NEGATIVE for infrequent substitutions.
++
++INPUT FORMAT. The format used for a new matrix is the same as the BLAST
++program. Any lines beginning with a # character are assumed to be comments. The
++first non-comment line should contain a list of amino acids in any order, using
++the 1 letter code, followed by a * character. This should be followed by a
++square matrix of scores, with one row and one column for each amino acid. The
++last row and column of the matrix (corresponding to the * character) contain
++the minimum score over the whole matrix.
++
++<H4>
++QUALITY SCORE PARAMETERS
++</H4>
++------------------------
++
++You can customise the column 'quality scores' plotted underneath the alignment
++display using the following options.
++
++SCORE PLOT SCALE: this is a scalar value from 1 to 10, which can be used to
++change the scale of the quality score plot.
++
++RESIDUE EXCEPTION CUTOFF: this is a scalar value from 1 to 10, which can be
++used to change the number of residue exceptions which are highlighted in the
++alignment display. (For an explanation of this cutoff, see the CALCULATION OF
++RESIDUE EXCEPTIONS section below.)
++
++PROTEIN WEIGHT MATRIX: the scoring table which describes the similarity of
++each amino acid to each other.
++
++DNA WEIGHT MATRIX: two hard-coded matrices are available: IUB and CLUSTALW(1.6).
++
++For more information about the weight matrices, see the help above for
++the Low-scoring Segments Weight Matrix.
++
++For details of the quality score calculations, see the CALCULATION section
++below.
++
++
++<STRONG>
++SHOW LOW-SCORING SEGMENTS
++</STRONG>
++
++The low-scoring segment display can be toggled on or off. This option does not
++recalculate the profile scores.
++
++
++<STRONG>
++SHOW EXCEPTIONAL RESIDUES
++</STRONG>
++
++This option highlights individual residues which score badly in the alignment
++quality calculations. Residues which score exceptionally low are highlighted by
++using a white character on a grey background.
++
++<STRONG>
++SAVE QUALITY SCORES TO FILE
++</STRONG>
++
++The quality scores that are plotted underneath the alignment display can also
++be saved in a text file. Each column in the alignment is written on one line in
++the output file, with the value of the quality score at the end of the line.
++Only the sequences currently selected in the display are written to the file.
++One use for quality scores is to color residues in a protein structure by
++sequence conservation. In this way conserved surface residues can be
++highlighted to locate functional regions such as ligand-binding sites.
++
++
++<H3>
++CALCULATION OF QUALITY SCORES
++</H3>
++-----------------------------
++
++Suppose we have an alignment of m sequences of length n. Then, the alignment
++can be written as:
++
++<PRE>
++ A11 A12 A13 .......... A1n
++ A21 A22 A23 .......... A2n
++ .
++ .
++ Am1 Am2 Am3 .......... Amn
++</PRE>
++
++We also have a residue comparison matrix of size R where C(i,j) is the score
++for aligning residue i with residue j.
++
++We want to calculate a score for the conservation of the jth position in the
++alignment.
++
++To do this, we define an R-dimensional sequence space. For the jth position in
++the alignment, each sequence consists of a single residue which is assigned a
++point S in the space. S has R dimensions, and for sequence i, the rth dimension
++is defined as:
++
++<PRE>
++ Sr = C(r,Aij)
++</PRE>
++
++We then calculate a consensus value for the jth position in the alignment. This
++value X also has R dimensions, and the rth dimension is defined as:
++
++<PRE>
++ Xr = ( SUM (Fij * C(i,r)) ) / m
++ 1<=i<=R
++</PRE>
++
++where Fij is the count of residues i at position j in the alignment.
++
++Now we can calculate the distance Di between each sequence i and the consensus
++position X in the R-dimensional space.
++
++<PRE>
++ Di = SQRT ( SUM (Xr - Sr)(Xr - Sr) )
++ 1<=i<=R
++
++</PRE>
++
++The quality score for the jth position in the alignment is defined as the mean
++of the sequence distances Di.
++
++The score is normalised by multiplying by the percentage of sequences which
++have residues (and not gaps) at this position.
++
++<H3>
++CALCULATION OF RESIDUE EXCEPTIONS
++</H3>
++---------------------------------
++
++The jth residue of the ith sequence is considered as an exception if the
++distance Di of the sequence from the consensus value P is greater than (Upper
++Quartile + Inter Quartile Range * Cutoff). The value used as a cutoff for
++displaying exceptions can be set from the SCORE PARAMETERS menu. A high cutoff
++value will only display very significant exceptions; a low value will allow
++more, less significant, exceptions to be highlighted.
++
++(NB. Sequences which contain gaps at this position are not included in the
++exception calculation.)
++
++
++<H3>
++CALCULATION OF LOW-SCORING SEGMENTS
++</H3>
++-----------------------------------
++
++Suppose we have an alignment of m sequences of length n. Then, the alignment
++can be written as:
++
++<PRE>
++ A11 A12 A13 .......... A1n
++ A21 A22 A23 .......... A2n
++ .
++ .
++ Am1 Am2 Am3 .......... Amn
++</PRE>
++
++We also have a residue comparison matrix of size R where C(i,j) is the score
++for aligning residue i with residue j.
++
++We calculate sequence weights by building a neighbour-joining tree, in which
++branch lengths are proportional to divergence. Summing the branches by branch
++ownership provides the weights. See (Thompson et al., CABIOS, 10, 19 (1994) and
++Henikoff et al.,JMB, 243, 574 1994).
++
++To find the low-scoring segments in a sequence Si, we build a weighted profile
++of the remaining sequences in the alignment. Suppose we find residue r at
++position j in the sequence; then the score for the jth position in the sequence
++is defined as
++
++<PRE>
++ Score(Si,j) = Profile(j,r) where Profile(j,r) is the profile score
++ for residue r at position j in the
++ alignment.
++</PRE>
++
++These residue scores are summed along the sequence in both forward and backward
++directions. If the sum of the scores is positive, then it is reset to zero.
++Segments which score negatively in both directions are considered as
++'low-scoring' and will be highlighted in the alignment display.
++
++
++>>HELP 9 <<
++ Command Line Parameters
++
++ DATA (sequences)
++
++-INFILE=file.ext :input sequences
++-PROFILE1=file.ext and -PROFILE2=file.ext :profiles (aligned sequences)
++
++
++ VERBS (do things)
++
++-OPTIONS :list the command line parameters
++-HELP or -CHECK :outline the command line parameters
++-ALIGN :do full multiple alignment
++-TREE :calculate NJ tree
++-BOOTSTRAP(=n) :bootstrap a NJ tree (n= number of bootstraps; def. = 1000)
++-CONVERT :output the input sequences in a different file format
++
++
++ PARAMETERS (set things)
++
++***General settings:****
++-INTERACTIVE :read command line, then enter normal interactive menus
++-QUICKTREE :use FAST algorithm for the alignment guide tree
++-TYPE= :PROTEIN or DNA sequences
++-NEGATIVE :protein alignment with negative values in matrix
++-OUTFILE= :sequence alignment file name
++-OUTPUT= :GCG, GDE, PHYLIP, PIR or NEXUS
++-OUTORDER= :INPUT or ALIGNED
++-CASE= :LOWER or UPPER (for GDE output only)
++-SEQNOS= :OFF or ON (for Clustal output only)
++
++
++***Fast Pairwise Alignments:***
++-KTUPLE=n :word size
++-TOPDIAGS=n :number of best diags.
++-WINDOW=n :window around best diags.
++-PAIRGAP=n :gap penalty
++-SCORE= :PERCENT or ABSOLUTE
++
++
++***Slow Pairwise Alignments:***
++-PWMATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
++-PWDNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
++-PWGAPOPEN=f :gap opening penalty
++-PWGAPEXT=f :gap opening penalty
++
++
++***Multiple Alignments:***
++-NEWTREE= :file for new guide tree
++-USETREE= :file for old guide tree
++-MATRIX= :Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename
++-DNAMATRIX= :DNA weight matrix=IUB, CLUSTALW or filename
++-GAPOPEN=f :gap opening penalty
++-GAPEXT=f :gap extension penalty
++-ENDGAPS :no end gap separation pen.
++-GAPDIST=n :gap separation pen. range
++-NOPGAP :residue-specific gaps off
++-NOHGAP :hydrophilic gaps off
++-HGAPRESIDUES= :list hydrophilic res.
++-MAXDIV=n :% ident. for delay
++-TYPE= :PROTEIN or DNA
++-TRANSWEIGHT=f :transitions weighting
++
++
++***Profile Alignments:***
++-PROFILE :Merge two alignments by profile alignment
++-NEWTREE1= :file for new guide tree for profile1
++-NEWTREE2= :file for new guide tree for profile2
++-USETREE1= :file for old guide tree for profile1
++-USETREE2= :file for old guide tree for profile2
++
++
++***Sequence to Profile Alignments:***
++-SEQUENCES :Sequentially add profile2 sequences to profile1 alignment
++-NEWTREE= :file for new guide tree
++-USETREE= :file for old guide tree
++
++
++***Structure Alignments:***
++-NOSECSTR1 :do not use secondary structure/gap penalty mask for profile 1
++-NOSECSTR2 :do not use secondary structure/gap penalty mask for profile 2
++-SECSTROUT=STRUCTURE or MASK or BOTH or NONE :output in alignment file
++-HELIXGAP=n :gap penalty for helix core residues
++-STRANDGAP=n :gap penalty for strand core residues
++-LOOPGAP=n :gap penalty for loop regions
++-TERMINALGAP=n :gap penalty for structure termini
++-HELIXENDIN=n :number of residues inside helix to be treated as terminal
++-HELIXENDOUT=n :number of residues outside helix to be treated as terminal
++-STRANDENDIN=n :number of residues inside strand to be treated as terminal
++-STRANDENDOUT=n:number of residues outside strand to be treated as terminal
++
++
++***Trees:***
++-OUTPUTTREE=nj OR phylip OR dist OR nexus
++-SEED=n :seed number for bootstraps
++-KIMURA :use Kimura's correction
++-TOSSGAPS :ignore positions with gaps
++-BOOTLABELS=node OR branch :position of bootstrap values in tree display
++
++
++>>HELP R <<
++ References
++
++<STRONG>
++The ClustalX program is described in the manuscript:
++</STRONG>
++
++Thompson,J.D., Gibson,T.J., Plewniak,F., Jeanmougin,F. and Higgins,D.G. (1997)
++The ClustalX windows interface: flexible strategies for multiple sequence
++alignment aided by quality analysis tools. Nucleic Acids Research, 24:4876-4882.
++
++
++<STRONG>
++The ClustalW program is described in the manuscript:
++</STRONG>
++
++Thompson, J.D., Higgins, D.G. and Gibson, T.J. (1994) CLUSTAL W: improving the
++sensitivity of progressive multiple sequence alignment through sequence
++weighting, positions-specific gap penalties and weight matrix choice. Nucleic
++Acids Research, 22:4673-4680.
++
++
++<STRONG>
++The ClustalV program is described in the manuscript:
++</STRONG>
++
++Higgins,D.G., Bleasby,A.J. and Fuchs,R. (1992) CLUSTAL V: improved software for
++multiple sequence alignment. CABIOS 8,189-191.
++
++
++<STRONG>
++The original Clustal program is described in the manuscripts:
++</STRONG>
++
++Higgins,D.G. and Sharp,P.M. (1989) Fast and sensitive multiple sequence
++alignments on a microcomputer.
++CABIOS 5,151-153.
++
++Higgins,D.G. and Sharp,P.M. (1988) CLUSTAL: a package for performing multiple
++sequence alignment on a microcomputer. Gene 73,237-244.
++
++-------------------------------------------------------------------------------
++<STRONG>
++Some tips on using Clustal X:
++</STRONG>
++
++Jeanmougin,F., Thompson,J.D., Gouy,M., Higgins,D.G. and Gibson,T.J. (1998)
++Multiple sequence alignment with Clustal X. Trends Biochem Sci, 23, 403-5.
++
++<STRONG>
++Some tips on using Clustal W:
++</STRONG>
++
++Higgins, D. G., Thompson, J. D. and Gibson, T. J. (1996) Using CLUSTAL for
++multiple sequence alignments. Methods Enzymol., 266, 383-402.
++
++-------------------------------------------------------------------------------
++<STRONG>
++You can get the latest version of the ClustalX program by anonymous ftp to:
++</STRONG>
++
++ftp-igbmc.u-strasbg.fr
++ftp.embl-heidelberg.de
++ftp.ebi.ac.uk
++
++<STRONG>
++Or, have a look at the following WWW site:
++</STRONG>
++
++http://www-igbmc.u-strasbg.fr/BioInfo/
++
++
+ This is the on-line help file for Clustal X (version 1.83), using the NCBI
+ Vibrant Toolkit.
+
Added: trunk/packages/clustalw/trunk/debian/patches/interface.c.patch
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/interface.c.patch (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/interface.c.patch 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,226 @@
+Index: clustalw-1.83/interface.c
+===================================================================
+--- clustalw-1.83.orig/interface.c
++++ clustalw-1.83/interface.c
+@@ -1223,8 +1223,7 @@
+ while(fgets(temp,MAXLINE+1,help_file)) {
+ if(strstr(temp, help_marker)){
+ if(usemenu) {
+- fprintf(stdout,"\n");
+- getstr("Press [RETURN] to continue",lin2);
++ getstr("\nPress [RETURN] to continue",MAXLINE+1,lin2);
+ }
+ fclose(help_file);
+ return;
+@@ -1235,8 +1234,7 @@
+ }
+ if(usemenu) {
+ if(nlines >= PAGE_LEN) {
+- fprintf(stdout,"\n");
+- getstr("Press [RETURN] to continue or X to stop",lin2);
++ getstr("\nPress [RETURN] to continue or X to stop",MAXLINE+1,lin2);
+ if(toupper(*lin2) == 'X') {
+ fclose(help_file);
+ return;
+@@ -1247,8 +1245,7 @@
+ }
+ }
+ if(usemenu) {
+- fprintf(stdout,"\n");
+- getstr("Press [RETURN] to continue",lin2);
++ getstr("\nPress [RETURN] to continue",MAXLINE+1,lin2);
+ }
+ fclose(help_file);
+ }
+@@ -1286,8 +1283,7 @@
+ fputs(temp,stdout);
+ ++nlines;
+ if(nlines >= PAGE_LEN) {
+- fprintf(stdout,"\n");
+- getstr("Press [RETURN] to continue or X to stop",lin2);
++ getstr("\nPress [RETURN] to continue or X to stop",MAXLINE+1,lin2);
+ if(toupper(*lin2) == 'X') {
+ fclose(file);
+ return;
+@@ -1297,8 +1293,7 @@
+ }
+ }
+ fclose(file);
+- fprintf(stdout,"\n");
+- getstr("Press [RETURN] to continue",lin2);
++ getstr("\nPress [RETURN] to continue",MAXLINE+1,lin2);
+ }
+
+
+@@ -1747,7 +1742,7 @@
+ FILE *infile;
+
+ if(usemenu)
+- getstr("Enter name of the matrix file",lin2);
++ getstr("Enter name of the matrix file",MAXLINE+1,lin2);
+ else
+ strcpy(lin2,str);
+
+@@ -1773,7 +1768,7 @@
+ FILE *infile;
+
+ if(usemenu)
+- getstr("Enter name of the matrix file",lin2);
++ getstr("Enter name of the matrix file",MAXLINE+1,lin2);
+ else
+ strcpy(lin2,str);
+
+@@ -2163,6 +2158,7 @@
+
+ { static char temp[FILENAMELEN+1];
+ static char local_prompt[MAXLINE];
++ static char local_prompt_tmp[MAXLINE+FILENAMELEN+1];
+ FILE * file_handle;
+
+ /* if (*file_name == EOS) {
+@@ -2174,17 +2170,17 @@
+ warning("Output file name is the same as input file.");
+ if (usemenu) {
+ strcpy(local_prompt,"\n\nEnter new name to avoid overwriting ");
+- strcat(local_prompt," [%s]: ");
+- fprintf(stdout,local_prompt,file_name);
+- gets(temp);
++ strcat(local_prompt," [%s]");
++ sprintf(local_prompt_tmp,local_prompt,file_name);
++ getstr(local_prompt_tmp,FILENAMELEN+1,temp);
+ if(*temp != EOS) strcpy(file_name,temp);
+ }
+ }
+ else if (usemenu) {
+ strcpy(local_prompt,prompt);
+- strcat(local_prompt," [%s]: ");
+- fprintf(stdout,local_prompt,file_name);
+- gets(temp);
++ strcat(local_prompt," [%s]");
++ sprintf(local_prompt_tmp,local_prompt,file_name);
++ getstr(local_prompt_tmp,FILENAMELEN+1,temp);
+ if(*temp != EOS) strcpy(file_name,temp);
+ }
+
+@@ -2260,7 +2256,7 @@
+ }
+ else {
+ if((tree = open_output_file(
+- "\nEnter name for new GUIDE TREE file ",path,
++ "\nEnter name for new GUIDE TREE file ",path,
+ phylip_name,"dnd")) == NULL) return;
+ }
+ }
+@@ -2327,6 +2323,7 @@
+ {
+ char path[FILENAMELEN+1];
+ char tree_name[FILENAMELEN+1],temp[MAXLINE+1];
++ char tmp_msg[MAXLINE+1+300];
+ Boolean use_tree;
+ FILE *tree;
+ sint i,j,count;
+@@ -2383,9 +2380,9 @@
+ if((tree=fopen(tree_name,"r"))!=NULL) {
+ #endif
+ if (usemenu)
+- fprintf(stdout,"\nUse the existing GUIDE TREE file, %s (y/n) ? [y]: ",
++ sprintf(tmp_msg,"\nUse the existing GUIDE TREE file, %s (y/n) ? [y]",
+ tree_name);
+- gets(temp);
++ getstr(tmp_msg,MAXLINE+1,temp);
+ if(*temp != 'n' && *temp != 'N') {
+ strcpy(phylip_name,tree_name);
+ use_tree = TRUE;
+@@ -2584,6 +2581,7 @@
+ void get_tree(char *phylip_name)
+ {
+ char path[FILENAMELEN+1],temp[MAXLINE+1];
++ char tmp_msg[FILENAMELEN+300];
+ sint count;
+
+ if(empty) {
+@@ -2615,9 +2613,9 @@
+ strcpy(phylip_name,path);
+ strcat(phylip_name,"dnd");
+
+- fprintf(stdout,"\nEnter a name for the guide tree file [%s]: ",
++ sprintf(tmp_msg,"\nEnter a name for the guide tree file [%s]",
+ phylip_name);
+- gets(temp);
++ getstr(tmp_msg,MAXLINE+1,temp);
+ if(*temp != EOS)
+ strcpy(phylip_name,temp);
+ }
+@@ -2685,6 +2683,8 @@
+ char path[FILENAMELEN+1];
+ char tree_name[FILENAMELEN+1];
+ char temp[MAXLINE+1];
++ char tmp_msg[FILENAMELEN+300];
++
+ Boolean use_tree1,use_tree2;
+ FILE *tree;
+ sint count,i,j,dscore;
+@@ -2717,9 +2717,9 @@
+ #else
+ if((tree=fopen(tree_name,"r"))!=NULL) {
+ #endif
+- fprintf(stdout,"\nUse the existing GUIDE TREE file for Profile 1, %s (y/n) ? [y]: ",
++ sprintf(tmp_msg,"\nUse the existing GUIDE TREE file for Profile 1, %s (y/n) ? [y]",
+ tree_name);
+- gets(temp);
++ getstr(tmp_msg,MAXLINE+1,temp);
+ if(*temp != 'n' && *temp != 'N') {
+ strcpy(p1_tree_name,tree_name);
+ use_tree1 = TRUE;
+@@ -2739,19 +2739,20 @@
+ strcpy(tree_name,path);
+ strcat(tree_name,"dnd");
+ #ifdef VMS
+- if((tree=fopen(tree_name,"r","rat=cr","rfm=var"))!=NULL) {
++ if((tree=fopen(tree_name,"r","rat=cr","rfm=var"))!=NULL)
+ #else
+- if((tree=fopen(tree_name,"r"))!=NULL) {
++ if((tree=fopen(tree_name,"r"))!=NULL)
+ #endif
+- fprintf(stdout,"\nUse the existing GUIDE TREE file for Profile 2, %s (y/n) ? [y]: ",
+- tree_name);
+- gets(temp);
+- if(*temp != 'n' && *temp != 'N') {
+- strcpy(p2_tree_name,tree_name);
+- use_tree2 = TRUE;
+- }
+- fclose(tree);
+- }
++ {
++ sprintf(tmp_msg,"\nUse the existing GUIDE TREE file for Profile 2, %s (y/n) ? [y]",
++ tree_name);
++ getstr(tmp_msg,MAXLINE+1,temp);
++ if(*temp != 'n' && *temp != 'N') {
++ strcpy(p2_tree_name,tree_name);
++ use_tree2 = TRUE;
++ }
++ fclose(tree);
++ }
+ }
+ else if (!usemenu && use_tree2_file) {
+ use_tree2 = TRUE;
+@@ -4194,6 +4195,7 @@
+ {
+ char parname[FILENAMELEN+1], temp[FILENAMELEN+1];
+ char path[FILENAMELEN+1];
++ char tmp_msg[FILENAMELEN+300];
+ FILE *parout;
+
+ get_path(seqname,path);
+@@ -4201,9 +4203,9 @@
+ strcat(parname,"par");
+
+ if(usemenu) {
+- fprintf(stdout,"\nEnter a name for the parameter output file [%s]: ",
++ sprintf(tmp_msg,"\nEnter a name for the parameter output file [%s]",
+ parname);
+- gets(temp);
++ getstr(tmp_msg,FILENAMELEN+1,temp);
+ if(*temp != EOS)
+ strcpy(parname,temp);
+ }
Added: trunk/packages/clustalw/trunk/debian/patches/makefile.patch
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/makefile.patch (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/makefile.patch 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,101 @@
+Index: clustalw-1.83/makefile
+===================================================================
+--- clustalw-1.83.orig/makefile
++++ clustalw-1.83/makefile
+@@ -1,7 +1,15 @@
+-install: clustalx clustalw
+
+-clean:
+- rm *.o
++RM=/bin/rm -f
++
++BINDIR=$(DESTDIR)/usr/bin
++XBINDIR=$(DESTDIR)/usr/X11R6/bin
++DOCDIR=$(DESTDIR)/usr/share/doc/clustalw
++XDOCDIR=$(DESTDIR)/usr/share/doc/clustalx
++LIBDIR=$(DESTDIR)/usr/share/clustalw
++MANDIR=$(DESTDIR)/usr/share/man/man1
++XMANDIR=$(DESTDIR)/usr/X11R6/man/man1
++DOCS=clustalv.doc clustalw.doc clustalw.ms README_W
++XDOCS=README_X clustalx.html
+
+ OBJECTS = interface.o sequence.o showpair.o malign.o \
+ util.o trees.o gcgcheck.o prfalign.o pairalign.o \
+@@ -12,25 +20,36 @@
+
+ HEADERS = general.h clustalw.h
+
+-CC = cc
+-CFLAGS = -c -O
++CC = gcc
++CFLAGS = -c -O2
++
++MACHINE=$(shell uname -m)
++ifeq ("$(MACHINE)","alpha")
++ # -mieee is for the Alpha only: ClustalW divides by zero (yes, I know it's bad)
++ # and expect the processor to goes on. -mieee tells the Alpha to comply with
++ # the IEEE standard and to shut up about divisions by zero.
++ CFLAGS += -mieee
++endif
++
+ LFLAGS = -O -lm
+-NCBI_INC = /dec/biolo/ncbi/include
+-NCBI_LIB = /dec/biolo/ncbi/lib
+-CXFLAGS = -DWIN_MOTIF -I$(NCBI_INC)
+-LXFLAGS = -L$(NCBI_LIB) -lvibrant -lncbi -lpthread -lXm -lXmu -lXt -lX11 -lm
++NCBI_INC= /usr/include/ncbi
++NCBI_LIB= /usr/lib
++CXFLAGS = -DWIN_MOTIF -I$(NCBI_INC)
++LXFLAGS = -L/usr/X11R6/lib -lvibrant -lncbi -lpthread -lXm -lXmu -lXt -lX11 -lm
+
+-clustalw : $(OBJECTS) amenu.o clustalw.o
+- $(CC) -o $@ $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
++all: clustalx clustalw
+
+-interface.o : interface.c $(HEADERS) param.h
+- $(CC) $(CFLAGS) $*.c
++machine:
++ echo $(MACHINE)
+
+-amenu.o : amenu.c $(HEADERS) param.h
+- $(CC) $(CFLAGS) $*.c
++clustalw : $(OBJECTS) $(XOBJECTS) amenu.o clustalw.o
++ $(CC) -o $@ -I$(NCBI_INC) $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
+
+ clustalx : $(OBJECTS) $(XOBJECTS) clustalx.o
+- $(CC) -o $@ $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
++ $(CC) -o $@ -I$(NCBI_INC) $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
++
++clustalw.o : clustalw.c $(HEADERS)
++ $(CC) $(CFLAGS) $*.c
+
+ clustalx.o : clustalx.c $(HEADERS)
+ $(CC) $(CFLAGS) $(CXFLAGS) $*.c
+@@ -56,6 +75,25 @@
+ trees.o : trees.c $(HEADERS) dayhoff.h
+ $(CC) $(CFLAGS) $*.c
+
+-.c.o :
+- $(CC) $(CFLAGS) $?
++
++
++install: all
++ install -d $(BINDIR) $(XBINDIR) $(LIBDIR) $(DOCDIR)/examples $(MANDIR) $(XMANDIR) $(XDOCDIR)
++ install -m 0755 clustalw $(BINDIR)
++ install -m 0755 clustalx $(XBINDIR)
++ install -m 0644 clustalw_help clustalx_help $(LIBDIR)
++ install -m 0644 clustalw.1 $(MANDIR)
++ install -m 0644 clustalx.1 $(MANDIR)
++ install -m 0644 $(DOCS) $(DOCDIR)
++ install -m 0644 $(XDOCS) $(XDOCDIR)
++ cp -a -R tests.clustalw $(DOCDIR)/examples/tests
++
++.PHONY: clean distclean
++
++clean:
++ $(RM) *.o
++
++distclean: clean
++ $(RM) clustalw clustalx
++ cd tests.clustalw; make clean
+
Added: trunk/packages/clustalw/trunk/debian/patches/sequence.c.patch
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/sequence.c.patch (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/sequence.c.patch 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,13 @@
+Index: clustalw-1.83/sequence.c
+===================================================================
+--- clustalw-1.83.orig/sequence.c
++++ clustalw-1.83/sequence.c
+@@ -924,7 +924,7 @@
+ static Boolean dnaflag1;
+
+ if(usemenu)
+- getstr("Enter the name of the sequence file",line);
++ getstr("Enter the name of the sequence file",FILENAMELEN+1,line);
+ else
+ strcpy(line,seqname);
+ if(*line == EOS) return -1;
Added: trunk/packages/clustalw/trunk/debian/patches/series
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/series (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/series 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,11 @@
+amenu.c.patch
+clustal-help.patch
+clustalw.h.patch
+clustalx.html.patch
+interface.c.patch
+sequence.c.patch
+trees.c.patch
+util.c.patch
+makefile.patch
+clustalx_help.patch
+xmenu.c.patch
Added: trunk/packages/clustalw/trunk/debian/patches/trees.c.patch
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/trees.c.patch (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/trees.c.patch 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,13 @@
+Index: clustalw-1.83/trees.c
+===================================================================
+--- clustalw-1.83.orig/trees.c
++++ clustalw-1.83/trees.c
+@@ -1497,7 +1497,7 @@
+ fprintf(stdout,"\n or 3) use the PHYLIP package.");
+ fprintf(stdout,"\n\n");
+ if (usemenu)
+- getstr("Press [RETURN] to continue",dummy);
++ getstr("Press [RETURN] to continue",10,dummy);
+ }
+
+
Added: trunk/packages/clustalw/trunk/debian/patches/util.c.patch
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/util.c.patch (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/util.c.patch 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,52 @@
+Index: clustalw-1.83/util.c
+===================================================================
+--- clustalw-1.83.orig/util.c
++++ clustalw-1.83/util.c
+@@ -171,10 +171,18 @@
+ return str;
+ }
+
+-void getstr(char *instr,char *outstr)
++void getstr(char *instr, int n, char *outstr)
+ {
++ int sl;
+ fprintf(stdout,"%s: ",instr);
+- gets(outstr);
++ fgets(outstr,n,stdin);
++ /*
++ * modify outstr for compatibility with prior used (insecure) gets()
++ */
++ sl=strlen(outstr);
++ if(sl>0 && '\n'==outstr[sl-1]) {
++ outstr[sl-1]=0;
++ }
+ }
+
+ double getreal(char *instr,double minx,double maxx,double def)
+@@ -185,7 +193,7 @@
+
+ while(TRUE) {
+ fprintf(stdout,"%s (%.1f-%.1f) [%.1f]: ",instr,minx,maxx,def);
+- gets(line);
++ fgets(line,MAXLINE,stdin);
+ status=sscanf(line,"%f",&ret);
+ if(status == EOF) return def;
+ if(ret>maxx) {
+@@ -210,7 +218,7 @@
+ while(TRUE) {
+ fprintf(stdout,"%s (%d..%d) [%d]: ",
+ instr,(pint)minx,(pint)maxx,(pint)def);
+- gets(line);
++ fgets(line,MAXLINE,stdin);
+ status=sscanf(line,"%d",&ret);
+ if(status == EOF) return def;
+ if(ret>maxx) {
+@@ -230,7 +238,7 @@
+ {
+ char line[MAXLINE];
+
+- getstr("\n\nEnter system command",line);
++ getstr("\n\nEnter system command",MAXLINE,line);
+ if(*line != EOS)
+ system(line);
+ fprintf(stdout,"\n\n");
Added: trunk/packages/clustalw/trunk/debian/patches/xmenu.c.patch
===================================================================
--- trunk/packages/clustalw/trunk/debian/patches/xmenu.c.patch (rev 0)
+++ trunk/packages/clustalw/trunk/debian/patches/xmenu.c.patch 2007-08-12 15:08:40 UTC (rev 399)
@@ -0,0 +1,13 @@
+Index: xmenu.c
+===================================================================
+--- ./xmenu.c (révision 173)
++++ ./xmenu.c (révision 174)
+@@ -4411,7 +4411,7 @@
+ while(TRUE) {
+ if(fgets(temp,MAXLINE+1,fd) == NULL) {
+ if(!found_help)
+- error("No help found in help file");
++ error("No help found in help file [%s]",help_file);
+ fclose(fd);
+ return;
+ }
Modified: trunk/packages/clustalw/trunk/debian/rules
===================================================================
--- trunk/packages/clustalw/trunk/debian/rules 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/debian/rules 2007-08-12 15:08:40 UTC (rev 399)
@@ -3,7 +3,10 @@
# Uncomment this to turn on verbose mode.
#export DH_VERBOSE=1
-build: build-stamp
+include /usr/share/quilt/quilt.make
+
+
+build: patch build-stamp
build-stamp:
dh_testdir
@@ -11,7 +14,7 @@
touch build-stamp
-clean:
+clean: unpatch
dh_testdir
dh_testroot
rm -f build-stamp
Deleted: trunk/packages/clustalw/trunk/gcgcheck.c
===================================================================
--- trunk/packages/clustalw/trunk/gcgcheck.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/gcgcheck.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,15 +0,0 @@
-#include <ctype.h> /* because of toupper() */
-int SeqGCGCheckSum(char *seq, int len);
-
-int SeqGCGCheckSum(char *seq, int len)
-{
- int i;
- long check;
-
- for( i=0, check=0; i< len; i++,seq++)
- check += ((i % 57)+1) * toupper(*seq);
-
- return(check % 10000);
-}
-
-
Deleted: trunk/packages/clustalw/trunk/general.h
===================================================================
--- trunk/packages/clustalw/trunk/general.h 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/general.h 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,50 +0,0 @@
-/* General purpose header file - rf 12/90 */
-
-#ifndef _H_general
-#define _H_general
-
-
-
-/* Macintosh specific */
-#ifdef MAC /* rf 12/9/94 */
-
-#define const /* THINK C doesn't know about these identifiers */
-#define signed
-#define volatile
-#define int long
-#ifndef Boolean
-#define Boolean char
-#endif
-#define pint short /* cast ints in printf statements as pint */
-#define sint int /* cast ints for sequence lengths */
-#define lint int /* cast ints for profile scores */
-
-#else /* not Macintoshs */
-
-#define pint int /* cast ints in printf statements as pint */
-#define sint int /* cast ints for sequence lengths */
-#define lint int /* cast ints for profile scores */
-#ifndef Boolean
-#define Boolean char
-#endif
-
-#endif /* ifdef MAC */
-
-/* definitions for all machines */
-
-#undef TRUE /* Boolean values; first undef them, just in case */
-#undef FALSE
-#define TRUE 1
-#define FALSE 0
-
-#define EOS '\0' /* End-Of-String */
-#define MAXLINE 512 /* Max. line length */
-
-
-#ifdef VMS
-#define signed
-#endif
-
-
-#endif /* ifndef _H_general */
-
Deleted: trunk/packages/clustalw/trunk/globin.pep
===================================================================
--- trunk/packages/clustalw/trunk/globin.pep 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/globin.pep 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,86 +0,0 @@
->P1;HBB_HUMAN
-Sw:Hbb_Human => HBB_HUMAN
- VHLTPEEKSA VTALWGKVNV DEVGGEALGR LLVVYPWTQR FFESFGDLST
- PDAVMGNPKV KAHGKKVLGA FSDGLAHLDN LKGTFATLSE LHCDKLHVDP
- ENFRLLGNVL VCVLAHHFGK EFTPPVQAAY QKVVAGVANA LAHKYH*
-C;ID HBB_HUMAN STANDARD; PRT; 146 AA.
-C;AC P02023;
-C;DT 21-JUL-1986 (REL. 01, CREATED)
-C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
-C;DT 01-APR-1993 (REL. 25, LAST ANNOTATION UPDATE)
-C;DE HEMOGLOBIN BETA CHAIN. . . .
-
->P1;HBB_HORSE
-Sw:Hbb_Horse => HBB_HORSE
- VQLSGEEKAA VLALWDKVNE EEVGGEALGR LLVVYPWTQR FFDSFGDLSN
- PGAVMGNPKV KAHGKKVLHS FGEGVHHLDN LKGTFAALSE LHCDKLHVDP
- ENFRLLGNVL VVVLARHFGK DFTPELQASY QKVVAGVANA LAHKYH*
-C;ID HBB_HORSE STANDARD; PRT; 146 AA.
-C;AC P02062;
-C;DT 21-JUL-1986 (REL. 01, CREATED)
-C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
-C;DT 01-MAR-1992 (REL. 21, LAST ANNOTATION UPDATE)
-C;DE HEMOGLOBIN BETA CHAIN. . . .
-
->P1;HBA_HUMAN
-Sw:Hba_Human => HBA_HUMAN
- VLSPADKTNV KAAWGKVGAH AGEYGAEALE RMFLSFPTTK TYFPHFDLSH
- GSAQVKGHGK KVADALTNAV AHVDDMPNAL SALSDLHAHK LRVDPVNFKL
- LSHCLLVTLA AHLPAEFTPA VHASLDKFLA SVSTVLTSKY R*
-C;ID HBA_HUMAN STANDARD; PRT; 141 AA.
-C;AC P01922;
-C;DT 21-JUL-1986 (REL. 01, CREATED)
-C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
-C;DT 01-FEB-1994 (REL. 28, LAST ANNOTATION UPDATE)
-C;DE HEMOGLOBIN ALPHA CHAIN. . . .
-
->P1;HBA_HORSE
-Sw:Hba_Horse => HBA_HORSE
- VLSAADKTNV KAAWSKVGGH AGEYGAEALE RMFLGFPTTK TYFPHFDLSH
- GSAQVKAHGK KVGDALTLAV GHLDDLPGAL SNLSDLHAHK LRVDPVNFKL
- LSHCLLSTLA VHLPNDFTPA VHASLDKFLS SVSTVLTSKY R*
-C;ID HBA_HORSE STANDARD; PRT; 141 AA.
-C;AC P01958;
-C;DT 21-JUL-1986 (REL. 01, CREATED)
-C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
-C;DT 01-MAR-1992 (REL. 21, LAST ANNOTATION UPDATE)
-C;DE HEMOGLOBIN ALPHA CHAINS (SLOW AND FAST). . . .
-
->P1;MYG_PHYCA
-Sw:Myg_Phyca => MYG_PHYCA
- VLSEGEWQLV LHVWAKVEAD VAGHGQDILI RLFKSHPETL EKFDRFKHLK
- TEAEMKASED LKKHGVTVLT ALGAILKKKG HHEAELKPLA QSHATKHKIP
- IKYLEFISEA IIHVLHSRHP GDFGADAQGA MNKALELFRK DIAAKYKELG
- YQG*
-C;ID MYG_PHYCA STANDARD; PRT; 153 AA.
-C;AC P02185;
-C;DT 21-JUL-1986 (REL. 01, CREATED)
-C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
-C;DT 01-MAY-1992 (REL. 22, LAST ANNOTATION UPDATE)
-C;DE MYOGLOBIN. . . .
-
->P1;GLB5_PETMA
-Sw:Glb5_Petma => GLB5_PETMA
- PIVDTGSVAP LSAAEKTKIR SAWAPVYSTY ETSGVDILVK FFTSTPAAQE
- FFPKFKGLTT ADQLKKSADV RWHAERIINA VNDAVASMDD TEKMSMKLRD
- LSGKHAKSFQ VDPQYFKVLA AVIADTVAAG DAGFEKLMSM ICILLRSAY*
-C;ID GLB5_PETMA STANDARD; PRT; 149 AA.
-C;AC P02208;
-C;DT 21-JUL-1986 (REL. 01, CREATED)
-C;DT 21-JUL-1986 (REL. 01, LAST SEQUENCE UPDATE)
-C;DT 01-MAR-1992 (REL. 21, LAST ANNOTATION UPDATE)
-C;DE GLOBIN V. . . .
-
->P1;LGB2_LUPLU
-Sw:Lgb2_Luplu => LGB2_LUPLU
- GALTESQAAL VKSSWEEFNA NIPKHTHRFF ILVLEIAPAA KDLFSFLKGT
- SEVPQNNPEL QAHAGKVFKL VYEAAIQLQV TGVVVTDATL KNLGSVHVSK
- GVADAHFPVV KEAILKTIKE VVGAKWSEEL NSAWTIAYDE LAIVIKKEMN
- DAA*
-C;ID LGB2_LUPLU STANDARD; PRT; 153 AA.
-C;AC P02240;
-C;DT 21-JUL-1986 (REL. 01, CREATED)
-C;DT 01-NOV-1988 (REL. 09, LAST SEQUENCE UPDATE)
-C;DT 01-MAR-1992 (REL. 21, LAST ANNOTATION UPDATE)
-C;DE LEGHEMOGLOBIN II. . . .
-
Deleted: trunk/packages/clustalw/trunk/gon90.bla
===================================================================
--- trunk/packages/clustalw/trunk/gon90.bla 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/gon90.bla 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,24 +0,0 @@
-#
-#
- C S T P A G N D E Q H R K M I L V F Y W *
- 15.10 -1.20-3.00 -8.50 -0.70 -5.60 -5.10 -8.60 -8.60 -7.00-3.90 -5.50 -8.10 -3.50 -4.80 -5.10 -1.80 -3.50 -2.80 -3.90 0.0
- -1.20 7.302.60 -1.10 1.50 -1.20 0.70 -1.10 -1.60 -1.20-2.10 -2.30 -1.70 -3.70 -5.70 -5.90 -3.90 -7.10 -4.30 -6.80 0.0
- -3.00 2.60 7.70 -1.70 -0.30 -4.70 -0.50 -2.00 -2.30 -1.50-2.10 -2.40 -1.20 -2.20 -2.40 -4.40 -0.70 -5.80 -5.30 -8.10 0.0
- -8.50 -1.10-1.70 11.20 -1.30 -5.30 -4.30 -3.80 -3.10 -2.10-4.00 -4.10 -3.30 -7.10 -6.90 -5.40 -5.30 -8.70 -7.10 -10.50 0.0
- -0.70 1.50 -0.30 -1.30 7.10 -0.80 -2.90 -2.70 -1.40 -1.90-3.50 -3.20 -2.70 -2.30 -3.80 -3.90 -0.40 -5.90 -6.00 -8.20 0.0
- -5.60 -1.20-4.70 -5.30 -0.80 9.50 -1.40 -2.30 -4.20 -4.00-4.80 -3.90 -4.50 -7.70 -10.60 -9.80 -8.00 -10.80-8.80 -7.60 0.0
- -5.10 0.70 -0.50 -4.30 -2.90 -1.40 9.30 2.30 -0.90 -0.401.10 -1.90 0.10 -5.70 -6.70 -7.40 -6.30 -7.00 -3.60 -8.10 0.0
- -8.60 -1.10-2.00 -3.80 -2.70 -2.30 2.30 9.30 3.30 -0.60-1.40 -3.90 -1.70 -8.10 -9.80 -10.10 -8.10 -10.40-6.40 -11.60 0.0
- -8.60 -1.60-2.30 -3.10 -1.40 -4.20 -0.90 3.30 8.40 2.40 -1.40 -1.90 0.70 -4.90 -6.80 -7.00 -4.70 -9.50 -6.90 -9.20 0.0
- -7.00 -1.20-1.50 -2.10 -1.90 -4.00 -0.40 -0.60 2.40 8.901.60 1.30 1.80 -1.80 -5.50 -3.60 -4.70 -6.50 -4.90 -5.80 0.0
- -3.90 -2.10-2.10 -4.00 -3.50 -4.80 1.10 -1.40 -1.40 1.60 12.30 -0.70 -0.90 -3.50 -5.90 -5.30 -6.00 -2.20 2.30 -3.80 0.0
- -5.50 -2.30-2.40 -4.10 -3.20 -3.90 -1.90 -3.90 -1.90 1.30 -0.70 9.30 3.50 -4.80 -6.50 -5.40 -5.50 -8.40 -4.60 -3.80 0.0
- -8.10 -1.70-1.20 -3.30 -2.70 -4.50 0.10 -1.70 0.70 1.80 -0.90 3.50 8.10 -3.40 -5.50 -5.40 -4.90 -8.30 -5.50 -8.30 0.0
- -3.50 -3.70-2.20 -7.10 -2.30 -7.70 -5.70 -8.10 -4.90 -1.80-3.50 -4.80 -3.40 11.10 2.70 3.20 0.40 0.60 -3.30 -4.10 0.0
- -4.80 -5.70-2.40 -6.90 -3.80 -10.60 -6.70 -9.80 -6.80 -5.50-5.90 -6.50 -5.50 2.70 8.20 2.40 4.20 -1.10 -4.20 -5.80 0.0
- -5.10 -5.90-4.40 -5.40 -3.90 -9.80 -7.40 -10.10 -7.00 -3.60-5.30 -5.40 -5.40 3.20 2.40 7.40 0.60 1.00 -3.10 -3.90 0.0
- -1.80 -3.90-0.70 -5.30 -0.40 -8.00 -6.30 -8.10 -4.70 -4.70-6.00 -5.50 -4.90 0.40 4.20 0.60 7.60 -2.70 -4.30 -7.30 0.0
- -3.50 -7.10-5.80 -8.70 -5.90 -10.80 -7.00 -10.40 -9.50 -6.50-2.20 -8.40 -8.30 0.60 -1.10 1.00 -2.70 11.105.10 2.00 0.0
- -2.80 -4.30-5.30 -7.10 -6.00 -8.80 -3.60 -6.40 -6.90 -4.902.30 -4.60 -5.50 -3.30 -4.20 -3.10 -4.30 5.10 12.00 2.60 0.0
- -3.90 -6.80-8.10 -10.50 -8.20 -7.60 -8.10 -11.60 -9.20 -5.80-3.80 -3.80 -8.30 -4.10 -5.80 -3.90 -7.30 2.00 2.60 17.10 0.0
-0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Deleted: trunk/packages/clustalw/trunk/interface.c
===================================================================
--- trunk/packages/clustalw/trunk/interface.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/interface.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,4393 +0,0 @@
-/* command line interface for Clustal W */
-/* DES was here MARCH. 1994 */
-/* DES was here SEPT. 1994 */
-/* Fixed memory allocation bug in check_param() . Alan Bleasby Dec 2002 */
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <setjmp.h>
-#include "clustalw.h"
-#include "param.h"
-
-/*
-* Prototypes
-*/
-
-#ifdef UNIX
-FILE *open_path(char *);
-#endif
-
-
-char *nameonly(char *s) ;
-
-static sint check_param(char **args,char *params[], char *param_arg[]);
-static void set_optional_param(void);
-static sint find_match(char *probe, char *list[], sint n);
-static void show_aln(void);
-static void create_parameter_output(void);
-static void reset_align(void);
-static void reset_prf1(void);
-static void reset_prf2(void);
-static void calc_gap_penalty_mask(int prf_length,char *struct_mask,char *gap_mask);
-void print_sec_struct_mask(int prf_length,char *mask,char *struct_mask);
-/*
-* Global variables
-*/
-
-extern sint max_names;
-
-extern Boolean interactive;
-
-extern double **tmat;
-extern float gap_open, gap_extend;
-extern float dna_gap_open, dna_gap_extend;
-extern float prot_gap_open, prot_gap_extend;
-extern float pw_go_penalty, pw_ge_penalty;
-extern float dna_pw_go_penalty, dna_pw_ge_penalty;
-extern float prot_pw_go_penalty, prot_pw_ge_penalty;
-extern char revision_level[];
-extern sint wind_gap,ktup,window,signif;
-extern sint dna_wind_gap, dna_ktup, dna_window, dna_signif;
-extern sint prot_wind_gap,prot_ktup,prot_window,prot_signif;
-extern sint boot_ntrials; /* number of bootstrap trials */
-extern sint nseqs;
-extern sint new_seq;
-extern sint *seqlen_array;
-extern sint divergence_cutoff;
-extern sint debug;
-extern Boolean no_weights;
-extern Boolean neg_matrix;
-extern Boolean quick_pairalign;
-extern Boolean reset_alignments_new; /* DES */
-extern Boolean reset_alignments_all; /* DES */
-extern sint gap_dist;
-extern Boolean no_hyd_penalties, no_pref_penalties;
-extern sint max_aa;
-extern sint gap_pos1, gap_pos2;
-extern sint max_aln_length;
-extern sint *output_index, output_order;
-extern sint profile_no;
-extern short usermat[], pw_usermat[];
-extern short aa_xref[], pw_aa_xref[];
-extern short userdnamat[], pw_userdnamat[];
-extern short dna_xref[], pw_dna_xref[];
-extern sint *seq_weight;
-
-extern Boolean lowercase; /* Flag for GDE output - set on comm. line*/
-extern Boolean cl_seq_numbers;
-
-extern Boolean seqRange; /*Ramu */
-
-extern Boolean output_clustal, output_nbrf, output_phylip, output_gcg, output_gde, output_nexus, output_fasta;
-extern Boolean output_tree_clustal, output_tree_phylip, output_tree_distances, output_tree_nexus;
-extern sint bootstrap_format;
-extern Boolean tossgaps, kimura;
-extern Boolean percent;
-extern Boolean explicit_dnaflag; /* Explicit setting of sequence type on comm.line*/
-extern Boolean usemenu;
-extern Boolean showaln, save_parameters;
-extern Boolean dnaflag;
-extern float transition_weight;
-extern unsigned sint boot_ran_seed;
-
-
-extern FILE *tree;
-extern FILE *clustal_outfile, *gcg_outfile, *nbrf_outfile, *phylip_outfile, *nexus_outfile;
-extern FILE *fasta_outfile; /* Ramu */
-extern FILE *gde_outfile;
-
-extern char hyd_residues[];
-extern char *amino_acid_codes;
-extern char **args;
-extern char seqname[];
-
-extern char **seq_array;
-extern char **names, **titles;
-
-extern char *gap_penalty_mask1,*gap_penalty_mask2;
-extern char *sec_struct_mask1,*sec_struct_mask2;
-extern sint struct_penalties,struct_penalties1,struct_penalties2;
-extern sint output_struct_penalties;
-extern Boolean use_ss1, use_ss2;
-extern char *ss_name1,*ss_name2;
-
-
-char *ss_name = NULL;
-char *sec_struct_mask = NULL;
-char *gap_penalty_mask = NULL;
-
-char profile1_name[FILENAMELEN+1];
-char profile2_name[FILENAMELEN+1];
-
-Boolean empty;
-Boolean profile1_empty, profile2_empty; /* whether or not profiles */
-
-char outfile_name[FILENAMELEN+1]="";
-
-static char clustal_outname[FILENAMELEN+1], gcg_outname[FILENAMELEN+1];
-static char phylip_outname[FILENAMELEN+1],nbrf_outname[FILENAMELEN+1];
-static char gde_outname[FILENAMELEN+1], nexus_outname[FILENAMELEN+1];
-static char fasta_outname[FILENAMELEN+1]; /* Ramu */
-char clustal_tree_name[FILENAMELEN+1]="";
-char dist_tree_name[FILENAMELEN+1]="";
-char phylip_tree_name[FILENAMELEN+1]="";
-char nexus_tree_name[FILENAMELEN+1]="";
-char p1_tree_name[FILENAMELEN+1]="";
-char p2_tree_name[FILENAMELEN+1]="";
-
-char pim_name[FILENAMELEN+1]=""; /* Ramu */
-
-static char *params[MAXARGS];
-static char *param_arg[MAXARGS];
-
-static char *cmd_line_type[] = {
- " ",
- "=n ",
- "=f ",
- "=string ",
- "=filename ",
- ""};
-
-static sint numparams;
-static Boolean check_tree = TRUE;
-
-sint profile1_nseqs; /* have been filled; the no. of seqs in prof 1*/
-Boolean use_tree_file = FALSE,new_tree_file = FALSE;
-Boolean use_tree1_file = FALSE, use_tree2_file = FALSE;
-Boolean new_tree1_file = FALSE, new_tree2_file = FALSE;
-
-static char *lin2;
-
-MatMenu dnamatrix_menu = {3,
- "IUB","iub",
- "CLUSTALW(1.6)","clustalw",
- "User defined",""
- };
-
-MatMenu matrix_menu = {5,
- "BLOSUM series","blosum",
- "PAM series","pam",
- "Gonnet series","gonnet",
- "Identity matrix","id",
- "User defined",""
- };
-
-MatMenu pw_matrix_menu = {5,
- "BLOSUM 30","blosum",
- "PAM 350","pam",
- "Gonnet 250","gonnet",
- "Identity matrix","id",
- "User defined",""
- };
-
-
-void init_interface(void)
-{
- empty=TRUE;
-
- profile1_empty = TRUE; /* */
- profile2_empty = TRUE; /* */
-
- lin2 = (char *)ckalloc( (MAXLINE+1) * sizeof (char) );
-
-}
-
-
-
-
-static sint check_param(char **args,char *params[], char *param_arg[])
-{
-
-/*
-#ifndef MAC
- char *strtok(char *s1, const char *s2);
-#endif
-*/
- sint len,i,j,k,s,n,match[MAXARGS];
- Boolean name1 = FALSE;
- sint ajb;
-
- if(args[0]==NULL) return;
-
-
-
- params[0]=(char *)ckalloc((strlen(args[0])+1)*sizeof(char));
- if (args[0][0]!=COMMANDSEP)
- {
- name1 = TRUE;
- strcpy(params[0],args[0]);
- }
- else
- strcpy(params[0],&args[0][1]);
-
- for (i=1;i<MAXARGS;i++) {
- if(args[i]==NULL) break;
- params[i]=(char *)ckalloc((strlen(args[i])+1)*sizeof(char));
- ajb=0;
- for(j=0;j<strlen(args[i])-1;j++)
- if(isprint(args[i][j+1])) params[i][ajb++]=args[i][j+1];
- params[i][ajb]='\0';
- }
-
- if (i==MAXARGS) {
- fprintf(stdout,"Error: too many command line arguments\n");
- return(-1);
- }
-/*
- special case - first parameter is input filename
- */
- s = 0;
- if(name1 == TRUE) {
- strcpy(seqname, params[0]);
- /* JULIE
- convert to lower case now
- */
-#ifndef UNIX
- for(k=0;k<(sint)strlen(params[0]);++k) seqname[k]=tolower(params[0][k]);
-#else
- for(k=0;k<(sint)strlen(params[0]);++k) seqname[k]=params[0][k];
-#endif
- s++;
- }
-
- n = i;
- for (i=s;i<n;i++) {
- param_arg[i] = NULL;
- len = (sint)strlen(params[i]);
- for(j=0; j<len; j++)
- if(params[i][j] == '=') {
- param_arg[i] = (char *)ckalloc((len-j) * sizeof(char));
- strncpy(param_arg[i],¶ms[i][j+1],len-j-1);
- params[i][j] = EOS;
- /* JULIE
- convert keywords to lower case now
- */
- for(k=0;k<j;++k) params[i][k]=tolower(params[i][k]);
- param_arg[i][len-j-1] = EOS;
- break;
- }
- }
-
- /*
- for each parameter given on the command line, first search the list of recognised optional
- parameters....
- */
-
- for (i=0;i<n;i++) {
- if ((i==0) && (name1 == TRUE)) continue;
- j = 0;
- match[i] = -1;
- for(;;) {
- if (cmd_line_para[j].str[0] == '\0') break;
- if (!strcmp(params[i],cmd_line_para[j].str)) {
- match[i] = j;
- *cmd_line_para[match[i]].flag = i;
- if ((cmd_line_para[match[i]].type != NOARG) &&
- (param_arg[i] == NULL)) {
- fprintf(stdout,
- "Error: parameter required for /%s\n",params[i]);
- exit(1);
- }
- /* JULIE
- convert parameters to lower case now, unless the parameter is a filename
- */
-#ifdef UNIX
- else if (cmd_line_para[match[i]].type != FILARG
- && param_arg[i] != NULL)
-#endif
- if (param_arg[i]!=0)
- {
- for(k=0;k<strlen(param_arg[i]);++k)
- param_arg[i][k]=tolower(param_arg[i][k]);
- }
- break;
- }
- j++;
- }
- }
- /*
- ....then the list of recognised input files,....
-*/
- for (i=0;i<n;i++) {
- if ((i==0) && (name1 == TRUE)) continue;
- if (match[i] != -1) continue;
- j = 0;
- for(;;) {
- if (cmd_line_file[j].str[0] == '\0') break;
- if (!strcmp(params[i],cmd_line_file[j].str)) {
- match[i] = j;
- *cmd_line_file[match[i]].flag = i;
- if ((cmd_line_file[match[i]].type != NOARG) &&
- (param_arg[i] == NULL)) {
- fprintf(stdout,
- "Error: parameter required for /%s\n",params[i]);
- exit(1);
- }
- break;
- }
- j++;
- }
- }
-/*
- ....and finally the recognised verbs.
-*/
- for (i=0;i<n;i++) {
- if ((i==0) && (name1 == TRUE)) continue;
- if (match[i] != -1) continue;
- j = 0;
- for(;;) {
- if (cmd_line_verb[j].str[0] == '\0') break;
- if (!strcmp(params[i],cmd_line_verb[j].str)) {
- match[i] = j;
- *cmd_line_verb[match[i]].flag = i;
- if ((cmd_line_verb[match[i]].type != NOARG) &&
- (param_arg[i] == NULL)) {
- fprintf(stdout,
- "Error: parameter required for /%s\n",params[i]);
- exit(1);
- }
- break;
- }
- j++;
- }
- }
-
-/*
- check for any unrecognised parameters.
-*/
- for (i=0;i<n;i++) {
- if (match[i] == -1) {
- fprintf(stdout,
- "Error: unknown option %c%s\n",COMMANDSEP,params[i]);
- exit(1);
- }
- }
- return(n);
-}
-
-static void set_optional_param(void)
-{
- int i,temp;
- int c;
- float ftemp;
- char tstr[100];
-
- /****************************************************************************/
- /* look for parameters on command line e.g. gap penalties, k-tuple etc. */
- /****************************************************************************/
-
- /*** ? /score=percent or /score=absolute */
- if(setscore != -1)
- if(strlen(param_arg[setscore]) > 0) {
- temp = find_match(param_arg[setscore],score_arg,2);
- if(temp == 0)
- percent = TRUE;
- else if(temp == 1)
- percent = FALSE;
- else
- fprintf(stdout,"\nUnknown SCORE type: %s\n",
- param_arg[setscore]);
- }
-
- /*** ? /seed=n */
- if(setseed != -1) {
- temp = 0;
- if(strlen(param_arg[setseed]) > 0)
- if (sscanf(param_arg[setseed],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /seed (must be integer)\n");
- temp = 0;
- }
- if(temp > 0) boot_ran_seed = temp;
- fprintf(stdout,"\ntemp = %d; seed = %u;\n",(pint)temp,boot_ran_seed);
- }
-
-
-/*** ? /output=PIR, GCG, GDE or PHYLIP */
- if(setoutput != -1)
- if(strlen(param_arg[setoutput]) > 0) {
- temp = find_match(param_arg[setoutput],output_arg,6);
- if (temp >= 0 && temp <= 5) {
- output_clustal = FALSE;
- output_gcg = FALSE;
- output_phylip = FALSE;
- output_nbrf = FALSE;
- output_gde = FALSE;
- output_nexus = FALSE;
- output_fasta = FALSE;
- }
- switch (temp) {
- case 0: /* GCG */
- output_gcg = TRUE;
- break;
- case 1: /* GDE */
- output_gde = TRUE;
- break;
- case 2: /* PIR */
- output_nbrf = TRUE;
- break;
- case 3: /* PHYLIP */
- output_phylip = TRUE;
- break;
- case 4: /* NEXUS */
- output_nexus = TRUE;
- break;
- case 5: /* NEXUS */
- output_fasta = TRUE;
- break;
- default:
- fprintf(stdout,"\nUnknown OUTPUT type: %s\n",
- param_arg[setoutput]);
- }
- }
-
-/*** ? /outputtree=NJ or PHYLIP or DIST or NEXUS */
- if(setoutputtree != -1)
- if(strlen(param_arg[setoutputtree]) > 0) {
- temp = find_match(param_arg[setoutputtree],outputtree_arg,4);
- switch (temp) {
- case 0: /* NJ */
- output_tree_clustal = TRUE;
- break;
- case 1: /* PHYLIP */
- output_tree_phylip = TRUE;
- break;
- case 2: /* DIST */
- output_tree_distances = TRUE;
- break;
- case 3: /* NEXUS */
- output_tree_nexus = TRUE;
- break;
- default:
- fprintf(stdout,"\nUnknown OUTPUT TREE type: %s\n",
- param_arg[setoutputtree]);
- }
- }
-
-/*** ? /profile (sets type of second input file to profile) */
- if(setprofile != -1)
- profile_type = PROFILE;
-
- /*** ? /sequences (sets type of second input file to list of sequences) */
- if(setsequences != -1)
- profile_type = SEQUENCE;
-
-
-
- /*** ? /ktuple=n */
- if(setktuple != -1) {
- temp = 0;
- if(strlen(param_arg[setktuple]) > 0)
- if (sscanf(param_arg[setktuple],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /ktuple (must be integer)\n");
- temp = 0;
- }
- if(temp > 0) {
- if(dnaflag) {
- if(temp <= 4) {
- ktup = temp;
- dna_ktup = ktup;
- wind_gap = ktup + 4;
- dna_wind_gap = wind_gap;
- }
- }
- else {
- if(temp <= 2) {
- ktup = temp;
- prot_ktup = ktup;
- wind_gap = ktup + 3;
- prot_wind_gap = wind_gap;
- }
- }
- }
- }
-
- /*** ? /pairgap=n */
- if(setpairgap != -1) {
- temp = 0;
- if(strlen(param_arg[setpairgap]) > 0)
- if (sscanf(param_arg[setpairgap],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /pairgap (must be integer)\n");
- temp = 0;
- }
- if(temp > 0)
- if(dnaflag) {
- if(temp > ktup) {
- wind_gap = temp;
- dna_wind_gap = wind_gap;
- }
- }
- else {
- if(temp > ktup) {
- wind_gap = temp;
- prot_wind_gap = wind_gap;
- }
- }
- }
-
-
-/*** ? /topdiags=n */
- if(settopdiags != -1) {
- temp = 0;
- if(strlen(param_arg[settopdiags]) > 0)
- if (sscanf(param_arg[settopdiags],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /topdiags (must be integer)\n");
- temp = 0;
- }
- if(temp > 0)
- if(dnaflag) {
- if(temp > ktup) {
- signif = temp;
- dna_signif = signif;
- }
- }
- else {
- if(temp > ktup) {
- signif = temp;
- prot_signif = signif;
- }
- }
- }
-
-
-/*** ? /window=n */
- if(setwindow != -1) {
- temp = 0;
- if(strlen(param_arg[setwindow]) > 0)
- if (sscanf(param_arg[setwindow],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /window (must be integer)\n");
- temp = 0;
- }
- if(temp > 0)
- if(dnaflag) {
- if(temp > ktup) {
- window = temp;
- dna_window = window;
- }
- }
- else {
- if(temp > ktup) {
- window = temp;
- prot_window = window;
- }
- }
- }
-
-/*** ? /kimura */
- if(setkimura != -1)
- kimura = TRUE;
-
- /*** ? /tossgaps */
- if(settossgaps != -1)
- tossgaps = TRUE;
-
-
- /*** ? /negative */
- if(setnegative != -1)
- neg_matrix = TRUE;
-
- /*** ? /noweights */
- if(setnoweights!= -1)
- no_weights = TRUE;
-
-
- /*** ? /pwmatrix=ID (user's file) */
- if(setpwmatrix != -1)
- {
- temp=strlen(param_arg[setpwmatrix]);
- if(temp > 0) {
- for(i=0;i<temp;i++)
- if (isupper(param_arg[setpwmatrix][i]))
- tstr[i]=tolower(param_arg[setpwmatrix][i]);
- else
- tstr[i]=param_arg[setpwmatrix][i];
- tstr[i]='\0';
- if (strcmp(tstr,"blosum")==0) {
- strcpy(pw_mtrxname, tstr);
- pw_matnum = 1;
- }
- else if (strcmp(tstr,"pam")==0) {
- strcpy(pw_mtrxname, tstr);
- pw_matnum = 2;
- }
- else if (strcmp(tstr,"gonnet")==0) {
- strcpy(pw_mtrxname, tstr);
- pw_matnum = 3;
- }
- else if (strcmp(tstr,"id")==0) {
- strcpy(pw_mtrxname, tstr);
- pw_matnum = 4;
- }
- else {
- if(user_mat(param_arg[setpwmatrix], pw_usermat, pw_aa_xref))
- {
- strcpy(pw_mtrxname,param_arg[setpwmatrix]);
- strcpy(pw_usermtrxname,param_arg[setpwmatrix]);
- pw_matnum=5;
- }
- else exit(1);
- }
-
- }
- }
-
-/*** ? /matrix=ID (user's file) */
- if(setmatrix != -1)
- {
- temp=strlen(param_arg[setmatrix]);
- if(temp > 0) {
- for(i=0;i<temp;i++)
- if (isupper(param_arg[setmatrix][i]))
- tstr[i]=tolower(param_arg[setmatrix][i]);
- else
- tstr[i]=param_arg[setmatrix][i];
- tstr[i]='\0';
- if (strcmp(tstr,"blosum")==0) {
- strcpy(mtrxname, tstr);
- matnum = 1;
- }
- else if (strcmp(tstr,"pam")==0) {
- strcpy(mtrxname, tstr);
- matnum = 2;
- }
- else if (strcmp(tstr,"gonnet")==0) {
- strcpy(mtrxname, tstr);
- matnum = 3;
- }
- else if (strcmp(tstr,"id")==0) {
- strcpy(mtrxname, tstr);
- matnum = 4;
- }
- else {
- if(user_mat_series(param_arg[setmatrix], usermat, aa_xref))
- {
- strcpy(mtrxname,param_arg[setmatrix]);
- strcpy(usermtrxname,param_arg[setmatrix]);
- matnum=5;
- }
- else exit(1);
- }
-
- }
- }
-
-/*** ? /pwdnamatrix=ID (user's file) */
- if(setpwdnamatrix != -1)
- {
- temp=strlen(param_arg[setpwdnamatrix]);
- if(temp > 0) {
- for(i=0;i<temp;i++)
- if (isupper(param_arg[setpwdnamatrix][i]))
- tstr[i]=tolower(param_arg[setpwdnamatrix][i]);
- else
- tstr[i]=param_arg[setpwdnamatrix][i];
- tstr[i]='\0';
- if (strcmp(tstr,"iub")==0) {
- strcpy(pw_dnamtrxname, tstr);
- pw_dnamatnum = 1;
- }
- else if (strcmp(tstr,"clustalw")==0) {
- strcpy(pw_dnamtrxname, tstr);
- pw_dnamatnum = 2;
- }
- else {
- if(user_mat(param_arg[setpwdnamatrix], pw_userdnamat, pw_dna_xref))
- {
- strcpy(pw_dnamtrxname,param_arg[setpwdnamatrix]);
- strcpy(pw_dnausermtrxname,param_arg[setpwdnamatrix]);
- pw_dnamatnum=3;
- }
- else exit(1);
- }
-
- }
- }
-
-/*** ? /matrix=ID (user's file) */
- if(setdnamatrix != -1)
- {
- temp=strlen(param_arg[setdnamatrix]);
- if(temp > 0) {
- for(i=0;i<temp;i++)
- if (isupper(param_arg[setdnamatrix][i]))
- tstr[i]=tolower(param_arg[setdnamatrix][i]);
- else
- tstr[i]=param_arg[setdnamatrix][i];
- tstr[i]='\0';
- if (strcmp(tstr,"iub")==0) {
- strcpy(dnamtrxname, tstr);
- dnamatnum = 1;
- }
- else if (strcmp(tstr,"clustalw")==0) {
- strcpy(dnamtrxname, tstr);
- dnamatnum = 2;
- }
- else {
- if(user_mat(param_arg[setdnamatrix], userdnamat, dna_xref))
- {
- strcpy(dnamtrxname,param_arg[setdnamatrix]);
- strcpy(dnausermtrxname,param_arg[setdnamatrix]);
- dnamatnum=3;
- }
- else exit(1);
- }
-
- }
- }
-/*** ? /maxdiv= n */
- if(setmaxdiv != -1) {
- temp = 0;
- if(strlen(param_arg[setmaxdiv]) > 0)
- if (sscanf(param_arg[setmaxdiv],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /maxdiv (must be integer)\n");
- temp = 0;
- }
- if (temp >= 0)
- divergence_cutoff = temp;
- }
-
-/*** ? /gapdist= n */
- if(setgapdist != -1) {
- temp = 0;
- if(strlen(param_arg[setgapdist]) > 0)
- if (sscanf(param_arg[setgapdist],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /gapdist (must be integer)\n");
- temp = 0;
- }
- if (temp >= 0)
- gap_dist = temp;
- }
-
-/*** ? /debug= n */
- if(setdebug != -1) {
- temp = 0;
- if(strlen(param_arg[setdebug]) > 0)
- if (sscanf(param_arg[setdebug],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /debug (must be integer)\n");
- temp = 0;
- }
- if (temp >= 0)
- debug = temp;
- }
-
-/*** ? /outfile= (user's file) */
- if(setoutfile != -1)
- if(strlen(param_arg[setoutfile]) > 0) {
- strcpy(outfile_name, param_arg[setoutfile]);
- }
-
-/*** ? /case= lower/upper */
- if(setcase != -1)
- if(strlen(param_arg[setcase]) > 0) {
- temp = find_match(param_arg[setcase],case_arg,2);
- if(temp == 0) {
- lowercase = TRUE;
- }
- else if(temp == 1) {
- lowercase = FALSE;
- }
- else
- fprintf(stdout,"\nUnknown case %s\n",
- param_arg[setcase]);
- }
-
-/*** ? /seqnos=off/on */
- if(setseqno != -1)
- if(strlen(param_arg[setseqno]) > 0) {
- temp = find_match(param_arg[setseqno],seqno_arg,2);
- if(temp == 0) {
- cl_seq_numbers = FALSE;
- }
- else if(temp == 1) {
- cl_seq_numbers = TRUE;
- }
- else
- fprintf(stdout,"\nUnknown SEQNO option %s\n",
- param_arg[setseqno]);
- }
-
-
-
- if(setseqno_range != -1)
- if(strlen(param_arg[setseqno_range]) > 0) {
- temp = find_match(param_arg[setseqno_range],seqno_range_arg,2);
- printf("\n comparing ");
- printf("\nparam_arg[setseqno_range]= %s", param_arg[setseqno_range]);
- /* printf("\nseqno_range_arg = %s ",seqno_range_arg); */
- printf("\n comparing \n ");
-
- if(temp == 0) {
- seqRange = FALSE;
- }
- else if(temp == 1) {
- seqRange = TRUE;
-
- }
- else
- fprintf(stdout,"\nUnknown Sequence range option %s\n",
- param_arg[setseqno_range]);
- }
-
-
-/*** ? /range=n:m */
- if(setrange != -1) {
- temp = 0;
- if(strlen(param_arg[setrange]) > 0)
- if (sscanf(param_arg[setrange],"%d:%d",&temp,&temp)!=2) {
- fprintf(stdout,"setrange: Syntax Error: Cannot set range, should be from:to \n");
- temp = 0;
- }
- }
-
-/*** ? /range=n:m */
-
-
-
-/*** ? /gapopen=n */
- if(setgapopen != -1) {
- ftemp = 0.0;
- if(strlen(param_arg[setgapopen]) > 0)
- if (sscanf(param_arg[setgapopen],"%f",&ftemp)!=1) {
- fprintf(stdout,"Bad option for /gapopen (must be real number)\n");
- ftemp = 0.0;
- }
- if(ftemp >= 0.0)
- if(dnaflag) {
- gap_open = ftemp;
- dna_gap_open = gap_open;
- }
- else {
- gap_open = ftemp;
- prot_gap_open = gap_open;
- }
- }
-
-
-/*** ? /gapext=n */
- if(setgapext != -1) {
- ftemp = 0.0;
- if(strlen(param_arg[setgapext]) > 0)
- if (sscanf(param_arg[setgapext],"%f",&ftemp)!=1) {
- fprintf(stdout,"Bad option for /gapext (must be real number)\n");
- ftemp = 0.0;
- }
- if(ftemp >= 0)
- if(dnaflag) {
- gap_extend = ftemp;
- dna_gap_extend = gap_extend;
- }
- else {
- gap_extend = ftemp;
- prot_gap_extend = gap_extend;
- }
- }
-
-/*** ? /transweight=n*/
- if(settransweight != -1) {
- ftemp = 0.0;
- if(strlen(param_arg[settransweight]) > 0)
- if (sscanf(param_arg[settransweight],"%f",&ftemp)!=1) {
- fprintf(stdout,"Bad option for /transweight (must be real number)\n");
- ftemp = 0.0;
- }
- transition_weight=ftemp;
- }
-
-/*** ? /pwgapopen=n */
- if(setpwgapopen != -1) {
- ftemp = 0.0;
- if(strlen(param_arg[setpwgapopen]) > 0)
- if (sscanf(param_arg[setpwgapopen],"%f",&ftemp)!=1) {
- fprintf(stdout,"Bad option for /pwgapopen (must be real number)\n");
- ftemp = 0.0;
- }
- if(ftemp >= 0.0)
- if(dnaflag) {
- pw_go_penalty = ftemp;
- dna_pw_go_penalty = pw_go_penalty;
- }
- else {
- pw_go_penalty = ftemp;
- prot_pw_go_penalty = pw_go_penalty;
- }
- }
-
-
-/*** ? /gapext=n */
- if(setpwgapext != -1) {
- ftemp = 0.0;
- if(strlen(param_arg[setpwgapext]) > 0)
- if (sscanf(param_arg[setpwgapext],"%f",&ftemp)!=1) {
- fprintf(stdout,"Bad option for /pwgapext (must be real number)\n");
- ftemp = 0.0;
- }
- if(ftemp >= 0)
- if(dnaflag) {
- pw_ge_penalty = ftemp;
- dna_pw_ge_penalty = pw_ge_penalty;
- }
- else {
- pw_ge_penalty = ftemp;
- prot_pw_ge_penalty = pw_ge_penalty;
- }
- }
-
-
-
-/*** ? /outorder=n */
- if(setoutorder != -1) {
- if(strlen(param_arg[setoutorder]) > 0)
- temp = find_match(param_arg[setoutorder],outorder_arg,2);
- if(temp == 0) {
- output_order = INPUT;
- }
- else if(temp == 1) {
- output_order = ALIGNED;
- }
- else
- fprintf(stdout,"\nUnknown OUTPUT ORDER type %s\n",
- param_arg[setoutorder]);
- }
-
-/*** ? /bootlabels=n */
- if(setbootlabels != -1) {
- if(strlen(param_arg[setbootlabels]) > 0)
- temp = find_match(param_arg[setbootlabels],bootlabels_arg,2);
- if(temp == 0) {
- bootstrap_format = BS_NODE_LABELS;
- }
- else if(temp == 1) {
- bootstrap_format = BS_BRANCH_LABELS;
- }
- else
- fprintf(stdout,"\nUnknown bootlabels type %s\n",
- param_arg[setoutorder]);
- }
-
-/*** ? /endgaps */
- if(setuseendgaps != -1)
- use_endgaps = FALSE;
-
-/*** ? /nopgap */
- if(setnopgap != -1)
- no_pref_penalties = TRUE;
-
-/*** ? /nohgap */
- if(setnohgap != -1)
- no_hyd_penalties = TRUE;
-
-/*** ? /novgap */
- if(setnovgap != -1)
- no_var_penalties = FALSE;
-
-/*** ? /hgapresidues="string" */
- if(sethgapres != -1)
- if(strlen(param_arg[sethgapres]) > 0) {
- for (i=0;i<strlen(hyd_residues) && i<26;i++) {
- c = param_arg[sethgapres][i];
- if (isalpha(c))
- hyd_residues[i] = (char)toupper(c);
- else
- break;
- }
- }
-
-
-/*** ? /nosecstr1 */
- if(setsecstr1 != -1)
- use_ss1 = FALSE;
-
-/*** ? /nosecstr2 */
- if(setsecstr2 != -1)
- use_ss2 = FALSE;
-
-/*** ? /secstroutput */
- if(setsecstroutput != -1)
- if(strlen(param_arg[setsecstroutput]) > 0) {
- temp = find_match(param_arg[setsecstroutput],outputsecstr_arg,4);
- if(temp >= 0 && temp <= 3)
- output_struct_penalties = temp;
- else
- fprintf(stdout,"\nUnknown case %s\n",
- param_arg[setsecstroutput]);
- }
-
-
-/*** ? /helixgap= n */
- if(sethelixgap != -1) {
- temp = 0;
- if(strlen(param_arg[sethelixgap]) > 0)
- if (sscanf(param_arg[sethelixgap],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /helixgap (must be integer)\n");
- temp = 0;
- }
- if (temp >= 1 && temp <= 9)
- helix_penalty = temp;
- }
-
-/*** ? /strandgap= n */
- if(setstrandgap != -1) {
- temp = 0;
- if(strlen(param_arg[setstrandgap]) > 0)
- if (sscanf(param_arg[setstrandgap],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /strandgap (must be integer)\n");
- temp = 0;
- }
- if (temp >= 1 && temp <= 9)
- strand_penalty = temp;
- }
-
-/*** ? /loopgap= n */
- if(setloopgap != -1) {
- temp = 0;
- if(strlen(param_arg[setloopgap]) > 0)
- if (sscanf(param_arg[setloopgap],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /loopgap (must be integer)\n");
- temp = 0;
- }
- if (temp >= 1 && temp <= 9)
- loop_penalty = temp;
- }
-
-/*** ? /terminalgap= n */
- if(setterminalgap != -1) {
- temp = 0;
- if(strlen(param_arg[setterminalgap]) > 0)
- if (sscanf(param_arg[setterminalgap],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /terminalgap (must be integer)\n");
- temp = 0;
- }
- if (temp >= 1 && temp <= 9) {
- helix_end_penalty = temp;
- strand_end_penalty = temp;
- }
- }
-
-/*** ? /helixendin= n */
- if(sethelixendin != -1) {
- temp = 0;
- if(strlen(param_arg[sethelixendin]) > 0)
- if (sscanf(param_arg[sethelixendin],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /helixendin (must be integer)\n");
- temp = 0;
- }
- if (temp >= 0 && temp <= 3)
- helix_end_minus = temp;
- }
-
-/*** ? /helixendout= n */
- if(sethelixendout != -1) {
- temp = 0;
- if(strlen(param_arg[sethelixendout]) > 0)
- if (sscanf(param_arg[sethelixendout],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /helixendout (must be integer)\n");
- temp = 0;
- }
- if (temp >= 0 && temp <= 3)
- helix_end_plus = temp;
- }
-
-/*** ? /strandendin= n */
- if(setstrandendin != -1) {
- temp = 0;
- if(strlen(param_arg[setstrandendin]) > 0)
- if (sscanf(param_arg[setstrandendin],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /strandendin (must be integer)\n");
- temp = 0;
- }
- if (temp >= 0 && temp <= 3)
- strand_end_minus = temp;
- }
-
-/*** ? /strandendout= n */
- if(setstrandendout != -1) {
- temp = 0;
- if(strlen(param_arg[setstrandendout]) > 0)
- if (sscanf(param_arg[setstrandendout],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /strandendout (must be integer)\n");
- temp = 0;
- }
- if (temp >= 0 && temp <= 3)
- strand_end_plus = temp;
- }
-
-}
-
-#ifdef UNIX
-FILE *open_path(char *fname) /* to open in read-only file fname searching for
- it through all path directories */
-{
-#define Mxdir 70
- char dir[Mxdir+1], *path, *deb, *fin;
- FILE *fich;
- sint lf, ltot;
- char *path1;
-
- path=getenv("PATH"); /* get the list of path directories,
- separated by :
- */
-
- /* added for File System Standards - Francois */
- path1=(char *)ckalloc((strlen(path)+64)*sizeof(char));
- strcpy(path1,path);
- strcat(path1,"/usr/share/clustalx:/usr/local/share/clustalx");
-
- lf=(sint)strlen(fname);
- deb=path1;
- do
- {
- fin=strchr(deb,':');
- if(fin!=NULL)
- { strncpy(dir,deb,fin-deb); ltot=fin-deb; }
- else
- { strcpy(dir,deb); ltot=(sint)strlen(dir); }
- /* now one directory is in string dir */
- if( ltot + lf + 1 <= Mxdir)
- {
- dir[ltot]='/';
- strcpy(dir+ltot+1,fname); /* now dir is appended with fi
- lename */
- if( (fich = fopen(dir,"r") ) != NULL) break;
- }
- else fich = NULL;
- deb=fin+1;
- }
- while (fin != NULL);
- return fich;
-}
-#endif
-
-
-void get_help(char help_pointer) /* Help procedure */
-{
- FILE *help_file;
- sint i, number, nlines;
- Boolean found_help;
- char temp[MAXLINE+1];
- char token = '\0';
- char *digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
- char *help_marker = ">>HELP";
-
- extern char *help_file_name;
-
-#ifdef VMS
- if((help_file=fopen(help_file_name,"r","rat=cr","rfm=var"))==NULL) {
- error("Cannot open help file [%s]",help_file_name);
- return;
- }
-#else
-
-#ifdef UNIX
- if((help_file=open_path(help_file_name))==NULL) {
- if((help_file=fopen(help_file_name,"r"))==NULL) {
- error("Cannot open help file [%s]",help_file_name);
- return;
- }
- }
-
-#else
- if((help_file=fopen(help_file_name,"r"))==NULL) {
- error("Cannot open help file [%s]",help_file_name);
- return;
- }
-#endif
-
-#endif
-/* error("Cannot open help file [%s]",help_file_name);
- return;
- }
-*/
- nlines = 0;
- number = -1;
- found_help = FALSE;
-
- while(TRUE) {
- if(fgets(temp,MAXLINE+1,help_file) == NULL) {
- if(!found_help)
- error("No help found in help file");
- fclose(help_file);
- return;
- }
- if(strstr(temp,help_marker)) {
- token = ' ';
- for(i=strlen(help_marker); i<8; i++)
- if(strchr(digits, temp[i])) {
- token = temp[i];
- break;
- }
- }
- if(token == help_pointer) {
- found_help = TRUE;
- while(fgets(temp,MAXLINE+1,help_file)) {
- if(strstr(temp, help_marker)){
- if(usemenu) {
- getstr("\nPress [RETURN] to continue",MAXLINE+1,lin2);
- }
- fclose(help_file);
- return;
- }
- if(temp[0]!='<') {
- fputs(temp,stdout);
- ++nlines;
- }
- if(usemenu) {
- if(nlines >= PAGE_LEN) {
- getstr("\nPress [RETURN] to continue or X to stop",MAXLINE+1,lin2);
- if(toupper(*lin2) == 'X') {
- fclose(help_file);
- return;
- }
- else
- nlines = 0;
- }
- }
- }
- if(usemenu) {
- getstr("\nPress [RETURN] to continue",MAXLINE+1,lin2);
- }
- fclose(help_file);
- }
- }
-}
-
-static void show_aln(void) /* Alignment screen display procedure */
-{
- FILE *file;
- sint nlines;
- char temp[MAXLINE+1];
- char file_name[FILENAMELEN+1];
-
- if(output_clustal) strcpy(file_name,clustal_outname);
- else if(output_nbrf) strcpy(file_name,nbrf_outname);
- else if(output_gcg) strcpy(file_name,gcg_outname);
- else if(output_phylip) strcpy(file_name,phylip_outname);
- else if(output_gde) strcpy(file_name,gde_outname);
- else if(output_nexus) strcpy(file_name,nexus_outname);
- else if(output_fasta) strcpy(file_name,fasta_outname);
-
-#ifdef VMS
- if((file=fopen(file_name,"r","rat=cr","rfm=var"))==NULL) {
-#else
- if((file=fopen(file_name,"r"))==NULL) {
-#endif
- error("Cannot open file [%s]",file_name);
- return;
- }
-
- fprintf(stdout,"\n\n");
- nlines = 0;
-
- while(fgets(temp,MAXLINE+1,file)) {
- fputs(temp,stdout);
- ++nlines;
- if(nlines >= PAGE_LEN) {
- getstr("\nPress [RETURN] to continue or X to stop",MAXLINE+1,lin2);
- if(toupper(*lin2) == 'X') {
- fclose(file);
- return;
- }
- else
- nlines = 0;
- }
- }
- fclose(file);
- getstr("\nPress [RETURN] to continue",MAXLINE+1,lin2);
-}
-
-
-void parse_params(Boolean xmenus)
-{
- sint i,j,len,temp;
- static sint cl_error_code=0;
- char path[FILENAMELEN];
-
-
- Boolean do_align, do_convert, do_align_only, do_tree_only, do_tree, do_boot, do_profile, do_something;
-
- if (!xmenus)
- {
- fprintf(stdout,"\n\n\n");
- fprintf(stdout," CLUSTAL %s Multiple Sequence Alignments\n\n\n",revision_level);
- }
-
- do_align = do_convert = do_align_only = do_tree_only = do_tree = do_boot = do_profile = do_something = FALSE;
-
- *seqname=EOS;
-
-/* JULIE
- len=(sint)strlen(paramstr);
- Stop converting command line to lower case - unix, mac, pc are case sensitive
- for(i=0;i<len;++i) paramstr[i]=tolower(paramstr[i]);
-*/
-
- numparams = check_param(args, params, param_arg);
- if (numparams <0) exit(1);
-
- if(sethelp != -1) {
- get_help('9');
- exit(1);
- }
-
- if(setoptions != -1) {
- fprintf(stdout,"clustalw option list:-\n");
- for (i=0;cmd_line_verb[i].str[0] != '\0';i++) {
- fprintf(stdout,"\t\t%c%s%s",COMMANDSEP,cmd_line_verb[i].str,cmd_line_type[cmd_line_verb[i].type]);
- if (cmd_line_verb[i].type == OPTARG) {
- if (cmd_line_verb[i].arg[0][0] != '\0')
- fprintf(stdout,"=%s",cmd_line_verb[i].arg[0]);
- for (j=1;cmd_line_verb[i].arg[j][0] != '\0';j++)
- fprintf(stdout," OR %s",cmd_line_verb[i].arg[j]);
- }
- fprintf(stdout,"\n");
- }
- for (i=0;cmd_line_file[i].str[0] != '\0';i++) {
- fprintf(stdout,"\t\t%c%s%s",COMMANDSEP,cmd_line_file[i].str,cmd_line_type[cmd_line_file[i].type]);
- if (cmd_line_file[i].type == OPTARG) {
- if (cmd_line_file[i].arg[0][0] != '\0')
- fprintf(stdout,"=%s",cmd_line_file[i].arg[0]);
- for (j=1;cmd_line_file[i].arg[j][0] != '\0';j++)
- fprintf(stdout," OR %s",cmd_line_file[i].arg[j]);
- }
- fprintf(stdout,"\n");
- }
- for (i=0;cmd_line_para[i].str[0] != '\0';i++) {
- fprintf(stdout,"\t\t%c%s%s",COMMANDSEP,cmd_line_para[i].str,cmd_line_type[cmd_line_para[i].type]);
- if (cmd_line_para[i].type == OPTARG) {
- if (cmd_line_para[i].arg[0][0] != '\0')
- fprintf(stdout,"=%s",cmd_line_para[i].arg[0]);
- for (j=1;cmd_line_para[i].arg[j][0] != '\0';j++)
- fprintf(stdout," OR %s",cmd_line_para[i].arg[j]);
- }
- fprintf(stdout,"\n");
- }
- exit(1);
- }
-
-
-/*****************************************************************************/
-/* Check to see if sequence type is explicitely stated..override ************/
-/* the automatic checking (DNA or Protein). /type=d or /type=p *************/
-/*****************************************************************************/
- if(settype != -1)
- if(strlen(param_arg[settype])>0) {
- temp = find_match(param_arg[settype],type_arg,2);
- if(temp == 0) {
- dnaflag = FALSE;
- explicit_dnaflag = TRUE;
- info("Sequence type explicitly set to Protein");
- }
- else if(temp == 1) {
- info("Sequence type explicitly set to DNA");
- dnaflag = TRUE;
- explicit_dnaflag = TRUE;
- }
- else
- fprintf(stdout,"\nUnknown sequence type %s\n",
- param_arg[settype]);
- }
-
-
-/***************************************************************************
-* check to see if 1st parameter does not start with '/' i.e. look for an *
-* input file as first parameter. The input file can also be specified *
-* by /infile=fname. *
-****************************************************************************/
-/* JULIE - moved to check_param()
- if(paramstr[0] != '/') {
- strcpy(seqname, params[0]);
- }
-*/
-
-/**************************************************/
-/* Look for /infile=file.ext on the command line */
-/**************************************************/
-
- if(setinfile != -1) {
- if(strlen(param_arg[setinfile]) <= 0) {
- error("Bad sequence file name");
- exit(1);
- }
- strcpy(seqname, param_arg[setinfile]);
- }
-
- if(*seqname != EOS) {
- profile_no = 0;
- nseqs = readseqs((sint)1);
- if(nseqs < 2) {
- if(nseqs < 0) cl_error_code = 2;
- else if(nseqs == 0) cl_error_code = 3;
- else cl_error_code = 4;
- fprintf(stdout,
- "\nNo. of seqs. read = %d. No alignment!\n",(pint)nseqs);
- exit(cl_error_code);
- }
- for(i = 1; i<=nseqs; i++)
- info("Sequence %d: %-*s %6.d %s",
- (pint)i,max_names,names[i],(pint)seqlen_array[i],dnaflag?"bp":"aa");
- empty = FALSE;
- do_something = TRUE;
- }
-
- set_optional_param();
-
-/*********************************************************/
-/* Look for /profile1=file.ext AND /profile2=file2.ext */
-/* You must give both file names OR neither. */
-/*********************************************************/
-
- if(setprofile1 != -1) {
- if(strlen(param_arg[setprofile1]) <= 0) {
- error("Bad profile 1 file name");
- exit(1);
- }
- strcpy(seqname, param_arg[setprofile1]);
- profile_no = 1;
- profile_input();
- if(nseqs <= 0) {
- if(nseqs<0) cl_error_code=2;
- else if(nseqs==0) cl_error_code=3;
- exit(cl_error_code);
- }
- strcpy(profile1_name,seqname);
- }
-
- if(setprofile2 != -1) {
- if(strlen(param_arg[setprofile2]) <= 0) {
- error("Bad profile 2 file name");
- exit(1);
- }
- if(profile1_empty) {
- error("Only 1 profile file (profile 2) specified.");
- exit(1);
- }
- strcpy(seqname, param_arg[setprofile2]);
- profile_no = 2;
- profile_input();
- if(nseqs > profile1_nseqs)
- do_something = do_profile = TRUE;
- else {
- if(nseqs<0) cl_error_code=2;
- else if(nseqs==0) cl_error_code=3;
- error("No sequences read from profile 2");
- exit(cl_error_code);
- }
- strcpy(profile2_name,seqname);
- }
-
-/*************************************************************************/
-/* Look for /tree or /bootstrap or /align or /usetree ******************/
-/*************************************************************************/
-
- if (setbatch != -1)
- interactive=FALSE;
-
- if (setinteractive != -1)
- interactive=TRUE;
-
- if (interactive) {
- settree = -1;
- setbootstrap = -1;
- setalign = -1;
- setusetree = -1;
- setusetree1 = -1;
- setusetree2 = -1;
- setnewtree = -1;
- setconvert = -1;
- }
-
- if(settree != -1 )
- if(empty) {
- error("Cannot draw tree. No input alignment file");
- exit(1);
- }
- else
- do_tree = TRUE;
-
- if(setbootstrap != -1)
- if(empty) {
- error("Cannot bootstrap tree. No input alignment file");
- exit(1);
- }
- else {
- temp = 0;
- if(param_arg[setbootstrap] != NULL)
- if (sscanf(param_arg[setbootstrap],"%d",&temp)!=1) {
- fprintf(stdout,"Bad option for /bootstrap (must be integer)\n");
- temp = 0;
- };
- if(temp > 0) boot_ntrials = temp;
- do_boot = TRUE;
- }
-
- if(setalign != -1)
- if(empty) {
- error("Cannot align sequences. No input file");
- exit(1);
- }
- else
- do_align = TRUE;
-
- if(setconvert != -1)
- if(empty) {
- error("Cannot convert sequences. No input file");
- exit(1);
- }
- else
- do_convert = TRUE;
-
- if(setusetree != -1)
- if(empty) {
- error("Cannot align sequences. No input file");
- exit(1);
- }
- else {
- if(strlen(param_arg[setusetree]) == 0) {
- error("Cannot align sequences. No tree file specified");
- exit(1);
- }
- else {
- strcpy(phylip_tree_name, param_arg[setusetree]);
- }
- use_tree_file = TRUE;
- do_align_only = TRUE;
- }
-
- if(setnewtree != -1)
- if(empty) {
- error("Cannot align sequences. No input file");
- exit(1);
- }
- else {
- if(strlen(param_arg[setnewtree]) == 0) {
- error("Cannot align sequences. No tree file specified");
- exit(1);
- }
- else {
- strcpy(phylip_tree_name, param_arg[setnewtree]);
- }
- new_tree_file = TRUE;
- do_tree_only = TRUE;
- }
-
- if(setusetree1 != -1)
- if(profile1_empty) {
- error("Cannot align profiles. No input file");
- exit(1);
- }
- else if(profile_type == SEQUENCE) {
- error("Invalid option /usetree1.");
- exit(1);
- }
- else {
- if(strlen(param_arg[setusetree1]) == 0) {
- error("Cannot align profiles. No tree file specified");
- exit(1);
- }
- else {
- strcpy(p1_tree_name, param_arg[setusetree1]);
- }
- use_tree1_file = TRUE;
- do_align_only = TRUE;
- }
-
- if(setnewtree1 != -1)
- if(profile1_empty) {
- error("Cannot align profiles. No input file");
- exit(1);
- }
- else if(profile_type == SEQUENCE) {
- error("Invalid option /newtree1.");
- exit(1);
- }
- else {
- if(strlen(param_arg[setnewtree1]) == 0) {
- error("Cannot align profiles. No tree file specified");
- exit(1);
- }
- else {
- strcpy(p1_tree_name, param_arg[setnewtree1]);
- }
- new_tree1_file = TRUE;
- }
-
- if(setusetree2 != -1)
- if(profile2_empty) {
- error("Cannot align profiles. No input file");
- exit(1);
- }
- else if(profile_type == SEQUENCE) {
- error("Invalid option /usetree2.");
- exit(1);
- }
- else {
- if(strlen(param_arg[setusetree2]) == 0) {
- error("Cannot align profiles. No tree file specified");
- exit(1);
- }
- else {
- strcpy(p2_tree_name, param_arg[setusetree2]);
- }
- use_tree2_file = TRUE;
- do_align_only = TRUE;
- }
-
- if(setnewtree2 != -1)
- if(profile2_empty) {
- error("Cannot align profiles. No input file");
- exit(1);
- }
- else if(profile_type == SEQUENCE) {
- error("Invalid option /newtree2.");
- exit(1);
- }
- else {
- if(strlen(param_arg[setnewtree2]) == 0) {
- error("Cannot align profiles. No tree file specified");
- exit(1);
- }
- else {
- strcpy(p2_tree_name, param_arg[setnewtree2]);
- }
- new_tree2_file = TRUE;
- }
-
-
- if( (!do_tree) && (!do_boot) && (!empty) && (!do_profile) && (!do_align_only) && (!do_tree_only) && (!do_convert))
- do_align = TRUE;
-
-/*** ? /quicktree */
- if(setquicktree != -1)
- quick_pairalign = TRUE;
-
- if(dnaflag) {
- gap_open = dna_gap_open;
- gap_extend = dna_gap_extend;
- pw_go_penalty = dna_pw_go_penalty;
- pw_ge_penalty = dna_pw_ge_penalty;
- ktup = dna_ktup;
- window = dna_window;
- signif = dna_signif;
- wind_gap = dna_wind_gap;
-
- }
- else {
- gap_open = prot_gap_open;
- gap_extend = prot_gap_extend;
- pw_go_penalty = prot_pw_go_penalty;
- pw_ge_penalty = prot_pw_ge_penalty;
- ktup = prot_ktup;
- window = prot_window;
- signif = prot_signif;
- wind_gap = prot_wind_gap;
- }
-
- if(interactive) {
- if (!xmenus) usemenu = TRUE;
- return;
- }
-
-
- if(!do_something) {
- error("No input file(s) specified");
- exit(1);
- }
-
-
-
-
-/****************************************************************************/
-/* Now do whatever has been requested ***************************************/
-/****************************************************************************/
-
-
- if(do_profile) {
- if (profile_type == PROFILE) profile_align(p1_tree_name,p2_tree_name);
- else new_sequence_align(phylip_tree_name);
- }
-
- else if(do_align)
- align(phylip_tree_name);
-
- else if(do_convert) {
- get_path(seqname,path);
- if(!open_alignment_output(path)) exit(1);
- create_alignment_output(1,nseqs);
- }
-
- else if (do_align_only)
- get_tree(phylip_tree_name);
-
- else if(do_tree_only)
- make_tree(phylip_tree_name);
-
- else if(do_tree)
- phylogenetic_tree(phylip_tree_name,clustal_tree_name,dist_tree_name,nexus_tree_name,pim_name);
-
- else if(do_boot)
- bootstrap_tree(phylip_tree_name,clustal_tree_name,nexus_tree_name);
-
- fprintf(stdout,"\n");
- exit(0);
-
-/*******whew!***now*go*home****/
-}
-
-
-Boolean user_mat(char *str, short *mat, short *xref)
-{
- sint maxres;
-
- FILE *infile;
-
- if(usemenu)
- getstr("Enter name of the matrix file",MAXLINE+1,lin2);
- else
- strcpy(lin2,str);
-
- if(*lin2 == EOS) return FALSE;
-
- if((infile=fopen(lin2,"r"))==NULL) {
- error("Cannot find matrix file [%s]",lin2);
- return FALSE;
- }
-
- strcpy(str, lin2);
-
- maxres = read_user_matrix(str, mat, xref);
- if (maxres <= 0) return FALSE;
-
- return TRUE;
-}
-
-Boolean user_mat_series(char *str, short *mat, short *xref)
-{
- sint maxres;
-
- FILE *infile;
-
- if(usemenu)
- getstr("Enter name of the matrix file",MAXLINE+1,lin2);
- else
- strcpy(lin2,str);
-
- if(*lin2 == EOS) return FALSE;
-
- if((infile=fopen(lin2,"r"))==NULL) {
- error("Cannot find matrix file [%s]",lin2);
- return FALSE;
- }
-
- strcpy(str, lin2);
-
- maxres = read_matrix_series(str, mat, xref);
- if (maxres <= 0) return FALSE;
-
- return TRUE;
-}
-
-
-
-
-
-
-sint seq_input(Boolean append)
-{
- sint i;
- sint local_nseqs;
-
- if(usemenu) {
-fprintf(stdout,"\n\nSequences should all be in 1 file.\n");
-fprintf(stdout,"\n7 formats accepted: \n");
-fprintf(stdout,
-"NBRF/PIR, EMBL/SwissProt, Pearson (Fasta), GDE, Clustal, GCG/MSF, RSF.\n\n\n");
-/*fprintf(stdout,
-"\nGCG users should use TOPIR to convert their sequence files before use.\n\n\n");*/
- }
-
- if (append)
- local_nseqs = readseqs(nseqs+(sint)1);
- else
- local_nseqs = readseqs((sint)1); /* 1 is the first seq to be read */
- if(local_nseqs < 0) /* file could not be opened */
- {
- return local_nseqs;
- }
- else if(local_nseqs == 0) /* no sequences */
- {
- error("No sequences in file! Bad format?");
- return local_nseqs;
- }
- else
- {
- struct_penalties1 = struct_penalties2 = NONE;
- if (sec_struct_mask1 != NULL) sec_struct_mask1=ckfree(sec_struct_mask1);
- if (sec_struct_mask2 != NULL) sec_struct_mask2=ckfree(sec_struct_mask2);
- if (gap_penalty_mask1 != NULL) gap_penalty_mask1=ckfree(gap_penalty_mask1);
- if (gap_penalty_mask2 != NULL) gap_penalty_mask2=ckfree(gap_penalty_mask2);
- if (ss_name1 != NULL) ss_name1=ckfree(ss_name1);
- if (ss_name2 != NULL) ss_name2=ckfree(ss_name2);
-
- if(append) nseqs+=local_nseqs;
- else nseqs=local_nseqs;
- info("Sequences assumed to be %s",
- dnaflag?"DNA":"PROTEIN");
- if (usemenu) {
- fprintf(stdout,"\n\n");
- for(i=1; i<=nseqs; i++) {
-/* DES fprintf(stdout,"%s: = ",names[i]); */
- info("Sequence %d: %-*s %6.d %s",
- (pint)i,max_names,names[i],(pint)seqlen_array[i],dnaflag?"bp":"aa");
- }
- }
- if(dnaflag) {
- gap_open = dna_gap_open;
- gap_extend = dna_gap_extend;
- }
- else {
- gap_open = prot_gap_open;
- gap_extend = prot_gap_extend;
- }
- empty=FALSE;
- }
- return local_nseqs;
-}
-
-
-
-
-
-
-
-sint profile_input(void) /* read a profile */
-{ /* profile_no is 1 or 2 */
- sint local_nseqs, i;
-
- if(profile_no == 2 && profile1_empty)
- {
- error("You must read in profile number 1 first");
- return 0;
- }
-
- if(profile_no == 1) /* for the 1st profile */
- {
- local_nseqs = readseqs((sint)1); /* (1) means 1st seq to be read = no. 1 */
- if(local_nseqs < 0) /* file could not be opened */
- {
- return local_nseqs;
- }
- else if(local_nseqs == 0) /* no sequences */
- {
- error("No sequences in file! Bad format?");
- return local_nseqs;
- }
- else if (local_nseqs > 0)
- { /* success; found some seqs. */
- struct_penalties1 = NONE;
- if (sec_struct_mask1 != NULL) sec_struct_mask1=ckfree(sec_struct_mask1);
- if (gap_penalty_mask1 != NULL) gap_penalty_mask1=ckfree(gap_penalty_mask1);
- if (ss_name1 != NULL) ss_name1=ckfree(ss_name1);
- if (struct_penalties != NONE) /* feature table / mask in alignment */
- {
- struct_penalties1 = struct_penalties;
- if (struct_penalties == SECST) {
- sec_struct_mask1 = (char *)ckalloc((max_aln_length) * sizeof (char));
- for (i=0;i<max_aln_length;i++)
- sec_struct_mask1[i] = sec_struct_mask[i];
- }
- gap_penalty_mask1 = (char *)ckalloc((max_aln_length) * sizeof (char));
- for (i=0;i<max_aln_length;i++)
- gap_penalty_mask1[i] = gap_penalty_mask[i];
- ss_name1 = (char *)ckalloc( (MAXNAMES+1) * sizeof (char));
-
- strcpy(ss_name1,ss_name);
-if (debug>0) {
-for (i=0;i<seqlen_array[1];i++)
- fprintf(stdout,"%c",gap_penalty_mask1[i]);
-fprintf(stdout,"\n");
-}
- }
- nseqs = profile1_nseqs = local_nseqs;
- info("No. of seqs=%d",(pint)nseqs);
- profile1_empty=FALSE;
- profile2_empty=TRUE;
- }
- }
- else
- { /* first seq to be read = profile1_nseqs + 1 */
- local_nseqs = readseqs(profile1_nseqs+(sint)1);
- if(local_nseqs < 0) /* file could not be opened */
- {
- return local_nseqs;
- }
- else if(local_nseqs == 0) /* no sequences */
- {
- error("No sequences in file! Bad format?");
- return local_nseqs;
- }
- else if(local_nseqs > 0)
- {
- struct_penalties2 = NONE;
- if (sec_struct_mask2 != NULL) sec_struct_mask2=ckfree(sec_struct_mask2);
- if (gap_penalty_mask2 != NULL) gap_penalty_mask2=ckfree(gap_penalty_mask2);
- if (ss_name2 != NULL) ss_name2=ckfree(ss_name2);
- if (struct_penalties != NONE) /* feature table / mask in alignment */
- {
- struct_penalties2 = struct_penalties;
- if (struct_penalties == SECST) {
- sec_struct_mask2 = (char *)ckalloc((max_aln_length) * sizeof (char));
- for (i=0;i<max_aln_length;i++)
- sec_struct_mask2[i] = sec_struct_mask[i];
- }
- gap_penalty_mask2 = (char *)ckalloc((max_aln_length) * sizeof (char));
- for (i=0;i<max_aln_length;i++)
- gap_penalty_mask2[i] = gap_penalty_mask[i];
- ss_name2 = (char *)ckalloc( (MAXNAMES+1) * sizeof (char));
- strcpy(ss_name2,ss_name);
-if (debug>0) {
-for (i=0;i<seqlen_array[profile1_nseqs+1];i++)
- fprintf(stdout,"%c",gap_penalty_mask2[i]);
-fprintf(stdout,"\n");
-}
- }
- info("No. of seqs in profile=%d",(pint)local_nseqs);
- nseqs = profile1_nseqs + local_nseqs;
- info("Total no. of seqs =%d",(pint)nseqs);
- profile2_empty=FALSE;
- empty = FALSE;
- }
-
- }
- if (sec_struct_mask != NULL) sec_struct_mask=ckfree(sec_struct_mask);
- if (gap_penalty_mask != NULL) gap_penalty_mask=ckfree(gap_penalty_mask);
- if (ss_name != NULL) ss_name=ckfree(ss_name);
-
- if(local_nseqs<=0) return local_nseqs;
-
- info("Sequences assumed to be %s",
- dnaflag?"DNA":"PROTEIN");
- if (usemenu) fprintf(stdout,"\n\n");
- for(i=profile2_empty?1:profile1_nseqs+1; i<=nseqs; i++) {
- info("Sequence %d: %-*s %6.d %s",
- (pint)i,max_names,names[i],(pint)seqlen_array[i],dnaflag?"bp":"aa");
- }
- if(dnaflag) {
- gap_open = dna_gap_open;
- gap_extend = dna_gap_extend;
- }
- else {
- gap_open = prot_gap_open;
- gap_extend = prot_gap_extend;
- }
-
- return nseqs;
-}
-
-
-
-static void calc_gap_penalty_mask(int prf_length, char *mask, char *gap_mask)
-{
- int i,j;
- char *struct_mask;
-
- struct_mask = (char *)ckalloc((prf_length+1) * sizeof(char));
-/*
- calculate the gap penalty mask from the secondary structures
-*/
- i=0;
- while (i<prf_length) {
- if (tolower(mask[i]) == 'a' || mask[i] == '$') {
- for (j = -helix_end_plus; j<0; j++) {
- if ((i+j>=0) && (tolower(struct_mask[i+j]) != 'a')
- && (tolower(struct_mask[i+j]) != 'b'))
- struct_mask[i+j] = 'a';
- }
- for (j = 0; j<helix_end_minus; j++) {
- if (i+j>=prf_length || (tolower(mask[i+j]) != 'a'
- && mask[i+j] != '$')) break;
- struct_mask[i+j] = 'a';
- }
- i += j;
- while (tolower(mask[i]) == 'a'
- || mask[i] == '$') {
- if (i>=prf_length) break;
- if (mask[i] == '$') {
- struct_mask[i] = 'A';
- i++;
- break;
- }
- else struct_mask[i] = mask[i];
- i++;
- }
- for (j = 0; j<helix_end_minus; j++) {
- if ((i-j-1>=0) && (tolower(mask[i-j-1]) == 'a'
- || mask[i-j-1] == '$'))
- struct_mask[i-j-1] = 'a';
- }
- for (j = 0; j<helix_end_plus; j++) {
- if (i+j>=prf_length) break;
- struct_mask[i+j] = 'a';
- }
- }
- else if (tolower(mask[i]) == 'b' || mask[i] == '%') {
- for (j = -strand_end_plus; j<0; j++) {
- if ((i+j>=0) && (tolower(struct_mask[i+j]) != 'a')
- && (tolower(struct_mask[i+j]) != 'b'))
- struct_mask[i+j] = 'b';
- }
- for (j = 0; j<strand_end_minus; j++) {
- if (i+j>=prf_length || (tolower(mask[i+j]) != 'b'
- && mask[i+j] != '%')) break;
- struct_mask[i+j] = 'b';
- }
- i += j;
- while (tolower(mask[i]) == 'b'
- || mask[i] == '%') {
- if (i>=prf_length) break;
- if (mask[i] == '%') {
- struct_mask[i] = 'B';
- i++;
- break;
- }
- else struct_mask[i] = mask[i];
- i++;
- }
- for (j = 0; j<strand_end_minus; j++) {
- if ((i-j-1>=0) && (tolower(mask[i-j-1]) == 'b'
- || mask[i-j-1] == '%'))
- struct_mask[i-j-1] = 'b';
- }
- for (j = 0; j<strand_end_plus; j++) {
- if (i+j>=prf_length) break;
- struct_mask[i+j] = 'b';
- }
- }
- else i++;
- }
-
- for(i=0;i<prf_length;i++) {
- switch (struct_mask[i]) {
- case 'A':
- gap_mask[i] = helix_penalty+'0';
- break;
- case 'a':
- gap_mask[i] = helix_end_penalty+'0';
- break;
- case 'B':
- gap_mask[i] = strand_penalty+'0';
- break;
- case 'b':
- gap_mask[i] = strand_end_penalty+'0';
- break;
- default:
- gap_mask[i] = loop_penalty+'0';
- break;
- }
- }
-
- struct_mask=ckfree(struct_mask);
-
-}
-
-void print_sec_struct_mask(int prf_length, char *mask, char *struct_mask)
-{
- int i,j;
-
-/*
- calculate the gap penalty mask from the secondary structures
-*/
- i=0;
- while (i<prf_length) {
- if (tolower(mask[i]) == 'a' || mask[i] == '$') {
- for (j = 0; j<helix_end_minus; j++) {
- if (i+j>=prf_length || (tolower(mask[i+j]) != 'a'
- && mask[i+j] != '$')) break;
- struct_mask[i+j] = 'a';
- }
- i += j;
- while (tolower(mask[i]) == 'a'
- || mask[i] == '$') {
- if (i>=prf_length) break;
- if (mask[i] == '$') {
- struct_mask[i] = 'A';
- i++;
- break;
- }
- else struct_mask[i] = mask[i];
- i++;
- }
- for (j = 0; j<helix_end_minus; j++) {
- if ((i-j-1>=0) && (tolower(mask[i-j-1]) == 'a'
- || mask[i-j-1] == '$'))
- struct_mask[i-j-1] = 'a';
- }
- }
- else if (tolower(mask[i]) == 'b' || mask[i] == '%') {
- for (j = 0; j<strand_end_minus; j++) {
- if (i+j>=prf_length || (tolower(mask[i+j]) != 'b'
- && mask[i+j] != '%')) break;
- struct_mask[i+j] = 'b';
- }
- i += j;
- while (tolower(mask[i]) == 'b'
- || mask[i] == '%') {
- if (i>=prf_length) break;
- if (mask[i] == '%') {
- struct_mask[i] = 'B';
- i++;
- break;
- }
- else struct_mask[i] = mask[i];
- i++;
- }
- for (j = 0; j<strand_end_minus; j++) {
- if ((i-j-1>=0) && (tolower(mask[i-j-1]) == 'b'
- || mask[i-j-1] == '%'))
- struct_mask[i-j-1] = 'b';
- }
- }
- else i++;
- }
-}
-
-
-
-FILE * open_output_file(char *prompt, char *path,
- char *file_name, char *file_extension)
-
-{ static char temp[FILENAMELEN+1];
- static char local_prompt[MAXLINE];
- static char local_prompt_tmp[MAXLINE+FILENAMELEN+1];
- FILE * file_handle;
-
-/* if (*file_name == EOS) {
-*/ strcpy(file_name,path);
- strcat(file_name,file_extension);
-/* }
-*/
- if(strcmp(file_name,seqname)==0) {
- warning("Output file name is the same as input file.");
- if (usemenu) {
- strcpy(local_prompt,"\n\nEnter new name to avoid overwriting ");
- strcat(local_prompt," [%s]");
- sprintf(local_prompt_tmp,local_prompt,file_name);
- getstr(local_prompt_tmp,FILENAMELEN+1,temp);
- if(*temp != EOS) strcpy(file_name,temp);
- }
- }
- else if (usemenu) {
- strcpy(local_prompt,prompt);
- strcat(local_prompt," [%s]");
- sprintf(local_prompt_tmp,local_prompt,file_name);
- getstr(local_prompt_tmp,FILENAMELEN+1,temp);
- if(*temp != EOS) strcpy(file_name,temp);
- }
-
-#ifdef VMS
- if((file_handle=fopen(file_name,"w","rat=cr","rfm=var"))==NULL) {
-#else
- if((file_handle=fopen(file_name,"w"))==NULL) {
-#endif
- error("Cannot open output file [%s]",file_name);
- return NULL;
- }
- return file_handle;
-}
-
-
-
-FILE * open_explicit_file(char *file_name)
-{
- FILE * file_handle;
-
- if (*file_name == EOS) {
- error("Bad output file [%s]",file_name);
- return NULL;
- }
-#ifdef VMS
- if((file_handle=fopen(file_name,"w","rat=cr","rfm=var"))==NULL) {
-#else
- if((file_handle=fopen(file_name,"w"))==NULL) {
-#endif
- error("Cannot open output file [%s]",file_name);
- return NULL;
- }
- return file_handle;
-}
-
-
-/* Ramu void */
-
-void align(char *phylip_name)
-{
- char path[FILENAMELEN+1];
- FILE *tree;
- sint count;
-
- if(empty && usemenu) {
- error("No sequences in memory. Load sequences first.");
- return;
- }
-
- struct_penalties1 = struct_penalties2 = NONE;
- if (sec_struct_mask1 != NULL) sec_struct_mask1=ckfree(sec_struct_mask1);
- if (sec_struct_mask2 != NULL) sec_struct_mask2=ckfree(sec_struct_mask2);
- if (gap_penalty_mask1 != NULL) gap_penalty_mask1=ckfree(gap_penalty_mask1);
- if (gap_penalty_mask2 != NULL) gap_penalty_mask2=ckfree(gap_penalty_mask2);
- if (ss_name1 != NULL) ss_name1=ckfree(ss_name1);
- if (ss_name2 != NULL) ss_name2=ckfree(ss_name2);
-
-
- get_path(seqname,path);
-/* DES DEBUG
- fprintf(stdout,"\n\n Seqname = %s \n Path = %s \n\n",seqname,path);
-*/
- if(usemenu || !interactive) {
- if(!open_alignment_output(path)) return;
- }
-
- if (nseqs >= 2) {
-
- get_path(seqname,path);
- if (phylip_name[0]!=EOS) {
- if((tree = open_explicit_file(
- phylip_name))==NULL) return;
- }
- else {
- if((tree = open_output_file(
- "\nEnter name for new GUIDE TREE file ",path,
- phylip_name,"dnd")) == NULL) return;
- }
- }
-
- if (save_parameters) create_parameter_output();
-
- if(reset_alignments_new || reset_alignments_all) reset_align();
-
- info("Start of Pairwise alignments");
- info("Aligning...");
- if(dnaflag) {
- gap_open = dna_gap_open;
- gap_extend = dna_gap_extend;
- pw_go_penalty = dna_pw_go_penalty;
- pw_ge_penalty = dna_pw_ge_penalty;
- ktup = dna_ktup;
- window = dna_window;
- signif = dna_signif;
- wind_gap = dna_wind_gap;
-
- }
- else {
- gap_open = prot_gap_open;
- gap_extend = prot_gap_extend;
- pw_go_penalty = prot_pw_go_penalty;
- pw_ge_penalty = prot_pw_ge_penalty;
- ktup = prot_ktup;
- window = prot_window;
- signif = prot_signif;
- wind_gap = prot_wind_gap;
-
- }
-
- if (quick_pairalign)
- show_pair((sint)0,nseqs,(sint)0,nseqs);
- else
- pairalign((sint)0,nseqs,(sint)0,nseqs);
-
- if (nseqs >= 2) {
-
- guide_tree(tree,1,nseqs);
- info("Guide tree file created: [%s]",
- phylip_name);
- }
-
-
- count = malign((sint)0,phylip_name);
-
- if (count <= 0) return;
-
- if (usemenu) fprintf(stdout,"\n\n\n");
-
- create_alignment_output(1,nseqs);
- if (showaln && usemenu) show_aln();
- phylip_name[0]=EOS;
- return ;
-}
-
-
-
-
-
-void new_sequence_align(char *phylip_name)
-{
- char path[FILENAMELEN+1];
- char tree_name[FILENAMELEN+1],temp[MAXLINE+1];
- char tmp_msg[MAXLINE+1+300];
- Boolean use_tree;
- FILE *tree;
- sint i,j,count;
- float dscore;
- Boolean save_ss2;
-
- if(profile1_empty && usemenu) {
- error("No profile in memory. Input 1st profile first.");
- return;
- }
-
- if(profile2_empty && usemenu) {
- error("No sequences in memory. Input sequences first.");
- return;
- }
-
- get_path(profile2_name,path);
-
- if(usemenu || !interactive) {
- if(!open_alignment_output(path)) return;
- }
-
- new_seq = profile1_nseqs+1;
-
-/* check for secondary structure information for list of sequences */
-
- save_ss2 = use_ss2;
- if (struct_penalties2 != NONE && use_ss2 == TRUE && (nseqs - profile1_nseqs >
-1)) {
- if (struct_penalties2 == SECST)
- warning("Warning: ignoring secondary structure for a list of sequences");
- else if (struct_penalties2 == GMASK)
- warning("Warning: ignoring gap penalty mask for a list of sequences");
- use_ss2 = FALSE;
- }
-
- for (i=1;i<=new_seq;i++) {
- for (j=i+1;j<=new_seq;j++) {
- dscore = countid(i,j);
- tmat[i][j] = ((double)100.0 - (double)dscore)/(double)100.0;
- tmat[j][i] = tmat[i][j];
- }
- }
-
- tree_name[0] = EOS;
- use_tree = FALSE;
- if (nseqs >= 2) {
- if (check_tree && usemenu) {
- strcpy(tree_name,path);
- strcat(tree_name,"dnd");
-#ifdef VMS
- if((tree=fopen(tree_name,"r","rat=cr","rfm=var"))!=NULL) {
-#else
- if((tree=fopen(tree_name,"r"))!=NULL) {
-#endif
- if (usemenu)
- sprintf(tmp_msg,"\nUse the existing GUIDE TREE file, %s (y/n) ? [y]",
- tree_name);
- getstr(tmp_msg,MAXLINE+1,temp);
- if(*temp != 'n' && *temp != 'N') {
- strcpy(phylip_name,tree_name);
- use_tree = TRUE;
- }
- fclose(tree);
- }
- }
- else if (!usemenu && use_tree_file) {
- use_tree = TRUE;
- }
- }
-
- if (save_parameters) create_parameter_output();
-
- if(reset_alignments_new || reset_alignments_all) {
-/*
- reset_prf1();
-*/
- reset_prf2();
- }
- else fix_gaps();
-
- if (struct_penalties1 == SECST)
-
- calc_gap_penalty_mask(seqlen_array[1],sec_struct_mask1,gap_penalty_mask1);
-
- if (struct_penalties2 == SECST)
-
-calc_gap_penalty_mask(seqlen_array[profile1_nseqs+1],sec_struct_mask2,gap_penalty_mask2);
-
-
-/* create the new tree file, if necessary */
-
- if (use_tree == FALSE) {
-
- if (nseqs >= 2) {
- get_path(profile2_name,path);
- if (phylip_name[0]!=EOS) {
- if((tree = open_explicit_file(
- phylip_name))==NULL) return;
- }
- else {
- if((tree = open_output_file(
- "\nEnter name for new GUIDE TREE file ",path,
- phylip_name,"dnd")) == NULL) return;
- }
- }
- info("Start of Pairwise alignments");
- info("Aligning...");
- if(dnaflag) {
- gap_open = dna_gap_open;
- gap_extend = dna_gap_extend;
- pw_go_penalty = dna_pw_go_penalty;
- pw_ge_penalty = dna_pw_ge_penalty;
- ktup = dna_ktup;
- window = dna_window;
- signif = dna_signif;
- wind_gap = dna_wind_gap;
-
- }
- else {
- gap_open = prot_gap_open;
- gap_extend = prot_gap_extend;
- pw_go_penalty = prot_pw_go_penalty;
- pw_ge_penalty = prot_pw_ge_penalty;
- ktup = prot_ktup;
- window = prot_window;
- signif = prot_signif;
- wind_gap = prot_wind_gap;
-
- }
-
- if (quick_pairalign)
- show_pair((sint)0,nseqs,new_seq-2,nseqs);
- else
- pairalign((sint)0,nseqs,new_seq-2,nseqs);
-
- if (nseqs >= 2) {
- guide_tree(tree,1,nseqs);
- info("Guide tree file created: [%s]",
- phylip_name);
- }
- }
-
- if (new_tree_file) return;
-
- count = seqalign(new_seq-2,phylip_name);
-
- use_ss2 = save_ss2;
-
- if (count <= 0) return;
-
- if (usemenu) fprintf(stdout,"\n\n\n");
-
- create_alignment_output(1,nseqs);
- if (showaln && usemenu) show_aln();
-
- phylip_name[0]=EOS;
-
-}
-
-
-
-
-
-void make_tree(char *phylip_name)
-{
- char path[FILENAMELEN+1];
- FILE *tree;
-
- if(empty) {
- error("No sequences in memory. Load sequences first.");
- return;
- }
-
- struct_penalties1 = struct_penalties2 = NONE;
- if (sec_struct_mask1 != NULL) sec_struct_mask1=ckfree(sec_struct_mask1);
- if (sec_struct_mask2 != NULL) sec_struct_mask2=ckfree(sec_struct_mask2);
- if (gap_penalty_mask1 != NULL) gap_penalty_mask1=ckfree(gap_penalty_mask1);
- if (gap_penalty_mask2 != NULL) gap_penalty_mask2=ckfree(gap_penalty_mask2);
- if (ss_name1 != NULL) ss_name1=ckfree(ss_name1);
- if (ss_name2 != NULL) ss_name2=ckfree(ss_name2);
-
- if(reset_alignments_new || reset_alignments_all) reset_align();
-
- get_path(seqname,path);
-
- if (nseqs < 2) {
- error("Less than 2 sequences in memory. Phylogenetic tree cannot be built.");
- return;
- }
-
- if (save_parameters) create_parameter_output();
-
- info("Start of Pairwise alignments");
- info("Aligning...");
- if(dnaflag) {
- gap_open = dna_gap_open;
- gap_extend = dna_gap_extend;
- pw_go_penalty = dna_pw_go_penalty;
- pw_ge_penalty = dna_pw_ge_penalty;
- ktup = dna_ktup;
- window = dna_window;
- signif = dna_signif;
- wind_gap = dna_wind_gap;
-
- }
- else {
- gap_open = prot_gap_open;
- gap_extend = prot_gap_extend;
- pw_go_penalty = prot_pw_go_penalty;
- pw_ge_penalty = prot_pw_ge_penalty;
- ktup = prot_ktup;
- window = prot_window;
- signif = prot_signif;
- wind_gap = prot_wind_gap;
-
-
- }
-
- if (quick_pairalign)
- show_pair((sint)0,nseqs,(sint)0,nseqs);
- else
- pairalign((sint)0,nseqs,(sint)0,nseqs);
-
- if (nseqs >= 2) {
- get_path(seqname,path);
- if (phylip_name[0]!=EOS) {
- if((tree = open_explicit_file(
- phylip_name))==NULL) return;
- }
- else {
- if((tree = open_output_file(
- "\nEnter name for new GUIDE TREE file ",path,
- phylip_name,"dnd")) == NULL) return;
- }
-
- guide_tree(tree,1,nseqs);
- info("Guide tree file created: [%s]",
- phylip_name);
- }
-
- if(reset_alignments_new || reset_alignments_all) reset_align();
-
- phylip_name[0]=EOS;
-}
-
-
-
-
-
-
-
-
-
-void get_tree(char *phylip_name)
-{
- char path[FILENAMELEN+1],temp[MAXLINE+1];
- char tmp_msg[FILENAMELEN+300];
- sint count;
-
- if(empty) {
- error("No sequences in memory. Load sequences first.");
- return;
- }
- struct_penalties1 = struct_penalties2 = NONE;
- if (sec_struct_mask1 != NULL) sec_struct_mask1=ckfree(sec_struct_mask1);
- if (sec_struct_mask2 != NULL) sec_struct_mask2=ckfree(sec_struct_mask2);
- if (gap_penalty_mask1 != NULL) gap_penalty_mask1=ckfree(gap_penalty_mask1);
- if (gap_penalty_mask2 != NULL) gap_penalty_mask2=ckfree(gap_penalty_mask2);
- if (ss_name1 != NULL) ss_name1=ckfree(ss_name1);
- if (ss_name2 != NULL) ss_name2=ckfree(ss_name2);
-
-
- get_path(seqname,path);
-
- if(usemenu || !interactive) {
- if(!open_alignment_output(path)) return;
- }
-
- if(reset_alignments_new || reset_alignments_all) reset_align();
-
- get_path(seqname,path);
-
- if (nseqs >= 2) {
-
- if(usemenu) {
- strcpy(phylip_name,path);
- strcat(phylip_name,"dnd");
-
- sprintf(tmp_msg,"\nEnter a name for the guide tree file [%s]",
- phylip_name);
- getstr(tmp_msg,MAXLINE+1,temp);
- if(*temp != EOS)
- strcpy(phylip_name,temp);
- }
-
- if(usemenu || !interactive) {
-#ifdef VMS
- if((tree=fopen(phylip_name,"r","rat=cr","rfm=var"))==NULL) {
-#else
- if((tree=fopen(phylip_name,"r"))==NULL) {
-#endif
- error("Cannot open tree file [%s]",phylip_name);
- return;
- }
- }
- }
- else {
- info("Start of Pairwise alignments");
- info("Aligning...");
- if(dnaflag) {
- gap_open = dna_gap_open;
- gap_extend = dna_gap_extend;
- pw_go_penalty = dna_pw_go_penalty;
- pw_ge_penalty = dna_pw_ge_penalty;
- ktup = dna_ktup;
- window = dna_window;
- signif = dna_signif;
- wind_gap = dna_wind_gap;
-
- }
- else {
- gap_open = prot_gap_open;
- gap_extend = prot_gap_extend;
- pw_go_penalty = prot_pw_go_penalty;
- pw_ge_penalty = prot_pw_ge_penalty;
- ktup = prot_ktup;
- window = prot_window;
- signif = prot_signif;
- wind_gap = prot_wind_gap;
-
- }
-
- if (quick_pairalign)
- show_pair((sint)0,nseqs,(sint)0,nseqs);
- else
- pairalign((sint)0,nseqs,(sint)0,nseqs);
- }
-
- if (save_parameters) create_parameter_output();
-
- count = malign(0,phylip_name);
- if (count <= 0) return;
-
- if (usemenu) fprintf(stdout,"\n\n\n");
-
- create_alignment_output(1,nseqs);
- if (showaln && usemenu) show_aln();
-
- phylip_name[0]=EOS;
-}
-
-
-
-void profile_align(char *p1_tree_name,char *p2_tree_name)
-{
- char path[FILENAMELEN+1];
- char tree_name[FILENAMELEN+1];
- char temp[MAXLINE+1];
- char tmp_msg[FILENAMELEN+300];
-
- Boolean use_tree1,use_tree2;
- FILE *tree;
- sint count,i,j,dscore;
-
- if(profile1_empty || profile2_empty) {
- error("No sequences in memory. Load sequences first.");
- return;
- }
-
- get_path(profile1_name,path);
-
- if(usemenu || !interactive) {
- if(!open_alignment_output(path)) return;
- }
-
- if(reset_alignments_new || reset_alignments_all) {
- reset_prf1();
- reset_prf2();
- }
- else fix_gaps();
-
- tree_name[0] = EOS;
- use_tree1 = FALSE;
- if (profile1_nseqs >= 2) {
- if (check_tree && usemenu) {
- strcpy(tree_name,path);
- strcat(tree_name,"dnd");
-#ifdef VMS
- if((tree=fopen(tree_name,"r","rat=cr","rfm=var"))!=NULL) {
-#else
- if((tree=fopen(tree_name,"r"))!=NULL) {
-#endif
- sprintf(tmp_msg,"\nUse the existing GUIDE TREE file for Profile 1, %s (y/n) ? [y]",
- tree_name);
- getstr(tmp_msg,MAXLINE+1,temp);
- if(*temp != 'n' && *temp != 'N') {
- strcpy(p1_tree_name,tree_name);
- use_tree1 = TRUE;
- }
- fclose(tree);
- }
- }
- else if (!usemenu && use_tree1_file) {
- use_tree1 = TRUE;
- }
- }
- tree_name[0] = EOS;
- use_tree2 = FALSE;
- get_path(profile2_name,path);
- if (nseqs-profile1_nseqs >= 2) {
- if (check_tree && usemenu) {
- strcpy(tree_name,path);
- strcat(tree_name,"dnd");
-#ifdef VMS
- if((tree=fopen(tree_name,"r","rat=cr","rfm=var"))!=NULL)
-#else
- if((tree=fopen(tree_name,"r"))!=NULL)
-#endif
- {
- sprintf(tmp_msg,"\nUse the existing GUIDE TREE file for Profile 2, %s (y/n) ? [y]",
- tree_name);
- getstr(tmp_msg,MAXLINE+1,temp);
- if(*temp != 'n' && *temp != 'N') {
- strcpy(p2_tree_name,tree_name);
- use_tree2 = TRUE;
- }
- fclose(tree);
- }
- }
- else if (!usemenu && use_tree2_file) {
- use_tree2 = TRUE;
- }
- }
-
- if (save_parameters) create_parameter_output();
-
- if (struct_penalties1 == SECST)
-
- calc_gap_penalty_mask(seqlen_array[1],sec_struct_mask1,gap_penalty_mask1);
-
- if (struct_penalties2 == SECST)
-
- calc_gap_penalty_mask(seqlen_array[profile1_nseqs+1],sec_struct_mask2,gap_penalty_mask2);
-
- if (use_tree1 == FALSE)
- if (profile1_nseqs >= 2) {
- for (i=1;i<=profile1_nseqs;i++) {
- for (j=i+1;j<=profile1_nseqs;j++) {
- dscore = countid(i,j);
- tmat[i][j] = (100.0 - dscore)/100.0;
- tmat[j][i] = tmat[i][j];
- }
- }
- get_path(profile1_name,path);
- if (p1_tree_name[0]!=EOS) {
- if((tree = open_explicit_file(p1_tree_name))==NULL) return;
- }
- else {
- if((tree = open_output_file(
- "\nEnter name for new GUIDE TREE file for profile 1 ",path,
- p1_tree_name,"dnd")) == NULL) return;
- }
-
- guide_tree(tree,1,profile1_nseqs);
- info("Guide tree file created: [%s]",
- p1_tree_name);
- }
- if (use_tree2 == FALSE)
- if(nseqs-profile1_nseqs >= 2) {
- for (i=1+profile1_nseqs;i<=nseqs;i++) {
- for (j=i+1;j<=nseqs;j++) {
- dscore = countid(i,j);
- tmat[i][j] = (100.0 - dscore)/100.0;
- tmat[j][i] = tmat[i][j];
- }
- }
- if (p2_tree_name[0]!=EOS) {
- if((tree = open_explicit_file(p2_tree_name))==NULL) return;
- }
- else {
- get_path(profile2_name,path);
- if((tree = open_output_file(
- "\nEnter name for new GUIDE TREE file for profile 2 ",path,
- p2_tree_name,"dnd")) == NULL) return;
- }
- guide_tree(tree,profile1_nseqs+1,nseqs-profile1_nseqs);
- info("Guide tree file created: [%s]",
- p2_tree_name);
- }
-
- if (new_tree1_file || new_tree2_file) return;
-
-/* do an initial alignment to get the pairwise identities between the two
-profiles - used to set parameters for the final alignment */
- count = palign1();
- if (count == 0) return;
-
- reset_prf1();
- reset_prf2();
-
- count = palign2(p1_tree_name,p2_tree_name);
-
- if (count == 0) return;
-
- if(usemenu) fprintf(stdout,"\n\n\n");
-
- create_alignment_output(1,nseqs);
- if (showaln && usemenu) show_aln();
-
- p1_tree_name[0]=EOS;
- p2_tree_name[0]=EOS;
-}
-
-
-
-
-
-
- typedef struct rangeNum {
- int start;
- int end;
- } rangeNum;
-
-
-/**** ********************************************************************************
- *
- *
- *
- * INPUT:
- *
- * RETURNS: the range objects with the from, to range for each seqs.
- *
- * the best things is to couple this up with the seqnames
- * structure (there is no struct for seqnames yet!)
- */
-
-
-void fillrange(rangeNum *rnum, sint fres, sint len, sint fseq)
-{
- sint val;
- sint i,ii;
- sint j,slen;
-
- char tmpName[FILENAMELEN+15];
- int istart =0;
- int iend = 0; /* to print sequence start-end with names */
- int found =0;
- int ngaps=0;
- int tmpStart=0;
- int tmpEnd=0;
- int ntermgaps=0;
- int pregaps=0;
- int tmpk=0;
- int isRange=0;
- int formula =0;
-
- tmpName[0] = '\0';
- slen = 0;
-
- ii = fseq ;
- i = output_index[ii];
- if( (sscanf(names[i],"%[^/]/%d-%d",tmpName, &tmpStart, &tmpEnd) == 3)) {
- isRange = 1;
- }
- for(tmpk=1; tmpk<fres; tmpk++) { /* do this irrespective of above sscanf */
- val = seq_array[i][tmpk];
- if ((val < 0) || (val > max_aa)) { /*it is gap */
- pregaps++;
- }
- }
- for(j=fres; j<fres+len; j++) {
- val = seq_array[i][j];
- if((val == -3) || (val == 253))
- break;
- else if((val < 0) || (val > max_aa)) {
- /* residue = '-'; */
- ngaps++;
- }
- else {
- /* residue = amino_acid_codes[val]; */
- found = j;
- }
- if ( found && (istart == 0) ) {
- istart = found;
- ntermgaps = ngaps;
- }
- slen++;
- }
- if( seqRange) {
- printf("Name : %s ",names[i]);
- printf("\n fres = %d ",fres);
- printf(" len = %d ",len);
- printf("\n istart = %d ",istart);
- printf("\n tmpStart = %d ",tmpStart);
- printf("\n ngaps = %d ",ngaps);
- printf("\n pregaps = %d ",pregaps);
- if (!isRange)
- formula = istart - pregaps;
- else
- formula = istart - pregaps + ( tmpStart == 1 ? 0: tmpStart-1) ;
-
- printf("\n\nsuggestion istart - pregaps + tmpStart - ntermgaps = %d - %d + %d - %d",istart,
- pregaps,tmpStart,ntermgaps);
- printf(" formula %d ",formula);
- }
- else {
- printf("\n no range found .... strange, istart = %d",istart);
- formula = 1;
- }
- if (pregaps == fres-1) /* all gaps - now the conditions........ */
- formula = tmpStart ; /* keep the previous start... */
- formula = (formula <= 0) ? 1: formula;
- if (pregaps ==0 && tmpStart == 0) {
- formula = fres;
- }
- iend = formula + len - ngaps -1;
-
- rnum->start = formula;
- rnum->end = iend;
- printf("\n check... %s %d - %d",names[i],rnum->start,rnum->end);
- printf(" Done checking.........");
-}
-
-
-void fasta_out(FILE *fastaout, sint fres, sint len, sint fseq, sint lseq)
-{
-
- char *seq, residue;
- sint val;
- sint i,ii;
- sint j,slen;
- sint line_length;
-
- rangeNum *rnum;
- int tmpk;
-
- seq = (char *)ckalloc((len+1) * sizeof(char));
-
- line_length=PAGEWIDTH-max_names;
- line_length=line_length-line_length % 10; /* round to a multiple of 10*/
- if (line_length > LINELENGTH) line_length=LINELENGTH;
-
- if(seqRange) {
- rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
- }
-
- for(ii=fseq; ii<=lseq; ii++) {
- i = output_index[ii];
- slen = 0;
- for(j=fres; j<fres+len; j++) {
- val = seq_array[i][j];
- if((val == -3) || (val == 253))
- break;
- else if((val < 0) || (val > max_aa)) {
- residue = '-';
- }
- else {
- residue = amino_acid_codes[val];
- }
- if (lowercase)
- seq[j-fres] = (char)tolower((int)residue);
- else
- seq[j-fres] = residue;
- slen++;
- }
- fprintf(fastaout, ">%-s",nameonly(names[i]));
- if(seqRange) {
- fillrange(rnum,fres, len, ii);
- fprintf(fastaout,"/%d-%d",rnum->start, rnum->end);
- }
- fprintf(fastaout,"\n");
- for(j=1; j<=slen; j++) {
- fprintf(fastaout,"%c",toupper(seq[j-1]));
- if((j % line_length == 0) || (j == slen))
- fprintf(fastaout,"\n");
- }
- }
- seq=ckfree((void *)seq);
-
- if(seqRange)
- if (rnum)
- free(rnum);
- /* just try and see
- printf("\n Now.... calculating percentage identity....\n\n");
- calc_percidentity();*/
-
-}
-
-
-void clustal_out(FILE *clusout, sint fres, sint len, sint fseq, sint lseq)
-{
- static char *seq1;
- static sint *seq_no;
- static sint *print_seq_no;
- char *ss_mask1, *ss_mask2;
- char temp[MAXLINE];
- char c;
- sint val;
- sint ii,lv1,catident1[NUMRES],catident2[NUMRES],ident,chunks;
- sint i,j,k,l;
- sint pos,ptr;
- sint line_length;
-
- rangeNum *rnum;
- char tmpStr[FILENAMELEN+15];
- int tmpk;
-
- /*
- stop doing this ...... opens duplicate files in VMS DES
- fclose(clusout);
- if ((clusout=fopen(clustal_outname,"w")) == NULL)
- {
- fprintf(stdout,"Error opening %s\n",clustal_outfile);
- return;
- }
- */
-
- if(seqRange) {
- rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
- if ( rnum ==NULL ) {
- printf("cannot alloc memory for rnum");
- }
- }
-
- seq_no = (sint *)ckalloc((nseqs+1) * sizeof(sint));
- print_seq_no = (sint *)ckalloc((nseqs+1) * sizeof(sint));
- for (i=fseq;i<=lseq;i++)
- {
- print_seq_no[i] = seq_no[i] = 0;
- for(j=1;j<fres;j++) {
- val = seq_array[i][j];
- if((val >=0) || (val <=max_aa)) seq_no[i]++;
- }
- }
-
- seq1 = (char *)ckalloc((max_aln_length+1) * sizeof(char));
-
- if (struct_penalties1 == SECST && use_ss1 == TRUE) {
- ss_mask1 = (char *)ckalloc((seqlen_array[1]+10) * sizeof(char));
- for (i=0;i<seqlen_array[1];i++)
- ss_mask1[i] = sec_struct_mask1[i];
- print_sec_struct_mask(seqlen_array[1],sec_struct_mask1,ss_mask1);
- }
- if (struct_penalties2 == SECST && use_ss2 == TRUE) {
- ss_mask2 = (char *)ckalloc((seqlen_array[profile1_nseqs+1]+10) * sizeof(char));
- for (i=0;i<seqlen_array[profile1_nseqs+1];i++)
- ss_mask2[i] = sec_struct_mask2[i];
- print_sec_struct_mask(seqlen_array[profile1_nseqs+1],sec_struct_mask2,ss_mask2);
- }
-
- fprintf(clusout,"CLUSTAL %s multiple sequence alignment\n\n",
- revision_level);
-
- /* decide the line length for this alignment - maximum is LINELENGTH */
- line_length=PAGEWIDTH-max_names;
- line_length=line_length-line_length % 10; /* round to a multiple of 10*/
- if (line_length > LINELENGTH) line_length=LINELENGTH;
-
- chunks = len/line_length;
- if(len % line_length != 0)
- ++chunks;
-
- for(lv1=1;lv1<=chunks;++lv1) {
- pos = ((lv1-1)*line_length)+1;
- ptr = (len<pos+line_length-1) ? len : pos+line_length-1;
-
- fprintf(clusout,"\n");
-
- if (output_struct_penalties == 0 || output_struct_penalties == 2) {
- if (struct_penalties1 == SECST && use_ss1 == TRUE) {
- for(i=pos;i<=ptr;++i) {
- val=ss_mask1[i+fres-2];
- if (val == gap_pos1 || val == gap_pos2)
- temp[i-pos]='-';
- else
- temp[i-pos]=val;
- }
- temp[ptr-pos+1]=EOS;
- if(seqRange) /*Ramu*/
- fprintf(clusout,"!SS_%-*s %s\n",max_names+15,ss_name1,temp);
- else
- fprintf(clusout,"!SS_%-*s %s\n",max_names,ss_name1,temp);
- }
- }
- if (output_struct_penalties == 1 || output_struct_penalties == 2) {
- if (struct_penalties1 != NONE && use_ss1 == TRUE) {
- for(i=pos;i<=ptr;++i) {
- val=gap_penalty_mask1[i+fres-2];
- if (val == gap_pos1 || val == gap_pos2)
- temp[i-pos]='-';
- else
- temp[i-pos]=val;
- }
- temp[ptr-pos+1]=EOS;
- fprintf(clusout,"!GM_%-*s %s\n",max_names,ss_name1,temp);
- }
- }
- if (output_struct_penalties == 0 || output_struct_penalties == 2) {
- if (struct_penalties2 == SECST && use_ss2 == TRUE) {
- for(i=pos;i<=ptr;++i) {
- val=ss_mask2[i+fres-2];
- if (val == gap_pos1 || val == gap_pos2)
- temp[i-pos]='-';
- else
- temp[i-pos]=val;
- }
- temp[ptr-pos+1]=EOS;
- if (seqRange )
- fprintf(clusout,"!SS_%-*s %s\n",max_names+15,ss_name2,temp);
- else
- fprintf(clusout,"!SS_%-*s %s\n",max_names,ss_name2,temp);
- }
- }
- if (output_struct_penalties == 1 || output_struct_penalties == 2) {
- if (struct_penalties2 != NONE && use_ss2 == TRUE) {
- for(i=pos;i<=ptr;++i) {
- val=gap_penalty_mask2[i+fres-2];
- if (val == gap_pos1 || val == gap_pos2)
- temp[i-pos]='-';
- else
- temp[i-pos]=val;
- }
- temp[ptr-pos+1]=EOS;
- fprintf(clusout,"!GM_%-*s %s\n",max_names,ss_name2,temp);
- }
- }
-
- for(ii=fseq;ii<=lseq;++ii) {
- i=output_index[ii];
- print_seq_no[i] = 0;
- for(j=pos;j<=ptr;++j) {
- if (j+fres-1<=seqlen_array[i])
- val = seq_array[i][j+fres-1];
- else val = -3;
- if((val == -3) || (val == 253)) break;
- else if((val < 0) || (val > max_aa)){
- seq1[j]='-';
- }
- else {
- seq1[j]=amino_acid_codes[val];
- seq_no[i]++;
- print_seq_no[i]=1;
- }
- }
- for(;j<=ptr;++j) seq1[j]='-';
- strncpy(temp,&seq1[pos],ptr-pos+1);
- temp[ptr-pos+1]=EOS;
- if (!seqRange) {
- fprintf(clusout,"%-*s",max_names+5,names[i]);
- }
- else {
- fillrange(rnum,fres, len, ii);
- sprintf(tmpStr,"%s/%d-%d", nameonly(names[i]), rnum->start, rnum->end);
- fprintf(clusout,"%-*s",max_names+15,tmpStr);
- }
- fprintf(clusout," %s",temp);
- if (cl_seq_numbers && print_seq_no[i])
- fprintf(clusout," %d",seq_no[i]);
- fprintf(clusout,"\n");
- }
-
- for(i=pos;i<=ptr;++i) {
- seq1[i]=' ';
- ident=0;
- for(j=1;res_cat1[j-1]!=NULL;j++) catident1[j-1] = 0;
- for(j=1;res_cat2[j-1]!=NULL;j++) catident2[j-1] = 0;
- for(j=fseq;j<=lseq;++j) {
- if((seq_array[fseq][i+fres-1] >=0) &&
- (seq_array[fseq][i+fres-1] <= max_aa)) {
- if(seq_array[fseq][i+fres-1] == seq_array[j][i+fres-1])
- ++ident;
- for(k=1;res_cat1[k-1]!=NULL;k++) {
- for(l=0;(c=res_cat1[k-1][l]);l++) {
- if (amino_acid_codes[seq_array[j][i+fres-1]]==c)
- {
- catident1[k-1]++;
- break;
- }
- }
- }
- for(k=1;res_cat2[k-1]!=NULL;k++) {
- for(l=0;(c=res_cat2[k-1][l]);l++) {
- if (amino_acid_codes[seq_array[j][i+fres-1]]==c)
- {
- catident2[k-1]++;
- break;
- }
- }
- }
- }
- }
- if(ident==lseq-fseq+1)
- seq1[i]='*';
- else if (!dnaflag) {
- for(k=1;res_cat1[k-1]!=NULL;k++) {
- if (catident1[k-1]==lseq-fseq+1) {
- seq1[i]=':';
- break;
- }
- }
- if(seq1[i]==' ')
- for(k=1;res_cat2[k-1]!=NULL;k++) {
- if (catident2[k-1]==lseq-fseq+1) {
- seq1[i]='.';
- break;
- }
- }
- }
- }
- strncpy(temp,&seq1[pos],ptr-pos+1);
- temp[ptr-pos+1]=EOS;
- for(k=0;k<max_names+6;k++) fprintf(clusout," ");
- if(seqRange) /*<ramu>*/
- fprintf(clusout," "); /*</ramu>*/
- fprintf(clusout,"%s\n",temp);
- }
-
- seq1=ckfree((void *)seq1);
- if (struct_penalties1 == SECST && use_ss1 == TRUE) ckfree(ss_mask1);
- if (struct_penalties2 == SECST && use_ss2 == TRUE) ckfree(ss_mask2);
- /* DES ckfree(output_index); */
-
- if(seqRange)
- if (rnum)
- free(rnum);
-}
-
-
-
-
-void gcg_out(FILE *gcgout, sint fres, sint len, sint fseq, sint lseq)
-{
- /* static char *aacids = "XCSTPAGNDEQHRKMILVFYW";*/
- /* static char *nbases = "XACGT"; */
- char *seq, residue;
- sint val;
- sint *all_checks;
- sint i,ii,chunks,block;
- sint j,k,pos1,pos2;
- long grand_checksum;
-
- /*<ramu>*/
- rangeNum *rnum;
- char tmpStr[FILENAMELEN+15];
- int tmpk;
-
- if(seqRange) {
- rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
- if ( rnum ==NULL ) {
- printf("cannot alloc memory for rnum");
- }
- }
-
- seq = (char *)ckalloc((max_aln_length+1) * sizeof(char));
- all_checks = (sint *)ckalloc((lseq+1) * sizeof(sint));
-
- for(i=fseq; i<=lseq; i++) {
- for(j=fres; j<=fres+len-1; j++) {
- val = seq_array[i][j];
- if((val == -3) || (val == 253)) break;
- else if((val < 0) || (val > max_aa))
- residue = '.';
- else {
- residue = amino_acid_codes[val];
- }
- seq[j-fres+1] = residue;
- }
- /* pad any short sequences with gaps, to make all sequences the same length */
- for(; j<=fres+len-1; j++)
- seq[j-fres+1] = '.';
- all_checks[i] = SeqGCGCheckSum(seq+1, (int)len);
- }
-
- grand_checksum = 0;
- for(i=1; i<=nseqs; i++) grand_checksum += all_checks[output_index[i]];
- grand_checksum = grand_checksum % 10000;
- fprintf(gcgout,"PileUp\n\n");
- fprintf(gcgout,"\n\n MSF:%5d Type: ",(pint)len);
- if(dnaflag)
- fprintf(gcgout,"N");
- else
- fprintf(gcgout,"P");
- fprintf(gcgout," Check:%6ld .. \n\n", (long)grand_checksum);
- for(ii=fseq; ii<=lseq; ii++) {
- i = output_index[ii];
- fprintf(gcgout,
- " Name: %s oo Len:%5d Check:%6ld Weight: %.1f\n",
- names[i],(pint)len,(long)all_checks[i],(float)seq_weight[i-1]*100.0/(float)INT_SCALE_FACTOR);
- }
- fprintf(gcgout,"\n//\n");
-
- chunks = len/GCG_LINELENGTH;
- if(len % GCG_LINELENGTH != 0) ++chunks;
-
- for(block=1; block<=chunks; block++) {
- fprintf(gcgout,"\n\n");
- pos1 = ((block-1) * GCG_LINELENGTH) + 1;
- pos2 = (len<pos1+GCG_LINELENGTH-1)? len : pos1+GCG_LINELENGTH-1;
- for(ii=fseq; ii<=lseq; ii++) {
- i = output_index[ii];
- if (!seqRange) {
- fprintf(gcgout,"\n%-*s ",max_names+5,names[i]);
- }
- else {
- fillrange(rnum,fres, len, ii);
- sprintf(tmpStr,"%s/%d-%d",nameonly(names[i]),rnum->start,rnum->end);
- fprintf(gcgout,"\n%-*s",max_names+15,tmpStr);
- }
- for(j=pos1, k=1; j<=pos2; j++, k++) {
- /*
- JULIE -
- check for sint sequences - pad out with '.' characters to end of alignment
- */
- if (j+fres-1<=seqlen_array[i])
- val = seq_array[i][j+fres-1];
- else val = -3;
- if((val == -3) || (val == 253))
- residue = '.';
- else if((val < 0) || (val > max_aa))
- residue = '.';
- else {
- residue = amino_acid_codes[val];
- }
- fprintf(gcgout,"%c",residue);
- if(j % 10 == 0) fprintf(gcgout," ");
- }
- }
- }
- /* DES ckfree(output_index); */
-
- seq=ckfree((void *)seq);
- all_checks=ckfree((void *)all_checks);
- fprintf(gcgout,"\n\n");
-
-
- if(seqRange) if (rnum) free(rnum);
-}
-
-
-/* <Ramu> */
-/************************************************************************
- *
- *
- * Removes the sequence range from sequence name
- *
- *
- * INPUT: Sequence name
- * (e.g. finc_rat/1-200 )
- *
- *
- * RETURNS: pointer to string
- */
-
-char *nameonly(char *s)
-{
- static char tmp[FILENAMELEN+1];
- int i =0;
-
- while (*s != '/' && *s != '\0') {
- tmp[i++] = *s++;
- }
- tmp[i] = '\0';
- return &tmp[0];
-}
-
-
-int startFind(char *s)
-{
- int i = 0;
- sint val;
- printf("\n Debug.....\n %s",s);
-
- while( *s ) {
- val = *s;
- if ( (val <0 ) || (val > max_aa)) {
- i++;
- *s++;
- printf("%c",amino_acid_codes[val]);
- }
- }
- return i;
-}
-
-/*
-void fasta_out(FILE *fastaout, sint fres, sint len, sint fseq, sint lseq)
-{
- char residue;
- sint val;
- sint i,ii;
- sint j,k;
-
- for(ii=fseq; ii<=lseq; ii++) {
- i = output_index[ii];
- fprintf(fastaout,">%-s",names[i],len);
- j = 1;
- while(j<len) {
- if ( ! (j%80) ) {
- fprintf(fastaout,"\n");
- }
- val = seq_array[i][j];
- if((val < 0) || (val > max_aa))
- residue = '-';
- else {
- residue = amino_acid_codes[val];
- }
- fprintf(fastaout,"%c",residue);
- j++;
- }
- fprintf(fastaout,"\n");
- }
-
-}
-*/
-
-/* </Ramu> */
-
-void nexus_out(FILE *nxsout, sint fres, sint len, sint fseq, sint lseq)
-{
-/* static char *aacids = "XCSTPAGNDEQHRKMILVFYW";*/
-/* static char *nbases = "XACGT"; */
- char residue;
- sint val;
- sint i,ii,chunks,block;
- sint j,k,pos1,pos2;
-
-
- /*<ramu>*/
- rangeNum *rnum;
- char tmpStr[FILENAMELEN+15];
- int tmpk;
-
- if(seqRange) {
- rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
- if ( rnum ==NULL ) {
- printf("cannot alloc memory for rnum");
- }
- }
-
-
- chunks = len/GCG_LINELENGTH;
- if(len % GCG_LINELENGTH != 0) ++chunks;
-
- fprintf(nxsout,"#NEXUS\n");
- fprintf(nxsout,"BEGIN DATA;\n");
- fprintf(nxsout,"dimensions ntax=%d nchar=%d;\n",(pint)nseqs,(pint)len);
- fprintf(nxsout,"format missing=?\n");
- fprintf(nxsout,"symbols=\"");
- for(i=0;i<=max_aa;i++)
- fprintf(nxsout,"%c",amino_acid_codes[i]);
- fprintf(nxsout,"\"\n");
- fprintf(nxsout,"interleave datatype=");
- fprintf(nxsout, dnaflag ? "DNA " : "PROTEIN ");
- fprintf(nxsout,"gap= -;\n");
- fprintf(nxsout,"\nmatrix");
-
- for(block=1; block<=chunks; block++) {
- pos1 = ((block-1) * GCG_LINELENGTH)+1;
- pos2 = (len<pos1+GCG_LINELENGTH-1)? len : pos1+GCG_LINELENGTH-1;
- for(ii=fseq; ii<=lseq; ii++) {
- i = output_index[ii];
- if (!seqRange) {
- fprintf(nxsout,"\n%-*s ",max_names+1,names[i]);
- }
- else {
- fillrange(rnum,fres, len, ii);
- sprintf(tmpStr,"%s/%d-%d",nameonly(names[i]),rnum->start,rnum->end);
- fprintf(nxsout,"\n%-*s",max_names+15,tmpStr);
- }
- for(j=pos1, k=1; j<=pos2; j++, k++) {
- if (j+fres-1<=seqlen_array[i])
- val = seq_array[i][j+fres-1];
- else val = -3;
- if((val == -3) || (val == 253))
- break;
- else if((val < 0) || (val > max_aa))
- residue = '-';
- else {
- residue = amino_acid_codes[val];
- }
- fprintf(nxsout,"%c",residue);
- }
- }
- fprintf(nxsout,"\n");
- }
- fprintf(nxsout,";\nend;\n");
- /* DES ckfree(output_index); */
-
- if(seqRange) if (rnum) free(rnum);
-
-}
-
-
-
-
-void phylip_out(FILE *phyout, sint fres, sint len, sint fseq, sint lseq)
-{
-/* static char *aacids = "XCSTPAGNDEQHRKMILVFYW";*/
-/* static char *nbases = "XACGT"; */
- char residue;
- sint val;
- sint i,ii,chunks,block;
- sint j,k,pos1,pos2;
- sint name_len;
- Boolean warn;
- char **snames;
-
- /*<ramu>*/
- rangeNum *rnum;
- char tmpStr[FILENAMELEN+15];
- int tmpk;
-
-
- if(seqRange) {
- rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
- if ( rnum ==NULL ) {
- printf("cannot alloc memory for rnum");
- }
- }
-
- snames=(char **)ckalloc((lseq-fseq+2)*sizeof(char *));
- name_len=0;
- for(i=fseq; i<=lseq; i++) {
- snames[i]=(char *)ckalloc((11)*sizeof(char));
- ii=strlen(names[i]);
- strncpy(snames[i],names[i],10);
- if(name_len<ii) name_len=ii;
- }
- if(name_len>10) {
- warn=FALSE;
- for(i=fseq; i<=lseq; i++) {
- for(j=i+1;j<=lseq;j++) {
- if (strcmp(snames[i],snames[j]) == 0)
- warn=TRUE;
- }
- }
- if(warn)
- warning("Truncating sequence names to 10 characters for PHYLIP output.\n"
- "Names in the PHYLIP format file are NOT unambiguous.");
- else
- warning("Truncating sequence names to 10 characters for PHYLIP output.");
- }
-
-
- chunks = len/GCG_LINELENGTH;
- if(len % GCG_LINELENGTH != 0) ++chunks;
-
- fprintf(phyout,"%6d %6d",(pint)nseqs,(pint)len);
-
- for(block=1; block<=chunks; block++) {
- pos1 = ((block-1) * GCG_LINELENGTH)+1;
- pos2 = (len<pos1+GCG_LINELENGTH-1)? len : pos1+GCG_LINELENGTH-1;
- for(ii=fseq; ii<=lseq; ii++) {
- i = output_index[ii];
- if(block == 1) {
- if(!seqRange) {
- fprintf(phyout,"\n%-10s ",snames[i]);
- }
- else
- {
- fillrange(rnum,fres, len, ii);
- sprintf(tmpStr,"%s/%d-%d",nameonly(names[i]),rnum->start,rnum->end);
- fprintf(phyout,"\n%-*s",max_names+15,tmpStr);
- }
- }
- else
- fprintf(phyout,"\n ");
- for(j=pos1, k=1; j<=pos2; j++, k++) {
- if (j+fres-1<=seqlen_array[i])
- val = seq_array[i][j+fres-1];
- else val = -3;
- if((val == -3) || (val == 253))
- break;
- else if((val < 0) || (val > max_aa))
- residue = '-';
- else {
- residue = amino_acid_codes[val];
- }
- fprintf(phyout,"%c",residue);
- if(j % 10 == 0) fprintf(phyout," ");
- }
- }
- fprintf(phyout,"\n");
- }
- /* DES ckfree(output_index); */
-
- for(i=fseq;i<=lseq;i++)
- ckfree(snames[i]);
- ckfree(snames);
-
- if(seqRange) if (rnum) free(rnum);
-
-}
-
-
-
-
-
-void nbrf_out(FILE *nbout, sint fres, sint len, sint fseq, sint lseq)
-{
-/* static char *aacids = "XCSTPAGNDEQHRKMILVFYW";*/
-/* static char *nbases = "XACGT"; */
- char *seq, residue;
- sint val;
- sint i,ii;
- sint j,slen;
- sint line_length;
-
-
- /*<ramu>*/
- rangeNum *rnum;
- char tmpStr[FILENAMELEN+15];
- int tmpk;
-
- if(seqRange) {
- rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
- if ( rnum ==NULL ) {
- printf("cannot alloc memory for rnum");
- }
- }
-
- seq = (char *)ckalloc((max_aln_length+1) * sizeof(char));
-
- /* decide the line length for this alignment - maximum is LINELENGTH */
- line_length=PAGEWIDTH-max_names;
- line_length=line_length-line_length % 10; /* round to a multiple of 10*/
- if (line_length > LINELENGTH) line_length=LINELENGTH;
-
- for(ii=fseq; ii<=lseq; ii++) {
- i = output_index[ii];
- fprintf(nbout, dnaflag ? ">DL;" : ">P1;");
- if (!seqRange) {
- fprintf(nbout, "%s\n%s\n", names[i], titles[i]);
- }
- else {
- fillrange(rnum,fres, len, ii);
- sprintf(tmpStr,"%s/%d-%d",nameonly(names[i]),rnum->start,rnum->end);
- fprintf(nbout,"%s\n%s\n",tmpStr,titles[i]);
- }
- slen = 0;
- for(j=fres; j<fres+len; j++) {
- val = seq_array[i][j];
- if((val == -3) || (val == 253))
- break;
- else if((val < 0) || (val > max_aa))
- residue = '-';
- else {
- residue = amino_acid_codes[val];
- }
- seq[j-fres] = residue;
- slen++;
- }
- for(j=1; j<=slen; j++) {
- fprintf(nbout,"%c",seq[j-1]);
- if((j % line_length == 0) || (j == slen))
- fprintf(nbout,"\n");
- }
- fprintf(nbout,"*\n");
- }
- /* DES ckfree(output_index); */
-
- seq=ckfree((void *)seq);
-
- if(seqRange) if (rnum) free(rnum);
-
-}
-
-
-void gde_out(FILE *gdeout, sint fres, sint len, sint fseq, sint lseq)
-{
-/* static char *aacids = "XCSTPAGNDEQHRKMILVFYW";*/
-/* static char *nbases = "XACGT"; */
- char *seq, residue;
- sint val;
- char *ss_mask1, *ss_mask2;
- sint i,ii;
- sint j,slen;
- sint line_length;
-
-
- /*<ramu>*/
- rangeNum *rnum;
- char tmpStr[FILENAMELEN+15];
- int tmpk;
-
- if(seqRange) {
- rnum = (struct rangeNum *) malloc(sizeof(struct rangeNum));
- if ( rnum ==NULL ) {
- printf("cannot alloc memory for rnum");
- }
- }
-
- seq = (char *)ckalloc((max_aln_length+1) * sizeof(char));
-
- /* decide the line length for this alignment - maximum is LINELENGTH */
- line_length=PAGEWIDTH-max_names;
- line_length=line_length-line_length % 10; /* round to a multiple of 10*/
- if (line_length > LINELENGTH) line_length=LINELENGTH;
-
- if (struct_penalties1 == SECST && use_ss1 == TRUE) {
- ss_mask1 = (char *)ckalloc((seqlen_array[1]+10) * sizeof(char));
- for (i=0;i<seqlen_array[1];i++)
- ss_mask1[i] = sec_struct_mask1[i];
- print_sec_struct_mask(seqlen_array[1],sec_struct_mask1,ss_mask1);
- }
- if (struct_penalties2 == SECST && use_ss2 == TRUE) {
- ss_mask2 = (char *)ckalloc((seqlen_array[profile1_nseqs+1]+10) *
- sizeof(char));
- for (i=0;i<seqlen_array[profile1_nseqs+1];i++)
- ss_mask2[i] = sec_struct_mask2[i];
- print_sec_struct_mask(seqlen_array[profile1_nseqs+1],sec_struct_mask2,ss_mask2);
- }
-
-
- for(ii=fseq; ii<=lseq; ii++) {
- i = output_index[ii];
- fprintf(gdeout, dnaflag ? "#" : "%%");
- if(!seqRange) {
- fprintf(gdeout, "%s\n", names[i]);
- }
- else {
- fillrange(rnum,fres, len, ii);
- fprintf(gdeout,"%s/%d-%d\n",nameonly(names[i]),rnum->start,rnum->end);
- }
- slen = 0;
- for(j=fres; j<fres+len; j++) {
- val = seq_array[i][j];
- if((val == -3) || (val == 253))
- break;
- else if((val < 0) || (val > max_aa))
- residue = '-';
- else {
- residue = amino_acid_codes[val];
- }
- if (lowercase)
- seq[j-fres] = (char)tolower((int)residue);
- else
- seq[j-fres] = residue;
- slen++;
- }
- for(j=1; j<=slen; j++) {
- fprintf(gdeout,"%c",seq[j-1]);
- if((j % line_length == 0) || (j == slen))
- fprintf(gdeout,"\n");
- }
- }
- /* DES ckfree(output_index); */
-
- if (output_struct_penalties == 0 || output_struct_penalties == 2) {
- if (struct_penalties1 == SECST && use_ss1 == TRUE) {
- fprintf(gdeout,"\"SS_%-*s\n",max_names,ss_name1);
- for(i=fres; i<fres+len; i++) {
- val=ss_mask1[i-1];
- if (val == gap_pos1 || val == gap_pos2)
- seq[i-fres]='-';
- else
- seq[i-fres]=val;
- }
- seq[i-fres]=EOS;
- for(i=1; i<=len; i++) {
- fprintf(gdeout,"%c",seq[i-1]);
- if((i % line_length == 0) || (i == len))
- fprintf(gdeout,"\n");
- }
- }
-
- if (struct_penalties2 == SECST && use_ss2 == TRUE) {
- fprintf(gdeout,"\"SS_%-*s\n",max_names,ss_name2);
- for(i=fres; i<fres+len; i++) {
- val=ss_mask2[i-1];
- if (val == gap_pos1 || val == gap_pos2)
- seq[i-fres]='-';
- else
- seq[i-fres]=val;
- }
- seq[i]=EOS;
- for(i=1; i<=len; i++) {
- fprintf(gdeout,"%c",seq[i-1]);
- if((i % line_length == 0) || (i == len))
- fprintf(gdeout,"\n");
- }
- }
- }
- if (output_struct_penalties == 1 || output_struct_penalties == 2) {
- if (struct_penalties1 != NONE && use_ss1 == TRUE) {
- fprintf(gdeout,"\"GM_%-*s\n",max_names,ss_name1);
- for(i=fres; i<fres+len; i++) {
- val=gap_penalty_mask1[i-1];
- if (val == gap_pos1 || val == gap_pos2)
- seq[i-fres]='-';
- else
- seq[i-fres]=val;
- }
- seq[i]=EOS;
- for(i=1; i<=len; i++) {
- fprintf(gdeout,"%c",seq[i-1]);
- if((i % line_length == 0) || (i == len))
- fprintf(gdeout,"\n");
- }
- }
- if (struct_penalties2 != NONE && use_ss2 == TRUE) {
- fprintf(gdeout,"\"GM_%-*s\n",max_names,ss_name2);
- for(i=fres; i<fres+len; i++) {
- val=gap_penalty_mask2[i-1];
- if (val == gap_pos1 || val == gap_pos2)
- seq[i-fres]='-';
- else
- seq[i-fres]=val;
- }
- seq[i]=EOS;
- for(i=1; i<=len; i++) {
- fprintf(gdeout,"%c",seq[i-1]);
- if((i % line_length == 0) || (i == len))
- fprintf(gdeout,"\n");
- }
- }
- }
-
- if (struct_penalties1 == SECST && use_ss1 == TRUE) ckfree(ss_mask1);
- if (struct_penalties2 == SECST && use_ss2 == TRUE) ckfree(ss_mask2);
- seq=ckfree((void *)seq);
-
-
- if(seqRange) if (rnum) free(rnum);
-
-}
-
-
-Boolean open_alignment_output(char *path)
-{
-
- if(!output_clustal && !output_nbrf && !output_gcg &&
- !output_phylip && !output_gde && !output_nexus && !output_fasta) {
- error("You must select an alignment output format");
- return FALSE;
- }
-
- if(output_clustal)
- if (outfile_name[0]!=EOS) {
- strcpy(clustal_outname,outfile_name);
- if((clustal_outfile = open_explicit_file(
- clustal_outname))==NULL) return FALSE;
- }
- else {
- /* DES DEBUG
- fprintf(stdout,"\n\n path = %s\n clustal_outname = %s\n\n",
- path,clustal_outname);
- */
- if((clustal_outfile = open_output_file(
- "\nEnter a name for the CLUSTAL output file ",path,
- clustal_outname,"aln"))==NULL) return FALSE;
- /* DES DEBUG
- fprintf(stdout,"\n\n path = %s\n clustal_outname = %s\n\n",
- path,clustal_outname);
- */
- }
- if(output_nbrf)
- if (outfile_name[0]!=EOS) {
- strcpy(nbrf_outname,outfile_name);
- if( (nbrf_outfile = open_explicit_file(nbrf_outname))==NULL)
- return FALSE;
- }
- else
- if((nbrf_outfile = open_output_file(
- "\nEnter a name for the NBRF/PIR output file",path,
- nbrf_outname,"pir"))==NULL) return FALSE;
- if(output_gcg)
- if (outfile_name[0]!=EOS) {
- strcpy(gcg_outname,outfile_name);
- if((gcg_outfile = open_explicit_file( gcg_outname))==NULL)
- return FALSE;
- }
- else
- if((gcg_outfile = open_output_file(
- "\nEnter a name for the GCG output file ",path,
- gcg_outname,"msf"))==NULL) return FALSE;
- if(output_phylip)
- if (outfile_name[0]!=EOS) {
- strcpy(phylip_outname,outfile_name);
- if((phylip_outfile = open_explicit_file(
- phylip_outname))==NULL) return FALSE;
- }
- else
- if((phylip_outfile = open_output_file(
- "\nEnter a name for the PHYLIP output file ",path,
- phylip_outname,"phy"))==NULL) return FALSE;
- if(output_gde)
- if (outfile_name[0]!=EOS) {
- strcpy(gde_outname,outfile_name);
- if((gde_outfile = open_explicit_file(
- gde_outname))==NULL) return FALSE;
- }
- else
- if((gde_outfile = open_output_file(
- "\nEnter a name for the GDE output file ",path,
- gde_outname,"gde"))==NULL) return FALSE;
- if(output_nexus)
- if (outfile_name[0]!=EOS) {
- strcpy(nexus_outname,outfile_name);
- if((nexus_outfile = open_explicit_file(
- nexus_outname))==NULL) return FALSE;
- }
- else
- if((nexus_outfile = open_output_file(
- "\nEnter a name for the NEXUS output file ",path,
- nexus_outname,"nxs"))==NULL) return FALSE;
-
- /* Ramu */
- if(output_fasta)
- if (outfile_name[0]!=EOS) {
- strcpy(fasta_outname,outfile_name);
- if((fasta_outfile = open_explicit_file(
- fasta_outname))==NULL) return FALSE;
- }
- else
- if((fasta_outfile = open_output_file(
- "\nEnter a name for the Fasta output file ",path,
- fasta_outname,"fasta"))==NULL) return FALSE;
-
- return TRUE;
-}
-
-
-
-
-void create_alignment_output(sint fseq, sint lseq)
-{
- sint i,length;
-
- sint ifres; /* starting sequence range - Ramu */
- sint ilres; /* ending sequence range */
- char ignore;
- Boolean rangeOK;
-
- length=0;
-
- ifres = 1;
- ilres = 0;
- rangeOK = FALSE;
- for (i=fseq;i<=lseq;i++)
- if (length < seqlen_array[i])
- length = seqlen_array[i];
- ilres=length;
-
-
- if (setrange != -1 ) {
- /* printf("\n ==================== seqRange is set \n"); */
- if ( sscanf(param_arg[setrange],"%d%[ :,-]%d",&ifres,&ignore,&ilres) !=3) {
- info("seqrange numers are not set properly, using default....");
- ifres = 1;
- ilres = length;
- }
- else
- rangeOK = TRUE;
- }
- if ( rangeOK && ilres > length ) {
- ilres = length; /* if asked for more, set the limit, Ramui */
- info("Seqrange %d is more than the %d setting it to %d ",ilres,length,length);
- }
-
- /* if (usemenu) info("Consensus length = %d",(pint)length);*/
-
- if (usemenu) info("Consensus length = %d",(pint)ilres); /* Ramu */
-
- /*
- printf("\n creating output ....... normal.... setrange = %d \n",setrange);
- printf(" ---------> %d %d \n\n ",ifres,ilres);
- printf(" ---------> %d \n\n ",length);
- */
-
- if(output_clustal) {
- clustal_out(clustal_outfile, ifres, ilres, fseq, lseq);
- fclose(clustal_outfile);
- info("CLUSTAL-Alignment file created [%s]",clustal_outname);
- }
- if(output_nbrf) {
- nbrf_out(nbrf_outfile, ifres, ilres, /*1, length */ fseq, lseq);
- fclose(nbrf_outfile);
- info("NBRF/PIR-Alignment file created [%s]",nbrf_outname);
- }
- if(output_gcg) {
- gcg_out(gcg_outfile, ifres, ilres, /*1, length */ fseq, lseq);
- fclose(gcg_outfile);
- info("GCG-Alignment file created [%s]",gcg_outname);
- }
- if(output_phylip) {
- phylip_out(phylip_outfile, ifres, ilres, /*1, length */ fseq, lseq);
- fclose(phylip_outfile);
- info("PHYLIP-Alignment file created [%s]",phylip_outname);
- }
- if(output_gde) {
- gde_out(gde_outfile, ifres, ilres /*1, length */, fseq, lseq);
- fclose(gde_outfile);
- info("GDE-Alignment file created [%s]",gde_outname);
- }
- if(output_nexus) {
- nexus_out(nexus_outfile, ifres, ilres /*1, length */, fseq, lseq);
- fclose(nexus_outfile);
- info("NEXUS-Alignment file created [%s]",nexus_outname);
- }
- /* Ramu */
- if(output_fasta) {
- fasta_out(fasta_outfile, ifres, ilres /*1, length */, fseq, lseq);
- fclose(fasta_outfile);
- info("Fasta-Alignment file created [%s]",fasta_outname);
- }
-}
-
-
-static void reset_align(void) /* remove gaps from older alignments (code =
- gap_pos1) */
-{ /* EXCEPT for gaps that were INPUT with the seqs.*/
- register sint sl; /* which have code = gap_pos2 */
- sint i,j;
-
- for(i=1;i<=nseqs;++i) {
- sl=0;
- for(j=1;j<=seqlen_array[i];++j) {
- if(seq_array[i][j] == gap_pos1 &&
- ( reset_alignments_new ||
- reset_alignments_all)) continue;
- if(seq_array[i][j] == gap_pos2 && (reset_alignments_all)) continue;
- ++sl;
- seq_array[i][sl]=seq_array[i][j];
- }
- seqlen_array[i]=sl;
- }
-}
-
-
-
-static void reset_prf1(void) /* remove gaps from older alignments (code =
- gap_pos1) */
-{ /* EXCEPT for gaps that were INPUT with the seqs.*/
- register sint sl; /* which have code = gap_pos2 */
- sint i,j;
-
- if (struct_penalties1 != NONE) {
- sl=0;
- for (j=0;j<seqlen_array[1];++j) {
- if (gap_penalty_mask1[j] == gap_pos1 && (reset_alignments_new ||
- reset_alignments_all)) continue;
- if (gap_penalty_mask1[j] == gap_pos2 && (reset_alignments_all)) continue;
- gap_penalty_mask1[sl]=gap_penalty_mask1[j];
- ++sl;
- }
- }
-
- if (struct_penalties1 == SECST) {
- sl=0;
- for (j=0;j<seqlen_array[1];++j) {
- if (sec_struct_mask1[j] == gap_pos1 && (reset_alignments_new ||
- reset_alignments_all)) continue;
- if (sec_struct_mask1[j] == gap_pos2 && (reset_alignments_all)) continue;
- sec_struct_mask1[sl]=sec_struct_mask1[j];
- ++sl;
- }
- }
-
- for(i=1;i<=profile1_nseqs;++i) {
- sl=0;
- for(j=1;j<=seqlen_array[i];++j) {
- if(seq_array[i][j] == gap_pos1 && (reset_alignments_new ||
- reset_alignments_all)) continue;
- if(seq_array[i][j] == gap_pos2 && (reset_alignments_all)) continue;
- ++sl;
- seq_array[i][sl]=seq_array[i][j];
- }
- seqlen_array[i]=sl;
- }
-
-
-}
-
-
-
-static void reset_prf2(void) /* remove gaps from older alignments (code =
- gap_pos1) */
-{ /* EXCEPT for gaps that were INPUT with the seqs.*/
- register sint sl; /* which have code = gap_pos2 */
- sint i,j;
-
- if (struct_penalties2 != NONE) {
- sl=0;
- for (j=0;j<seqlen_array[profile1_nseqs+1];++j) {
- if (gap_penalty_mask2[j] == gap_pos1 && (reset_alignments_new ||
- reset_alignments_all)) continue;
- if (gap_penalty_mask2[j] == gap_pos2 && (reset_alignments_all)) continue;
- gap_penalty_mask2[sl]=gap_penalty_mask2[j];
- ++sl;
- }
- }
-
- if (struct_penalties2 == SECST) {
- sl=0;
- for (j=0;j<seqlen_array[profile1_nseqs+1];++j) {
- if (sec_struct_mask2[j] == gap_pos1 && (reset_alignments_new ||
- reset_alignments_all)) continue;
- if (sec_struct_mask2[j] == gap_pos2 && (reset_alignments_all)) continue;
- sec_struct_mask2[sl]=sec_struct_mask2[j];
- ++sl;
- }
- }
-
- for(i=profile1_nseqs+1;i<=nseqs;++i) {
- sl=0;
- for(j=1;j<=seqlen_array[i];++j) {
- if(seq_array[i][j] == gap_pos1 && (reset_alignments_new ||
- reset_alignments_all)) continue;
- if(seq_array[i][j] == gap_pos2 && (reset_alignments_all)) continue;
- ++sl;
- seq_array[i][sl]=seq_array[i][j];
- }
- seqlen_array[i]=sl;
- }
-
-
-}
-
-
-
-void fix_gaps(void) /* fix gaps introduced in older alignments (code = gap_pos1) */
-{
- sint i,j;
-
- if (struct_penalties1 != NONE) {
- for (j=0;j<seqlen_array[1];++j) {
- if (gap_penalty_mask1[j] == gap_pos1)
- gap_penalty_mask1[j]=gap_pos2;
- }
- }
-
- if (struct_penalties1 == SECST) {
- for (j=0;j<seqlen_array[1];++j) {
- if (sec_struct_mask1[j] == gap_pos1)
- sec_struct_mask1[j]=gap_pos2;
- }
- }
-
- for(i=1;i<=nseqs;++i) {
- for(j=1;j<=seqlen_array[i];++j) {
- if(seq_array[i][j] == gap_pos1)
- seq_array[i][j]=gap_pos2;
- }
- }
-}
-
-static sint find_match(char *probe, char *list[], sint n)
-{
- sint i,j,len;
- sint count,match=0;
-
- len = (sint)strlen(probe);
- for (i=0;i<len;i++) {
- count = 0;
- for (j=0;j<n;j++) {
- if (probe[i] == list[j][i]) {
- match = j;
- count++;
- }
- }
- if (count == 0) return((sint)-1);
- if (count == 1) return(match);
- }
- return((sint)-1);
-}
-
-static void create_parameter_output(void)
-{
- char parname[FILENAMELEN+1], temp[FILENAMELEN+1];
- char path[FILENAMELEN+1];
- char tmp_msg[FILENAMELEN+300];
- FILE *parout;
-
- get_path(seqname,path);
- strcpy(parname,path);
- strcat(parname,"par");
-
- if(usemenu) {
- sprintf(tmp_msg,"\nEnter a name for the parameter output file [%s]",
- parname);
- getstr(tmp_msg,FILENAMELEN+1,temp);
- if(*temp != EOS)
- strcpy(parname,temp);
- }
-
-/* create a file with execute permissions first */
- remove(parname);
- /*
- fd = creat(parname, 0777);
- close(fd);
- */
-
- if((parout = open_explicit_file(parname))==NULL) return;
-
- fprintf(parout,"clustalw \\\n");
- if (!empty && profile1_empty) fprintf(parout,"-infile=%s \\\n",seqname);
- if (!profile1_empty) fprintf(parout,"-profile1=%s\\\n",profile1_name);
- if (!profile2_empty) fprintf(parout,"-profile2=%s \\\n",profile2_name);
- if (dnaflag == TRUE)
- fprintf(parout,"-type=dna \\\n");
- else
- fprintf(parout,"-type=protein \\\n");
-
- if (quick_pairalign) {
- fprintf(parout,"-quicktree \\\n");
- fprintf(parout,"-ktuple=%d \\\n",(pint)ktup);
- fprintf(parout,"-window=%d \\\n",(pint)window);
- fprintf(parout,"-pairgap=%d \\\n",(pint)wind_gap);
- fprintf(parout,"-topdiags=%d \\\n",(pint)signif);
- if (percent) fprintf(parout,"-score=percent \\\n");
- else
- fprintf(parout,"-score=absolute \\\n");
- }
- else {
- if (!dnaflag) {
- fprintf(parout,"-pwmatrix=%s \\\n",pw_mtrxname);
- fprintf(parout,"-pwgapopen=%.2f \\\n",prot_pw_go_penalty);
- fprintf(parout,"-pwgapext=%.2f \\\n",prot_pw_ge_penalty);
- }
- else {
- fprintf(parout,"-pwgapopen=%.2f \\\n",pw_go_penalty);
- fprintf(parout,"-pwgapext=%.2f \\\n",pw_ge_penalty);
- }
- }
-
- if (!dnaflag) {
- fprintf(parout,"-matrix=%s \\\n",mtrxname);
- fprintf(parout,"-gapopen=%.2f \\\n",prot_gap_open);
- fprintf(parout,"-gapext=%.2f \\\n",prot_gap_extend);
- }
- else {
- fprintf(parout,"-gapopen=%.2f \\\n",dna_gap_open);
- fprintf(parout,"-gapext=%.2f \\\n",dna_gap_extend);
- }
-
- fprintf(parout,"-maxdiv=%d \\\n",(pint)divergence_cutoff);
- if (!use_endgaps) fprintf(parout,"-endgaps \\\n");
-
- if (!dnaflag) {
- if (neg_matrix) fprintf(parout,"-negative \\\n");
- if (no_pref_penalties) fprintf(parout,"-nopgap \\\n");
- if (no_hyd_penalties) fprintf(parout,"-nohgap \\\n");
- if (no_var_penalties) fprintf(parout,"-novgap \\\n");
- fprintf(parout,"-hgapresidues=%s \\\n",hyd_residues);
- fprintf(parout,"-gapdist=%d \\\n",(pint)gap_dist);
- }
- else {
- fprintf(parout,"-transweight=%.2f \\\n",transition_weight);
- }
-
- if (output_gcg) fprintf(parout,"-output=gcg \\\n");
- else if (output_gde) fprintf(parout,"-output=gde \\\n");
- else if (output_nbrf) fprintf(parout,"-output=pir \\\n");
- else if (output_phylip) fprintf(parout,"-output=phylip \\\n");
- else if (output_nexus) fprintf(parout,"-output=nexus \\\n");
- if (outfile_name[0]!=EOS) fprintf(parout,"-outfile=%s \\\n",outfile_name);
- if (output_order==ALIGNED) fprintf(parout,"-outorder=aligned \\\n");
- else fprintf(parout,"-outorder=input \\\n");
- if (output_gde)
- if (lowercase) fprintf(parout,"-case=lower \\\n");
- else fprintf(parout,"-case=upper \\\n");
-
-
- fprintf(parout,"-interactive\n");
-
- /*
- if (kimura) fprintf(parout,"-kimura \\\n");
- if (tossgaps) fprintf(parout,"-tossgaps \\\n");
- fprintf(parout,"-seed=%d \\\n",(pint)boot_ran_seed);
- fprintf(parout,"-bootstrap=%d \\\n",(pint)boot_ntrials);
- */
- fclose(parout);
-}
-
-
-#define isgap(val1) ( (val1 < 0) || (val1 > max_aa) )
-#define isend(val1) ((val1 == -3)||(val1 == 253) )
-
-void calc_percidentity(FILE *pfile)
-{
- double **pmat;
- char residue;
-
- float ident;
- int nmatch;
-
- sint val1, val2;
-
- sint i,j,k, length_longest;
- sint length_shortest;
-
- int rs=0, rl=0;
- /* findout sequence length, longest and shortest ; */
- length_longest=0;
- length_shortest=0;
-
- for (i=1;i<=nseqs;i++) {
- /*printf("\n %d : %d ",i,seqlen_array[i]);*/
- if (length_longest < seqlen_array[i]){
- length_longest = seqlen_array[i];
- rs = i;
- }
- if (length_shortest > seqlen_array[i]) {
- length_shortest = seqlen_array[i];
- rl = i;
- }
- }
- /*
- printf("\n shortest length %s %d ",names[rs], length_shortest);
- printf("\n longest est length %s %d",names[rl], length_longest);
- */
-
- pmat = (double **)ckalloc((nseqs+1) * sizeof(double *));
- for (i=0;i<=nseqs;i++)
- pmat[i] = (double *)ckalloc((nseqs+1) * sizeof(double));
- for (i = 0; i <= nseqs; i++)
- for (j = 0; j <= nseqs; j++)
- pmat[i][j] = 0.0;
-
- nmatch = 0;
-
- for (i=1; i <= nseqs; i++) {
- /*printf("\n %5d: comparing %s with ",i,names[i]); */
- for (j=i; j<=nseqs ; j++) {
- printf("\n %s ",names[j]);
- ident = 0;
- nmatch = 0;
- for(k=1; k<=length_longest; k++) {
- val1 = seq_array[i][k];
- val2 = seq_array[j][k];
- if ( isend(val1) || isend(val2)) break; /* end of sequence ????? */
- if ( isgap(val1) || isgap(val2) ) continue; /* residue = '-'; */
- if (val1 == val2) {
- ident++ ;
- nmatch++;
- /* residue = amino_acid_codes[val1];
- printf("%c:",residue);
- residue = amino_acid_codes[val2];
- printf("%c ",residue);*/
- }
- else {
- nmatch++ ;
- }
- }
- ident = ident/nmatch * 100.0 ;
- pmat[i][j] = ident;
- pmat[j][i]= ident;
- /* printf(" %d x %d .... match %d %d \n",i,j,ident,pmat[i][j]); */
- }
-
- }
- /* printf("\n nmatch = %d\n ", nmatch);*/
- fprintf(pfile,"#\n#\n# Percent Identity Matrix - created by Clustal%s \n#\n#\n",revision_level);
- for(i=1;i<=nseqs;i++) {
- fprintf(pfile,"\n %5d: %-*s",i,max_names,names[i]);
- for(j=1;j<=nseqs;j++) {
- fprintf(pfile,"%8.0f",pmat[i][j]);
- }
- }
- fprintf(pfile,"\n");
-
- for (i=0;i<nseqs;i++)
- pmat[i]=ckfree((void *)pmat[i]);
- pmat=ckfree((void *)pmat);
-
-}
Deleted: trunk/packages/clustalw/trunk/makefile
===================================================================
--- trunk/packages/clustalw/trunk/makefile 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/makefile 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,99 +0,0 @@
-
-RM=/bin/rm -f
-
-BINDIR=$(DESTDIR)/usr/bin
-XBINDIR=$(DESTDIR)/usr/X11R6/bin
-DOCDIR=$(DESTDIR)/usr/share/doc/clustalw
-XDOCDIR=$(DESTDIR)/usr/share/doc/clustalx
-LIBDIR=$(DESTDIR)/usr/share/clustalw
-MANDIR=$(DESTDIR)/usr/share/man/man1
-XMANDIR=$(DESTDIR)/usr/X11R6/man/man1
-DOCS=clustalv.doc clustalw.doc clustalw.ms README_W
-XDOCS=README_X clustalx.html
-
-OBJECTS = interface.o sequence.o showpair.o malign.o \
- util.o trees.o gcgcheck.o prfalign.o pairalign.o \
- calcgapcoeff.o calcprf1.o calcprf2.o calctree.o \
- readmat.o alnscore.o random.o
-
-XOBJECTS = xutils.o xmenu.o xcolor.o xdisplay.o xscore.o
-
-HEADERS = general.h clustalw.h
-
-CC = gcc
-CFLAGS = -c -O2
-
-MACHINE=$(shell uname -m)
-ifeq ("$(MACHINE)","alpha")
- # -mieee is for the Alpha only: ClustalW divides by zero (yes, I know it's bad)
- # and expect the processor to goes on. -mieee tells the Alpha to comply with
- # the IEEE standard and to shut up about divisions by zero.
- CFLAGS += -mieee
-endif
-
-LFLAGS = -O -lm
-NCBI_INC= /usr/include/ncbi
-NCBI_LIB= /usr/lib
-CXFLAGS = -DWIN_MOTIF -I$(NCBI_INC)
-LXFLAGS = -L/usr/X11R6/lib -lvibrant -lncbi -lpthread -lXm -lXmu -lXt -lX11 -lm
-
-all: clustalx clustalw
-
-machine:
- echo $(MACHINE)
-
-clustalw : $(OBJECTS) $(XOBJECTS) amenu.o clustalw.o
- $(CC) -o $@ -I$(NCBI_INC) $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
-
-clustalx : $(OBJECTS) $(XOBJECTS) clustalx.o
- $(CC) -o $@ -I$(NCBI_INC) $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
-
-clustalw.o : clustalw.c $(HEADERS)
- $(CC) $(CFLAGS) $*.c
-
-clustalx.o : clustalx.c $(HEADERS)
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xmenu.o : xmenu.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xdisplay.o : xdisplay.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xutils.o : xutils.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xcolor.o : xcolor.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xscore.o : xscore.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-readmat.o : readmat.c $(HEADERS) matrices.h
- $(CC) $(CFLAGS) $*.c
-
-trees.o : trees.c $(HEADERS) dayhoff.h
- $(CC) $(CFLAGS) $*.c
-
-
-
-install: all
- install -d $(BINDIR) $(XBINDIR) $(LIBDIR) $(DOCDIR)/examples $(MANDIR) $(XMANDIR) $(XDOCDIR)
- install -m 0755 clustalw $(BINDIR)
- install -m 0755 clustalx $(XBINDIR)
- install -m 0644 clustalw_help clustalx_help $(LIBDIR)
- install -m 0644 clustalw.1 $(MANDIR)
- install -m 0644 clustalx.1 $(MANDIR)
- install -m 0644 $(DOCS) $(DOCDIR)
- install -m 0644 $(XDOCS) $(XDOCDIR)
- cp -a -R tests.clustalw $(DOCDIR)/examples/tests
-
-.PHONY: clean distclean
-
-clean:
- $(RM) *.o
-
-distclean: clean
- $(RM) clustalw clustalx
- cd tests.clustalw; make clean
-
Deleted: trunk/packages/clustalw/trunk/makefile.alpha
===================================================================
--- trunk/packages/clustalw/trunk/makefile.alpha 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/makefile.alpha 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,65 +0,0 @@
-install: clustalx clustalw
-
-clean:
- rm *.o
-
-OBJECTS = interface.o sequence.o showpair.o malign.o \
- util.o trees.o gcgcheck.o prfalign.o pairalign.o \
- calcgapcoeff.o calcprf1.o calcprf2.o calctree.o \
- readmat.o alnscore.o random.o
-
-XOBJECTS = xutils.o xmenu.o xcolor.o xdisplay.o xscore.o
-
-HEADERS = general.h clustalw.h
-
-CC = cc
-CFLAGS = -c -O
-LFLAGS = -O -lm
-NCBI_INC = /dec/biolo/ncbi/include
-NCBI_LIB = /dec/biolo/ncbi/lib
-CXFLAGS = -DWIN_MOTIF -I$(NCBI_INC)
-LXFLAGS = -L$(NCBI_LIB) -lvibrant -lncbi -lpthread -lXm -lXmu -lXt -lX11 -lm
-
-clustalw : $(OBJECTS) amenu.o clustalw.o
- $(CC) -o $@ $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
-
-clustalt : $(OBJECTS) amenu.o clustalw.o
- $(CC) -o clustalt $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
-
-
-interface.o : interface.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $*.c
-
-amenu.o : amenu.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $*.c
-
-clustalx : $(OBJECTS) $(XOBJECTS) clustalx.o
- $(CC) -o $@ $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
-
-clustalx.o : clustalx.c $(HEADERS)
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xmenu.o : xmenu.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xdisplay.o : xdisplay.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xutils.o : xutils.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xcolor.o : xcolor.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xscore.o : xscore.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-readmat.o : readmat.c $(HEADERS) matrices.h
- $(CC) $(CFLAGS) $*.c
-
-trees.o : trees.c $(HEADERS) dayhoff.h
- $(CC) $(CFLAGS) $*.c
-
-.c.o :
- $(CC) $(CFLAGS) $?
-
Deleted: trunk/packages/clustalw/trunk/makefile.linux
===================================================================
--- trunk/packages/clustalw/trunk/makefile.linux 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/makefile.linux 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,58 +0,0 @@
-install: clustalx clustalw
-
-clean:
- rm *.o
-
-OBJECTS = interface.o sequence.o showpair.o malign.o util.o trees.o gcgcheck.o prfalign.o pairalign.o calcgapcoeff.o calcprf1.o calcprf2.o calctree.o readmat.o alnscore.o random.o
-
-XOBJECTS = xutils.o xmenu.o xcolor.o xdisplay.o xscore.o
-
-HEADERS = general.h clustalw.h
-
-CC = cc
-CFLAGS = -c -O
-LFLAGS = -O -lm
-CXFLAGS = -DWIN_MOTIF -I/usr/bio/src/ncbi/include
-LXFLAGS = -L/usr/bio/src/ncbi/lib -L/usr/ccs/lib -L/usr/X11R6/lib -lvibrant -lncbi -lXm -lXmu -lXpm -lXt -lX11 -lm
-
-
-static: $(OBJECTS) amenu.o clustalw.o $(XOBJECTS) clustalx.o
- $(CC) -o clustalx.static $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS) -lXext -lX11 -lSM -static /usr/X11R6/lib/libICE.a
- $(CC) -o clustalw $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
-
-clustalw : $(OBJECTS) amenu.o clustalw.o
- $(CC) -o $@ $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
-
-amenu.o : amenu.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $*.c
-
-clustalx : $(OBJECTS) $(XOBJECTS) clustalx.o
- $(CC) -o $@ $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
-
-clustalx.o : clustalx.c $(HEADERS)
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xmenu.o : xmenu.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xdisplay.o : xdisplay.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xutils.o : xutils.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xcolor.o : xcolor.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xscore.o : xscore.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-readmat.o : readmat.c $(HEADERS) matrices.h
- $(CC) $(CFLAGS) $*.c
-
-trees.o : trees.c $(HEADERS) dayhoff.h
- $(CC) $(CFLAGS) $*.c
-
-.c.o :
- $(CC) $(CFLAGS) $?
-
Deleted: trunk/packages/clustalw/trunk/makefile.sgi
===================================================================
--- trunk/packages/clustalw/trunk/makefile.sgi 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/makefile.sgi 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,58 +0,0 @@
-install: clustalx clustalw
-
-clean:
- rm *.o
-
-OBJECTS = interface.o sequence.o showpair.o malign.o \
- util.o trees.o gcgcheck.o prfalign.o pairalign.o \
- calcgapcoeff.o calcprf1.o calcprf2.o calctree.o \
- readmat.o alnscore.o random.o
-
-XOBJECTS = xutils.o xmenu.o xcolor.o xdisplay.o xscore.o
-
-HEADERS = general.h clustalw.h
-
-CC = cc
-CFLAGS = -c -O
-LFLAGS = -O -lm
-NCBI_INC = /biolo/ncbi/include
-NCBI_LIB = /biolo/ncbi/lib
-CXFLAGS = -DWIN_MOTIF -I$(NCBI_INC)
-LXFLAGS = -L$(NCBI_LIB) -L/usr/ccs/lib/ -lvibrant -lncbi -lXm -lXt -lX11 -lXmu -lm
-
-clustalw : $(OBJECTS) amenu.o clustalw.o
- $(CC) -o $@ $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
-
-amenu.o : amenu.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $*.c
-
-clustalx : $(OBJECTS) $(XOBJECTS) clustalx.o
- $(CC) -o $@ $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
-
-clustalx.o : clustalx.c $(HEADERS)
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xmenu.o : xmenu.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xdisplay.o : xdisplay.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xutils.o : xutils.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xcolor.o : xcolor.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xscore.o : xscore.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-readmat.o : readmat.c $(HEADERS) matrices.h
- $(CC) $(CFLAGS) $*.c
-
-trees.o : trees.c $(HEADERS) dayhoff.h
- $(CC) $(CFLAGS) $*.c
-
-.c.o :
- $(CC) $(CFLAGS) $?
-
Deleted: trunk/packages/clustalw/trunk/makefile.sun
===================================================================
--- trunk/packages/clustalw/trunk/makefile.sun 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/makefile.sun 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,61 +0,0 @@
-install: clustalx clustalw
-
-clean:
- rm *.o
-
-OBJECTS = interface.o sequence.o showpair.o malign.o \
- util.o trees.o gcgcheck.o prfalign.o pairalign.o \
- calcgapcoeff.o calcprf1.o calcprf2.o calctree.o \
- readmat.o alnscore.o random.o
-
-XOBJECTS = xutils.o xmenu.o xcolor.o xdisplay.o xscore.o
-
-HEADERS = general.h clustalw.h
-
-CC = cc
-CFLAGS = -c -O
-LFLAGS = -O -lm
-NCBI_INC = /workbench/include/ncbi
-NCBI_LIB = /workbench/lib/ncbi
-CXFLAGS = -DWIN_MOTIF -I$(NCBI_INC) -I/opt/SUNWmotif/include
-LXFLAGS = -L$(NCBI_LIB) -L/usr/ccs/lib/ -L/opt/SUNWmotif/lib -Bstatic -lvibrant -lncbi -Bdynamic -lXm -lXmu -Bdynamic -lXt -lX11 -lgen
-
-clustalw : $(OBJECTS) amenu.o clustalw.o
- $(CC) -o $@ $(OBJECTS) amenu.o clustalw.o $(LFLAGS)
-
-interface.o : interface.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $*.c
-
-amenu.o : amenu.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $*.c
-
-clustalx : $(OBJECTS) $(XOBJECTS) clustalx.o
- $(CC) -o $@ $(OBJECTS) $(XOBJECTS) clustalx.o $(LFLAGS) $(LXFLAGS)
-
-clustalx.o : clustalx.c $(HEADERS)
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xmenu.o : xmenu.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xdisplay.o : xdisplay.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xutils.o : xutils.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xcolor.o : xcolor.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-xscore.o : xscore.c $(HEADERS) param.h
- $(CC) $(CFLAGS) $(CXFLAGS) $*.c
-
-readmat.o : readmat.c $(HEADERS) matrices.h
- $(CC) $(CFLAGS) $*.c
-
-trees.o : trees.c $(HEADERS) dayhoff.h
- $(CC) $(CFLAGS) $*.c
-
-.c.o :
- $(CC) $(CFLAGS) $?
-
Deleted: trunk/packages/clustalw/trunk/malign.c
===================================================================
--- trunk/packages/clustalw/trunk/malign.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/malign.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,654 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include "clustalw.h"
-
-
-/*
- * Prototypes
- */
-
-/*
- * Global Variables
- */
-
-extern double **tmat;
-extern Boolean no_weights;
-extern sint debug;
-extern sint max_aa;
-extern sint nseqs;
-extern sint profile1_nseqs;
-extern sint nsets;
-extern sint **sets;
-extern sint divergence_cutoff;
-extern sint *seq_weight;
-extern sint output_order, *output_index;
-extern Boolean distance_tree;
-extern char seqname[];
-extern sint *seqlen_array;
-extern char **seq_array;
-
-sint malign(sint istart,char *phylip_name) /* full progressive alignment*/
-{
- static sint *aligned;
- static sint *group;
- static sint ix;
-
- sint *maxid, max, sum;
- sint *tree_weight;
- sint i,j,set,iseq=0;
- sint status,entries;
- lint score = 0;
-
-
- info("Start of Multiple Alignment");
-
-/* get the phylogenetic tree from *.ph */
-
- if (nseqs >= 2)
- {
- status = read_tree(phylip_name, (sint)0, nseqs);
- if (status == 0) return((sint)0);
- }
-
-/* calculate sequence weights according to branch lengths of the tree -
- weights in global variable seq_weight normalised to sum to 100 */
-
- calc_seq_weights((sint)0, nseqs, seq_weight);
-
-/* recalculate tmat matrix as percent similarity matrix */
-
- status = calc_similarities(nseqs);
- if (status == 0) return((sint)0);
-
-/* for each sequence, find the most closely related sequence */
-
- maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
- for (i=1;i<=nseqs;i++)
- {
- maxid[i] = -1;
- for (j=1;j<=nseqs;j++)
- if (j!=i && maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j];
- }
-
-/* group the sequences according to their relative divergence */
-
- if (istart == 0)
- {
- sets = (sint **) ckalloc( (nseqs+1) * sizeof (sint *) );
- for(i=0;i<=nseqs;i++)
- sets[i] = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
-
- create_sets((sint)0,nseqs);
- info("There are %d groups",(pint)nsets);
-
-/* clear the memory used for the phylogenetic tree */
-
- if (nseqs >= 2)
- clear_tree(NULL);
-
-/* start the multiple alignments......... */
-
- info("Aligning...");
-
-/* first pass, align closely related sequences first.... */
-
- ix = 0;
- aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
- for (i=0;i<=nseqs;i++) aligned[i] = 0;
-
- for(set=1;set<=nsets;++set)
- {
- entries=0;
- for (i=1;i<=nseqs;i++)
- {
- if ((sets[set][i] != 0) && (maxid[i] > divergence_cutoff))
- {
- entries++;
- if (aligned[i] == 0)
- {
- if (output_order==INPUT)
- {
- ++ix;
- output_index[i] = i;
- }
- else output_index[++ix] = i;
- aligned[i] = 1;
- }
- }
- }
-
- if(entries > 0) score = prfalign(sets[set], aligned);
- else score=0.0;
-
-
-/* negative score means fatal error... exit now! */
-
- if (score < 0)
- {
- return(-1);
- }
- if ((entries > 0) && (score > 0))
- info("Group %d: Sequences:%4d Score:%d",
- (pint)set,(pint)entries,(pint)score);
- else
- info("Group %d: Delayed",
- (pint)set);
- }
-
- for (i=0;i<=nseqs;i++)
- sets[i]=ckfree((void *)sets[i]);
- sets=ckfree(sets);
- }
- else
- {
-/* clear the memory used for the phylogenetic tree */
-
- if (nseqs >= 2)
- clear_tree(NULL);
-
- aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
- ix = 0;
- for (i=1;i<=istart+1;i++)
- {
- aligned[i] = 1;
- ++ix;
- output_index[i] = i;
- }
- for (i=istart+2;i<=nseqs;i++) aligned[i] = 0;
- }
-
-/* second pass - align remaining, more divergent sequences..... */
-
-/* if not all sequences were aligned, for each unaligned sequence,
- find it's closest pair amongst the aligned sequences. */
-
- group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
- tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );
- for (i=0;i<nseqs;i++)
- tree_weight[i] = seq_weight[i];
-
-/* if we haven't aligned any sequences, in the first pass - align the
-two most closely related sequences now */
- if(ix==0)
- {
- max = -1;
- iseq = 0;
- for (i=1;i<=nseqs;i++)
- {
- for (j=i+1;j<=nseqs;j++)
- {
- if (max < tmat[i][j])
- {
- max = tmat[i][j];
- iseq = i;
- }
- }
- }
- aligned[iseq]=1;
- if (output_order == INPUT)
- {
- ++ix;
- output_index[iseq] = iseq;
- }
- else
- output_index[++ix] = iseq;
- }
-
- while (ix < nseqs)
- {
- for (i=1;i<=nseqs;i++) {
- if (aligned[i] == 0)
- {
- maxid[i] = -1;
- for (j=1;j<=nseqs;j++)
- if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0))
- maxid[i] = tmat[i][j];
- }
- }
-/* find the most closely related sequence to those already aligned */
-
- max = -1;
- iseq = 0;
- for (i=1;i<=nseqs;i++)
- {
- if ((aligned[i] == 0) && (maxid[i] > max))
- {
- max = maxid[i];
- iseq = i;
- }
- }
-
-
-/* align this sequence to the existing alignment */
-/* weight sequences with percent identity with profile*/
-/* OR...., multiply sequence weights from tree by percent identity with new sequence */
- if(no_weights==FALSE) {
- for (j=0;j<nseqs;j++)
- if (aligned[j+1] != 0)
- seq_weight[j] = tree_weight[j] * tmat[j+1][iseq];
-/*
- Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
-*/
-
- sum = 0;
- for (j=0;j<nseqs;j++)
- if (aligned[j+1] != 0)
- sum += seq_weight[j];
- if (sum == 0)
- {
- for (j=0;j<nseqs;j++)
- seq_weight[j] = 1;
- sum = j;
- }
- for (j=0;j<nseqs;j++)
- if (aligned[j+1] != 0)
- {
- seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
- if (seq_weight[j] < 1) seq_weight[j] = 1;
- }
- }
-
- entries = 0;
- for (j=1;j<=nseqs;j++)
- if (aligned[j] != 0)
- {
- group[j] = 1;
- entries++;
- }
- else if (iseq==j)
- {
- group[j] = 2;
- entries++;
- }
- aligned[iseq] = 1;
-
- score = prfalign(group, aligned);
- info("Sequence:%d Score:%d",(pint)iseq,(pint)score);
- if (output_order == INPUT)
- {
- ++ix;
- output_index[iseq] = iseq;
- }
- else
- output_index[++ix] = iseq;
- }
-
- group=ckfree((void *)group);
- aligned=ckfree((void *)aligned);
- maxid=ckfree((void *)maxid);
- tree_weight=ckfree((void *)tree_weight);
-
- aln_score();
-
-/* make the rest (output stuff) into routine clustal_out in file amenu.c */
-
- return(nseqs);
-
-}
-
-sint seqalign(sint istart,char *phylip_name) /* sequence alignment to existing profile */
-{
- static sint *aligned, *tree_weight;
- static sint *group;
- static sint ix;
-
- sint *maxid, max;
- sint i,j,status,iseq;
- sint sum,entries;
- lint score = 0;
-
-
- info("Start of Multiple Alignment");
-
-/* get the phylogenetic tree from *.ph */
-
- if (nseqs >= 2)
- {
- status = read_tree(phylip_name, (sint)0, nseqs);
- if (status == 0) return(0);
- }
-
-/* calculate sequence weights according to branch lengths of the tree -
- weights in global variable seq_weight normalised to sum to 100 */
-
- calc_seq_weights((sint)0, nseqs, seq_weight);
-
- tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );
- for (i=0;i<nseqs;i++)
- tree_weight[i] = seq_weight[i];
-
-/* recalculate tmat matrix as percent similarity matrix */
-
- status = calc_similarities(nseqs);
- if (status == 0) return((sint)0);
-
-/* for each sequence, find the most closely related sequence */
-
- maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
- for (i=1;i<=nseqs;i++)
- {
- maxid[i] = -1;
- for (j=1;j<=nseqs;j++)
- if (maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j];
- }
-
-/* clear the memory used for the phylogenetic tree */
-
- if (nseqs >= 2)
- clear_tree(NULL);
-
- aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
- ix = 0;
- for (i=1;i<=istart+1;i++)
- {
- aligned[i] = 1;
- ++ix;
- output_index[i] = i;
- }
- for (i=istart+2;i<=nseqs;i++) aligned[i] = 0;
-
-/* for each unaligned sequence, find it's closest pair amongst the
- aligned sequences. */
-
- group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
-
- while (ix < nseqs)
- {
- if (ix > 0)
- {
- for (i=1;i<=nseqs;i++) {
- if (aligned[i] == 0)
- {
- maxid[i] = -1;
- for (j=1;j<=nseqs;j++)
- if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0))
- maxid[i] = tmat[i][j];
- }
- }
- }
-
-/* find the most closely related sequence to those already aligned */
-
- max = -1;
- for (i=1;i<=nseqs;i++)
- {
- if ((aligned[i] == 0) && (maxid[i] > max))
- {
- max = maxid[i];
- iseq = i;
- }
- }
-
-/* align this sequence to the existing alignment */
-
- entries = 0;
- for (j=1;j<=nseqs;j++)
- if (aligned[j] != 0)
- {
- group[j] = 1;
- entries++;
- }
- else if (iseq==j)
- {
- group[j] = 2;
- entries++;
- }
- aligned[iseq] = 1;
-
-
-/* EITHER....., set sequence weights equal to percent identity with new sequence */
-/*
- for (j=0;j<nseqs;j++)
- seq_weight[j] = tmat[j+1][iseq];
-*/
-/* OR...., multiply sequence weights from tree by percent identity with new sequence */
- for (j=0;j<nseqs;j++)
- seq_weight[j] = tree_weight[j] * tmat[j+1][iseq];
-if (debug>1)
- for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf (stdout,"sequence %d: %d\n", j+1,tree_weight[j]);
-/*
- Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
-*/
-
- sum = 0;
- for (j=0;j<nseqs;j++)
- if (group[j+1] == 1) sum += seq_weight[j];
- if (sum == 0)
- {
- for (j=0;j<nseqs;j++)
- seq_weight[j] = 1;
- sum = j;
- }
- for (j=0;j<nseqs;j++)
- {
- seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
- if (seq_weight[j] < 1) seq_weight[j] = 1;
- }
-
-if (debug > 1) {
- fprintf(stdout,"new weights\n");
- for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]);
-}
-
- score = prfalign(group, aligned);
- info("Sequence:%d Score:%d",(pint)iseq,(pint)score);
- if (output_order == INPUT)
- {
- ++ix;
- output_index[iseq] = iseq;
- }
- else
- output_index[++ix] = iseq;
- }
-
- group=ckfree((void *)group);
- aligned=ckfree((void *)aligned);
- maxid=ckfree((void *)maxid);
-
- aln_score();
-
-/* make the rest (output stuff) into routine clustal_out in file amenu.c */
-
- return(nseqs);
-
-}
-
-
-sint palign1(void) /* a profile alignment */
-{
- sint i,j,temp;
- sint entries;
- sint *aligned, *group;
- float dscore;
- lint score;
-
- info("Start of Initial Alignment");
-
-/* calculate sequence weights according to branch lengths of the tree -
- weights in global variable seq_weight normalised to sum to INT_SCALE_FACTOR */
-
- temp = INT_SCALE_FACTOR/nseqs;
- for (i=0;i<nseqs;i++) seq_weight[i] = temp;
-
- distance_tree = FALSE;
-
-/* do the initial alignment......... */
-
- group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
-
- for(i=1; i<=profile1_nseqs; ++i)
- group[i] = 1;
- for(i=profile1_nseqs+1; i<=nseqs; ++i)
- group[i] = 2;
- entries = nseqs;
-
- aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
- for (i=1;i<=nseqs;i++) aligned[i] = 1;
-
- score = prfalign(group, aligned);
- info("Sequences:%d Score:%d",(pint)entries,(pint)score);
- group=ckfree((void *)group);
- aligned=ckfree((void *)aligned);
-
- for (i=1;i<=nseqs;i++) {
- for (j=i+1;j<=nseqs;j++) {
- dscore = countid(i,j);
- tmat[i][j] = ((double)100.0 - (double)dscore)/(double)100.0;
- tmat[j][i] = tmat[i][j];
- }
- }
-
- return(nseqs);
-}
-
-float countid(sint s1, sint s2)
-{
- char c1,c2;
- sint i;
- sint count,total;
- float score;
-
- count = total = 0;
- for (i=1;i<=seqlen_array[s1] && i<=seqlen_array[s2];i++) {
- c1 = seq_array[s1][i];
- c2 = seq_array[s2][i];
- if ((c1>=0) && (c1<max_aa)) {
- total++;
- if (c1 == c2) count++;
- }
-
- }
-
- if(total==0) score=0;
- else
- score = 100.0 * (float)count / (float)total;
- return(score);
-
-}
-
-sint palign2(char *p1_tree_name,char *p2_tree_name) /* a profile alignment */
-{
- sint i,j,sum,entries,status;
- lint score;
- sint *aligned, *group;
- sint *maxid,*p1_weight,*p2_weight;
- sint dscore;
-
- info("Start of Multiple Alignment");
-
-/* get the phylogenetic trees from *.ph */
-
- if (profile1_nseqs >= 2)
- {
- status = read_tree(p1_tree_name, (sint)0, profile1_nseqs);
- if (status == 0) return(0);
- }
-
-/* calculate sequence weights according to branch lengths of the tree -
- weights in global variable seq_weight normalised to sum to 100 */
-
- p1_weight = (sint *) ckalloc( (profile1_nseqs) * sizeof(sint) );
-
- calc_seq_weights((sint)0, profile1_nseqs, p1_weight);
-
-/* clear the memory for the phylogenetic tree */
-
- if (profile1_nseqs >= 2)
- clear_tree(NULL);
-
- if (nseqs-profile1_nseqs >= 2)
- {
- status = read_tree(p2_tree_name, profile1_nseqs, nseqs);
- if (status == 0) return(0);
- }
-
- p2_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );
-
- calc_seq_weights(profile1_nseqs,nseqs, p2_weight);
-
-
-/* clear the memory for the phylogenetic tree */
-
- if (nseqs-profile1_nseqs >= 2)
- clear_tree(NULL);
-
-/* convert tmat distances to similarities */
-
- for (i=1;i<nseqs;i++)
- for (j=i+1;j<=nseqs;j++) {
- tmat[i][j]=100.0-tmat[i][j]*100.0;
- tmat[j][i]=tmat[i][j];
- }
-
-
-/* weight sequences with max percent identity with other profile*/
-
- maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
- for (i=0;i<profile1_nseqs;i++) {
- maxid[i] = 0;
- for (j=profile1_nseqs+1;j<=nseqs;j++)
- if(maxid[i]<tmat[i+1][j]) maxid[i] = tmat[i+1][j];
- seq_weight[i] = maxid[i]*p1_weight[i];
- }
-
- for (i=profile1_nseqs;i<nseqs;i++) {
- maxid[i] = -1;
- for (j=1;j<=profile1_nseqs;j++)
- if(maxid[i]<tmat[i+1][j]) maxid[i] = tmat[i+1][j];
- seq_weight[i] = maxid[i]*p2_weight[i];
- }
-/*
- Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
-*/
-
- sum = 0;
- for (j=0;j<nseqs;j++)
- sum += seq_weight[j];
- if (sum == 0)
- {
- for (j=0;j<nseqs;j++)
- seq_weight[j] = 1;
- sum = j;
- }
- for (j=0;j<nseqs;j++)
- {
- seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
- if (seq_weight[j] < 1) seq_weight[j] = 1;
- }
-if (debug > 1) {
- fprintf(stdout,"new weights\n");
- for (j=0;j<nseqs;j++) fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]);
-}
-
-
-/* do the alignment......... */
-
- info("Aligning...");
-
- group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
-
- for(i=1; i<=profile1_nseqs; ++i)
- group[i] = 1;
- for(i=profile1_nseqs+1; i<=nseqs; ++i)
- group[i] = 2;
- entries = nseqs;
-
- aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
- for (i=1;i<=nseqs;i++) aligned[i] = 1;
-
- score = prfalign(group, aligned);
- info("Sequences:%d Score:%d",(pint)entries,(pint)score);
- group=ckfree((void *)group);
- p1_weight=ckfree((void *)p1_weight);
- p2_weight=ckfree((void *)p2_weight);
- aligned=ckfree((void *)aligned);
- maxid=ckfree((void *)maxid);
-
-/* DES output_index = (int *)ckalloc( (nseqs+1) * sizeof (int)); */
- for (i=1;i<=nseqs;i++) output_index[i] = i;
-
- return(nseqs);
-}
-
Deleted: trunk/packages/clustalw/trunk/matrices.h
===================================================================
--- trunk/packages/clustalw/trunk/matrices.h 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/matrices.h 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,854 +0,0 @@
-char *amino_acid_order = "ABCDEFGHIKLMNPQRSTVWXYZ";
-
-short blosum30mt[]={
- 4,
- 0, 5,
- -3, -2, 17,
- 0, 5, -3, 9,
- 0, 0, 1, 1, 6,
- -2, -3, -3, -5, -4, 10,
- 0, 0, -4, -1, -2, -3, 8,
- -2, -2, -5, -2, 0, -3, -3, 14,
- 0, -2, -2, -4, -3, 0, -1, -2, 6,
- 0, 0, -3, 0, 2, -1, -1, -2, -2, 4,
- -1, -1, 0, -1, -1, 2, -2, -1, 2, -2, 4,
- 1, -2, -2, -3, -1, -2, -2, 2, 1, 2, 2, 6,
- 0, 4, -1, 1, -1, -1, 0, -1, 0, 0, -2, 0, 8,
- -1, -2, -3, -1, 1, -4, -1, 1, -3, 1, -3, -4, -3, 11,
- 1, -1, -2, -1, 2, -3, -2, 0, -2, 0, -2, -1, -1, 0, 8,
- -1, -2, -2, -1, -1, -1, -2, -1, -3, 1, -2, 0, -2, -1, 3, 8,
- 1, 0, -2, 0, 0, -1, 0, -1, -1, 0, -2, -2, 0, -1, -1, -1, 4,
- 1, 0, -2, -1, -2, -2, -2, -2, 0, -1, 0, 0, 1, 0, 0, -3, 2, 5,
- 1, -2, -2, -2, -3, 1, -3, -3, 4, -2, 1, 0, -2, -4, -3, -1, -1, 1, 5,
- -5, -5, -2, -4, -1, 1, 1, -5, -3, -2, -2, -3, -7, -3, -1, 0, -3, -5, -3, 20,
- 0, -1, -2, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, -2, -1,
- -4, -3, -6, -1, -2, 3, -3, 0, -1, -1, 3, -1, -4, -2, -1, 0, -2, -1, 1, 5, -1, 9,
- 0, 0, 0, 0, 5, -4, -2, 0, -3, 1, -1, -1, -1, 0, 4, 0, -1, -1, -3, -1, 0, -2, 4};
-
-/*
-short blosum35mt[]={
- 5,
- -1, 5,
- -2, -2, 15,
- -1, 5, -3, 8,
- -1, 0, -1, 2, 6,
- -2, -2, -4, -3, -3, 8,
- 0, 0, -3, -2, -2, -3, 7,
- -2, 0, -4, 0, -1, -3, -2, 12,
- -1, -2, -4, -3, -3, 1, -3, -3, 5,
- 0, 0, -2, -1, 1, -1, -1, -2, -2, 5,
- -2, -2, -2, -2, -1, 2, -3, -2, 2, -2, 5,
- 0, -2, -4, -3, -2, 0, -1, 1, 1, 0, 3, 6,
- -1, 4, -1, 1, -1, -1, 1, 1, -1, 0, -2, -1, 7,
- -2, -1, -4, -1, 0, -4, -2, -1, -1, 0, -3, -3, -2, 10,
- 0, 0, -3, -1, 2, -4, -2, -1, -2, 0, -2, -1, 1, 0, 7,
- -1, -1, -3, -1, -1, -1, -2, -1, -3, 2, -2, 0, -1, -2, 2, 8,
- 1, 0, -3, -1, 0, -1, 1, -1, -2, 0, -2, -1, 0, -2, 0, -1, 4,
- 0, -1, -1, -1, -1, -1, -2, -2, -1, 0, 0, 0, 0, 0, 0, -2, 2, 5,
- 0, -2, -2, -2, -2, 1, -3, -4, 4, -2, 2, 1, -2, -3, -3, -1, -1, 1, 5,
- -2, -3, -5, -3, -1, 1, -1, -4, -1, 0, 0, 1, -2, -4, -1, 0, -2, -2, -2, 16,
- 0, -1, -2, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1,
- -1, -2, -5, -2, -1, 3, -2, 0, 0, -1, 0, 0, -2, -3, 0, 0, -1, -2, 0, 3, -1, 8,
- -1, 0, -2, 1, 5, -3, -2, -1, -3, 1, -2, -2, 0, 0, 4, 0, 0, -1, -2, -1, 0, -1, 4};
-*/
-short blosum40mt[]={
- 5,
- -1, 5,
- -2, -2, 16,
- -1, 6, -2, 9,
- -1, 1, -2, 2, 7,
- -3, -3, -2, -4, -3, 9,
- 1, -1, -3, -2, -3, -3, 8,
- -2, 0, -4, 0, 0, -2, -2, 13,
- -1, -3, -4, -4, -4, 1, -4, -3, 6,
- -1, 0, -3, 0, 1, -3, -2, -1, -3, 6,
- -2, -3, -2, -3, -2, 2, -4, -2, 2, -2, 6,
- -1, -3, -3, -3, -2, 0, -2, 1, 1, -1, 3, 7,
- -1, 4, -2, 2, -1, -3, 0, 1, -2, 0, -3, -2, 8,
- -2, -2, -5, -2, 0, -4, -1, -2, -2, -1, -4, -2, -2, 11,
- 0, 0, -4, -1, 2, -4, -2, 0, -3, 1, -2, -1, 1, -2, 8,
- -2, -1, -3, -1, -1, -2, -3, 0, -3, 3, -2, -1, 0, -3, 2, 9,
- 1, 0, -1, 0, 0, -2, 0, -1, -2, 0, -3, -2, 1, -1, 1, -1, 5,
- 0, 0, -1, -1, -1, -1, -2, -2, -1, 0, -1, -1, 0, 0, -1, -2, 2, 6,
- 0, -3, -2, -3, -3, 0, -4, -4, 4, -2, 2, 1, -3, -3, -3, -2, -1, 1, 5,
- -3, -4, -6, -5, -2, 1, -2, -5, -3, -2, -1, -2, -4, -4, -1, -2, -5, -4, -3, 19,
- 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -2, -1, -1, 0, 0, -1, -2, -1,
- -2, -3, -4, -3, -2, 4, -3, 2, 0, -1, 0, 1, -2, -3, -1, -1, -2, -1, -1, 3, -1, 9,
- -1, 2, -3, 1, 5, -4, -2, 0, -4, 1, -2, -2, 0, -1, 4, 0, 0, -1, -3, -2, -1, -2, 5};
-
-short blosum45mt[]={
- 5,
- -1, 4,
- -1, -2, 12,
- -2, 5, -3, 7,
- -1, 1, -3, 2, 6,
- -2, -3, -2, -4, -3, 8,
- 0, -1, -3, -1, -2, -3, 7,
- -2, 0, -3, 0, 0, -2, -2, 10,
- -1, -3, -3, -4, -3, 0, -4, -3, 5,
- -1, 0, -3, 0, 1, -3, -2, -1, -3, 5,
- -1, -3, -2, -3, -2, 1, -3, -2, 2, -3, 5,
- -1, -2, -2, -3, -2, 0, -2, 0, 2, -1, 2, 6,
- -1, 4, -2, 2, 0, -2, 0, 1, -2, 0, -3, -2, 6,
- -1, -2, -4, -1, 0, -3, -2, -2, -2, -1, -3, -2, -2, 9,
- -1, 0, -3, 0, 2, -4, -2, 1, -2, 1, -2, 0, 0, -1, 6,
- -2, -1, -3, -1, 0, -2, -2, 0, -3, 3, -2, -1, 0, -2, 1, 7,
- 1, 0, -1, 0, 0, -2, 0, -1, -2, -1, -3, -2, 1, -1, 0, -1, 4,
- 0, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 2, 5,
- 0, -3, -1, -3, -3, 0, -3, -3, 3, -2, 1, 1, -3, -3, -3, -2, -1, 0, 5,
- -2, -4, -5, -4, -3, 1, -2, -3, -2, -2, -2, -2, -4, -3, -2, -2, -4, -3, -3, 15,
- 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, -2, -1,
- -2, -2, -3, -2, -2, 3, -3, 2, 0, -1, 0, 0, -2, -3, -1, -1, -2, -1, -1, 3, -1, 8,
- -1, 2, -3, 1, 4, -3, -2, 0, -3, 1, -2, -1, 0, -1, 4, 0, 0, -1, -3, -2, -1, -2, 4};
-
-/*
-short blosum50mt[]={
- 5,
- -2, 5,
- -1, -3, 13,
- -2, 5, -4, 8,
- -1, 1, -3, 2, 6,
- -3, -4, -2, -5, -3, 8,
- 0, -1, -3, -1, -3, -4, 8,
- -2, 0, -3, -1, 0, -1, -2, 10,
- -1, -4, -2, -4, -4, 0, -4, -4, 5,
- -1, 0, -3, -1, 1, -4, -2, 0, -3, 6,
- -2, -4, -2, -4, -3, 1, -4, -3, 2, -3, 5,
- -1, -3, -2, -4, -2, 0, -3, -1, 2, -2, 3, 7,
- -1, 4, -2, 2, 0, -4, 0, 1, -3, 0, -4, -2, 7,
- -1, -2, -4, -1, -1, -4, -2, -2, -3, -1, -4, -3, -2, 10,
- -1, 0, -3, 0, 2, -4, -2, 1, -3, 2, -2, 0, 0, -1, 7,
- -2, -1, -4, -2, 0, -3, -3, 0, -4, 3, -3, -2, -1, -3, 1, 7,
- 1, 0, -1, 0, -1, -3, 0, -1, -3, 0, -3, -2, 1, -1, 0, -1, 5,
- 0, 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 2, 5,
- 0, -4, -1, -4, -3, -1, -4, -4, 4, -3, 1, 1, -3, -3, -3, -3, -2, 0, 5,
- -3, -5, -5, -5, -3, 1, -3, -3, -3, -3, -2, -1, -4, -4, -1, -3, -4, -3, -3, 15,
- -1, -1, -2, -1, -1, -2, -2, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, 0, -1, -3, -1,
- -2, -3, -3, -3, -2, 4, -3, 2, -1, -2, -1, 0, -2, -3, -1, -1, -2, -2, -1, 2, -1, 8,
- -1, 2, -3, 1, 5, -4, -2, 0, -3, 1, -3, -1, 0, -1, 4, 0, 0, -1, -3, -2, -1, -2, 5};
-
-short blosum55mt[]={
- 5,
- -2, 5,
- 0, -4, 13,
- -2, 5, -4, 8,
- -1, 1, -4, 2, 7,
- -3, -5, -3, -5, -4, 9,
- 0, -1, -3, -2, -3, -4, 8,
- -2, 0, -4, -1, -1, -1, -2, 11,
- -2, -4, -2, -4, -4, 0, -5, -4, 6,
- -1, 0, -4, -1, 1, -4, -2, 0, -4, 6,
- -2, -4, -2, -5, -4, 1, -5, -3, 2, -3, 6,
- -1, -3, -2, -4, -3, 0, -3, -2, 2, -2, 3, 8,
- -2, 4, -3, 2, 0, -4, 0, 1, -4, 0, -4, -3, 8,
- -1, -2, -3, -2, -1, -5, -3, -3, -3, -1, -4, -3, -2, 10,
- -1, 0, -4, 0, 2, -4, -2, 1, -4, 2, -3, 0, 0, -1, 7,
- -2, -1, -4, -2, 0, -3, -3, 0, -4, 3, -3, -2, -1, -3, 1, 8,
- 2, 0, -1, 0, 0, -3, 0, -1, -3, 0, -3, -2, 1, -1, 0, -1, 5,
- 0, -1, -1, -1, -1, -3, -2, -2, -1, -1, -2, -1, 0, -1, -1, -1, 2, 6,
- 0, -4, -1, -4, -3, -1, -4, -4, 4, -3, 1, 1, -4, -3, -3, -3, -2, 0, 5,
- -4, -5, -4, -5, -3, 2, -3, -3, -3, -4, -3, -2, -5, -5, -2, -3, -4, -3, -4, 15,
- -1, -1, -2, -2, -1, -2, -2, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1, -3, -1,
- -2, -3, -3, -3, -2, 4, -4, 2, -1, -2, -1, -1, -2, -4, -1, -2, -2, -2, -2, 3, -1, 9,
- -1, 2, -4, 1, 5, -4, -3, 0, -4, 1, -3, -2, 0, -1, 4, 0, 0, -1, -3, -3, -1, -2, 5};
-
-
-short blosum62mt[]={
- 4,
- -2, 4,
- 0, -3, 9,
- -2, 4, -3, 6,
- -1, 1, -4, 2, 5,
- -2, -3, -2, -3, -3, 6,
- 0, -1, -3, -1, -2, -3, 6,
- -2, 0, -3, -1, 0, -1, -2, 8,
- -1, -3, -1, -3, -3, 0, -4, -3, 4,
- -1, 0, -3, -1, 1, -3, -2, -1, -3, 5,
- -1, -4, -1, -4, -3, 0, -4, -3, 2, -2, 4,
- -1, -3, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5,
- -2, 3, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6,
- -1, -2, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7,
- -1, 0, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5,
- -1, -1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5,
- 1, 0, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4,
- 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5,
- 0, -3, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4,
- -3, -4, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11,
- 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, 0, 0, -1, -2, -1,
- -2, -3, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, -1, 7,
- -1, 1, -3, 1, 4, -3, -2, 0, -3, 1, -3, -1, 0, -1, 3, 0, 0, -1, -2, -3, -1, -2, 4};
-*/
-
-short blosum62mt2[]={
- 8,
- -4, 8,
- 0, -6, 18,
- -4, 8, -6, 12,
- -2, 2, -8, 4, 10,
- -4, -6, -4, -6, -6, 12,
- 0, -2, -6, -2, -4, -6, 12,
- -4, 0, -6, -2, 0, -2, -4, 16,
- -2, -6, -2, -6, -6, 0, -8, -6, 8,
- -2, 0, -6, -2, 2, -6, -4, -2, -6, 10,
- -2, -8, -2, -8, -6, 0, -8, -6, 4, -4, 8,
- -2, -6, -2, -6, -4, 0, -6, -4, 2, -2, 4, 10,
- -4, 6, -6, 2, 0, -6, 0, 2, -6, 0, -6, -4, 12,
- -2, -4, -6, -2, -2, -8, -4, -4, -6, -2, -6, -4, -4, 14,
- -2, 0, -6, 0, 4, -6, -4, 0, -6, 2, -4, 0, 0, -2, 10,
- -2, -2, -6, -4, 0, -6, -4, 0, -6, 4, -4, -2, 0, -4, 2, 10,
- 2, 0, -2, 0, 0, -4, 0, -2, -4, 0, -4, -2, 2, -2, 0, -2, 8,
- 0, -2, -2, -2, -2, -4, -4, -4, -2, -2, -2, -2, 0, -2, -2, -2, 2, 10,
- 0, -6, -2, -6, -4, -2, -6, -6, 6, -4, 2, 2, -6, -4, -4, -6, -4, 0, 8,
- -6, -8, -4, -8, -6, 2, -4, -4, -6, -6, -4, -2, -8, -8, -4, -6, -6, -4, -6, 22,
- 0, -2, -4, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -4, -2, -2, 0, 0, -2, -4, -2,
- -4, -6, -4, -6, -4, 6, -6, 4, -2, -4, -2, -2, -4, -6, -2, -4, -4, -4, -2, 4, -2, 14,
- -2, 2, -6, 2, 8, -6, -4, 0, -6, 2, -6, -2, 0, -2, 6, 0, 0, -2, -4, -6, -2, -4, 8};
-
-/*
-short blosum65mt[]={
- 4,
- -2, 4,
- 0, -3, 9,
- -2, 4, -4, 6,
- -1, 1, -4, 2, 5,
- -2, -3, -2, -4, -3, 6,
- 0, -1, -3, -1, -2, -3, 6,
- -2, 0, -3, -1, 0, -1, -2, 8,
- -1, -3, -1, -3, -3, 0, -4, -3, 4,
- -1, 0, -3, -1, 1, -3, -2, -1, -3, 5,
- -2, -4, -1, -4, -3, 0, -4, -3, 2, -3, 4,
- -1, -3, -2, -3, -2, 0, -3, -2, 1, -2, 2, 6,
- -2, 3, -3, 1, 0, -3, -1, 1, -3, 0, -4, -2, 6,
- -1, -2, -3, -2, -1, -4, -2, -2, -3, -1, -3, -3, -2, 8,
- -1, 0, -3, 0, 2, -3, -2, 1, -3, 1, -2, 0, 0, -1, 6,
- -1, -1, -4, -2, 0, -3, -2, 0, -3, 2, -2, -2, 0, -2, 1, 6,
- 1, 0, -1, 0, 0, -2, 0, -1, -2, 0, -3, -2, 1, -1, 0, -1, 4,
- 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5,
- 0, -3, -1, -3, -3, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4,
- -3, -4, -2, -5, -3, 1, -3, -2, -2, -3, -2, -2, -4, -4, -2, -3, -3, -3, -3, 10,
- -1, -1, -2, -1, -1, -2, -2, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1, -2, -1,
- -2, -3, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -2, -2, -2, -2, -1, 2, -1, 7,
- -1, 1, -4, 1, 4, -3, -2, 0, -3, 1, -3, -2, 0, -1, 3, 0, 0, -1, -2, -3, -1, -2, 4};
-
-short blosum70mt[]={
- 4,
- -2, 4,
- -1, -4, 9,
- -2, 4, -4, 6,
- -1, 1, -4, 1, 5,
- -2, -4, -2, -4, -4, 6,
- 0, -1, -3, -2, -2, -4, 6,
- -2, -1, -4, -1, 0, -1, -2, 8,
- -2, -4, -1, -4, -4, 0, -4, -4, 4,
- -1, -1, -4, -1, 1, -3, -2, -1, -3, 5,
- -2, -4, -2, -4, -3, 0, -4, -3, 2, -3, 4,
- -1, -3, -2, -3, -2, 0, -3, -2, 1, -2, 2, 6,
- -2, 3, -3, 1, 0, -3, -1, 0, -4, 0, -4, -2, 6,
- -1, -2, -3, -2, -1, -4, -3, -2, -3, -1, -3, -3, -2, 8,
- -1, 0, -3, -1, 2, -3, -2, 1, -3, 1, -2, 0, 0, -2, 6,
- -2, -1, -4, -2, 0, -3, -3, 0, -3, 2, -3, -2, -1, -2, 1, 6,
- 1, 0, -1, 0, 0, -3, -1, -1, -3, 0, -3, -2, 0, -1, 0, -1, 4,
- 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -2, -1, 0, -1, -1, -1, 1, 5,
- 0, -3, -1, -4, -3, -1, -4, -3, 3, -3, 1, 1, -3, -3, -2, -3, -2, 0, 4,
- -3, -4, -3, -5, -4, 1, -3, -2, -3, -3, -2, -2, -4, -4, -2, -3, -3, -3, -3, 11,
- -1, -1, -2, -2, -1, -2, -2, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1, -3, -1,
- -2, -3, -3, -4, -3, 3, -4, 2, -1, -2, -1, -1, -2, -3, -2, -2, -2, -2, -2, 2, -2, 7,
- -1, 0, -4, 1, 4, -4, -2, 0, -3, 1, -3, -2, 0, -1, 3, 0, 0, -1, -3, -3, -1, -2, 4};
-
-short blosum75mt[]={
- 4,
- -2, 4,
- -1, -4, 9,
- -2, 4, -4, 6,
- -1, 1, -5, 1, 5,
- -3, -4, -2, -4, -4, 6,
- 0, -1, -3, -2, -3, -4, 6,
- -2, -1, -4, -1, 0, -2, -2, 8,
- -2, -4, -1, -4, -4, 0, -5, -4, 4,
- -1, -1, -4, -1, 1, -4, -2, -1, -3, 5,
- -2, -4, -2, -4, -4, 0, -4, -3, 1, -3, 4,
- -1, -3, -2, -4, -2, 0, -3, -2, 1, -2, 2, 6,
- -2, 3, -3, 1, -1, -4, -1, 0, -4, 0, -4, -3, 6,
- -1, -2, -4, -2, -1, -4, -3, -2, -3, -1, -3, -3, -3, 8,
- -1, 0, -3, -1, 2, -4, -2, 1, -3, 1, -3, 0, 0, -2, 6,
- -2, -1, -4, -2, 0, -3, -3, 0, -3, 2, -3, -2, -1, -2, 1, 6,
- 1, 0, -1, -1, 0, -3, -1, -1, -3, 0, -3, -2, 0, -1, 0, -1, 5,
- 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -2, -1, 0, -1, -1, -1, 1, 5,
- 0, -4, -1, -4, -3, -1, -4, -4, 3, -3, 1, 1, -3, -3, -2, -3, -2, 0, 4,
- -3, -5, -3, -5, -4, 1, -3, -2, -3, -4, -2, -2, -4, -5, -2, -3, -3, -3, -3, 11,
- -1, -2, -2, -2, -1, -2, -2, -1, -2, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1, -3, -1,
- -2, -3, -3, -4, -3, 3, -4, 2, -2, -2, -1, -2, -3, -4, -2, -2, -2, -2, -2, 2, -2, 7,
- -1, 0, -4, 1, 4, -4, -2, 0, -4, 1, -3, -2, 0, -2, 3, 0, 0, -1, -3, -3, -1, -3, 4};
-*/
-
-short blosum80mt[]={
- 7,
- -3, 6,
- -1, -6, 13,
- -3, 6, -7, 10,
- -2, 1, -7, 2, 8,
- -4, -6, -4, -6, -6, 10,
- 0, -2, -6, -3, -4, -6, 9,
- -3, -1, -7, -2, 0, -2, -4, 12,
- -3, -6, -2, -7, -6, -1, -7, -6, 7,
- -1, -1, -6, -2, 1, -5, -3, -1, -5, 8,
- -3, -7, -3, -7, -6, 0, -7, -5, 2, -4, 6,
- -2, -5, -3, -6, -4, 0, -5, -4, 2, -3, 3, 9,
- -3, 5, -5, 2, -1, -6, -1, 1, -6, 0, -6, -4, 9,
- -1, -4, -6, -3, -2, -6, -5, -4, -5, -2, -5, -4, -4, 12,
- -2, -1, -5, -1, 3, -5, -4, 1, -5, 2, -4, -1, 0, -3, 9,
- -3, -2, -6, -3, -1, -5, -4, 0, -5, 3, -4, -3, -1, -3, 1, 9,
- 2, 0, -2, -1, -1, -4, -1, -2, -4, -1, -4, -3, 1, -2, -1, -2, 7,
- 0, -1, -2, -2, -2, -4, -3, -3, -2, -1, -3, -1, 0, -3, -1, -2, 2, 8,
- -1, -6, -2, -6, -4, -2, -6, -5, 4, -4, 1, 1, -5, -4, -4, -4, -3, 0, 7,
- -5, -8, -5, -8, -6, 0, -6, -4, -5, -6, -4, -3, -7, -7, -4, -5, -6, -5, -5, 16,
- -1, -3, -4, -3, -2, -3, -3, -2, -2, -2, -2, -2, -2, -3, -2, -2, -1, -1, -2, -5, -2,
- -4, -5, -5, -6, -5, 4, -6, 3, -3, -4, -2, -3, -4, -6, -3, -4, -3, -3, -3, 3, -3, 11,
- -2, 0, -7, 1, 6, -6, -4, 0, -6, 1, -5, -3, -1, -2, 5, 0, -1, -2, -4, -5, -1, -4, 6};
-
-/*
-short blosum85mt[]={
- 5,
- -2, 4,
- -1, -4, 9,
- -2, 4, -5, 7,
- -1, 0, -5, 1, 6,
- -3, -4, -3, -4, -4, 7,
- 0, -1, -4, -2, -3, -4, 6,
- -2, -1, -5, -2, -1, -2, -3, 8,
- -2, -5, -2, -5, -4, -1, -5, -4, 5,
- -1, -1, -4, -1, 0, -4, -2, -1, -3, 6,
- -2, -5, -2, -5, -4, 0, -5, -3, 1, -3, 4,
- -2, -4, -2, -4, -3, -1, -4, -3, 1, -2, 2, 7,
- -2, 4, -4, 1, -1, -4, -1, 0, -4, 0, -4, -3, 7,
- -1, -3, -4, -2, -2, -4, -3, -3, -4, -2, -4, -3, -3, 8,
- -1, -1, -4, -1, 2, -4, -3, 1, -4, 1, -3, 0, 0, -2, 6,
- -2, -2, -4, -2, -1, -4, -3, 0, -4, 2, -3, -2, -1, -2, 1, 6,
- 1, 0, -2, -1, -1, -3, -1, -1, -3, -1, -3, -2, 0, -1, -1, -1, 5,
- 0, -1, -2, -2, -1, -3, -2, -2, -1, -1, -2, -1, 0, -2, -1, -2, 1, 5,
- -1, -4, -1, -4, -3, -1, -4, -4, 3, -3, 0, 0, -4, -3, -3, -3, -2, 0, 5,
- -3, -5, -4, -6, -4, 0, -4, -3, -3, -5, -3, -2, -5, -5, -3, -4, -4, -4, -3, 11,
- -1, -2, -3, -2, -1, -2, -2, -2, -2, -1, -2, -1, -2, -2, -1, -2, -1, -1, -1, -3, -2,
- -3, -4, -3, -4, -4, 3, -5, 2, -2, -3, -2, -2, -3, -4, -2, -3, -2, -2, -2, 2, -2, 7,
- -1, 0, -5, 1, 4, -4, -3, 0, -4, 1, -4, -2, -1, -2, 4, 0, -1, -1, -3, -4, -1, -3, 4};
-
-short blosum90mt[]={
- 5,
- -2, 4,
- -1, -4, 9,
- -3, 4, -5, 7,
- -1, 0, -6, 1, 6,
- -3, -4, -3, -5, -5, 7,
- 0, -2, -4, -2, -3, -5, 6,
- -2, -1, -5, -2, -1, -2, -3, 8,
- -2, -5, -2, -5, -4, -1, -5, -4, 5,
- -1, -1, -4, -1, 0, -4, -2, -1, -4, 6,
- -2, -5, -2, -5, -4, 0, -5, -4, 1, -3, 5,
- -2, -4, -2, -4, -3, -1, -4, -3, 1, -2, 2, 7,
- -2, 4, -4, 1, -1, -4, -1, 0, -4, 0, -4, -3, 7,
- -1, -3, -4, -3, -2, -4, -3, -3, -4, -2, -4, -3, -3, 8,
- -1, -1, -4, -1, 2, -4, -3, 1, -4, 1, -3, 0, 0, -2, 7,
- -2, -2, -5, -3, -1, -4, -3, 0, -4, 2, -3, -2, -1, -3, 1, 6,
- 1, 0, -2, -1, -1, -3, -1, -2, -3, -1, -3, -2, 0, -2, -1, -1, 5,
- 0, -1, -2, -2, -1, -3, -3, -2, -1, -1, -2, -1, 0, -2, -1, -2, 1, 6,
- -1, -4, -2, -5, -3, -2, -5, -4, 3, -3, 0, 0, -4, -3, -3, -3, -2, -1, 5,
- -4, -6, -4, -6, -5, 0, -4, -3, -4, -5, -3, -2, -5, -5, -3, -4, -4, -4, -3, 11,
- -1, -2, -3, -2, -2, -2, -2, -2, -2, -1, -2, -1, -2, -2, -1, -2, -1, -1, -2, -3, -2,
- -3, -4, -4, -4, -4, 3, -5, 1, -2, -3, -2, -2, -3, -4, -3, -3, -3, -2, -3, 2, -2, 8,
- -1, 0, -5, 0, 4, -4, -3, 0, -4, 1, -4, -2, -1, -2, 4, 0, -1, -1, -3, -4, -1, -3, 4};
-*/
-
-short pam20mt[]={
- 6,
- -5, 6,
- -8,-14, 10,
- -4, 6,-16, 8,
- -3, 0,-16, 2, 8,
- -9,-12,-15,-17,-16, 9,
- -3, -4,-11, -4, -5,-10, 7,
- -8, -2, -8, -5, -6, -7,-10, 9,
- -6, -7, -7, -9, -6, -3,-13,-11, 9,
- -8, -3,-16, -6, -5,-16, -8, -8, -7, 7,
- -7,-10,-17,-15,-10, -4,-12, -7, -2, -9, 7,
- -6,-12,-16,-13, -8, -5,-10,-13, -2, -3, 0, 11,
- -5, 6,-13, 1, -3,-10, -4, -1, -6, -2, -8,-11, 8,
- -2, -8, -9, -9, -7,-11, -7, -5,-10, -8, -8, -9, -7, 8,
- -5, -4,-16, -4, 0,-15, -8, 0, -9, -4, -6, -5, -5, -4, 9,
- -8, -9, -9,-12,-11,-10,-11, -3, -6, -1,-10, -5, -7, -5, -2, 9,
- -1, -2, -4, -5, -5, -7, -3, -7, -8, -5, -9, -6, -1, -3, -6, -4, 7,
- -1, -4, -9, -6, -7,-10, -7, -8, -3, -4, -8, -5, -3, -5, -7, -8, 0, 7,
- -3, -9, -7, -9, -8, -9, -7, -7, 1,-10, -3, -2, -9, -7, -8, -9, -8, -4, 7,
--16,-11,-18,-17,-19, -6,-17, -8,-16,-14, -7,-15, -9,-16,-15, -3, -6,-15,-18, 13,
- -4, -6,-11, -7, -6, -9, -6, -6, -6, -6, -7, -6, -4, -6, -6, -7, -4, -5, -6,-13, -6,
- -9, -7, -5,-13, -9, 1,-16, -4, -7,-10, -8,-13, -5,-16,-14,-11, -8, -7, -8, -6, -9, 10,
- -4, -1,-16, 0, 6,-16, -6, -2, -7, -5, -8, -6, -4, -5, 7, -5, -6, -7, -8,-17, -6,-11, 6};
-
-short pam60mt[]={
- 5,
- -2, 5,
- -5, -9, 9,
- -2, 5,-10, 7,
- -1, 2,-10, 3, 7,
- -6, -8, -9,-11,-10, 8,
- 0, -2, -7, -2, -2, -7, 6,
- -5, 0, -6, -2, -3, -4, -6, 8,
- -3, -4, -4, -5, -4, -1, -7, -6, 7,
- -5, -1,-10, -2, -3,-10, -5, -4, -4, 6,
- -4, -7,-11, -9, -7, -1, -8, -4, 0, -6, 6,
- -3, -6,-10, -7, -5, -2, -6, -7, 1, 0, 2, 10,
- -2, 5, -7, 2, 0, -6, -1, 1, -4, 0, -5, -6, 6,
- 0, -4, -6, -5, -3, -7, -4, -2, -6, -4, -5, -6, -4, 7,
- -3, -1,-10, -1, 2, -9, -5, 2, -5, -1, -3, -2, -2, -1, 7,
- -5, -5, -6, -6, -6, -7, -7, 0, -4, 2, -6, -2, -3, -2, 0, 8,
- 1, 0, -1, -2, -2, -5, 0, -4, -4, -2, -6, -4, 1, 0, -3, -2, 5,
- 1, -2, -5, -3, -4, -6, -3, -5, -1, -2, -5, -2, -1, -2, -4, -4, 1, 6,
- -1, -5, -4, -6, -4, -5, -4, -5, 3, -6, -1, 0, -5, -4, -5, -5, -4, -1, 6,
--10, -8,-12,-11,-12, -3,-11, -5,-10, -8, -4, -9, -6,-10, -9, 0, -4, -9,-11, 13,
- -2, -3, -6, -3, -3, -5, -3, -3, -3, -3, -4, -3, -2, -3, -3, -4, -2, -2, -3, -8, -3,
- -6, -5, -2, -8, -7, 3,-10, -2, -4, -7, -5, -7, -3,-10, -8, -8, -5, -5, -5, -3, -5, 9,
- -2, 1,-10, 2, 5,-10, -3, 0, -4, -2, -5, -4, -1, -2, 6, -2, -3, -4, -5,-11, -3, -7, 5};
-
-short pam120mt[]={
- 3,
- 0, 4,
- -3, -6, 9,
- 0, 4, -7, 5,
- 0, 3, -7, 3, 5,
- -4, -5, -6, -7, -7, 8,
- 1, 0, -4, 0, -1, -5, 5,
- -3, 1, -4, 0, -1, -3, -4, 7,
- -1, -3, -3, -3, -3, 0, -4, -4, 6,
- -2, 0, -7, -1, -1, -7, -3, -2, -3, 5,
- -3, -4, -7, -5, -4, 0, -5, -3, 1, -4, 5,
- -2, -4, -6, -4, -3, -1, -4, -4, 1, 0, 3, 8,
- -1, 3, -5, 2, 1, -4, 0, 2, -2, 1, -4, -3, 4,
- 1, -2, -4, -3, -2, -5, -2, -1, -3, -2, -3, -3, -2, 6,
- -1, 0, -7, 1, 2, -6, -3, 3, -3, 0, -2, -1, 0, 0, 6,
- -3, -2, -4, -3, -3, -5, -4, 1, -2, 2, -4, -1, -1, -1, 1, 6,
- 1, 0, 0, 0, -1, -3, 1, -2, -2, -1, -4, -2, 1, 1, -2, -1, 3,
- 1, 0, -3, -1, -2, -4, -1, -3, 0, -1, -3, -1, 0, -1, -2, -2, 2, 4,
- 0, -3, -3, -3, -3, -3, -2, -3, 3, -4, 1, 1, -3, -2, -3, -3, -2, 0, 5,
- -7, -6, -8, -8, -8, -1, -8, -3, -6, -5, -3, -6, -4, -7, -6, 1, -2, -6, -8, 12,
- -1, -1, -4, -2, -1, -3, -2, -2, -1, -2, -2, -2, -1, -2, -1, -2, -1, -1, -1, -5, -2,
- -4, -3, -1, -5, -5, 4, -6, -1, -2, -5, -2, -4, -2, -6, -5, -5, -3, -3, -3, -2, -3, 8,
- -1, 2, -7, 3, 4, -6, -2, 1, -3, -1, -3, -2, 0, -1, 4, -1, -1, -2, -3, -7, -1, -5, 4};
-
-/*
-short pam160mt[]={
- 2,
- 0, 3,
- -2, -4, 9,
- 0, 3, -5, 4,
- 0, 2, -5, 3, 4,
- -3, -4, -5, -6, -5, 7,
- 1, 0, -3, 0, 0, -4, 4,
- -2, 1, -3, 0, 0, -2, -3, 6,
- -1, -2, -2, -3, -2, 0, -3, -3, 5,
- -2, 0, -5, 0, -1, -5, -2, -1, -2, 4,
- -2, -4, -6, -4, -3, 1, -4, -2, 2, -3, 5,
- -1, -3, -5, -3, -2, 0, -3, -3, 2, 0, 3, 7,
- 0, 2, -4, 2, 1, -3, 0, 2, -2, 1, -3, -2, 3,
- 1, -1, -3, -2, -1, -4, -1, -1, -2, -2, -3, -2, -1, 5,
- -1, 1, -5, 1, 2, -5, -2, 2, -2, 0, -2, -1, 0, 0, 5,
- -2, -1, -3, -2, -2, -4, -3, 1, -2, 3, -3, -1, -1, -1, 1, 6,
- 1, 0, 0, 0, 0, -3, 1, -1, -2, -1, -3, -2, 1, 1, -1, -1, 2,
- 1, 0, -2, -1, -1, -3, -1, -2, 0, 0, -2, -1, 0, 0, -1, -1, 1, 3,
- 0, -2, -2, -3, -2, -2, -2, -2, 3, -3, 1, 1, -2, -2, -2, -3, -1, 0, 4,
- -5, -5, -7, -6, -7, -1, -7, -3, -5, -4, -2, -4, -4, -5, -5, 1, -2, -5, -6, 12,
- 0, -1, -3, -1, -1, -3, -1, -1, -1, -1, -2, -1, 0, -1, -1, -1, 0, 0, -1, -4, -1,
- -3, -3, 0, -4, -4, 5, -5, 0, -2, -4, -2, -3, -2, -5, -4, -4, -3, -3, -3, -1, -3, 8,
- 0, 2, -5, 2, 3, -5, -1, 1, -2, 0, -3, -2, 1, -1, 3, 0, -1, -1, -2, -6, -1, -4, 3};
-
-short pam250mt[]={
- 2,
- 0, 3,
- -2, -4, 12,
- 0, 3, -5, 4,
- 0, 3, -5, 3, 4,
- -3, -4, -4, -6, -5, 9,
- 1, 0, -3, 1, 0, -5, 5,
- -1, 1, -3, 1, 1, -2, -2, 6,
- -1, -2, -2, -2, -2, 1, -3, -2, 5,
- -1, 1, -5, 0, 0, -5, -2, 0, -2, 5,
- -2, -3, -6, -4, -3, 2, -4, -2, 2, -3, 6,
- -1, -2, -5, -3, -2, 0, -3, -2, 2, 0, 4, 6,
- 0, 2, -4, 2, 1, -3, 0, 2, -2, 1, -3, -2, 2,
- 1, -1, -3, -1, -1, -5, 0, 0, -2, -1, -3, -2, 0, 6,
- 0, 1, -5, 2, 2, -5, -1, 3, -2, 1, -2, -1, 1, 0, 4,
- -2, -1, -4, -1, -1, -4, -3, 2, -2, 3, -3, 0, 0, 0, 1, 6,
- 1, 0, 0, 0, 0, -3, 1, -1, -1, 0, -3, -2, 1, 1, -1, 0, 2,
- 1, 0, -2, 0, 0, -3, 0, -1, 0, 0, -2, -1, 0, 0, -1, -1, 1, 3,
- 0, -2, -2, -2, -2, -1, -1, -2, 4, -2, 2, 2, -2, -1, -2, -2, -1, 0, 4,
- -6, -5, -8, -7, -7, 0, -7, -3, -5, -3, -2, -4, -4, -6, -5, 2, -2, -5, -6, 17,
- 0, -1, -3, -1, -1, -2, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, 0, 0, -1, -4, -1,
- -3, -3, 0, -4, -4, 7, -5, 0, -1, -4, -1, -2, -2, -5, -4, -4, -3, -3, -2, 0, -2, 10,
- 0, 2, -5, 3, 3, -5, 0, 2, -2, 0, -3, -2, 1, 0, 3, 0, 0, -1, -2, -6, -1, -4, 3};
-*/
-short pam350mt[]={
- 2,
- 1, 3,
- -2, -5, 18,
- 1, 3, -6, 4,
- 1, 3, -6, 4, 4,
- -4, -5, -5, -6, -6, 13,
- 2, 1, -4, 1, 1, -6, 5,
- -1, 1, -4, 1, 1, -2, -2, 7,
- 0, -2, -3, -2, -2, 2, -2, -2, 5,
- -1, 1, -6, 1, 0, -6, -1, 1, -2, 5,
- -2, -4, -7, -4, -4, 3, -4, -2, 4, -3, 8,
- -1, -2, -6, -3, -2, 1, -3, -2, 3, 0, 5, 6,
- 0, 2, -4, 2, 2, -4, 1, 2, -2, 1, -3, -2, 2,
- 1, 0, -3, 0, 0, -5, 0, 0, -2, -1, -3, -2, 0, 6,
- 0, 2, -6, 2, 3, -5, -1, 3, -2, 1, -2, -1, 1, 1, 4,
- -1, 0, -4, -1, 0, -5, -2, 2, -2, 4, -3, 0, 1, 0, 2, 7,
- 1, 1, 0, 1, 0, -4, 1, -1, -1, 0, -3, -2, 1, 1, 0, 0, 1,
- 1, 0, -2, 0, 0, -3, 1, -1, 0, 0, -2, -1, 1, 1, 0, -1, 1, 2,
- 0, -2, -2, -2, -2, -1, -1, -2, 4, -2, 3, 2, -2, -1, -2, -3, -1, 0, 5,
- -7, -6,-10, -8, -8, 1, -8, -3, -6, -4, -2, -5, -5, -7, -5, 4, -3, -6, -7, 27,
- 0, 0, -3, -1, 0, -2, -1, 0, 0, -1, -1, 0, 0, 0, 0, -1, 0, 0, 0, -5, -1,
- -4, -4, 1, -5, -5, 11, -6, 0, 0, -5, 0, -2, -3, -6, -5, -5, -3, -3, -2, 1, -2, 14,
- 0, 2, -6, 3, 3, -6, 0, 2, -2, 1, -3, -2, 2, 0, 3, 1, 0, 0, -2, -7, 0, -5, 3};
-
-/*
-short md_40mt[]={
- 9,
- 0, 0,
- -7, 0, 16,
- -6, 0,-13, 11,
- -5, 0,-15, 3, 11,
--11, 0, -5,-15,-16, 13,
- -3, 0, -7, -4, -4,-15, 10,
- -9, 0, -6, -4, -8, -7,-10, 14,
- -6, 0,-11,-12,-12, -5,-13,-11, 11,
- -8, 0,-12, -8, -3,-16, -9, -6,-11, 11,
- -9, 0,-10,-14,-13, -1,-14, -7, -1,-12, 9,
- -6, 0, -9,-12,-11, -7,-12, -9, 1, -7, 1, 14,
- -6, 0, -8, 1, -5,-12, -5, 0, -8, -1,-12, -9, 12,
- -2, 0,-11,-11,-11,-11, -9, -4,-11,-10, -5,-10, -9, 12,
- -7, 0,-12, -6, 0,-14, -9, 2,-12, -1, -6, -8, -5, -3, 12,
- -7, 0, -5,-10, -8,-15, -4, 0,-10, 3, -9, -8, -6, -6, 0, 11,
- 0, 0, -2, -6, -8, -6, -2, -6, -8, -7, -7, -8, 1, -1, -7, -5, 9,
- 1, 0, -7, -8, -8,-11, -7, -7, -2, -5, -9, -2, -2, -4, -7, -6, 1, 10,
- -1, 0, -7, -9, -8, -6, -8,-12, 4,-12, -2, 0,-10, -9,-11,-11, -7, -4, 10,
--14, 0, -4,-15,-15, -7, -7,-13,-13,-13, -8,-11,-14,-14,-11, -4, -9,-12,-10, 18,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
--13, 0, -2, -8,-14, 2,-13, 2, -9,-13, -9,-11, -6,-13, -9,-10, -7,-10,-11, -6, 0, 14,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-short md_120mt[]={
- 6,
- 0, 0,
- -3, 0, 14,
- -2, 0, -7, 8,
- -2, 0, -8, 5, 8,
- -6, 0, -2, -9,-10, 11,
- 0, 0, -3, 0, -1, -9, 8,
- -4, 0, -2, -1, -3, -2, -4, 11,
- -1, 0, -5, -7, -7, -1, -6, -6, 7,
- -4, 0, -6, -2, 0, -9, -4, -1, -6, 8,
- -4, 0, -5, -8, -8, 2, -8, -4, 2, -6, 7,
- -2, 0, -5, -7, -6, -2, -6, -5, 3, -4, 3, 10,
- -1, 0, -3, 3, -1, -6, -1, 2, -4, 1, -6, -5, 8,
- 0, 0, -5, -5, -5, -5, -4, -1, -5, -4, -2, -5, -3, 9,
- -3, 0, -6, -1, 2, -7, -4, 4, -6, 2, -3, -4, -1, 0, 9,
- -3, 0, -2, -4, -3, -8, -1, 2, -6, 4, -5, -4, -2, -2, 2, 8,
- 2, 0, 0, -2, -3, -3, 0, -2, -3, -3, -3, -3, 2, 1, -3, -2, 5,
- 2, 0, -3, -3, -4, -6, -2, -3, 0, -2, -4, 0, 1, 0, -3, -3, 2, 6,
- 1, 0, -3, -5, -5, -2, -4, -6, 5, -6, 1, 2, -5, -4, -6, -6, -3, 0, 7,
- -8, 0, 0, -9, -9, -3, -3, -6, -7, -6, -4, -6, -8, -8, -6, -1, -5, -7, -6, 17,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -7, 0, 2, -4, -7, 5, -8, 4, -5, -7, -4, -6, -2, -7, -4, -5, -3, -6, -6, -2, 0, 12,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-short md_250mt[]={
- 2,
- 0, 0,
- -1, 0, 11,
- -1, 0, -3, 5,
- -1, 0, -4, 4, 5,
- -3, 0, 0, -5, -5, 8,
- 1, 0, -1, 1, 1, -5, 5,
- -2, 0, 0, 0, 0, 0, -2, 6,
- 0, 0, -2, -3, -3, 0, -3, -3, 4,
- -1, 0, -3, 0, 1, -5, -1, 1, -3, 5,
- -1, 0, -2, -4, -4, 2, -4, -2, 2, -3, 5,
- 0, 0, -2, -3, -3, 0, -3, -2, 3, -2, 3, 6,
- 0, 0, -1, 2, 1, -3, 0, 1, -2, 1, -3, -2, 3,
- 1, 0, -2, -2, -2, -2, -1, 0, -2, -1, 0, -2, -1, 6,
- -1, 0, -3, 0, 2, -4, -1, 3, -3, 2, -2, -2, 0, 0, 5,
- -1, 0, -1, -1, 0, -4, 0, 2, -3, 4, -3, -2, 0, -1, 2, 5,
- 1, 0, 1, 0, -1, -2, 1, -1, -1, -1, -2, -1, 1, 1, -1, -1, 2,
- 2, 0, -1, -1, -1, -2, 0, -1, 1, -1, -1, 0, 1, 1, -1, -1, 1, 2,
- 1, 0, -2, -3, -2, 0, -2, -3, 4, -3, 2, 2, -2, -1, -3, -3, -1, 0, 4,
- -4, 0, 1, -5, -5, -1, -1, -3, -4, -3, -2, -3, -4, -4, -3, 0, -3, -4, -3, 15,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -3, 0, 2, -2, -4, 5, -4, 4, -2, -3, -1, -3, -1, -3, -2, -2, -1, -3, -3, 0, 0, 9,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-short md_350mt[]={
- 1,
- 0, 0,
- 0, 0, 9,
- 0, 0, -2, 3,
- 0, 0, -2, 3, 3,
- -2, 0, 1, -3, -4, 6,
- 1, 0, 0, 1, 1, -3, 4,
- -1, 0, 0, 0, 0, 0, -1, 3,
- 0, 0, -1, -2, -2, 1, -2, -2, 3,
- -1, 0, -1, 0, 1, -3, 0, 1, -2, 3,
- -1, 0, -1, -3, -3, 2, -2, -1, 2, -2, 3,
- 0, 0, -1, -2, -2, 1, -2, -1, 2, -2, 2, 3,
- 0, 0, -1, 1, 1, -2, 0, 1, -1, 1, -2, -1, 2,
- 1, 0, -1, -1, -1, -2, -1, 0, -1, -1, 0, -1, 0, 4,
- -1, 0, -2, 1, 1, -2, 0, 2, -2, 2, -1, -1, 0, 0, 3,
- -1, 0, 0, 0, 0, -3, 0, 1, -2, 3, -2, -1, 0, 0, 2, 3,
- 1, 0, 0, 0, 0, -1, 1, 0, -1, 0, -1, -1, 1, 1, 0, 0, 1,
- 1, 0, 0, 0, -1, -1, 0, -1, 0, 0, -1, 0, 0, 1, -1, 0, 1, 1,
- 0, 0, -1, -2, -2, 0, -1, -2, 2, -2, 1, 2, -1, -1, -2, -2, 0, 0, 2,
- -3, 0, 1, -4, -3, 0, -1, -2, -3, -2, -1, -2, -3, -3, -2, 0, -2, -3, -2, 14,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -2, 0, 2, -2, -2, 5, -3, 3, -1, -2, 0, -1, -1, -2, -1, -1, -1, -2, -2, 0, 0, 7,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-*/
-
-short idmat[]={
-10,
- 0, 10,
- 0, 0, 10,
- 0, 0, 0, 10,
- 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10};
-
-short gon40mt[]={
- 92,
- 0, 0,
- -31, 0, 163,
- -56, 0,-135, 111,
- -37, 0,-140, 16, 105,
- -92, 0, -64,-152,-143, 126,
- -32, 0, -91, -51, -76,-152, 105,
- -65, 0, -67, -41, -40, -50, -81, 145,
- -76, 0, -87,-150,-106, -39,-158, -94, 104,
- -54, 0,-132, -47, -13,-127, -79, -34, -86, 103,
- -68, 0, -85,-155,-108, -13,-141, -85, 5, -85, 89,
- -45, 0, -63,-130, -80, -16,-114, -60, 10, -57, 16, 140,
- -62, 0, -83, 6, -38,-104, -40, -7, -99, -20,-112, -91, 115,
- -37, 0,-137, -69, -60,-128, -87, -71,-108, -62, -83,-119, -78, 124,
- -43, 0,-113, -32, 10,-100, -71, 0, -91, 2, -60, -35, -25, -46, 118,
- -61, 0, -86, -77, -50,-130, -69, -31,-103, 19, -84, -81, -47, -73, -6, 112,
- 0, 0, -35, -36, -41,-111, -37, -48, -95, -43, -95, -64, -11, -35, -35, -51, 99,
- -25, 0, -59, -47, -52, -90, -85, -46, -51, -34, -78, -44, -27, -42, -39, -52, 13, 100,
- -22, 0, -43,-133, -74, -58,-122, -98, 28, -82, -18, -22,-103, -86, -79, -88, -74, -25, 97,
--120, 0, -68,-171,-131, -6,-108, -70, -93,-127, -71, -72,-119,-149, -87, -63, -98,-120,-115, 181,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -95, 0, -56, -98,-107, 31,-129, 5, -76, -88, -64, -66, -62,-106, -81, -75, -69, -87, -73, 1, 0, 135,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-short gon80mt[]={
- 75,
- 0, 0,
- -10, 0, 154,
- -31, 0, -93, 96,
- -17, 0, -94, 31, 88,
- -64, 0, -39,-111,-102, 114,
- -11, 0, -61, -26, -47,-115, 97,
- -39, 0, -43, -17, -17, -26, -53, 127,
- -43, 0, -54,-106, -73, -15,-114, -64, 86,
- -30, 0, -88, -21, 4, -89, -50, -12, -59, 85,
- -43, 0, -55,-109, -75, 7,-104, -57, 22, -58, 77,
- -26, 0, -39, -88, -53, 3, -83, -38, 25, -37, 31, 117,
- -34, 0, -55, 21, -13, -75, -18, 9, -71, -2, -79, -62, 97,
- -16, 0, -93, -42, -35, -93, -58, -45, -75, -37, -58, -78, -48, 114,
- -22, 0, -76, -9, 23, -70, -44, 14, -60, 17, -39, -19, -6, -24, 95,
- -36, 0, -60, -44, -23, -90, -43, -10, -71, 33, -58, -53, -22, -45, 11, 97,
- 14, 0, -15, -14, -19, -77, -16, -25, -62, -20, -64, -41, 5, -14, -15, -27, 78,
- -5, 0, -34, -24, -27, -62, -52, -24, -28, -15, -49, -25, -7, -20, -18, -27, 25, 81,
- -6, 0, -21, -89, -51, -31, -86, -65, 41, -54, 3, 1, -69, -57, -51, -60, -43, -9, 80,
- -87, 0, -43,-124, -98, 16, -81, -43, -63, -89, -44, -45, -86,-112, -62, -41, -72, -87, -80, 173,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -65, 0, -32, -69, -74, 49, -94, 21, -47, -60, -35, -37, -39, -76, -53, -50, -46, -58, -47, 23, 0, 123,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-short gon120mt[]={
- 59,
- 0, 0,
- -1, 0, 144,
- -18, 0, -69, 82,
- -9, 0, -68, 35, 72,
- -48, 0, -26, -87, -78, 102,
- -3, 0, -45, -14, -31, -92, 90,
- -26, 0, -31, -7, -6, -14, -37, 110,
- -27, 0, -36, -80, -55, -3, -87, -48, 72,
- -19, 0, -64, -8, 11, -67, -34, -2, -44, 69,
- -30, 0, -39, -82, -57, 15, -82, -42, 28, -44, 66,
- -17, 0, -26, -64, -40, 11, -65, -28, 29, -27, 34, 95,
- -20, 0, -41, 26, -1, -58, -7, 14, -55, 5, -61, -46, 80,
- -6, 0, -68, -28, -22, -72, -41, -31, -56, -24, -44, -56, -32, 105,
- -12, 0, -56, 1, 25, -53, -30, 17, -43, 20, -30, -14, 1, -14, 74,
- -23, 0, -45, -27, -10, -68, -30, -1, -53, 36, -44, -38, -10, -30, 16, 83,
- 16, 0, -7, -5, -9, -58, -6, -14, -44, -10, -47, -29, 10, -5, -7, -15, 60,
- 2, 0, -21, -13, -15, -47, -35, -14, -17, -6, -34, -16, 0, -10, -9, -16, 26, 64,
- 0, 0, -11, -65, -38, -17, -65, -47, 42, -39, 13, 10, -50, -42, -36, -44, -28, -3, 65,
- -68, 0, -29, -96, -78, 27, -66, -28, -46, -68, -29, -31, -68, -89, -49, -30, -57, -67, -59, 166,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -48, 0, -20, -53, -56, 55, -74, 26, -31, -44, -20, -22, -28, -59, -38, -37, -35, -42, -33, 33, 0, 111,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-short gon160mt[]={
- 46,
- 0, 0,
- 3, 0, 135,
- -11, 0, -53, 70,
- -4, 0, -52, 34, 59,
- -38, 0, -18, -70, -62, 91,
- 2, 0, -34, -7, -21, -76, 82,
- -18, 0, -23, -1, -1, -7, -27, 93,
- -18, 0, -25, -62, -43, 3, -70, -37, 59,
- -12, 0, -48, -1, 13, -53, -24, 2, -35, 55,
- -22, 0, -29, -65, -45, 19, -67, -32, 30, -34, 57,
- -12, 0, -19, -50, -31, 14, -52, -21, 29, -21, 34, 76,
- -12, 0, -31, 26, 5, -47, -2, 15, -44, 8, -48, -36, 65,
- -1, 0, -52, -19, -14, -58, -30, -22, -43, -16, -35, -42, -22, 96,
- -7, 0, -42, 6, 23, -41, -21, 17, -32, 20, -24, -12, 5, -8, 56,
- -16, 0, -35, -16, -3, -53, -21, 3, -41, 35, -35, -29, -4, -21, 17, 71,
- 16, 0, -2, 0, -3, -45, -1, -8, -33, -4, -36, -23, 11, 0, -2, -9, 44,
- 5, 0, -14, -6, -8, -36, -24, -8, -12, -2, -24, -11, 3, -4, -4, -9, 23, 50,
- 1, 0, -6, -49, -30, -8, -52, -35, 40, -30, 17, 14, -38, -32, -27, -34, -20, 0, 53,
- -55, 0, -21, -78, -64, 32, -55, -19, -34, -54, -20, -22, -55, -74, -40, -24, -47, -54, -45, 158,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -37, 0, -13, -42, -44, 56, -60, 27, -20, -35, -11, -13, -22, -48, -29, -29, -28, -32, -24, 38, 0, 100,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-short gon250mt[]={
- 24,
- 0, 0,
- 5, 0, 115,
- -3, 0, -32, 47,
- 0, 0, -30, 27, 36,
- -23, 0, -8, -45, -39, 70,
- 5, 0, -20, 1, -8, -52, 66,
- -8, 0, -13, 4, 4, -1, -14, 60,
- -8, 0, -11, -38, -27, 10, -45, -22, 40,
- -4, 0, -28, 5, 12, -33, -11, 6, -21, 32,
- -12, 0, -15, -40, -28, 20, -44, -19, 28, -21, 40,
- -7, 0, -9, -30, -20, 16, -35, -13, 25, -14, 28, 43,
- -3, 0, -18, 22, 9, -31, 4, 12, -28, 8, -30, -22, 38,
- 3, 0, -31, -7, -5, -38, -16, -11, -26, -6, -23, -24, -9, 76,
- -2, 0, -24, 9, 17, -26, -10, 12, -19, 15, -16, -10, 7, -2, 27,
- -6, 0, -22, -3, 4, -32, -10, 6, -24, 27, -22, -17, 3, -9, 15, 47,
- 11, 0, 1, 5, 2, -28, 4, -2, -18, 1, -21, -14, 9, 4, 2, -2, 22,
- 6, 0, -5, 0, -1, -22, -11, -3, -6, 1, -13, -6, 5, 1, 0, -2, 15, 25,
- 1, 0, 0, -29, -19, 1, -33, -20, 31, -17, 18, 16, -22, -18, -15, -20, -10, 0, 34,
- -36, 0, -10, -52, -43, 36, -40, -8, -18, -35, -7, -10, -36, -50, -27, -16, -33, -35, -26, 142,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -22, 0, -5, -28, -27, 51, -40, 22, -7, -21, 0, -2, -14, -31, -17, -18, -19, -19, -11, 41, 0, 78,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-short gon300mt[]={
- 16,
- 0, 0,
- 5, 0, 104,
- -1, 0, -24, 37,
- 1, 0, -23, 23, 27,
- -18, 0, -5, -37, -31, 60,
- 5, 0, -15, 3, -4, -42, 58,
- -6, 0, -10, 5, 4, 0, -10, 45,
- -6, 0, -7, -30, -21, 11, -36, -16, 33,
- -2, 0, -21, 6, 11, -26, -7, 5, -17, 24,
- -9, 0, -10, -32, -22, 19, -36, -14, 25, -17, 33,
- -5, 0, -6, -24, -16, 15, -28, -10, 22, -11, 24, 31,
- -1, 0, -14, 18, 9, -25, 5, 10, -22, 8, -24, -17, 27,
- 3, 0, -23, -4, -2, -30, -11, -8, -20, -3, -18, -19, -6, 66,
- -1, 0, -18, 9, 14, -20, -6, 9, -15, 13, -13, -8, 7, -1, 18,
- -4, 0, -17, 0, 5, -25, -6, 6, -19, 22, -18, -13, 4, -6, 13, 37,
- 8, 0, 1, 5, 3, -22, 4, -1, -14, 2, -17, -11, 7, 4, 2, 0, 15,
- 5, 0, -3, 1, 1, -17, -7, -1, -4, 2, -9, -5, 4, 2, 1, -1, 11, 17,
- 0, 0, 1, -23, -15, 4, -26, -15, 26, -13, 17, 15, -17, -14, -12, -15, -8, 0, 26,
- -29, 0, -7, -42, -36, 36, -34, -5, -13, -28, -4, -6, -30, -41, -23, -14, -27, -28, -19, 132,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -17, 0, -3, -22, -22, 46, -33, 18, -3, -17, 3, 1, -12, -25, -14, -14, -15, -15, -7, 40, 0, 67,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-short gon350mt[]={
- 10,
- 0, 0,
- 4, 0, 93,
- 0, 0, -19, 29,
- 1, 0, -17, 19, 20,
- -14, 0, -3, -30, -25, 51,
- 5, 0, -12, 4, -2, -35, 51,
- -4, 0, -8, 5, 4, 1, -7, 33,
- -4, 0, -5, -24, -17, 11, -29, -13, 27,
- -1, 0, -16, 6, 9, -21, -4, 5, -13, 18,
- -7, 0, -7, -25, -18, 18, -30, -11, 22, -14, 28,
- -4, 0, -4, -19, -13, 14, -23, -8, 19, -9, 21, 23,
- 0, 0, -11, 15, 9, -20, 5, 8, -18, 7, -19, -14, 20,
- 3, 0, -18, -2, 0, -25, -7, -5, -16, -2, -15, -14, -3, 56,
- 0, 0, -14, 8, 11, -16, -4, 7, -11, 10, -11, -7, 6, 0, 12,
- -2, 0, -13, 2, 6, -20, -4, 6, -15, 18, -14, -11, 4, -4, 10, 28,
- 6, 0, 1, 5, 3, -18, 5, 0, -11, 2, -13, -9, 6, 4, 2, 1, 10,
- 4, 0, -2, 2, 1, -13, -5, -1, -3, 2, -7, -4, 4, 2, 1, 0, 8, 11,
- 0, 0, 2, -18, -12, 5, -21, -11, 22, -10, 16, 14, -13, -11, -9, -12, -6, 0, 21,
- -24, 0, -4, -35, -29, 35, -30, -3, -9, -23, -1, -3, -24, -34, -19, -12, -22, -23, -14, 124,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -14, 0, -1, -18, -17, 42, -27, 15, -1, -14, 5, 2, -10, -20, -11, -12, -12, -12, -4, 39, 0, 57,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-
-char *nucleic_acid_order = "ABCDGHKMNRSTUVWXY";
-
-short clustalvdnamt[]={
- 10,
- 0, 0,
- 0, 0, 10,
- 0, 0, 0, 0,
- 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-short swgapdnamt[]={
- 10,
- -9, 10,
- -9, 10, 10,
- 10, 10, -9, 10,
- -9, 10, -9, 10, 10,
- 10, 10, 10, 10, -9, 10,
- -9, 10, -9, 10, 10, 10, 10,
- 10, 10, 10, 10, -9, 10, -9, 10,
- 10, 10, 10, 10, 10, 10, 10, 10, 10,
- 10, 10, -9, 10, 10, 10, 10, 10, 10, 10,
- -9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
- -9, 10, -9, 10, -9, 10, 10, -9, 10, -9, -9, 10,
- -9, 10, -9, 10, -9, 10, 10, -9, 10, -9, -9, 10, 10,
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -9, -9, 10,
- 10, 10, -9, 10, -9, 10, 10, 10, 10, 10, -9, 10, 10, 10, 10,
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
- -9, 10, 10, 10, -9, 10, 10, 10, 10, -9, 10, 10, 10, 10, 10, 10, 10};
-
Deleted: trunk/packages/clustalw/trunk/matrixseries.gon
===================================================================
--- trunk/packages/clustalw/trunk/matrixseries.gon 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/matrixseries.gon 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,7 +0,0 @@
-CLUSTAL_SERIES
-
-
-MATRIX 61 100 /us1/user/julie/matrices/gon80.bla
-MATRIX 41 60 /us1/user/julie/matrices/gon120.bla
-MATRIX 21 40 /us1/user/julie/matrices/gon250.bla
-MATRIX 0 40 /us1/user/julie/matrices/gon350.bla
Deleted: trunk/packages/clustalw/trunk/pairalign.c
===================================================================
--- trunk/packages/clustalw/trunk/pairalign.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/pairalign.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,615 +0,0 @@
-/* Change int h to int gh everywhere DES June 1994 */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "clustalw.h"
-
-#define MIN(a,b) ((a)<(b)?(a):(b))
-#define MAX(a,b) ((a)>(b)?(a):(b))
-
-#define gap(k) ((k) <= 0 ? 0 : g + gh * (k))
-#define tbgap(k) ((k) <= 0 ? 0 : tb + gh * (k))
-#define tegap(k) ((k) <= 0 ? 0 : te + gh * (k))
-
-/*
- * Prototypes
- */
-static void add(sint v);
-static sint calc_score(sint iat, sint jat, sint v1, sint v2);
-static float tracepath(sint tsb1,sint tsb2);
-static void forward_pass(char *ia, char *ib, sint n, sint m);
-static void reverse_pass(char *ia, char *ib);
-static sint diff(sint A, sint B, sint M, sint N, sint tb, sint te);
-static void del(sint k);
-
-/*
- * Global variables
- */
-#ifdef MAC
-#define pwint short
-#else
-#define pwint int
-#endif
-static sint int_scale;
-
-extern double **tmat;
-extern float pw_go_penalty;
-extern float pw_ge_penalty;
-extern float transition_weight;
-extern sint nseqs;
-extern sint max_aa;
-extern sint gap_pos1,gap_pos2;
-extern sint max_aln_length;
-extern sint *seqlen_array;
-extern sint debug;
-extern sint mat_avscore;
-extern short blosum30mt[],pam350mt[],idmat[],pw_usermat[],pw_userdnamat[];
-extern short clustalvdnamt[],swgapdnamt[];
-extern short gon250mt[];
-extern short def_dna_xref[],def_aa_xref[],pw_dna_xref[],pw_aa_xref[];
-extern Boolean dnaflag;
-extern char **seq_array;
-extern char *amino_acid_codes;
-extern char pw_mtrxname[];
-extern char pw_dnamtrxname[];
-
-static float mm_score;
-static sint print_ptr,last_print;
-static sint *displ;
-static pwint *HH, *DD, *RR, *SS;
-static sint g, gh;
-static sint seq1, seq2;
-static sint matrix[NUMRES][NUMRES];
-static pwint maxscore;
-static sint sb1, sb2, se1, se2;
-
-
-sint pairalign(sint istart, sint iend, sint jstart, sint jend)
-{
- short *mat_xref;
- static sint si, sj, i;
- static sint n,m,len1,len2;
- static sint maxres;
- static short *matptr;
- static char c;
- static float gscale,ghscale;
-
- displ = (sint *)ckalloc((2*max_aln_length+1) * sizeof(sint));
- HH = (pwint *)ckalloc((max_aln_length) * sizeof(pwint));
- DD = (pwint *)ckalloc((max_aln_length) * sizeof(pwint));
- RR = (pwint *)ckalloc((max_aln_length) * sizeof(pwint));
- SS = (pwint *)ckalloc((max_aln_length) * sizeof(pwint));
-
-#ifdef MAC
- int_scale = 10;
-#else
- int_scale = 100;
-#endif
- gscale=ghscale=1.0;
- if (dnaflag)
- {
- if (debug>1) fprintf(stdout,"matrix %s\n",pw_dnamtrxname);
- if (strcmp(pw_dnamtrxname, "iub") == 0)
- {
- matptr = swgapdnamt;
- mat_xref = def_dna_xref;
- }
- else if (strcmp(pw_dnamtrxname, "clustalw") == 0)
- {
- matptr = clustalvdnamt;
- mat_xref = def_dna_xref;
- gscale=0.6667;
- ghscale=0.751;
- }
- else
- {
- matptr = pw_userdnamat;
- mat_xref = pw_dna_xref;
- }
- maxres = get_matrix(matptr, mat_xref, matrix, TRUE, int_scale);
- if (maxres == 0) return((sint)-1);
-
- matrix[0][4]=transition_weight*matrix[0][0];
- matrix[4][0]=transition_weight*matrix[0][0];
- matrix[2][11]=transition_weight*matrix[0][0];
- matrix[11][2]=transition_weight*matrix[0][0];
- matrix[2][12]=transition_weight*matrix[0][0];
- matrix[12][2]=transition_weight*matrix[0][0];
- }
- else
- {
- if (debug>1) fprintf(stdout,"matrix %s\n",pw_mtrxname);
- if (strcmp(pw_mtrxname, "blosum") == 0)
- {
- matptr = blosum30mt;
- mat_xref = def_aa_xref;
- }
- else if (strcmp(pw_mtrxname, "pam") == 0)
- {
- matptr = pam350mt;
- mat_xref = def_aa_xref;
- }
- else if (strcmp(pw_mtrxname, "gonnet") == 0)
- {
- matptr = gon250mt;
- int_scale /= 10;
- mat_xref = def_aa_xref;
- }
- else if (strcmp(pw_mtrxname, "id") == 0)
- {
- matptr = idmat;
- mat_xref = def_aa_xref;
- }
- else
- {
- matptr = pw_usermat;
- mat_xref = pw_aa_xref;
- }
-
- maxres = get_matrix(matptr, mat_xref, matrix, TRUE, int_scale);
- if (maxres == 0) return((sint)-1);
- }
-
-
- for (si=MAX(0,istart);si<nseqs && si<iend;si++)
- {
- n = seqlen_array[si+1];
- len1 = 0;
- for (i=1;i<=n;i++) {
- c = seq_array[si+1][i];
- if ((c!=gap_pos1) && (c != gap_pos2)) len1++;
- }
-
- for (sj=MAX(si+1,jstart+1);sj<nseqs && sj<jend;sj++)
- {
- m = seqlen_array[sj+1];
- if(n==0 || m==0) {
- tmat[si+1][sj+1]=1.0;
- tmat[sj+1][si+1]=1.0;
- continue;
- }
- len2 = 0;
- for (i=1;i<=m;i++) {
- c = seq_array[sj+1][i];
- if ((c!=gap_pos1) && (c != gap_pos2)) len2++;
- }
-
- if (dnaflag) {
- g = 2 * (float)pw_go_penalty * int_scale*gscale;
- gh = pw_ge_penalty * int_scale*ghscale;
- }
- else {
- if (mat_avscore <= 0)
- g = 2 * (float)(pw_go_penalty + log((double)(MIN(n,m))))*int_scale;
- else
- g = 2 * mat_avscore * (float)(pw_go_penalty +
- log((double)(MIN(n,m))))*gscale;
- gh = pw_ge_penalty * int_scale;
- }
-
- if (debug>1) fprintf(stdout,"go %d ge %d\n",(pint)g,(pint)gh);
-
- /*
- align the sequences
- */
- seq1 = si+1;
- seq2 = sj+1;
-
- forward_pass(&seq_array[seq1][0], &seq_array[seq2][0],
- n, m);
-
- reverse_pass(&seq_array[seq1][0], &seq_array[seq2][0]);
-
- last_print = 0;
- print_ptr = 1;
-/*
- sb1 = sb2 = 1;
- se1 = n-1;
- se2 = m-1;
-*/
-
-/* use Myers and Miller to align two sequences */
-
- maxscore = diff(sb1-1, sb2-1, se1-sb1+1, se2-sb2+1,
- (sint)0, (sint)0);
-
-/* calculate percentage residue identity */
-
- mm_score = tracepath(sb1,sb2);
-
- if(len1==0 || len2==0) mm_score=0;
- else
- mm_score /= (float)MIN(len1,len2);
-
- tmat[si+1][sj+1] = ((float)100.0 - mm_score)/(float)100.0;
- tmat[sj+1][si+1] = ((float)100.0 - mm_score)/(float)100.0;
-
-if (debug>1)
-{
- fprintf(stdout,"Sequences (%d:%d) Aligned. Score: %d CompScore: %d\n",
- (pint)si+1,(pint)sj+1,
- (pint)mm_score,
- (pint)maxscore/(MIN(len1,len2)*100));
-}
-else
-{
- info("Sequences (%d:%d) Aligned. Score: %d",
- (pint)si+1,(pint)sj+1,
- (pint)mm_score);
-}
-
- }
- }
- displ=ckfree((void *)displ);
- HH=ckfree((void *)HH);
- DD=ckfree((void *)DD);
- RR=ckfree((void *)RR);
- SS=ckfree((void *)SS);
-
-
- return((sint)1);
-}
-
-static void add(sint v)
-{
-
- if(last_print<0) {
- displ[print_ptr-1] = v;
- displ[print_ptr++] = last_print;
- }
- else
- last_print = displ[print_ptr++] = v;
-}
-
-static sint calc_score(sint iat,sint jat,sint v1,sint v2)
-{
- sint ipos,jpos;
- sint ret;
-
- ipos = v1 + iat;
- jpos = v2 + jat;
-
- ret=matrix[(int)seq_array[seq1][ipos]][(int)seq_array[seq2][jpos]];
-
- return(ret);
-}
-
-
-static float tracepath(sint tsb1,sint tsb2)
-{
- char c1,c2;
- sint i1,i2,r;
- sint i,k,pos,to_do;
- sint count;
- float score;
- char s1[600], s2[600];
-
- to_do=print_ptr-1;
- i1 = tsb1;
- i2 = tsb2;
-
- pos = 0;
- count = 0;
- for(i=1;i<=to_do;++i) {
-
- if (debug>1) fprintf(stdout,"%d ",(pint)displ[i]);
- if(displ[i]==0) {
- c1 = seq_array[seq1][i1];
- c2 = seq_array[seq2][i2];
-
- if (debug>0)
- {
- if (c1>max_aa) s1[pos] = '-';
- else s1[pos]=amino_acid_codes[c1];
- if (c2>max_aa) s2[pos] = '-';
- else s2[pos]=amino_acid_codes[c2];
- }
-
- if ((c1!=gap_pos1) && (c1 != gap_pos2) &&
- (c1 == c2)) count++;
- ++i1;
- ++i2;
- ++pos;
- }
- else {
- if((k=displ[i])>0) {
-
- if (debug>0)
- for (r=0;r<k;r++)
- {
- s1[pos+r]='-';
- if (seq_array[seq2][i2+r]>max_aa) s2[pos+r] = '-';
- else s2[pos+r]=amino_acid_codes[seq_array[seq2][i2+r]];
- }
-
- i2 += k;
- pos += k;
- }
- else {
-
- if (debug>0)
- for (r=0;r<(-k);r++)
- {
- s2[pos+r]='-';
- if (seq_array[seq1][i1+r]>max_aa) s1[pos+r] = '-';
- else s1[pos+r]=amino_acid_codes[seq_array[seq1][i1+r]];
- }
-
- i1 -= k;
- pos -= k;
- }
- }
- }
- if (debug>0) fprintf(stdout,"\n");
- if (debug>0)
- {
- for (i=0;i<pos;i++) fprintf(stdout,"%c",s1[i]);
- fprintf(stdout,"\n");
- for (i=0;i<pos;i++) fprintf(stdout,"%c",s2[i]);
- fprintf(stdout,"\n");
- }
- /*
- if (count <= 0) count = 1;
- */
- score = 100.0 * (float)count;
- return(score);
-}
-
-
-static void forward_pass(char *ia, char *ib, sint n, sint m)
-{
-
- sint i,j;
- pwint f,hh,p,t;
-
- maxscore = 0;
- se1 = se2 = 0;
- for (i=0;i<=m;i++)
- {
- HH[i] = 0;
- DD[i] = -g;
- }
-
- for (i=1;i<=n;i++)
- {
- hh = p = 0;
- f = -g;
-
- for (j=1;j<=m;j++)
- {
-
- f -= gh;
- t = hh - g - gh;
- if (f<t) f = t;
-
- DD[j] -= gh;
- t = HH[j] - g - gh;
- if (DD[j]<t) DD[j] = t;
-
- hh = p + matrix[(int)ia[i]][(int)ib[j]];
- if (hh<f) hh = f;
- if (hh<DD[j]) hh = DD[j];
- if (hh<0) hh = 0;
-
- p = HH[j];
- HH[j] = hh;
-
- if (hh > maxscore)
- {
- maxscore = hh;
- se1 = i;
- se2 = j;
- }
- }
- }
-
-}
-
-
-static void reverse_pass(char *ia, char *ib)
-{
-
- sint i,j;
- pwint f,hh,p,t;
- pwint cost;
-
- cost = 0;
- sb1 = sb2 = 1;
- for (i=se2;i>0;i--)
- {
- HH[i] = -1;
- DD[i] = -1;
- }
-
- for (i=se1;i>0;i--)
- {
- hh = f = -1;
- if (i == se1) p = 0;
- else p = -1;
-
- for (j=se2;j>0;j--)
- {
-
- f -= gh;
- t = hh - g - gh;
- if (f<t) f = t;
-
- DD[j] -= gh;
- t = HH[j] - g - gh;
- if (DD[j]<t) DD[j] = t;
-
- hh = p + matrix[(int)ia[i]][(int)ib[j]];
- if (hh<f) hh = f;
- if (hh<DD[j]) hh = DD[j];
-
- p = HH[j];
- HH[j] = hh;
-
- if (hh > cost)
- {
- cost = hh;
- sb1 = i;
- sb2 = j;
- if (cost >= maxscore) break;
- }
- }
- if (cost >= maxscore) break;
- }
-
-}
-
-static int diff(sint A,sint B,sint M,sint N,sint tb,sint te)
-{
- sint type;
- sint midi,midj,i,j;
- int midh;
- static pwint f, hh, e, s, t;
-
- if(N<=0) {
- if(M>0) {
- del(M);
- }
-
- return(-(int)tbgap(M));
- }
-
- if(M<=1) {
- if(M<=0) {
- add(N);
- return(-(int)tbgap(N));
- }
-
- midh = -(tb+gh) - tegap(N);
- hh = -(te+gh) - tbgap(N);
- if (hh>midh) midh = hh;
- midj = 0;
- for(j=1;j<=N;j++) {
- hh = calc_score(1,j,A,B)
- - tegap(N-j) - tbgap(j-1);
- if(hh>midh) {
- midh = hh;
- midj = j;
- }
- }
-
- if(midj==0) {
- del(1);
- add(N);
- }
- else {
- if(midj>1)
- add(midj-1);
- displ[print_ptr++] = last_print = 0;
- if(midj<N)
- add(N-midj);
- }
- return midh;
- }
-
-/* Divide: Find optimum midpoint (midi,midj) of cost midh */
-
- midi = M / 2;
- HH[0] = 0.0;
- t = -tb;
- for(j=1;j<=N;j++) {
- HH[j] = t = t-gh;
- DD[j] = t-g;
- }
-
- t = -tb;
- for(i=1;i<=midi;i++) {
- s=HH[0];
- HH[0] = hh = t = t-gh;
- f = t-g;
- for(j=1;j<=N;j++) {
- if ((hh=hh-g-gh) > (f=f-gh)) f=hh;
- if ((hh=HH[j]-g-gh) > (e=DD[j]-gh)) e=hh;
- hh = s + calc_score(i,j,A,B);
- if (f>hh) hh = f;
- if (e>hh) hh = e;
-
- s = HH[j];
- HH[j] = hh;
- DD[j] = e;
- }
- }
-
- DD[0]=HH[0];
-
- RR[N]=0;
- t = -te;
- for(j=N-1;j>=0;j--) {
- RR[j] = t = t-gh;
- SS[j] = t-g;
- }
-
- t = -te;
- for(i=M-1;i>=midi;i--) {
- s = RR[N];
- RR[N] = hh = t = t-gh;
- f = t-g;
-
- for(j=N-1;j>=0;j--) {
-
- if ((hh=hh-g-gh) > (f=f-gh)) f=hh;
- if ((hh=RR[j]-g-gh) > (e=SS[j]-gh)) e=hh;
- hh = s + calc_score(i+1,j+1,A,B);
- if (f>hh) hh = f;
- if (e>hh) hh = e;
-
- s = RR[j];
- RR[j] = hh;
- SS[j] = e;
-
- }
- }
-
- SS[N]=RR[N];
-
- midh=HH[0]+RR[0];
- midj=0;
- type=1;
- for(j=0;j<=N;j++) {
- hh = HH[j] + RR[j];
- if(hh>=midh)
- if(hh>midh || (HH[j]!=DD[j] && RR[j]==SS[j])) {
- midh=hh;
- midj=j;
- }
- }
-
- for(j=N;j>=0;j--) {
- hh = DD[j] + SS[j] + g;
- if(hh>midh) {
- midh=hh;
- midj=j;
- type=2;
- }
- }
-
- /* Conquer recursively around midpoint */
-
-
- if(type==1) { /* Type 1 gaps */
- diff(A,B,midi,midj,tb,g);
- diff(A+midi,B+midj,M-midi,N-midj,g,te);
- }
- else {
- diff(A,B,midi-1,midj,tb,0.0);
- del(2);
- diff(A+midi+1,B+midj,M-midi-1,N-midj,0.0,te);
- }
-
- return midh; /* Return the score of the best alignment */
-}
-
-static void del(sint k)
-{
- if(last_print<0)
- last_print = displ[print_ptr-1] -= k;
- else
- last_print = displ[print_ptr++] = -(k);
-}
-
-
Deleted: trunk/packages/clustalw/trunk/param.h
===================================================================
--- trunk/packages/clustalw/trunk/param.h 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/param.h 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,383 +0,0 @@
-#define MAXARGS 100
-
-typedef struct {
- char *str;
- sint *flag;
- int type;
- char **arg;
-} cmd_line_data;
-
-/*
- command line switches
-*/
-sint setoptions = -1;
-sint sethelp = -1;
-sint setinteractive = -1;
-sint setbatch = -1;
-sint setgapopen = -1;
-sint setgapext = -1;
-sint setpwgapopen = -1;
-sint setpwgapext = -1;
-sint setoutorder = -1;
-sint setbootlabels = -1;
-sint setpwmatrix = -1;
-sint setmatrix = -1;
-sint setpwdnamatrix = -1;
-sint setdnamatrix = -1;
-sint setnegative = -1;
-sint setnoweights = -1;
-sint setoutput = -1;
-sint setoutputtree = -1;
-sint setquicktree = -1;
-sint settype = -1;
-sint setcase = -1;
-sint setseqno = -1;
-
-sint setseqno_range = -1;
-sint setrange = -1;
-
-sint settransweight = -1;
-sint setseed = -1;
-sint setscore = -1;
-sint setwindow = -1;
-sint setktuple = -1;
-sint setkimura = -1;
-sint settopdiags = -1;
-sint setpairgap = -1;
-sint settossgaps = -1;
-sint setnopgap = -1;
-sint setnohgap = -1;
-sint setnovgap = -1;
-sint sethgapres = -1;
-sint setvgapres = -1;
-sint setuseendgaps = -1;
-sint setmaxdiv = -1;
-sint setgapdist = -1;
-sint setdebug = -1;
-sint setoutfile = -1;
-sint setinfile = -1;
-sint setprofile1 = -1;
-sint setprofile2 = -1;
-sint setalign = -1;
-sint setconvert = -1;
-sint setnewtree = -1;
-sint setusetree = -1;
-sint setnewtree1 = -1;
-sint setusetree1 = -1;
-sint setnewtree2 = -1;
-sint setusetree2 = -1;
-sint setbootstrap = -1;
-sint settree = -1;
-sint setprofile = -1;
-sint setsequences = -1;
-sint setsecstr1 = -1;
-sint setsecstr2 = -1;
-sint setsecstroutput = -1;
-sint sethelixgap = -1;
-sint setstrandgap = -1;
-sint setloopgap = -1;
-sint setterminalgap = -1;
-sint sethelixendin = -1;
-sint sethelixendout = -1;
-sint setstrandendin = -1;
-sint setstrandendout = -1;
-
-/*
- multiple alignment parameters
-*/
-float dna_gap_open = 15.0, dna_gap_extend = 6.66;
-float prot_gap_open = 10.0, prot_gap_extend = 0.2;
-sint profile_type = PROFILE;
-sint gap_dist = 4;
-sint output_order = ALIGNED;
-sint divergence_cutoff = 30;
-sint matnum = 3;
-char mtrxname[FILENAMELEN+1] = "gonnet";
-sint dnamatnum = 1;
-char dnamtrxname[FILENAMELEN+1] = "iub";
-char hyd_residues[] = "GPSNDQEKR";
-Boolean no_weights = FALSE;
-Boolean neg_matrix = FALSE;
-Boolean no_hyd_penalties = FALSE;
-Boolean no_var_penalties = TRUE;
-Boolean no_pref_penalties = FALSE;
-Boolean use_endgaps = FALSE;
-Boolean endgappenalties = FALSE;
-Boolean reset_alignments_new = FALSE; /* DES */
-Boolean reset_alignments_all = FALSE; /* DES */
-sint output_struct_penalties = 0;
-sint struct_penalties1 = NONE;
-sint struct_penalties2 = NONE;
-Boolean use_ss1 = TRUE;
-Boolean use_ss2 = TRUE;
-sint helix_penalty = 4;
-sint strand_penalty = 4;
-sint loop_penalty = 1;
-sint helix_end_minus = 3;
-sint helix_end_plus = 0;
-sint strand_end_minus = 1;
-sint strand_end_plus = 1;
-sint helix_end_penalty = 2;
-sint strand_end_penalty = 2;
-Boolean use_ambiguities = FALSE;
-
-/*
- pairwise alignment parameters
-*/
-float dna_pw_go_penalty = 15.0, dna_pw_ge_penalty = 6.66;
-float prot_pw_go_penalty = 10.0, prot_pw_ge_penalty = 0.1;
-sint pw_matnum = 3;
-char pw_mtrxname[FILENAMELEN+1] = "gonnet";
-sint pw_dnamatnum = 1;
-char pw_dnamtrxname[FILENAMELEN+1] = "iub";
-char usermtrxname[FILENAMELEN+1], pw_usermtrxname[FILENAMELEN+1];
-char dnausermtrxname[FILENAMELEN+1], pw_dnausermtrxname[FILENAMELEN+1];
-
-Boolean quick_pairalign = FALSE;
-float transition_weight = 0.5;
-sint new_seq;
-
-/*
- quick pairwise alignment parameters
-*/
-sint dna_ktup = 2; /* default parameters for DNA */
-sint dna_wind_gap = 5;
-sint dna_signif = 4;
-sint dna_window = 4;
-
-sint prot_ktup = 1; /* default parameters for proteins */
-sint prot_wind_gap = 3;
-sint prot_signif = 5;
-sint prot_window = 5;
-Boolean percent=TRUE;
-Boolean tossgaps = FALSE;
-Boolean kimura = FALSE;
-
-
-sint boot_ntrials = 1000;
-unsigned sint boot_ran_seed = 111;
-
-
-sint debug = 0;
-
-Boolean explicit_dnaflag = FALSE; /* Explicit setting of sequence type on comm.line*/
-Boolean lowercase = TRUE; /* Flag for GDE output - set on comm. line*/
-Boolean cl_seq_numbers = FALSE;
-
-Boolean seqRange = FALSE; /* Ramu */
-
-Boolean output_clustal = TRUE;
-Boolean output_gcg = FALSE;
-Boolean output_phylip = FALSE;
-Boolean output_nbrf = FALSE;
-Boolean output_gde = FALSE;
-Boolean output_nexus = FALSE;
-Boolean output_fasta = FALSE;
-
-Boolean showaln = TRUE;
-Boolean save_parameters = FALSE;
-
-/* DES */
-Boolean output_tree_clustal = FALSE;
-Boolean output_tree_phylip = TRUE;
-Boolean output_tree_distances = FALSE;
-Boolean output_tree_nexus = FALSE;
-Boolean output_pim = FALSE;
-
-
-sint bootstrap_format = BS_BRANCH_LABELS;
-
-/*These are all the positively scoring groups that occur in the Gonnet Pam250
-matrix. There are strong and weak groups, defined as strong score >0.5 and
-weak score =<0.5. Strong matching columns to be assigned ':' and weak matches
-assigned '.' in the clustal output format.
-*/
-
-char *res_cat1[] = {
- "STA",
- "NEQK",
- "NHQK",
- "NDEQ",
- "QHRK",
- "MILV",
- "MILF",
- "HY",
- "FYW",
- NULL };
-
-char *res_cat2[] = {
- "CSA",
- "ATV",
- "SAG",
- "STNK",
- "STPA",
- "SGND",
- "SNDEQK",
- "NDEQHK",
- "NEQHRK",
- "FVLIM",
- "HFY",
- NULL };
-
-
-
-static char *type_arg[] = {
- "protein",
- "dna",
- ""};
-
-static char *bootlabels_arg[] = {
- "node",
- "branch",
- ""};
-
-static char *outorder_arg[] = {
- "input",
- "aligned",
- ""};
-
-static char *case_arg[] = {
- "lower",
- "upper",
- ""};
-
-static char *seqno_arg[] = {
- "off",
- "on",
- ""};
-
-static char *seqno_range_arg[] = {
- "off",
- "on",
- ""};
-
-static char *score_arg[] = {
- "percent",
- "absolute",
- ""};
-
-static char *output_arg[] = {
- "gcg",
- "gde",
- "pir",
- "phylip",
- "nexus",
- "fasta",
- ""};
-
-static char *outputtree_arg[] = {
- "nj",
- "phylip",
- "dist",
- "nexus",
- ""};
-
-static char *outputsecstr_arg[] = {
- "structure",
- "mask",
- "both",
- "none",
- ""};
-
-/*
- command line initialisation
-
- type = 0 no argument
- type = 1 integer argument
- type = 2 float argument
- type = 3 string argument
- type = 4 filename
- type = 5 opts
-*/
-#define NOARG 0
-#define INTARG 1
-#define FLTARG 2
-#define STRARG 3
-#define FILARG 4
-#define OPTARG 5
-
-
-/* command line switches for DATA **************************/
-cmd_line_data cmd_line_file[] = {
- "infile", &setinfile, FILARG, NULL,
- "profile1", &setprofile1, FILARG, NULL,
- "profile2", &setprofile2, FILARG, NULL,
- "", NULL, -1};
-/* command line switches for VERBS **************************/
-cmd_line_data cmd_line_verb[] = {
- "help", &sethelp, NOARG, NULL,
- "check", &sethelp, NOARG, NULL,
- "options", &setoptions, NOARG, NULL,
- "align", &setalign, NOARG, NULL,
- "newtree", &setnewtree, FILARG, NULL,
- "usetree", &setusetree, FILARG, NULL,
- "newtree1", &setnewtree1, FILARG, NULL,
- "usetree1", &setusetree1, FILARG, NULL,
- "newtree2", &setnewtree2, FILARG, NULL,
- "usetree2", &setusetree2, FILARG, NULL,
- "bootstrap", &setbootstrap, NOARG, NULL,
- "tree", &settree, NOARG, NULL,
- "quicktree", &setquicktree, NOARG, NULL,
- "convert", &setconvert, NOARG, NULL,
- "interactive", &setinteractive, NOARG, NULL,
- "batch", &setbatch, NOARG, NULL,
- "", NULL, -1};
-/* command line switches for PARAMETERS **************************/
-cmd_line_data cmd_line_para[] = {
- "type", &settype, OPTARG, type_arg,
- "profile", &setprofile, NOARG, NULL,
- "sequences", &setsequences, NOARG, NULL,
- "matrix", &setmatrix, FILARG, NULL,
- "dnamatrix", &setdnamatrix, FILARG, NULL,
- "negative", &setnegative, NOARG, NULL,
- "noweights", &setnoweights, NOARG, NULL,
- "gapopen", &setgapopen, FLTARG, NULL,
- "gapext", &setgapext, FLTARG, NULL,
- "endgaps", &setuseendgaps, NOARG, NULL,
- "nopgap", &setnopgap, NOARG, NULL,
- "nohgap", &setnohgap, NOARG, NULL,
- "novgap", &setnovgap, NOARG, NULL,
- "hgapresidues", &sethgapres, STRARG, NULL,
- "maxdiv", &setmaxdiv, INTARG, NULL,
-
- "gapdist", &setgapdist, INTARG, NULL,
- "pwmatrix", &setpwmatrix, FILARG, NULL,
- "pwdnamatrix", &setpwdnamatrix, FILARG, NULL,
- "pwgapopen", &setpwgapopen, FLTARG, NULL,
- "pwgapext", &setpwgapext, FLTARG, NULL,
- "ktuple", &setktuple, INTARG, NULL,
- "window", &setwindow, INTARG, NULL,
- "pairgap", &setpairgap, INTARG, NULL,
- "topdiags", &settopdiags, INTARG, NULL,
- "score", &setscore, OPTARG, score_arg,
- "transweight", &settransweight, FLTARG, NULL,
- "seed", &setseed, INTARG, NULL,
- "kimura", &setkimura, NOARG, NULL,
- "tossgaps", &settossgaps, NOARG, NULL,
- "bootlabels", &setbootlabels, OPTARG, bootlabels_arg,
- "debug", &setdebug, INTARG, NULL,
- "output", &setoutput, OPTARG, output_arg,
- "outputtree", &setoutputtree, OPTARG, outputtree_arg,
- "outfile", &setoutfile, FILARG, NULL,
- "outorder", &setoutorder, OPTARG, outorder_arg,
- "case", &setcase, OPTARG, case_arg,
- "seqnos", &setseqno, OPTARG, seqno_arg,
-
- "seqno_range", &setseqno_range, OPTARG, seqno_range_arg, /* this one should be on/off and */
- "range", &setrange, STRARG, NULL, /* this one should be like 10:20 , messy option settings */
-
- "nosecstr1", &setsecstr1, NOARG, NULL,
- "nosecstr2", &setsecstr2, NOARG, NULL,
- "secstrout", &setsecstroutput, OPTARG, outputsecstr_arg,
- "helixgap", &sethelixgap, INTARG, NULL,
- "strandgap", &setstrandgap, INTARG, NULL,
- "loopgap", &setloopgap, INTARG, NULL,
- "terminalgap", &setterminalgap, INTARG, NULL,
- "helixendin", &sethelixendin, INTARG, NULL,
- "helixendout", &sethelixendout, INTARG, NULL,
- "strandendin", &setstrandendin, INTARG, NULL,
- "strandendout",&setstrandendout, INTARG, NULL,
-
- "", NULL, -1};
-
-
Deleted: trunk/packages/clustalw/trunk/prfalign.c
===================================================================
--- trunk/packages/clustalw/trunk/prfalign.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/prfalign.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,1132 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "clustalw.h"
-#define ENDALN 127
-
-#define MAX(a,b) ((a)>(b)?(a):(b))
-#define MIN(a,b) ((a)<(b)?(a):(b))
-
-/*
- * Prototypes
- */
-static lint pdiff(sint A,sint B,sint i,sint j,sint go1,sint go2);
-static lint prfscore(sint n, sint m);
-static sint gap_penalty1(sint i, sint j,sint k);
-static sint open_penalty1(sint i, sint j);
-static sint ext_penalty1(sint i, sint j);
-static sint gap_penalty2(sint i, sint j,sint k);
-static sint open_penalty2(sint i, sint j);
-static sint ext_penalty2(sint i, sint j);
-static void padd(sint k);
-static void pdel(sint k);
-static void palign(void);
-static void ptracepath(sint *alen);
-static void add_ggaps(void);
-static char * add_ggaps_mask(char *mask, int len, char *path1, char *path2);
-
-/*
- * Global variables
- */
-extern double **tmat;
-extern float gap_open, gap_extend;
-extern float transition_weight;
-extern sint gap_pos1, gap_pos2;
-extern sint max_aa;
-extern sint nseqs;
-extern sint *seqlen_array;
-extern sint *seq_weight;
-extern sint debug;
-extern Boolean neg_matrix;
-extern sint mat_avscore;
-extern short blosum30mt[], blosum40mt[], blosum45mt[];
-extern short blosum62mt2[], blosum80mt[];
-extern short pam20mt[], pam60mt[];
-extern short pam120mt[], pam160mt[], pam350mt[];
-extern short gon40mt[], gon80mt[];
-extern short gon120mt[], gon160mt[], gon250mt[], gon350mt[];
-extern short clustalvdnamt[],swgapdnamt[];
-extern short idmat[];
-extern short usermat[];
-extern short userdnamat[];
-extern Boolean user_series;
-extern UserMatSeries matseries;
-
-extern short def_dna_xref[],def_aa_xref[],dna_xref[],aa_xref[];
-extern sint max_aln_length;
-extern Boolean distance_tree;
-extern Boolean dnaflag;
-extern char mtrxname[];
-extern char dnamtrxname[];
-extern char **seq_array;
-extern char *amino_acid_codes;
-extern char *gap_penalty_mask1,*gap_penalty_mask2;
-extern char *sec_struct_mask1,*sec_struct_mask2;
-extern sint struct_penalties1, struct_penalties2;
-extern Boolean use_ss1, use_ss2;
-extern Boolean endgappenalties;
-
-static sint print_ptr,last_print;
-static sint *displ;
-
-static char **alignment;
-static sint *aln_len;
-static sint *aln_weight;
-static char *aln_path1, *aln_path2;
-static sint alignment_len;
-static sint **profile1, **profile2;
-static lint *HH, *DD, *RR, *SS;
-static lint *gS;
-static sint matrix[NUMRES][NUMRES];
-static sint nseqs1, nseqs2;
-static sint prf_length1, prf_length2;
-static sint *gaps;
-static sint gapcoef1,gapcoef2;
-static sint lencoef1,lencoef2;
-static Boolean switch_profiles;
-
-lint prfalign(sint *group, sint *aligned)
-{
-
- static Boolean found;
- static Boolean negative;
- static Boolean error_given=FALSE;
- static sint i, j, count = 0;
- static sint NumSeq;
- static sint len, len1, len2, is, minlen;
- static sint se1, se2, sb1, sb2;
- static sint maxres;
- static sint int_scale;
- static short *matptr;
- static short *mat_xref;
- static char c;
- static lint score;
- static float scale;
- static double logmin,logdiff;
- static double pcid;
-
-
- alignment = (char **) ckalloc( nseqs * sizeof (char *) );
- aln_len = (sint *) ckalloc( nseqs * sizeof (sint) );
- aln_weight = (sint *) ckalloc( nseqs * sizeof (sint) );
-
- for (i=0;i<nseqs;i++)
- if (aligned[i+1] == 0) group[i+1] = 0;
-
- nseqs1 = nseqs2 = 0;
- for (i=0;i<nseqs;i++)
- {
- if (group[i+1] == 1) nseqs1++;
- else if (group[i+1] == 2) nseqs2++;
- }
-
- if ((nseqs1 == 0) || (nseqs2 == 0)) return(0.0);
-
- if (nseqs2 > nseqs1)
- {
- switch_profiles = TRUE;
- for (i=0;i<nseqs;i++)
- {
- if (group[i+1] == 1) group[i+1] = 2;
- else if (group[i+1] == 2) group[i+1] = 1;
- }
- }
- else
- switch_profiles = FALSE;
-
- int_scale = 100;
-
-/*
- calculate the mean of the sequence pc identities between the two groups
-*/
- count = 0;
- pcid = 0.0;
- negative=neg_matrix;
- for (i=0;i<nseqs;i++)
- {
- if (group[i+1] == 1)
- for (j=0;j<nseqs;j++)
- if (group[j+1] == 2)
- {
- count++;
- pcid += tmat[i+1][j+1];
- }
- }
-
- pcid = pcid/(float)count;
-
-if (debug > 0) fprintf(stdout,"mean tmat %3.1f\n", pcid);
-
-
-/*
- Make the first profile.
-*/
- prf_length1 = 0;
- for (i=0;i<nseqs;i++)
- if (group[i+1] == 1)
- if(seqlen_array[i+1]>prf_length1) prf_length1=seqlen_array[i+1];
-
- nseqs1 = 0;
-if (debug>0) fprintf(stdout,"sequences profile 1:\n");
- for (i=0;i<nseqs;i++)
- {
- if (group[i+1] == 1)
- {
-if (debug>0) {
-extern char **names;
-fprintf(stdout,"%s\n",names[i+1]);
-}
- len = seqlen_array[i+1];
- alignment[nseqs1] = (char *) ckalloc( (prf_length1+2) * sizeof (char) );
- for (j=0;j<len;j++)
- alignment[nseqs1][j] = seq_array[i+1][j+1];
- for(j=len;j<prf_length1;j++)
- alignment[nseqs1][j]=gap_pos1;
- alignment[nseqs1][j] = ENDALN;
- aln_len[nseqs1] = prf_length1;
- aln_weight[nseqs1] = seq_weight[i];
- nseqs1++;
- }
- }
-
-/*
- Make the second profile.
-*/
- prf_length2 = 0;
- for (i=0;i<nseqs;i++)
- if (group[i+1] == 2)
- if(seqlen_array[i+1]>prf_length2) prf_length2=seqlen_array[i+1];
-
- nseqs2 = 0;
-if (debug>0) fprintf(stdout,"sequences profile 2:\n");
- for (i=0;i<nseqs;i++)
- {
- if (group[i+1] == 2)
- {
-if (debug>0) {
-extern char **names;
-fprintf(stdout,"%s\n",names[i+1]);
-}
- len = seqlen_array[i+1];
- alignment[nseqs1+nseqs2] =
- (char *) ckalloc( (prf_length2+2) * sizeof (char) );
- for (j=0;j<len;j++)
- alignment[nseqs1+nseqs2][j] = seq_array[i+1][j+1];
- for(j=len;j<prf_length2;j++)
- alignment[nseqs1+nseqs2][j]=gap_pos1;
- alignment[nseqs1+nseqs2][j] = ENDALN;
- aln_len[nseqs1+nseqs2] = prf_length2;
- aln_weight[nseqs1+nseqs2] = seq_weight[i];
- nseqs2++;
- }
- }
-
- max_aln_length = prf_length1 + prf_length2+2;
-
-/*
- calculate real length of profiles - removing gaps!
-*/
- len1=0;
- for (i=0;i<nseqs1;i++)
- {
- is=0;
- for (j=0; j<MIN(aln_len[i],prf_length1); j++)
- {
- c = alignment[i][j];
- if ((c !=gap_pos1) && (c != gap_pos2)) is++;
- }
- len1+=is;
- }
- len1/=(float)nseqs1;
-
- len2=0;
- for (i=nseqs1;i<nseqs2+nseqs1;i++)
- {
- is=0;
- for (j=0; j<MIN(aln_len[i],prf_length2); j++)
- {
- c = alignment[i][j];
- if ((c !=gap_pos1) && (c != gap_pos2)) is++;
- }
- len2+=is;
- }
- len2/=(float)nseqs2;
-
- if (dnaflag)
- {
- scale=1.0;
- if (strcmp(dnamtrxname, "iub") == 0)
- {
- matptr = swgapdnamt;
- mat_xref = def_dna_xref;
- }
- else if (strcmp(dnamtrxname, "clustalw") == 0)
- {
- matptr = clustalvdnamt;
- mat_xref = def_dna_xref;
- scale=0.66;
- }
- else
- {
- matptr = userdnamat;
- mat_xref = dna_xref;
- }
- maxres = get_matrix(matptr, mat_xref, matrix, neg_matrix, int_scale);
- if (maxres == 0) return((sint)-1);
-/*
- matrix[0][4]=transition_weight*matrix[0][0];
- matrix[4][0]=transition_weight*matrix[0][0];
- matrix[2][11]=transition_weight*matrix[0][0];
- matrix[11][2]=transition_weight*matrix[0][0];
- matrix[2][12]=transition_weight*matrix[0][0];
- matrix[12][2]=transition_weight*matrix[0][0];
-*/
-/* fix suggested by Chanan Rubin at Compugen */
- matrix[mat_xref[0]][mat_xref[4]]=transition_weight*matrix[0][0];
- matrix[mat_xref[4]][mat_xref[0]]=transition_weight*matrix[0][0];
- matrix[mat_xref[2]][mat_xref[11]]=transition_weight*matrix[0][0];
- matrix[mat_xref[11]][mat_xref[2]]=transition_weight*matrix[0][0];
- matrix[mat_xref[2]][mat_xref[12]]=transition_weight*matrix[0][0];
- matrix[mat_xref[12]][mat_xref[2]]=transition_weight*matrix[0][0];
-
- gapcoef1 = gapcoef2 = 100.0 * gap_open *scale;
- lencoef1 = lencoef2 = 100.0 * gap_extend *scale;
- }
- else
- {
- if(len1==0 || len2==0) {
- logmin=1.0;
- logdiff=1.0;
- }
- else {
- minlen = MIN(len1,len2);
- logmin = 1.0/log10((double)minlen);
- if (len2<len1)
- logdiff = 1.0+0.5*log10((double)((float)len2/(float)len1));
- else if (len1<len2)
- logdiff = 1.0+0.5*log10((double)((float)len1/(float)len2));
- else logdiff=1.0;
- if(logdiff<0.9) logdiff=0.9;
- }
-if(debug>0) fprintf(stdout,"%d %d logmin %f logdiff %f\n",
-(pint)len1,(pint)len2, logmin,logdiff);
- scale=0.75;
- if (strcmp(mtrxname, "blosum") == 0)
- {
- scale=0.75;
- if (negative || distance_tree == FALSE) matptr = blosum40mt;
- else if (pcid > 80.0)
- {
- matptr = blosum80mt;
- }
- else if (pcid > 60.0)
- {
- matptr = blosum62mt2;
- }
- else if (pcid > 40.0)
- {
- matptr = blosum45mt;
- }
- else if (pcid > 30.0)
- {
- scale=0.5;
- matptr = blosum45mt;
- }
- else if (pcid > 20.0)
- {
- scale=0.6;
- matptr = blosum45mt;
- }
- else
- {
- scale=0.6;
- matptr = blosum30mt;
- }
- mat_xref = def_aa_xref;
-
- }
- else if (strcmp(mtrxname, "pam") == 0)
- {
- scale=0.75;
- if (negative || distance_tree == FALSE) matptr = pam120mt;
- else if (pcid > 80.0) matptr = pam20mt;
- else if (pcid > 60.0) matptr = pam60mt;
- else if (pcid > 40.0) matptr = pam120mt;
- else matptr = pam350mt;
- mat_xref = def_aa_xref;
- }
- else if (strcmp(mtrxname, "gonnet") == 0)
- {
- scale/=2.0;
- if (negative || distance_tree == FALSE) matptr = gon250mt;
- else if (pcid > 35.0)
- {
- matptr = gon80mt;
- scale/=2.0;
- }
- else if (pcid > 25.0)
- {
- if(minlen<100) matptr = gon250mt;
- else matptr = gon120mt;
- }
- else
- {
- if(minlen<100) matptr = gon350mt;
- else matptr = gon160mt;
- }
- mat_xref = def_aa_xref;
- int_scale /= 10;
- }
- else if (strcmp(mtrxname, "id") == 0)
- {
- matptr = idmat;
- mat_xref = def_aa_xref;
- }
- else if(user_series)
- {
- matptr=NULL;
- found=FALSE;
- for(i=0;i<matseries.nmat;i++)
- if(pcid>=matseries.mat[i].llimit && pcid<=matseries.mat[i].ulimit)
- {
- j=i;
- found=TRUE;
- break;
- }
- if(found==FALSE)
- {
- if(!error_given)
- warning(
-"\nSeries matrix not found for sequence percent identity = %d.\n"
-"(Using first matrix in series as a default.)\n"
-"This alignment may not be optimal!\n"
-"SUGGESTION: Check your matrix series input file and try again.",(int)pcid);
- error_given=TRUE;
- j=0;
- }
-if (debug>0) fprintf(stdout,"pcid %d matrix %d\n",(pint)pcid,(pint)j+1);
-
- matptr = matseries.mat[j].matptr;
- mat_xref = matseries.mat[j].aa_xref;
-/* this gives a scale of 0.5 for pcid=llimit and 1.0 for pcid=ulimit */
- scale=0.5+(pcid-matseries.mat[j].llimit)/((matseries.mat[j].ulimit-matseries.mat[j].llimit)*2.0);
- }
- else
- {
- matptr = usermat;
- mat_xref = aa_xref;
- }
-if(debug>0) fprintf(stdout,"pcid %3.1f scale %3.1f\n",pcid,scale);
- maxres = get_matrix(matptr, mat_xref, matrix, negative, int_scale);
- if (maxres == 0)
- {
- fprintf(stdout,"Error: matrix %s not found\n", mtrxname);
- return(-1);
- }
-
- if (negative) {
- gapcoef1 = gapcoef2 = 100.0 * (float)(gap_open);
- lencoef1 = lencoef2 = 100.0 * gap_extend;
- }
- else {
- if (mat_avscore <= 0)
- gapcoef1 = gapcoef2 = 100.0 * (float)(gap_open + logmin);
- else
- gapcoef1 = gapcoef2 = scale * mat_avscore * (float)(gap_open/(logdiff*logmin));
- lencoef1 = lencoef2 = 100.0 * gap_extend;
- }
- }
-if (debug>0)
-{
-fprintf(stdout,"matavscore %d\n",mat_avscore);
-fprintf(stdout,"Gap Open1 %d Gap Open2 %d Gap Extend1 %d Gap Extend2 %d\n",
- (pint)gapcoef1,(pint)gapcoef2, (pint)lencoef1,(pint)lencoef2);
-fprintf(stdout,"Matrix %s\n", mtrxname);
-}
-
- profile1 = (sint **) ckalloc( (prf_length1+2) * sizeof (sint *) );
- for(i=0; i<prf_length1+2; i++)
- profile1[i] = (sint *) ckalloc( (LENCOL+2) * sizeof(sint) );
-
- profile2 = (sint **) ckalloc( (prf_length2+2) * sizeof (sint *) );
- for(i=0; i<prf_length2+2; i++)
- profile2[i] = (sint *) ckalloc( (LENCOL+2) * sizeof(sint) );
-
-/*
- calculate the Gap Coefficients.
-*/
- gaps = (sint *) ckalloc( (max_aln_length+1) * sizeof (sint) );
-
- if (switch_profiles == FALSE)
- calc_gap_coeff(alignment, gaps, profile1, (struct_penalties1 && use_ss1), gap_penalty_mask1,
- (sint)0, nseqs1, prf_length1, gapcoef1, lencoef1);
- else
- calc_gap_coeff(alignment, gaps, profile1, (struct_penalties2 && use_ss2), gap_penalty_mask2,
- (sint)0, nseqs1, prf_length1, gapcoef1, lencoef1);
-/*
- calculate the profile matrix.
-*/
- calc_prf1(profile1, alignment, gaps, matrix,
- aln_weight, prf_length1, (sint)0, nseqs1);
-
-if (debug>4)
-{
-extern char *amino_acid_codes;
- for (j=0;j<=max_aa;j++)
- fprintf(stdout,"%c ", amino_acid_codes[j]);
- fprintf(stdout,"\n");
- for (i=0;i<prf_length1;i++)
- {
- for (j=0;j<=max_aa;j++)
- fprintf(stdout,"%d ", (pint)profile1[i+1][j]);
- fprintf(stdout,"%d ", (pint)profile1[i+1][gap_pos1]);
- fprintf(stdout,"%d ", (pint)profile1[i+1][gap_pos2]);
- fprintf(stdout,"%d %d\n",(pint)profile1[i+1][GAPCOL],(pint)profile1[i+1][LENCOL]);
- }
-}
-
-/*
- calculate the Gap Coefficients.
-*/
-
- if (switch_profiles == FALSE)
- calc_gap_coeff(alignment, gaps, profile2, (struct_penalties2 && use_ss2), gap_penalty_mask2,
- nseqs1, nseqs1+nseqs2, prf_length2, gapcoef2, lencoef2);
- else
- calc_gap_coeff(alignment, gaps, profile2, (struct_penalties1 && use_ss1), gap_penalty_mask1,
- nseqs1, nseqs1+nseqs2, prf_length2, gapcoef2, lencoef2);
-/*
- calculate the profile matrix.
-*/
- calc_prf2(profile2, alignment, aln_weight,
- prf_length2, nseqs1, nseqs1+nseqs2);
-
- aln_weight=ckfree((void *)aln_weight);
-
-if (debug>4)
-{
-extern char *amino_acid_codes;
- for (j=0;j<=max_aa;j++)
- fprintf(stdout,"%c ", amino_acid_codes[j]);
- fprintf(stdout,"\n");
- for (i=0;i<prf_length2;i++)
- {
- for (j=0;j<=max_aa;j++)
- fprintf(stdout,"%d ", (pint)profile2[i+1][j]);
- fprintf(stdout,"%d ", (pint)profile2[i+1][gap_pos1]);
- fprintf(stdout,"%d ", (pint)profile2[i+1][gap_pos2]);
- fprintf(stdout,"%d %d\n",(pint)profile2[i+1][GAPCOL],(pint)profile2[i+1][LENCOL]);
- }
-}
-
- aln_path1 = (char *) ckalloc( (max_aln_length+1) * sizeof(char) );
- aln_path2 = (char *) ckalloc( (max_aln_length+1) * sizeof(char) );
-
-
-/*
- align the profiles
-*/
-/* use Myers and Miller to align two sequences */
-
- last_print = 0;
- print_ptr = 1;
-
- sb1 = sb2 = 0;
- se1 = prf_length1;
- se2 = prf_length2;
-
- HH = (lint *) ckalloc( (max_aln_length+1) * sizeof (lint) );
- DD = (lint *) ckalloc( (max_aln_length+1) * sizeof (lint) );
- RR = (lint *) ckalloc( (max_aln_length+1) * sizeof (lint) );
- SS = (lint *) ckalloc( (max_aln_length+1) * sizeof (lint) );
- gS = (lint *) ckalloc( (max_aln_length+1) * sizeof (lint) );
- displ = (sint *) ckalloc( (max_aln_length+1) * sizeof (sint) );
-
- score = pdiff(sb1, sb2, se1-sb1, se2-sb2, profile1[0][GAPCOL], profile1[prf_length1][GAPCOL]);
-
- HH=ckfree((void *)HH);
- DD=ckfree((void *)DD);
- RR=ckfree((void *)RR);
- SS=ckfree((void *)SS);
- gS=ckfree((void *)gS);
-
- ptracepath( &alignment_len);
-
- displ=ckfree((void *)displ);
-
- add_ggaps();
-
- for (i=0;i<prf_length1+2;i++)
- profile1[i]=ckfree((void *)profile1[i]);
- profile1=ckfree((void *)profile1);
-
- for (i=0;i<prf_length2+2;i++)
- profile2[i]=ckfree((void *)profile2[i]);
- profile2=ckfree((void *)profile2);
-
- prf_length1 = alignment_len;
-
- aln_path1=ckfree((void *)aln_path1);
- aln_path2=ckfree((void *)aln_path2);
-
- NumSeq = 0;
- for (j=0;j<nseqs;j++)
- {
- if (group[j+1] == 1)
- {
- seqlen_array[j+1] = prf_length1;
- realloc_seq(j+1,prf_length1);
- for (i=0;i<prf_length1;i++)
- seq_array[j+1][i+1] = alignment[NumSeq][i];
- NumSeq++;
- }
- }
- for (j=0;j<nseqs;j++)
- {
- if (group[j+1] == 2)
- {
- seqlen_array[j+1] = prf_length1;
- seq_array[j+1] = (char *)realloc(seq_array[j+1], (prf_length1+2) * sizeof (char));
- realloc_seq(j+1,prf_length1);
- for (i=0;i<prf_length1;i++)
- seq_array[j+1][i+1] = alignment[NumSeq][i];
- NumSeq++;
- }
- }
-
- for (i=0;i<nseqs1+nseqs2;i++)
- alignment[i]=ckfree((void *)alignment[i]);
- alignment=ckfree((void *)alignment);
-
- aln_len=ckfree((void *)aln_len);
- gaps=ckfree((void *)gaps);
-
- return(score/100);
-}
-
-static void add_ggaps(void)
-{
- sint j;
- sint i,ix;
- sint len;
- char *ta;
-
- ta = (char *) ckalloc( (alignment_len+1) * sizeof (char) );
-
- for (j=0;j<nseqs1;j++)
- {
- ix = 0;
- for (i=0;i<alignment_len;i++)
- {
- if (aln_path1[i] == 2)
- {
- if (ix < aln_len[j])
- ta[i] = alignment[j][ix];
- else
- ta[i] = ENDALN;
- ix++;
- }
- else if (aln_path1[i] == 1)
- {
-/*
- insertion in first alignment...
-*/
- ta[i] = gap_pos1;
- }
- else
- {
- fprintf(stdout,"Error in aln_path\n");
- }
- }
- ta[i] = ENDALN;
-
- len = alignment_len;
- alignment[j] = (char *)realloc(alignment[j], (len+2) * sizeof (char));
- for (i=0;i<len;i++)
- alignment[j][i] = ta[i];
- alignment[j][len] = ENDALN;
- aln_len[j] = len;
- }
-
- for (j=nseqs1;j<nseqs1+nseqs2;j++)
- {
- ix = 0;
- for (i=0;i<alignment_len;i++)
- {
- if (aln_path2[i] == 2)
- {
- if (ix < aln_len[j])
- ta[i] = alignment[j][ix];
- else
- ta[i] = ENDALN;
- ix++;
- }
- else if (aln_path2[i] == 1)
- {
-/*
- insertion in second alignment...
-*/
- ta[i] = gap_pos1;
- }
- else
- {
- fprintf(stdout,"Error in aln_path\n");
- }
- }
- ta[i] = ENDALN;
-
- len = alignment_len;
- alignment[j] = (char *) realloc(alignment[j], (len+2) * sizeof (char) );
- for (i=0;i<len;i++)
- alignment[j][i] = ta[i];
- alignment[j][len] = ENDALN;
- aln_len[j] = len;
- }
-
- ta=ckfree((void *)ta);
-
- if (struct_penalties1 != NONE)
- gap_penalty_mask1 = add_ggaps_mask(gap_penalty_mask1,alignment_len,aln_path1,aln_path2);
- if (struct_penalties1 == SECST)
- sec_struct_mask1 = add_ggaps_mask(sec_struct_mask1,alignment_len,aln_path1,aln_path2);
-
- if (struct_penalties2 != NONE)
- gap_penalty_mask2 = add_ggaps_mask(gap_penalty_mask2,alignment_len,aln_path2,aln_path1);
- if (struct_penalties2 == SECST)
- sec_struct_mask2 = add_ggaps_mask(sec_struct_mask2,alignment_len,aln_path2,aln_path1);
-
-if (debug>0)
-{
- char c;
- extern char *amino_acid_codes;
-
- for (i=0;i<nseqs1+nseqs2;i++)
- {
- for (j=0;j<alignment_len;j++)
- {
- if (alignment[i][j] == ENDALN) break;
- else if ((alignment[i][j] == gap_pos1) || (alignment[i][j] == gap_pos2)) c = '-';
- else c = amino_acid_codes[alignment[i][j]];
- fprintf(stdout,"%c", c);
- }
- fprintf(stdout,"\n\n");
- }
-}
-
-}
-
-static char * add_ggaps_mask(char *mask, int len, char *path1, char *path2)
-{
- int i,ix;
- char *ta;
-
- ta = (char *) ckalloc( (len+1) * sizeof (char) );
-
- ix = 0;
- if (switch_profiles == FALSE)
- {
- for (i=0;i<len;i++)
- {
- if (path1[i] == 2)
- {
- ta[i] = mask[ix];
- ix++;
- }
- else if (path1[i] == 1)
- ta[i] = gap_pos1;
- }
- }
- else
- {
- for (i=0;i<len;i++)
- {
- if (path2[i] == 2)
- {
- ta[i] = mask[ix];
- ix++;
- }
- else if (path2[i] == 1)
- ta[i] = gap_pos1;
- }
- }
- mask = (char *)realloc(mask,(len+2) * sizeof (char));
- for (i=0;i<len;i++)
- mask[i] = ta[i];
- mask[i] ='\0';
-
- ta=ckfree((void *)ta);
-
- return(mask);
-}
-
-static lint prfscore(sint n, sint m)
-{
- sint ix;
- lint score;
-
- score = 0.0;
- for (ix=0; ix<=max_aa; ix++)
- {
- score += (profile1[n][ix] * profile2[m][ix]);
- }
- score += (profile1[n][gap_pos1] * profile2[m][gap_pos1]);
- score += (profile1[n][gap_pos2] * profile2[m][gap_pos2]);
- return(score/10);
-
-}
-
-static void ptracepath(sint *alen)
-{
- sint i,j,k,pos,to_do;
-
- pos = 0;
-
- to_do=print_ptr-1;
-
- for(i=1;i<=to_do;++i) {
-if (debug>1) fprintf(stdout,"%d ",(pint)displ[i]);
- if(displ[i]==0) {
- aln_path1[pos]=2;
- aln_path2[pos]=2;
- ++pos;
- }
- else {
- if((k=displ[i])>0) {
- for(j=0;j<=k-1;++j) {
- aln_path2[pos+j]=2;
- aln_path1[pos+j]=1;
- }
- pos += k;
- }
- else {
- k = (displ[i]<0) ? displ[i] * -1 : displ[i];
- for(j=0;j<=k-1;++j) {
- aln_path1[pos+j]=2;
- aln_path2[pos+j]=1;
- }
- pos += k;
- }
- }
- }
-if (debug>1) fprintf(stdout,"\n");
-
- (*alen) = pos;
-
-}
-
-static void pdel(sint k)
-{
- if(last_print<0)
- last_print = displ[print_ptr-1] -= k;
- else
- last_print = displ[print_ptr++] = -(k);
-}
-
-static void padd(sint k)
-{
-
- if(last_print<0) {
- displ[print_ptr-1] = k;
- displ[print_ptr++] = last_print;
- }
- else
- last_print = displ[print_ptr++] = k;
-}
-
-static void palign(void)
-{
- displ[print_ptr++] = last_print = 0;
-}
-
-
-static lint pdiff(sint A,sint B,sint M,sint N,sint go1, sint go2)
-{
- sint midi,midj,type;
- lint midh;
-
- static lint t, tl, g, h;
-
-{ static sint i,j;
- static lint hh, f, e, s;
-
-/* Boundary cases: M <= 1 or N == 0 */
-if (debug>2) fprintf(stdout,"A %d B %d M %d N %d midi %d go1 %d go2 %d\n",
-(pint)A,(pint)B,(pint)M,(pint)N,(pint)M/2,(pint)go1,(pint)go2);
-
-/* if sequence B is empty.... */
-
- if(N<=0) {
-
-/* if sequence A is not empty.... */
-
- if(M>0) {
-
-/* delete residues A[1] to A[M] */
-
- pdel(M);
- }
- return(-gap_penalty1(A,B,M));
- }
-
-/* if sequence A is empty.... */
-
- if(M<=1) {
- if(M<=0) {
-
-/* insert residues B[1] to B[N] */
-
- padd(N);
- return(-gap_penalty2(A,B,N));
- }
-
-/* if sequence A has just one residue.... */
-
- if (go1 == 0)
- midh = -gap_penalty1(A+1,B+1,N);
- else
- midh = -gap_penalty2(A+1,B,1)-gap_penalty1(A+1,B+1,N);
- midj = 0;
- for(j=1;j<=N;j++) {
- hh = -gap_penalty1(A,B+1,j-1) + prfscore(A+1,B+j)
- -gap_penalty1(A+1,B+j+1,N-j);
- if(hh>midh) {
- midh = hh;
- midj = j;
- }
- }
-
- if(midj==0) {
- padd(N);
- pdel(1);
- }
- else {
- if(midj>1) padd(midj-1);
- palign();
- if(midj<N) padd(N-midj);
- }
- return midh;
- }
-
-
-/* Divide sequence A in half: midi */
-
- midi = M / 2;
-
-/* In a forward phase, calculate all HH[j] and HH[j] */
-
- HH[0] = 0.0;
- t = -open_penalty1(A,B+1);
- tl = -ext_penalty1(A,B+1);
- for(j=1;j<=N;j++) {
- HH[j] = t = t+tl;
- DD[j] = t-open_penalty2(A+1,B+j);
- }
-
- if (go1 == 0) t = 0;
- else t = -open_penalty2(A+1,B);
- tl = -ext_penalty2(A+1,B);
- for(i=1;i<=midi;i++) {
- s = HH[0];
- HH[0] = hh = t = t+tl;
- f = t-open_penalty1(A+i,B+1);
-
- for(j=1;j<=N;j++) {
- g = open_penalty1(A+i,B+j);
- h = ext_penalty1(A+i,B+j);
- if ((hh=hh-g-h) > (f=f-h)) f=hh;
- g = open_penalty2(A+i,B+j);
- h = ext_penalty2(A+i,B+j);
- if ((hh=HH[j]-g-h) > (e=DD[j]-h)) e=hh;
- hh = s + prfscore(A+i, B+j);
- if (f>hh) hh = f;
- if (e>hh) hh = e;
-
- s = HH[j];
- HH[j] = hh;
- DD[j] = e;
-
- }
- }
-
- DD[0]=HH[0];
-
-/* In a reverse phase, calculate all RR[j] and SS[j] */
-
- RR[N]=0.0;
- tl = 0.0;
- for(j=N-1;j>=0;j--) {
- g = -open_penalty1(A+M,B+j+1);
- tl -= ext_penalty1(A+M,B+j+1);
- RR[j] = g+tl;
- SS[j] = RR[j]-open_penalty2(A+M,B+j);
- gS[j] = open_penalty2(A+M,B+j);
- }
-
- tl = 0.0;
- for(i=M-1;i>=midi;i--) {
- s = RR[N];
- if (go2 == 0) g = 0;
- else g = -open_penalty2(A+i+1,B+N);
- tl -= ext_penalty2(A+i+1,B+N);
- RR[N] = hh = g+tl;
- t = open_penalty1(A+i,B+N);
- f = RR[N]-t;
-
- for(j=N-1;j>=0;j--) {
- g = open_penalty1(A+i,B+j+1);
- h = ext_penalty1(A+i,B+j+1);
- if ((hh=hh-g-h) > (f=f-h-g+t)) f=hh;
- t = g;
- g = open_penalty2(A+i+1,B+j);
- h = ext_penalty2(A+i+1,B+j);
- hh=RR[j]-g-h;
- if (i==(M-1)) {
- e=SS[j]-h;
- }
- else {
- e=SS[j]-h-g+open_penalty2(A+i+2,B+j);
- gS[j] = g;
- }
- if (hh > e) e=hh;
- hh = s + prfscore(A+i+1, B+j+1);
- if (f>hh) hh = f;
- if (e>hh) hh = e;
-
- s = RR[j];
- RR[j] = hh;
- SS[j] = e;
-
- }
- }
- SS[N]=RR[N];
- gS[N] = open_penalty2(A+midi+1,B+N);
-
-/* find midj, such that HH[j]+RR[j] or DD[j]+SS[j]+gap is the maximum */
-
- midh=HH[0]+RR[0];
- midj=0;
- type=1;
- for(j=0;j<=N;j++) {
- hh = HH[j] + RR[j];
- if(hh>=midh)
- if(hh>midh || (HH[j]!=DD[j] && RR[j]==SS[j])) {
- midh=hh;
- midj=j;
- }
- }
-
- for(j=N;j>=0;j--) {
- hh = DD[j] + SS[j] + gS[j];
- if(hh>midh) {
- midh=hh;
- midj=j;
- type=2;
- }
- }
-}
-
-/* Conquer recursively around midpoint */
-
-
- if(type==1) { /* Type 1 gaps */
-if (debug>2) fprintf(stdout,"Type 1,1: midj %d\n",(pint)midj);
- pdiff(A,B,midi,midj,go1,1);
-if (debug>2) fprintf(stdout,"Type 1,2: midj %d\n",(pint)midj);
- pdiff(A+midi,B+midj,M-midi,N-midj,1,go2);
- }
- else {
-if (debug>2) fprintf(stdout,"Type 2,1: midj %d\n",(pint)midj);
- pdiff(A,B,midi-1,midj,go1, 0);
- pdel(2);
-if (debug>2) fprintf(stdout,"Type 2,2: midj %d\n",(pint)midj);
- pdiff(A+midi+1,B+midj,M-midi-1,N-midj,0,go2);
- }
-
- return midh; /* Return the score of the best alignment */
-}
-
-/* calculate the score for opening a gap at residues A[i] and B[j] */
-
-static sint open_penalty1(sint i, sint j)
-{
- sint g;
-
- if (!endgappenalties &&(i==0 || i==prf_length1)) return(0);
-
- g = profile2[j][GAPCOL] + profile1[i][GAPCOL];
- return(g);
-}
-
-/* calculate the score for extending an existing gap at A[i] and B[j] */
-
-static sint ext_penalty1(sint i, sint j)
-{
- sint h;
-
- if (!endgappenalties &&(i==0 || i==prf_length1)) return(0);
-
- h = profile2[j][LENCOL];
- return(h);
-}
-
-/* calculate the score for a gap of length k, at residues A[i] and B[j] */
-
-static sint gap_penalty1(sint i, sint j, sint k)
-{
- sint ix;
- sint gp;
- sint g, h = 0;
-
- if (k <= 0) return(0);
- if (!endgappenalties &&(i==0 || i==prf_length1)) return(0);
-
- g = profile2[j][GAPCOL] + profile1[i][GAPCOL];
- for (ix=0;ix<k && ix+j<prf_length2;ix++)
- h += profile2[ix+j][LENCOL];
-
- gp = g + h;
- return(gp);
-}
-/* calculate the score for opening a gap at residues A[i] and B[j] */
-
-static sint open_penalty2(sint i, sint j)
-{
- sint g;
-
- if (!endgappenalties &&(j==0 || j==prf_length2)) return(0);
-
- g = profile1[i][GAPCOL] + profile2[j][GAPCOL];
- return(g);
-}
-
-/* calculate the score for extending an existing gap at A[i] and B[j] */
-
-static sint ext_penalty2(sint i, sint j)
-{
- sint h;
-
- if (!endgappenalties &&(j==0 || j==prf_length2)) return(0);
-
- h = profile1[i][LENCOL];
- return(h);
-}
-
-/* calculate the score for a gap of length k, at residues A[i] and B[j] */
-
-static sint gap_penalty2(sint i, sint j, sint k)
-{
- sint ix;
- sint gp;
- sint g, h = 0;
-
- if (k <= 0) return(0);
- if (!endgappenalties &&(j==0 || j==prf_length2)) return(0);
-
- g = profile1[i][GAPCOL] + profile2[j][GAPCOL];
- for (ix=0;ix<k && ix+i<prf_length1;ix++)
- h += profile1[ix+i][LENCOL];
-
- gp = g + h;
- return(gp);
-}
Deleted: trunk/packages/clustalw/trunk/random.c
===================================================================
--- trunk/packages/clustalw/trunk/random.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/random.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,81 +0,0 @@
-/*
-*
-* Rand.c
-*
-* - linear and additive congruential random number generators
-* (see R. Sedgewick, Algorithms, Chapter 35)
-*
-* Implementation: R. Fuchs, EMBL Data Library, 1991
-*
-*/
-#include <stdio.h>
-
-unsigned long linrand(unsigned long r);
-unsigned long addrand(unsigned long r);
-void addrandinit(unsigned long s);
-
-static unsigned long mult(unsigned long p,unsigned long q);
-
-
-#define m1 10000
-#define m 100000000
-
-static unsigned long mult(unsigned long p, unsigned long q);
-
-/* linear congruential method
-*
-* linrand() returns an unsigned long random number in the range 0 to r-1
-*/
-
-
-unsigned long linrand(unsigned long r)
-{
- static unsigned long a=1234567;
-
- a = (mult(a,31415821)+1) % m;
- return( ( (a / m1) * r) / m1 );
-}
-
-static unsigned long mult(unsigned long p, unsigned long q)
-{
- unsigned long p1,p0,q1,q0;
-
- p1 = p/m1; p0 = p % m1;
- q1 = q/m1; q0 = q % m1;
- return((((p0*q1 + p1*q0) % m1) * m1 + p0*q0) % m);
-}
-
-
-/* additive congruential method
-*
-* addrand() returns an unsigned long random number in the range 0 to r-1
-* The random number generator is initialized by addrandinit()
-*/
-
-static unsigned long j;
-static unsigned long a[55];
-
-unsigned long addrand(unsigned long r)
-{
-int x,y;
-/* fprintf(stdout,"\n j = %d",j); */
- j = (j + 1) % 55;
-/* fprintf(stdout,"\n j = %d",j); */
- x = (j+23)%55;
- y = (j+54)%55;
- a[j] = (a[x] + a[y]) % m;
-/* a[j] = (a[(j+23)%55] + a[(j+54)%55]) % m; */
-/* fprintf(stdout,"\n a[j] = %d",a[j]); */
- return( ((a[j] / m1) * r) / m1 );
-}
-
-void addrandinit(unsigned long s)
-{
- a[0] = s;
- j = 0;
- do {
- ++j;
- a[j] = (mult(31,a[j-1]) + 1) % m;
- } while (j<54);
-}
-
Deleted: trunk/packages/clustalw/trunk/readmat.c
===================================================================
--- trunk/packages/clustalw/trunk/readmat.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/readmat.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,476 +0,0 @@
-#include <stdio.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include "clustalw.h"
-#include "matrices.h"
-
-
-/*
- * Prototypes
- */
-static Boolean commentline(char *line);
-
-
-/*
- * Global variables
- */
-
-extern char *amino_acid_codes;
-extern sint gap_pos1, gap_pos2;
-extern sint max_aa;
-extern short def_dna_xref[],def_aa_xref[];
-extern sint mat_avscore;
-extern sint debug;
-extern Boolean dnaflag;
-
-extern Boolean user_series;
-extern UserMatSeries matseries;
-extern short usermatseries[MAXMAT][NUMRES][NUMRES];
-extern short aa_xrefseries[MAXMAT][NUMRES+1];
-
-
-void init_matrix(void)
-{
-
- char c1,c2;
- short i, j, maxres;
-
- max_aa = strlen(amino_acid_codes)-2;
- gap_pos1 = NUMRES-2; /* code for gaps inserted by clustalw */
- gap_pos2 = NUMRES-1; /* code for gaps already in alignment */
-
-/*
- set up cross-reference for default matrices hard-coded in matrices.h
-*/
- for (i=0;i<NUMRES;i++) def_aa_xref[i] = -1;
- for (i=0;i<NUMRES;i++) def_dna_xref[i] = -1;
-
- maxres = 0;
- for (i=0;(c1=amino_acid_order[i]);i++)
- {
- for (j=0;(c2=amino_acid_codes[j]);j++)
- {
- if (c1 == c2)
- {
- def_aa_xref[i] = j;
- maxres++;
- break;
- }
- }
- if ((def_aa_xref[i] == -1) && (amino_acid_order[i] != '*'))
- {
- error("residue %c in matrices.h is not recognised",
- amino_acid_order[i]);
- }
- }
-
- maxres = 0;
- for (i=0;(c1=nucleic_acid_order[i]);i++)
- {
- for (j=0;(c2=amino_acid_codes[j]);j++)
- {
- if (c1 == c2)
- {
- def_dna_xref[i] = j;
- maxres++;
- break;
- }
- }
- if ((def_dna_xref[i] == -1) && (nucleic_acid_order[i] != '*'))
- {
- error("nucleic acid %c in matrices.h is not recognised",
- nucleic_acid_order[i]);
- }
- }
-}
-
-sint get_matrix(short *matptr, short *xref, sint matrix[NUMRES][NUMRES], Boolean neg_flag, sint scale)
-{
- sint gg_score = 0;
- sint gr_score = 0;
- sint i, j, k, ix = 0;
- sint ti, tj;
- sint maxres;
- sint av1,av2,av3,min, max;
-/*
- default - set all scores to 0
-*/
- for (i=0;i<=max_aa;i++)
- for (j=0;j<=max_aa;j++)
- matrix[i][j] = 0;
-
- ix = 0;
- maxres = 0;
- for (i=0;i<=max_aa;i++)
- {
- ti = xref[i];
- for (j=0;j<=i;j++)
- {
- tj = xref[j];
- if ((ti != -1) && (tj != -1))
- {
- k = matptr[ix];
- if (ti==tj)
- {
- matrix[ti][ti] = k * scale;
- maxres++;
- }
- else
- {
- matrix[ti][tj] = k * scale;
- matrix[tj][ti] = k * scale;
- }
- ix++;
- }
- }
- }
-
- --maxres;
-
- av1 = av2 = av3 = 0;
- for (i=0;i<=max_aa;i++)
- {
- for (j=0;j<=i;j++)
- {
- av1 += matrix[i][j];
- if (i==j)
- {
- av2 += matrix[i][j];
- }
- else
- {
- av3 += matrix[i][j];
- }
- }
- }
-
- av1 /= (maxres*maxres)/2;
- av2 /= maxres;
- av3 /= ((float)(maxres*maxres-maxres))/2;
- mat_avscore = -av3;
-
- min = max = matrix[0][0];
- for (i=0;i<=max_aa;i++)
- for (j=1;j<=i;j++)
- {
- if (matrix[i][j] < min) min = matrix[i][j];
- if (matrix[i][j] > max) max = matrix[i][j];
- }
-if (debug>1) fprintf(stdout,"maxres %d\n",(pint)max_aa);
-if (debug>1) fprintf(stdout,"average mismatch score %d\n",(pint)av3);
-if (debug>1) fprintf(stdout,"average match score %d\n",(pint)av2);
-if (debug>1) fprintf(stdout,"average score %d\n",(pint)av1);
-
-/*
- if requested, make a positive matrix - add -(lowest score) to every entry
-*/
- if (neg_flag == FALSE)
- {
-
-if (debug>1) fprintf(stdout,"min %d max %d\n",(pint)min,(pint)max);
- if (min < 0)
- {
- for (i=0;i<=max_aa;i++)
- {
- ti = xref[i];
- if (ti != -1)
- {
- for (j=0;j<=max_aa;j++)
- {
- tj = xref[j];
-/*
- if (tj != -1) matrix[ti][tj] -= (2*av3);
-*/
- if (tj != -1) matrix[ti][tj] -= min;
- }
- }
- }
- }
-/*
- gr_score = av3;
- gg_score = -av3;
-*/
-
- }
-
-
-
- for (i=0;i<gap_pos1;i++)
- {
- matrix[i][gap_pos1] = gr_score;
- matrix[gap_pos1][i] = gr_score;
- matrix[i][gap_pos2] = gr_score;
- matrix[gap_pos2][i] = gr_score;
- }
- matrix[gap_pos1][gap_pos1] = gg_score;
- matrix[gap_pos2][gap_pos2] = gg_score;
- matrix[gap_pos2][gap_pos1] = gg_score;
- matrix[gap_pos1][gap_pos2] = gg_score;
-
- maxres += 2;
-
- return(maxres);
-}
-
-
-sint read_matrix_series(char *filename, short *usermat, short *xref)
-{
- FILE *fd = NULL, *matfd = NULL;
- char mat_filename[FILENAMELEN];
- char inline1[1024];
- sint maxres = 0;
- sint nmat;
- sint n,llimit,ulimit;
-
- if (filename[0] == '\0')
- {
- error("comparison matrix not specified");
- return((sint)0);
- }
- if ((fd=fopen(filename,"r"))==NULL)
- {
- error("cannot open %s", filename);
- return((sint)0);
- }
-
-/* check the first line to see if it's a series or a single matrix */
- while (fgets(inline1,1024,fd) != NULL)
- {
- if (commentline(inline1)) continue;
- if(linetype(inline1,"CLUSTAL_SERIES"))
- user_series=TRUE;
- else
- user_series=FALSE;
- break;
- }
-
-/* it's a single matrix */
- if(user_series == FALSE)
- {
- fclose(fd);
- maxres=read_user_matrix(filename,usermat,xref);
- return(maxres);
- }
-
-/* it's a series of matrices, find the next MATRIX line */
- nmat=0;
- matseries.nmat=0;
- while (fgets(inline1,1024,fd) != NULL)
- {
- if (commentline(inline1)) continue;
- if(linetype(inline1,"MATRIX"))
- {
- if(sscanf(inline1+6,"%d %d %s",&llimit,&ulimit,mat_filename)!=3)
- {
- error("Bad format in file %s\n",filename);
- fclose(fd);
- return((sint)0);
- }
- if(llimit<0 || llimit > 100 || ulimit <0 || ulimit>100)
- {
- error("Bad format in file %s\n",filename);
- fclose(fd);
- return((sint)0);
- }
- if(ulimit<=llimit)
- {
- error("in file %s: lower limit is greater than upper (%d-%d)\n",filename,llimit,ulimit);
- fclose(fd);
- return((sint)0);
- }
- n=read_user_matrix(mat_filename,&usermatseries[nmat][0][0],&aa_xrefseries[nmat][0]);
- if(n<=0)
- {
- error("Bad format in matrix file %s\n",mat_filename);
- fclose(fd);
- return((sint)0);
- }
- matseries.mat[nmat].llimit=llimit;
- matseries.mat[nmat].ulimit=ulimit;
- matseries.mat[nmat].matptr=&usermatseries[nmat][0][0];
- matseries.mat[nmat].aa_xref=&aa_xrefseries[nmat][0];
- nmat++;
- }
- }
- fclose(fd);
- matseries.nmat=nmat;
-
- maxres=n;
- return(maxres);
-
-}
-
-sint read_user_matrix(char *filename, short *usermat, short *xref)
-{
- double f;
- FILE *fd;
- sint numargs,farg;
- sint i, j, k = 0;
- char codes[NUMRES];
- char inline1[1024];
- char *args[NUMRES+4];
- char c1,c2;
- sint ix1, ix = 0;
- sint maxres = 0;
- float scale;
-
- if (filename[0] == '\0')
- {
- error("comparison matrix not specified");
- return((sint)0);
- }
-
- if ((fd=fopen(filename,"r"))==NULL)
- {
- error("cannot open %s", filename);
- return((sint)0);
- }
- maxres = 0;
- while (fgets(inline1,1024,fd) != NULL)
- {
- if (commentline(inline1)) continue;
- if(linetype(inline1,"CLUSTAL_SERIES"))
- {
- error("in %s - single matrix expected.", filename);
- fclose(fd);
- return((sint)0);
- }
-/*
- read residue characters.
-*/
- k = 0;
- for (j=0;j<strlen(inline1);j++)
- {
- if (isalpha((int)inline1[j])) codes[k++] = inline1[j];
- if (k>NUMRES)
- {
- error("too many entries in matrix %s",filename);
- fclose(fd);
- return((sint)0);
- }
- }
- codes[k] = '\0';
- break;
- }
-
- if (k == 0)
- {
- error("wrong format in matrix %s",filename);
- fclose(fd);
- return((sint)0);
- }
-
-/*
- cross-reference the residues
-*/
- for (i=0;i<NUMRES;i++) xref[i] = -1;
-
- maxres = 0;
- for (i=0;(c1=codes[i]);i++)
- {
- for (j=0;(c2=amino_acid_codes[j]);j++)
- if (c1 == c2)
- {
- xref[i] = j;
- maxres++;
- break;
- }
- if ((xref[i] == -1) && (codes[i] != '*'))
- {
- warning("residue %c in matrix %s not recognised",
- codes[i],filename);
- }
- }
-
-
-/*
- get the weights
-*/
-
- ix = ix1 = 0;
- while (fgets(inline1,1024,fd) != NULL)
- {
- if (inline1[0] == '\n') continue;
- if (inline1[0] == '#' ||
- inline1[0] == '!') break;
- numargs = getargs(inline1, args, (int)(k+1));
- if (numargs < maxres)
- {
- error("wrong format in matrix %s",filename);
- fclose(fd);
- return((sint)0);
- }
- if (isalpha(args[0][0])) farg=1;
- else farg=0;
-
-/* decide whether the matrix values are float or decimal */
- scale=1.0;
- for(i=0;i<strlen(args[farg]);i++)
- if(args[farg][i]=='.')
- {
-/* we've found a float value */
- scale=10.0;
- break;
- }
-
- for (i=0;i<=ix;i++)
- {
- if (xref[i] != -1)
- {
- f = atof(args[i+farg]);
- usermat[ix1++] = (short)(f*scale);
- }
- }
- ix++;
- }
- if (ix != k+1)
- {
- error("wrong format in matrix %s",filename);
- fclose(fd);
- return((sint)0);
- }
-
-
- maxres += 2;
- fclose(fd);
-
- return(maxres);
-}
-
-int getargs(char *inline1,char *args[],int max)
-{
-
- char *inptr;
-/*
-#ifndef MAC
- char *strtok(char *s1, const char *s2);
-#endif
-*/
- int i;
-
- inptr=inline1;
- for (i=0;i<=max;i++)
- {
- if ((args[i]=strtok(inptr," \t\n"))==NULL)
- break;
- inptr=NULL;
- }
-
- return(i);
-}
-
-
-static Boolean commentline(char *line)
-{
- int i;
-
- if(line[0] == '#') return TRUE;
- for(i=0;line[i]!='\n' && line[i]!=EOS;i++) {
- if(!isspace(line[i]))
- return FALSE;
- }
- return TRUE;
-}
-
Deleted: trunk/packages/clustalw/trunk/sequence.c
===================================================================
--- trunk/packages/clustalw/trunk/sequence.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/sequence.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,1379 +0,0 @@
-/********* Sequence input routines for CLUSTAL W *******************/
-/* DES was here. FEB. 1994 */
-/* Now reads PILEUP/MSF and CLUSTAL alignment files */
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include "clustalw.h"
-
-#define MIN(a,b) ((a)<(b)?(a):(b))
-
-
-
-/*
-* Prototypes
-*/
-
-static char * get_seq(char *,sint *,char *);
-static char * get_clustal_seq(char *,sint *,char *,sint);
-static char * get_msf_seq(char *,sint *,char *,sint);
-static void check_infile(sint *);
-static void p_encode(char *, char *, sint);
-static void n_encode(char *, char *, sint);
-static sint res_index(char *,char);
-static Boolean check_dnaflag(char *, sint);
-static sint count_clustal_seqs(void);
-static sint count_pir_seqs(void);
-static sint count_msf_seqs(void);
-static sint count_rsf_seqs(void);
-static void get_swiss_feature(char *line,sint len);
-static void get_rsf_feature(char *line,sint len);
-static void get_swiss_mask(char *line,sint len);
-static void get_clustal_ss(sint length);
-static void get_embl_ss(sint length);
-static void get_rsf_ss(sint length);
-static void get_gde_ss(sint length);
-static Boolean cl_blankline(char *line);
-
-/*
- * Global variables
- */
-extern sint max_names;
-FILE *fin;
-extern Boolean usemenu, dnaflag, explicit_dnaflag;
-extern Boolean interactive;
-extern char seqname[];
-extern sint nseqs;
-extern sint *seqlen_array;
-extern sint *output_index;
-extern char **names,**titles;
-extern char **seq_array;
-extern Boolean profile1_empty, profile2_empty;
-extern sint gap_pos2;
-extern sint max_aln_length;
-extern char *gap_penalty_mask, *sec_struct_mask;
-extern sint struct_penalties;
-extern char *ss_name;
-extern sint profile_no;
-extern sint debug;
-
-char *amino_acid_codes = "ABCDEFGHIKLMNPQRSTUVWXYZ-"; /* DES */
-static sint seqFormat;
-static char chartab[128];
-static char *formatNames[] = {"unknown","EMBL/Swiss-Prot","PIR",
- "Pearson","GDE","Clustal","Pileup/MSF","RSF","USER","PHYLIP","NEXUS"};
-
-void fill_chartab(void) /* Create translation and check table */
-{
- register sint i;
- register char c;
-
- for(i=0;i<128;chartab[i++]=0);
- for(i=0;(c=amino_acid_codes[i]);i++)
- chartab[(int)c]=chartab[tolower(c)]=c;
-}
-
-static char * get_msf_seq(char *sname,sint *len,char *tit,sint seqno)
-/* read the seqno_th. sequence from a PILEUP multiple alignment file */
-{
- static char line[MAXLINE+1];
- char *seq = NULL;
- sint i,j,k;
- unsigned char c;
-
- fseek(fin,0,0); /* start at the beginning */
-
- *len=0; /* initialise length to zero */
- for(i=0;;i++) {
- if(fgets(line,MAXLINE+1,fin)==NULL) return NULL; /* read the title*/
- if(linetype(line,"//") ) break; /* lines...ignore*/
- }
-
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if(!blankline(line)) {
-
- for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
- for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
- for(k=j;k<=strlen(line);k++) if(line[k] == ' ') break;
- strncpy(sname,line+j,MIN(MAXNAMES,k-j));
- sname[MIN(MAXNAMES,k-j)]=EOS;
- rtrim(sname);
- blank_to_(sname);
-
- if(seq==NULL)
- seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
- else
- seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
- for(i=k;i<=MAXLINE;i++) {
- c=line[i];
- if(c == '.' || c == '~' ) c = '-';
- if(c == '*') c = 'X';
- if(c == '\n' || c == EOS) break; /* EOL */
- c=chartab[c];
- if(c) seq[++(*len)]=c;
- }
-
- for(i=0;;i++) {
- if(fgets(line,MAXLINE+1,fin)==NULL) return seq;
- if(blankline(line)) break;
- }
- }
- }
- return seq;
-}
-
-static Boolean cl_blankline(char *line)
-{
- int i;
-
- if (line[0] == '!') return TRUE;
-
- for(i=0;line[i]!='\n' && line[i]!=EOS;i++) {
- if( isdigit(line[i]) ||
- isspace(line[i]) ||
- (line[i] == '*') ||
- (line[i] == ':') ||
- (line[i] == '.'))
- ;
- else
- return FALSE;
- }
- return TRUE;
-}
-
-static char * get_clustal_seq(char *sname,sint *len,char *tit,sint seqno)
-/* read the seqno_th. sequence from a clustal multiple alignment file */
-{
- static char line[MAXLINE+1];
- static char tseq[MAXLINE+1];
- char *seq = NULL;
- sint i,j;
- unsigned char c;
-
- fseek(fin,0,0); /* start at the beginning */
-
- *len=0; /* initialise length to zero */
- fgets(line,MAXLINE+1,fin); /* read the title line...ignore it */
-
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if(!cl_blankline(line)) {
-
- for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
- for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
-
- sscanf(line,"%s%s",sname,tseq);
- for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break;
- sname[j]=EOS;
- rtrim(sname);
- blank_to_(sname);
-
- if(seq==NULL)
- seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
- else
- seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
- for(i=0;i<=MAXLINE;i++) {
- c=tseq[i];
- /*if(c == '\n' || c == EOS) break;*/ /* EOL */
- if(isspace(c) || c == EOS) break; /* EOL */
- c=chartab[c];
- if(c) seq[++(*len)]=c;
- }
-
- for(i=0;;i++) {
- if(fgets(line,MAXLINE+1,fin)==NULL) return seq;
- if(cl_blankline(line)) break;
- }
- }
- }
-
- return seq;
-}
-
-static void get_clustal_ss(sint length)
-/* read the structure data from a clustal multiple alignment file */
-{
- static char title[MAXLINE+1];
- static char line[MAXLINE+1];
- static char lin2[MAXLINE+1];
- static char tseq[MAXLINE+1];
- static char sname[MAXNAMES+1];
- sint i,j,len,ix,struct_index=0;
- char c;
-
-
- fseek(fin,0,0); /* start at the beginning */
-
- len=0; /* initialise length to zero */
- if (fgets(line,MAXLINE+1,fin) == NULL) return; /* read the title line...ignore it */
-
- if (fgets(line,MAXLINE+1,fin) == NULL) return; /* read the next line... */
-/* skip any blank lines */
- for (;;) {
- if(fgets(line,MAXLINE+1,fin)==NULL) return;
- if(!blankline(line)) break;
- }
-
-/* look for structure table lines */
- ix = -1;
- for(;;) {
- if(line[0] != '!') break;
- if(strncmp(line,"!SS",3) == 0) {
- ix++;
- sscanf(line+4,"%s%s",sname,tseq);
- for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break;
- sname[j]=EOS;
- rtrim(sname);
- blank_to_(sname);
- if (interactive) {
- strcpy(title,"Found secondary structure in alignment file: ");
- strcat(title,sname);
- (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
- }
- else (*lin2) = 'y';
- if ((*lin2 != 'n') && (*lin2 != 'N')) {
- struct_penalties = SECST;
- struct_index = ix;
- for (i=0;i<length;i++)
- {
- sec_struct_mask[i] = '.';
- gap_penalty_mask[i] = '.';
- }
- strcpy(ss_name,sname);
- for(i=0;len < length;i++) {
- c = tseq[i];
- if(c == '\n' || c == EOS) break; /* EOL */
- if (!isspace(c)) sec_struct_mask[len++] = c;
- }
- }
- }
- else if(strncmp(line,"!GM",3) == 0) {
- ix++;
- sscanf(line+4,"%s%s",sname,tseq);
- for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break;
- sname[j]=EOS;
- rtrim(sname);
- blank_to_(sname);
- if (interactive) {
- strcpy(title,"Found gap penalty mask in alignment file: ");
- strcat(title,sname);
- (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
- }
- else (*lin2) = 'y';
- if ((*lin2 != 'n') && (*lin2 != 'N')) {
- struct_penalties = GMASK;
- struct_index = ix;
- for (i=0;i<length;i++)
- gap_penalty_mask[i] = '1';
- strcpy(ss_name,sname);
- for(i=0;len < length;i++) {
- c = tseq[i];
- if(c == '\n' || c == EOS) break; /* EOL */
- if (!isspace(c)) gap_penalty_mask[len++] = c;
- }
- }
- }
- if (struct_penalties != NONE) break;
- if(fgets(line,MAXLINE+1,fin)==NULL) return;
- }
-
- if (struct_penalties == NONE) return;
-
-/* skip any more comment lines */
- while (line[0] == '!') {
- if(fgets(line,MAXLINE+1,fin)==NULL) return;
- }
-
-/* skip the sequence lines and any comments after the alignment */
- for (;;) {
- if(isspace(line[0])) break;
- if(fgets(line,MAXLINE+1,fin)==NULL) return;
- }
-
-
-/* read the rest of the alignment */
-
- for (;;) {
-/* skip any blank lines */
- for (;;) {
- if(!blankline(line)) break;
- if(fgets(line,MAXLINE+1,fin)==NULL) return;
- }
-/* get structure table line */
- for(ix=0;ix<struct_index;ix++) {
- if (line[0] != '!') {
- if(struct_penalties == SECST)
- error("bad secondary structure format");
- else
- error("bad gap penalty mask format");
- struct_penalties = NONE;
- return;
- }
- if(fgets(line,MAXLINE+1,fin)==NULL) return;
- }
- if(struct_penalties == SECST) {
- if (strncmp(line,"!SS",3) != 0) {
- error("bad secondary structure format");
- struct_penalties = NONE;
- return;
- }
- sscanf(line+4,"%s%s",sname,tseq);
- for(i=0;len < length;i++) {
- c = tseq[i];
- if(c == '\n' || c == EOS) break; /* EOL */
- if (!isspace(c)) sec_struct_mask[len++] = c;
- }
- }
- else if (struct_penalties == GMASK) {
- if (strncmp(line,"!GM",3) != 0) {
- error("bad gap penalty mask format");
- struct_penalties = NONE;
- return;
- }
- sscanf(line+4,"%s%s",sname,tseq);
- for(i=0;len < length;i++) {
- c = tseq[i];
- if(c == '\n' || c == EOS) break; /* EOL */
- if (!isspace(c)) gap_penalty_mask[len++] = c;
- }
- }
-
-/* skip any more comment lines */
- while (line[0] == '!') {
- if(fgets(line,MAXLINE+1,fin)==NULL) return;
- }
-
-/* skip the sequence lines */
- for (;;) {
- if(isspace(line[0])) break;
- if(fgets(line,MAXLINE+1,fin)==NULL) return;
- }
- }
-}
-
-static void get_embl_ss(sint length)
-{
- static char title[MAXLINE+1];
- static char line[MAXLINE+1];
- static char lin2[MAXLINE+1];
- static char sname[MAXNAMES+1];
- char feature[MAXLINE+1];
- sint i;
-
-/* find the start of the sequence entry */
- for (;;) {
- while( !linetype(line,"ID") )
- if (fgets(line,MAXLINE+1,fin) == NULL) return;
-
- for(i=5;i<=strlen(line);i++) /* DES */
- if(line[i] != ' ') break;
- strncpy(sname,line+i,MAXNAMES); /* remember entryname */
- for(i=0;i<=strlen(sname);i++)
- if(sname[i] == ' ') {
- sname[i]=EOS;
- break;
- }
- sname[MAXNAMES]=EOS;
- rtrim(sname);
- blank_to_(sname);
-
-/* look for secondary structure feature table / gap penalty mask */
- while(fgets(line,MAXLINE+1,fin) != NULL) {
- if (linetype(line,"FT")) {
- sscanf(line+2,"%s",feature);
- if (strcmp(feature,"HELIX") == 0 ||
- strcmp(feature,"STRAND") == 0)
- {
-
- if (interactive) {
- strcpy(title,"Found secondary structure in alignment file: ");
- strcat(title,sname);
- (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
- }
- else (*lin2) = 'y';
- if ((*lin2 != 'n') && (*lin2 != 'N')) {
- struct_penalties = SECST;
- for (i=0;i<length;i++)
- sec_struct_mask[i] = '.';
- do {
- get_swiss_feature(&line[2],length);
- fgets(line,MAXLINE+1,fin);
- } while( linetype(line,"FT") );
- }
- else {
- do {
- fgets(line,MAXLINE+1,fin);
- } while( linetype(line,"FT") );
- }
- strcpy(ss_name,sname);
- }
- }
- else if (linetype(line,"GM")) {
- if (interactive) {
- strcpy(title,"Found gap penalty mask in alignment file: ");
- strcat(title,sname);
- (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
- }
- else (*lin2) = 'y';
- if ((*lin2 != 'n') && (*lin2 != 'N')) {
- struct_penalties = GMASK;
- for (i=0;i<length;i++)
- gap_penalty_mask[i] = '1';
- do {
- get_swiss_mask(&line[2],length);
- fgets(line,MAXLINE+1,fin);
- } while( linetype(line,"GM") );
- }
- else {
- do {
- fgets(line,MAXLINE+1,fin);
- } while( linetype(line,"GM") );
- }
- strcpy(ss_name,sname);
- }
- if (linetype(line,"SQ"))
- break;
-
- if (struct_penalties != NONE) break;
- }
-
- }
-
-}
-
-static void get_rsf_ss(sint length)
-{
- static char title[MAXLINE+1];
- static char line[MAXLINE+1];
- static char lin2[MAXLINE+1];
- static char sname[MAXNAMES+1];
- sint i;
-
-/* skip the comments */
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if(line[strlen(line)-2]=='.' &&
- line[strlen(line)-3]=='.')
- break;
- }
-
-/* find the start of the sequence entry */
- for (;;) {
- while (fgets(line,MAXLINE+1,fin) != NULL)
- if( *line == '{' ) break;
-
- while( !keyword(line,"name") )
- if (fgets(line,MAXLINE+1,fin) == NULL) return;
-
- for(i=5;i<=strlen(line);i++) /* DES */
- if(line[i] != ' ') break;
- strncpy(sname,line+i,MAXNAMES); /* remember entryname */
- for(i=0;i<=strlen(sname);i++)
- if(sname[i] == ' ') {
- sname[i]=EOS;
- break;
- }
- sname[MAXNAMES]=EOS;
- rtrim(sname);
- blank_to_(sname);
-
-/* look for secondary structure feature table / gap penalty mask */
- while(fgets(line,MAXLINE+1,fin) != NULL) {
- if (keyword(line,"feature")) {
- if (interactive) {
- strcpy(title,"Found secondary structure in alignment file: ");
- strcat(title,sname);
- (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
- }
- else (*lin2) = 'y';
- if ((*lin2 != 'n') && (*lin2 != 'N')) {
- struct_penalties = SECST;
- for (i=0;i<length;i++)
- sec_struct_mask[i] = '.';
- do {
- if(keyword(line,"feature"))
- get_rsf_feature(&line[7],length);
- fgets(line,MAXLINE+1,fin);
- } while( !keyword(line,"sequence") );
- }
- else {
- do {
- fgets(line,MAXLINE+1,fin);
- } while( !keyword(line,"sequence") );
- }
- strcpy(ss_name,sname);
- }
- else if (keyword(line,"sequence"))
- break;
-
- if (struct_penalties != NONE) break;
- }
-
- }
-
-}
-
-static void get_gde_ss(sint length)
-{
- static char title[MAXLINE+1];
- static char line[MAXLINE+1];
- static char lin2[MAXLINE+1];
- static char sname[MAXNAMES+1];
- sint i, len, offset = 0;
- unsigned char c;
-
- for (;;) {
- line[0] = '\0';
-/* search for the next comment line */
- while(*line != '"')
- if (fgets(line,MAXLINE+1,fin) == NULL) return;
-
-/* is it a secondary structure entry? */
- if (strncmp(&line[1],"SS_",3) == 0) {
- for (i=1;i<=MAXNAMES-3;i++) {
- if (line[i+3] == '(' || line[i+3] == '\n')
- break;
- sname[i-1] = line[i+3];
- }
- i--;
- sname[i]=EOS;
- if (sname[i-1] == '(') sscanf(&line[i+3],"%d",&offset);
- else offset = 0;
- for(i--;i > 0;i--)
- if(isspace(sname[i])) {
- sname[i]=EOS;
- }
- else break;
- blank_to_(sname);
-
- if (interactive) {
- strcpy(title,"Found secondary structure in alignment file: ");
- strcat(title,sname);
- (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
- }
- else (*lin2) = 'y';
- if ((*lin2 != 'n') && (*lin2 != 'N')) {
- struct_penalties = SECST;
- for (i=0;i<length;i++)
- sec_struct_mask[i] = '.';
- len = 0;
- while(fgets(line,MAXLINE+1,fin)) {
- if(*line == '%' || *line == '#' || *line == '"') break;
- for(i=offset;i < length;i++) {
- c=line[i];
- if(c == '\n' || c == EOS)
- break; /* EOL */
- sec_struct_mask[len++]=c;
- }
- if (len > length) break;
- }
- strcpy(ss_name,sname);
- }
- }
-/* or is it a gap penalty mask entry? */
- else if (strncmp(&line[1],"GM_",3) == 0) {
- for (i=1;i<=MAXNAMES-3;i++) {
- if (line[i+3] == '(' || line[i+3] == '\n')
- break;
- sname[i-1] = line[i+3];
- }
- i--;
- sname[i]=EOS;
- if (sname[i-1] == '(') sscanf(&line[i+3],"%d",&offset);
- else offset = 0;
- for(i--;i > 0;i--)
- if(isspace(sname[i])) {
- sname[i]=EOS;
- }
- else break;
- blank_to_(sname);
-
- if (interactive) {
- strcpy(title,"Found gap penalty mask in alignment file: ");
- strcat(title,sname);
- (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties ");
- }
- else (*lin2) = 'y';
- if ((*lin2 != 'n') && (*lin2 != 'N')) {
- struct_penalties = GMASK;
- for (i=0;i<length;i++)
- gap_penalty_mask[i] = '1';
- len = 0;
- while(fgets(line,MAXLINE+1,fin)) {
- if(*line == '%' || *line == '#' || *line == '"') break;
- for(i=offset;i < length;i++) {
- c=line[i];
- if(c == '\n' || c == EOS)
- break; /* EOL */
- gap_penalty_mask[len++]=c;
- }
- if (len > length) break;
- }
- strcpy(ss_name,sname);
- }
- }
- if (struct_penalties != NONE) break;
- }
-
-}
-
-static void get_swiss_feature(char *line, sint len)
-{
- char c, s, feature[MAXLINE+1];
- int i, start_pos, end_pos;
-
- if (sscanf(line,"%s%d%d",feature,&start_pos,&end_pos) != 3) {
- return;
- }
-
- if (strcmp(feature,"HELIX") == 0) {
- c = 'A';
- s = '$';
- }
- else if (strcmp(feature,"STRAND") == 0) {
- c = 'B';
- s = '%';
- }
- else
- return;
-
- if(start_pos >=len || end_pos>=len) return;
-
- sec_struct_mask[start_pos-1] = s;
- for (i=start_pos;i<end_pos-1;i++)
- sec_struct_mask[i] = c;
- sec_struct_mask[end_pos-1] = s;
-
-}
-
-static void get_rsf_feature(char *line, sint len)
-{
- char c, s;
- char str1[MAXLINE+1],str2[MAXLINE+1],feature[MAXLINE+1];
- int i, tmp,start_pos, end_pos;
-
- if (sscanf(line,"%d%d%d%s%s%s",&start_pos,&end_pos,&tmp,str1,str2,feature) != 6) {
- return;
- }
-
- if (strcmp(feature,"HELIX") == 0) {
- c = 'A';
- s = '$';
- }
- else if (strcmp(feature,"STRAND") == 0) {
- c = 'B';
- s = '%';
- }
- else
- return;
-
- if(start_pos>=len || end_pos >= len) return;
- sec_struct_mask[start_pos-1] = s;
- for (i=start_pos;i<end_pos-1;i++)
- sec_struct_mask[i] = c;
- sec_struct_mask[end_pos-1] = s;
-
-}
-
-static void get_swiss_mask(char *line, sint len)
-{
- int i, value, start_pos, end_pos;
-
- if (sscanf(line,"%d%d%d",&value,&start_pos,&end_pos) != 3) {
- return;
- }
-
- if (value < 1 || value > 9) return;
-
- if(start_pos>=len || end_pos >= len) return;
- for (i=start_pos-1;i<end_pos;i++)
- gap_penalty_mask[i] = value+'0';
-
-}
-
-static char * get_seq(char *sname,sint *len,char *tit)
-{
- static char line[MAXLINE+1];
- char *seq = NULL;
- sint i, offset = 0;
- unsigned char c=EOS;
- Boolean got_seq=FALSE;
-
- switch(seqFormat) {
-
-/************************************/
- case EMBLSWISS:
- while( !linetype(line,"ID") )
- if (fgets(line,MAXLINE+1,fin) == NULL) return NULL;
-
- for(i=5;i<=strlen(line);i++) /* DES */
- if(line[i] != ' ') break;
- strncpy(sname,line+i,MAXNAMES); /* remember entryname */
- for(i=0;i<=strlen(sname);i++)
- if(sname[i] == ' ') {
- sname[i]=EOS;
- break;
- }
-
- sname[MAXNAMES]=EOS;
- rtrim(sname);
- blank_to_(sname);
-
-
- while( !linetype(line,"SQ") )
- fgets(line,MAXLINE+1,fin);
-
- *len=0;
- while(fgets(line,MAXLINE+1,fin)) {
- if(got_seq && blankline(line)) break;
- if( strlen(line) > 2 && line[strlen(line)-2]=='.' && line[strlen(line)-3]=='.' )
- continue;
- if(seq==NULL)
- seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
- else
- seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
- for(i=0;i<=MAXLINE;i++) {
- c=line[i];
- if(c == '\n' || c == EOS || c == '/')
- break; /* EOL */
- c=chartab[c];
- if(c) {
- got_seq=TRUE;
- seq[++(*len)]=c;
- }
- }
- if(c == '/') break;
- }
- break;
-
-/************************************/
- case PIR:
- while(*line != '>')
- fgets(line,MAXLINE+1,fin);
- for(i=4;i<=strlen(line);i++) /* DES */
- if(line[i] != ' ') break;
- strncpy(sname,line+i,MAXNAMES); /* remember entryname */
- sname[MAXNAMES]=EOS;
- rtrim(sname);
- blank_to_(sname);
-
- fgets(line,MAXLINE+1,fin);
- strncpy(tit,line,MAXTITLES);
- tit[MAXTITLES]=EOS;
- i=strlen(tit);
- if(tit[i-1]=='\n') tit[i-1]=EOS;
-
- *len=0;
- while(fgets(line,MAXLINE+1,fin)) {
- if(seq==NULL)
- seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
- else
- seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
- for(i=0;i<=MAXLINE;i++) {
- c=line[i];
- if(c == '\n' || c == EOS || c == '*')
- break; /* EOL */
-
- c=chartab[c];
- if(c) seq[++(*len)]=c;
- }
- if(c == '*') break;
- }
- break;
-/***********************************************/
- case PEARSON:
- while(*line != '>')
- fgets(line,MAXLINE+1,fin);
-
- for(i=1;i<=strlen(line);i++) /* DES */
- if(line[i] != ' ') break;
- strncpy(sname,line+i,MAXNAMES); /* remember entryname */
- for(i=1;i<=strlen(sname);i++) /* DES */
- if(sname[i] == ' ') break;
- sname[i]=EOS;
- rtrim(sname);
- blank_to_(sname);
-
- *tit=EOS;
-
- *len=0;
- while(fgets(line,MAXLINE+1,fin)) {
- if(seq==NULL)
- seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
- else
- seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
- for(i=0;i<=MAXLINE;i++) {
- c=line[i];
- if(c == '\n' || c == EOS || c == '>')
- break; /* EOL */
-
- c=chartab[c];
- if(c) seq[++(*len)]=c;
- }
- if(c == '>') break;
- }
- break;
-/**********************************************/
- case GDE:
- if (dnaflag) {
- while(*line != '#')
- fgets(line,MAXLINE+1,fin);
- }
- else {
- while(*line != '%')
- fgets(line,MAXLINE+1,fin);
- }
-
- for (i=1;i<=MAXNAMES;i++) {
- if (line[i] == '(' || line[i] == '\n')
- break;
- sname[i-1] = line[i];
- }
- i--;
- sname[i]=EOS;
- if (sname[i-1] == '(') sscanf(&line[i],"%d",&offset);
- else offset = 0;
- for(i--;i > 0;i--)
- if(isspace(sname[i])) {
- sname[i]=EOS;
- }
- else break;
- blank_to_(sname);
-
- *tit=EOS;
-
- *len=0;
- for (i=0;i<offset;i++) seq[++(*len)] = '-';
- while(fgets(line,MAXLINE+1,fin)) {
- if(*line == '%' || *line == '#' || *line == '"') break;
- if(seq==NULL)
- seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
- else
- seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
- for(i=0;i<=MAXLINE;i++) {
- c=line[i];
- if(c == '\n' || c == EOS)
- break; /* EOL */
-
- c=chartab[c];
- if(c) seq[++(*len)]=c;
- }
- }
- break;
-/***********************************************/
- case RSF:
- while(*line != '{')
- if (fgets(line,MAXLINE+1,fin) == NULL) return NULL;
-
- while( !keyword(line,"name") )
- if (fgets(line,MAXLINE+1,fin) == NULL) return NULL;
-
- for(i=5;i<=strlen(line);i++) /* DES */
- if(line[i] != ' ') break;
- strncpy(sname,line+i,MAXNAMES); /* remember entryname */
- for(i=0;i<=strlen(sname);i++)
- if(sname[i] == ' ') {
- sname[i]=EOS;
- break;
- }
-
- sname[MAXNAMES]=EOS;
- rtrim(sname);
- blank_to_(sname);
-
-
- while( !keyword(line,"sequence") )
- if (fgets(line,MAXLINE+1,fin) == NULL) return NULL;
-
- *len=0;
- while(fgets(line,MAXLINE+1,fin)) {
- if(seq==NULL)
- seq=(char *)ckalloc((MAXLINE+2)*sizeof(char));
- else
- seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char));
- for(i=0;i<=MAXLINE;i++) {
- c=line[i];
- if(c == EOS || c == '}')
- break; /* EOL */
- if( c=='.')
- seq[++(*len)]='-';
- c=chartab[c];
- if(c) seq[++(*len)]=c;
- }
- if(c == '}') break;
- }
- break;
-/***********************************************/
- }
-
- seq[*len+1]=EOS;
-
- return seq;
-}
-
-
-sint readseqs(sint first_seq) /*first_seq is the #no. of the first seq. to read */
-{
- char line[FILENAMELEN+1];
- char fileName[FILENAMELEN+1];
-
- static char *seq1,sname1[MAXNAMES+1],title[MAXTITLES+1];
- sint i,j;
- sint no_seqs;
- static sint l1;
- static Boolean dnaflag1;
-
- if(usemenu)
- getstr("Enter the name of the sequence file",FILENAMELEN+1,line);
- else
- strcpy(line,seqname);
- if(*line == EOS) return -1;
-
- if ((sscanf(line,"file://%s",fileName) == 1 )) {
- strcpy(line,fileName);
- }
-
- if((fin=fopen(line,"r"))==NULL) {
- error("Could not open sequence file (%s) ",line);
- return -1; /* DES -1 => file not found */
- }
- strcpy(seqname,line);
- no_seqs=0;
- check_infile(&no_seqs);
- info("Sequence format is %s",formatNames[seqFormat]);
- if(seqFormat==NEXUS)
- error("Cannot read nexus format");
-
-/* DES DEBUG
- fprintf(stdout,"\n\n File name = %s\n\n",seqname);
-*/
- if(no_seqs == 0)
- return 0; /* return the number of seqs. (zero here)*/
-
-/*
- if((no_seqs + first_seq -1) > MAXN) {
- error("Too many sequences. Maximum is %d",(pint)MAXN);
- return 0;
- }
-*/
-
-/* DES */
-/* if(seqFormat == CLUSTAL) {
- info("no of sequences = %d",(pint)no_seqs);
- return no_seqs;
- }
-*/
- max_aln_length = 0;
-
-/* if this is a multiple alignment, or profile 1 - free any memory used
-by previous alignments, then allocate memory for the new alignment */
- if(first_seq == 1) {
- max_names = 0;
- free_aln(nseqs);
- alloc_aln(no_seqs);
- }
-/* otherwise, this is a profile 2, and we need to reallocate the arrays,
-leaving the data for profile 1 intact */
- else realloc_aln(first_seq,no_seqs);
-
- for(i=1;i<first_seq;i++)
- {
- if(seqlen_array[i]>max_aln_length)
- max_aln_length=seqlen_array[i];
- if(strlen(names[i])>max_names)
- max_names=strlen(names[i]);
- }
-
- for(i=first_seq;i<=first_seq+no_seqs-1;i++) { /* get the seqs now*/
- output_index[i] = i; /* default output order */
- if(seqFormat == CLUSTAL)
- seq1=get_clustal_seq(sname1,&l1,title,i-first_seq+1);
- else if(seqFormat == MSF)
- seq1=get_msf_seq(sname1,&l1,title,i-first_seq+1);
- else
- seq1=get_seq(sname1,&l1,title);
-
- if(seq1==NULL) break;
-/* JULIE */
-/* Set max length of dynamically allocated arrays in prfalign.c */
- if (l1 > max_aln_length) max_aln_length = l1;
- seqlen_array[i]=l1; /* store the length */
- strcpy(names[i],sname1); /* " " name */
- strcpy(titles[i],title); /* " " title */
-
- if(!explicit_dnaflag) {
- dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */
- if(i == 1) dnaflag = dnaflag1;
- } /* type decided by first seq*/
- else
- dnaflag1 = dnaflag;
-
- alloc_seq(i,l1);
-
- if(dnaflag)
- n_encode(seq1,seq_array[i],l1); /* encode the sequence*/
- else /* as ints */
- p_encode(seq1,seq_array[i],l1);
- if(seq1!=NULL) seq1=ckfree(seq1);
- }
-
-
- max_aln_length *= 2;
-/*
- JULIE
- check sequence names are all different - otherwise phylip tree is
- confused.
-*/
- for(i=1;i<=first_seq+no_seqs-1;i++) {
- for(j=i+1;j<=first_seq+no_seqs-1;j++) {
- if (strncmp(names[i],names[j],MAXNAMES) == 0) {
- error("Multiple sequences found with same name, %s (first %d chars are significant)", names[i],MAXNAMES);
- return 0;
- }
- }
- }
- for(i=first_seq;i<=first_seq+no_seqs-1;i++)
- {
- if(seqlen_array[i]>max_aln_length)
- max_aln_length=seqlen_array[i];
- }
-
-/* look for a feature table / gap penalty mask (only if this is a profile) */
- if (profile_no > 0) {
- rewind(fin);
- struct_penalties = NONE;
- gap_penalty_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char));
- sec_struct_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char));
- ss_name = (char *)ckalloc((MAXNAMES+1) * sizeof (char));
-
- if (seqFormat == CLUSTAL) {
- get_clustal_ss(max_aln_length);
- }
- else if (seqFormat == GDE) {
- get_gde_ss(max_aln_length);
- }
- else if (seqFormat == EMBLSWISS) {
- get_embl_ss(max_aln_length);
- }
- else if (seqFormat == RSF) {
- get_rsf_ss(max_aln_length);
- }
- }
-
- for(i=first_seq;i<=first_seq+no_seqs-1;i++)
- {
- if(strlen(names[i])>max_names)
- max_names=strlen(names[i]);
- }
-
- if(max_names<10) max_names=10;
-
- fclose(fin);
-
- return no_seqs; /* return the number of seqs. read in this call */
-}
-
-
-static Boolean check_dnaflag(char *seq, sint slen)
-/* check if DNA or Protein
- The decision is based on counting all A,C,G,T,U or N.
- If >= 85% of all characters (except -) are as above => DNA */
-{
- sint i, c, nresidues, nbases;
- float ratio;
- char *dna_codes="ACGTUN";
-
- nresidues = nbases = 0;
- for(i=1; i <= slen; i++) {
- if(seq[i] != '-') {
- nresidues++;
- if(seq[i] == 'N')
- nbases++;
- else {
- c = res_index(dna_codes, seq[i]);
- if(c >= 0)
- nbases++;
- }
- }
- }
- if( (nbases == 0) || (nresidues == 0) ) return FALSE;
- ratio = (float)nbases/(float)nresidues;
-/* DES fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n",
- (pint)nbases,(pint)nresidues,(pint)ratio); */
- if(ratio >= 0.85)
- return TRUE;
- else
- return FALSE;
-}
-
-
-
-static void check_infile(sint *nseqs)
-{
- char line[MAXLINE+1];
- sint i;
-
- *nseqs=0;
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if(!blankline(line))
- break;
- }
-
- for(i=strlen(line)-1;i>=0;i--)
- if(isgraph(line[i])) break;
- line[i+1]=EOS;
-
- for(i=0;i<=6;i++) line[i] = toupper(line[i]);
-
- if( linetype(line,"ID") ) { /* EMBL/Swiss-Prot format ? */
- seqFormat=EMBLSWISS;
- (*nseqs)++;
- }
- else if( linetype(line,"CLUSTAL") ) {
- seqFormat=CLUSTAL;
- }
- else if( linetype(line,"PILEUP") ) {
- seqFormat = MSF;
- }
- else if( linetype(line,"!!AA_MULTIPLE_ALIGNMENT") ) {
- seqFormat = MSF;
- dnaflag = FALSE;
- }
- else if( linetype(line,"!!NA_MULTIPLE_ALIGNMENT") ) {
- seqFormat = MSF;
- dnaflag = TRUE;
- }
- else if( strstr(line,"MSF") && line[strlen(line)-1]=='.' &&
- line[strlen(line)-2]=='.' ) {
- seqFormat = MSF;
- }
- else if( linetype(line,"!!RICH_SEQUENCE") ) {
- seqFormat = RSF;
- }
- else if( linetype(line,"#NEXUS") ) {
- seqFormat=NEXUS;
- return;
- }
- else if(*line == '>') { /* no */
- seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */
- (*nseqs)++;
- }
- else if((*line == '"') || (*line == '%') || (*line == '#')) {
- seqFormat=GDE; /* GDE format */
- if (*line == '%') {
- (*nseqs)++;
- dnaflag = FALSE;
- }
- else if (*line == '#') {
- (*nseqs)++;
- dnaflag = TRUE;
- }
- }
- else {
- seqFormat=UNKNOWN;
- return;
- }
-
- while(fgets(line,MAXLINE+1,fin) != NULL) {
- switch(seqFormat) {
- case EMBLSWISS:
- if( linetype(line,"ID") )
- (*nseqs)++;
- break;
- case PIR:
- *nseqs = count_pir_seqs();
- fseek(fin,0,0);
- return;
- case PEARSON:
- if( *line == '>' )
- (*nseqs)++;
- break;
- case GDE:
- if(( *line == '%' ) && ( dnaflag == FALSE))
- (*nseqs)++;
- else if (( *line == '#') && ( dnaflag == TRUE))
- (*nseqs)++;
- break;
- case CLUSTAL:
- *nseqs = count_clustal_seqs();
-/* DES */ /* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */
- fseek(fin,0,0);
- return;
- case MSF:
- *nseqs = count_msf_seqs();
- fseek(fin,0,0);
- return;
- case RSF:
- fseek(fin,0,0);
- *nseqs = count_rsf_seqs();
- fseek(fin,0,0);
- return;
- case USER:
- default:
- break;
- }
- }
- fseek(fin,0,0);
-}
-
-
-static sint count_pir_seqs(void)
-/* count the number of sequences in a pir alignment file */
-{
- char line[MAXLINE+1],c;
- sint nseqs, i;
- Boolean seq_ok;
-
- seq_ok = FALSE;
- while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of first seq */
- if(*line == '>') break;
- for(i=0;seq_ok == FALSE;i++) {
- c=line[i];
- if(c == '*') {
- seq_ok = TRUE; /* ok - end of sequence found */
- break;
- } /* EOL */
- if(c == '\n' || c == EOS)
- break; /* EOL */
- }
- if (seq_ok == TRUE)
- break;
- }
- if (seq_ok == FALSE) {
- error("PIR format sequence end marker '*'\nmissing for one or more sequences.");
- return (sint)0; /* funny format*/
- }
-
-
- nseqs = 1;
-
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if(*line == '>') { /* Look for start of next seq */
- seq_ok = FALSE;
- while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of seq */
- if(*line == '>') {
- error("PIR format sequence end marker '*' missing for one or more sequences.");
- return (sint)0; /* funny format*/
- }
- for(i=0;seq_ok == FALSE;i++) {
- c=line[i];
- if(c == '*') {
- seq_ok = TRUE; /* ok - sequence found */
- break;
- } /* EOL */
- if(c == '\n' || c == EOS)
- break; /* EOL */
- }
- if (seq_ok == TRUE) {
- nseqs++;
- break;
- }
- }
- }
- }
- return (sint)nseqs;
-}
-
-
-static sint count_clustal_seqs(void)
-/* count the number of sequences in a clustal alignment file */
-{
- char line[MAXLINE+1];
- sint nseqs;
-
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if(!cl_blankline(line)) break; /* Look for next non- */
- } /* blank line */
- nseqs = 1;
-
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if(cl_blankline(line)) return nseqs;
- nseqs++;
- }
-
- return (sint)0; /* if you got to here-funny format/no seqs.*/
-}
-
-static sint count_msf_seqs(void)
-{
-/* count the number of sequences in a PILEUP alignment file */
-
- char line[MAXLINE+1];
- sint nseqs;
-
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if(linetype(line,"//")) break;
- }
-
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if(!blankline(line)) break; /* Look for next non- */
- } /* blank line */
- nseqs = 1;
-
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if(blankline(line)) return nseqs;
- nseqs++;
- }
-
- return (sint)0; /* if you got to here-funny format/no seqs.*/
-}
-
-static sint count_rsf_seqs(void)
-{
-/* count the number of sequences in a GCG RSF alignment file */
-
- char line[MAXLINE+1];
- sint nseqs;
-
- nseqs = 0;
-/* skip the comments */
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if(line[strlen(line)-2]=='.' &&
- line[strlen(line)-3]=='.')
- break;
- }
-
- while (fgets(line,MAXLINE+1,fin) != NULL) {
- if( *line == '{' )
- nseqs++;
- }
- return (sint)nseqs;
-}
-
-static void p_encode(char *seq, char *naseq, sint l)
-{ /* code seq as ints .. use gap_pos2 for gap */
- register sint i;
-/* static char *aacids="CSTPAGNDEQHRKMILVFYW";*/
-
- for(i=1;i<=l;i++)
- if(seq[i] == '-')
- naseq[i] = gap_pos2;
- else
- naseq[i] = res_index(amino_acid_codes,seq[i]);
- naseq[i] = -3;
-}
-
-static void n_encode(char *seq,char *naseq,sint l)
-{ /* code seq as ints .. use gap_pos2 for gap */
- register sint i;
-/* static char *nucs="ACGTU"; */
-
- for(i=1;i<=l;i++) {
- if(seq[i] == '-') /* if a gap character -> code = gap_pos2 */
- naseq[i] = gap_pos2; /* this is the code for a gap in */
- else { /* the input files */
- naseq[i]=res_index(amino_acid_codes,seq[i]);
- }
- }
- naseq[i] = -3;
-}
-
-static sint res_index(char *t,char c)
-{
- register sint i;
-
- for(i=0;t[i] && t[i] != c;i++)
- ;
- if(t[i]) return(i);
- else return -1;
-}
Deleted: trunk/packages/clustalw/trunk/showpair.c
===================================================================
--- trunk/packages/clustalw/trunk/showpair.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/showpair.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,486 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include "clustalw.h"
-
-static void make_p_ptrs(sint *tptr, sint *pl, sint naseq, sint l);
-static void make_n_ptrs(sint *tptr, sint *pl, sint naseq, sint len);
-static void put_frag(sint fs, sint v1, sint v2, sint flen);
-static sint frag_rel_pos(sint a1, sint b1, sint a2, sint b2);
-static void des_quick_sort(sint *array1, sint *array2, sint array_size);
-static void pair_align(sint seq_no, sint l1, sint l2);
-
-
-/*
-* Prototypes
-*/
-
-/*
-* Global variables
-*/
-extern sint *seqlen_array;
-extern char **seq_array;
-extern sint dna_ktup, dna_window, dna_wind_gap, dna_signif; /* params for DNA */
-extern sint prot_ktup,prot_window,prot_wind_gap,prot_signif; /* params for prots */
-extern sint nseqs;
-extern Boolean dnaflag;
-extern double **tmat;
-extern sint max_aa;
-extern sint max_aln_length;
-
-static sint next;
-static sint curr_frag,maxsf,vatend;
-static sint **accum;
-static sint *diag_index;
-static char *slopes;
-
-sint ktup,window,wind_gap,signif; /* Pairwise aln. params */
-sint *displ;
-sint *zza, *zzb, *zzc, *zzd;
-
-extern Boolean percent;
-
-
-static void make_p_ptrs(sint *tptr,sint *pl,sint naseq,sint l)
-{
- static sint a[10];
- sint i,j,limit,code,flag;
- char residue;
-
- for (i=1;i<=ktup;i++)
- a[i] = (sint) pow((double)(max_aa+1),(double)(i-1));
-
- limit = (sint) pow((double)(max_aa+1),(double)ktup);
- for(i=1;i<=limit;++i)
- pl[i]=0;
- for(i=1;i<=l;++i)
- tptr[i]=0;
-
- for(i=1;i<=(l-ktup+1);++i) {
- code=0;
- flag=FALSE;
- for(j=1;j<=ktup;++j) {
- residue = seq_array[naseq][i+j-1];
- if((residue<0) || (residue > max_aa)){
- flag=TRUE;
- break;
- }
- code += ((residue) * a[j]);
- }
- if(flag)
- continue;
- ++code;
- if(pl[code]!=0)
- tptr[i]=pl[code];
- pl[code]=i;
- }
-}
-
-
-static void make_n_ptrs(sint *tptr,sint *pl,sint naseq,sint len)
-{
- static sint pot[]={ 0, 1, 4, 16, 64, 256, 1024, 4096 };
- sint i,j,limit,code,flag;
- char residue;
-
- limit = (sint) pow((double)4,(double)ktup);
-
- for(i=1;i<=limit;++i)
- pl[i]=0;
- for(i=1;i<=len;++i)
- tptr[i]=0;
-
- for(i=1;i<=len-ktup+1;++i) {
- code=0;
- flag=FALSE;
- for(j=1;j<=ktup;++j) {
- residue = seq_array[naseq][i+j-1];
- if((residue<0) || (residue>4)){
- flag=TRUE;
- break;
- }
- code += ((residue) * pot[j]); /* DES */
- }
- if(flag)
- continue;
- ++code;
- if(pl[code]!=0)
- tptr[i]=pl[code];
- pl[code]=i;
- }
-}
-
-
-static void put_frag(sint fs,sint v1,sint v2,sint flen)
-{
- sint end;
- accum[0][curr_frag]=fs;
- accum[1][curr_frag]=v1;
- accum[2][curr_frag]=v2;
- accum[3][curr_frag]=flen;
-
- if(!maxsf) {
- maxsf=1;
- accum[4][curr_frag]=0;
- return;
- }
-
- if(fs >= accum[0][maxsf]) {
- accum[4][curr_frag]=maxsf;
- maxsf=curr_frag;
- return;
- }
- else {
- next=maxsf;
- while(TRUE) {
- end=next;
- next=accum[4][next];
- if(fs>=accum[0][next])
- break;
- }
- accum[4][curr_frag]=next;
- accum[4][end]=curr_frag;
- }
-}
-
-
-static sint frag_rel_pos(sint a1,sint b1,sint a2,sint b2)
-{
- sint ret;
-
- ret=FALSE;
- if(a1-b1==a2-b2) {
- if(a2<a1)
- ret=TRUE;
- }
- else {
- if(a2+ktup-1<a1 && b2+ktup-1<b1)
- ret=TRUE;
- }
- return ret;
-}
-
-
-static void des_quick_sort(sint *array1, sint *array2, sint array_size)
-/* */
-/* Quicksort routine, adapted from chapter 4, page 115 of software tools */
-/* by Kernighan and Plauger, (1986) */
-/* Sort the elements of array1 and sort the */
-/* elements of array2 accordingly */
-/* */
-{
- sint temp1, temp2;
- sint p, pivlin;
- sint i, j;
- sint lst[50], ust[50]; /* the maximum no. of elements must be*/
- /* < log(base2) of 50 */
-
- lst[1] = 1;
- ust[1] = array_size-1;
- p = 1;
-
- while(p > 0) {
- if(lst[p] >= ust[p])
- p--;
- else {
- i = lst[p] - 1;
- j = ust[p];
- pivlin = array1[j];
- while(i < j) {
- for(i=i+1; array1[i] < pivlin; i++)
- ;
- for(j=j-1; j > i; j--)
- if(array1[j] <= pivlin) break;
- if(i < j) {
- temp1 = array1[i];
- array1[i] = array1[j];
- array1[j] = temp1;
-
- temp2 = array2[i];
- array2[i] = array2[j];
- array2[j] = temp2;
- }
- }
-
- j = ust[p];
-
- temp1 = array1[i];
- array1[i] = array1[j];
- array1[j] = temp1;
-
- temp2 = array2[i];
- array2[i] = array2[j];
- array2[j] = temp2;
-
- if(i-lst[p] < ust[p] - i) {
- lst[p+1] = lst[p];
- ust[p+1] = i - 1;
- lst[p] = i + 1;
- }
- else {
- lst[p+1] = i + 1;
- ust[p+1] = ust[p];
- ust[p] = i - 1;
- }
- p = p + 1;
- }
- }
- return;
-
-}
-
-
-
-
-
-static void pair_align(sint seq_no,sint l1,sint l2)
-{
- sint pot[8],i,j,l,m,flag,limit,pos,tl1,vn1,vn2,flen,osptr,fs;
- sint tv1,tv2,encrypt,subt1,subt2,rmndr;
- char residue;
-
- if(dnaflag) {
- for(i=1;i<=ktup;++i)
- pot[i] = (sint) pow((double)4,(double)(i-1));
- limit = (sint) pow((double)4,(double)ktup);
- }
- else {
- for (i=1;i<=ktup;i++)
- pot[i] = (sint) pow((double)(max_aa+1),(double)(i-1));
- limit = (sint) pow((double)(max_aa+1),(double)ktup);
- }
-
- tl1 = (l1+l2)-1;
-
- for(i=1;i<=tl1;++i) {
- slopes[i]=displ[i]=0;
- diag_index[i] = i;
- }
-
-
-/* increment diagonal score for each k_tuple match */
-
- for(i=1;i<=limit;++i) {
- vn1=zzc[i];
- while(TRUE) {
- if(!vn1) break;
- vn2=zzd[i];
- while(vn2 != 0) {
- osptr=vn1-vn2+l2;
- ++displ[osptr];
- vn2=zzb[vn2];
- }
- vn1=zza[vn1];
- }
- }
-
-/* choose the top SIGNIF diagonals */
-
- des_quick_sort(displ, diag_index, tl1);
-
- j = tl1 - signif + 1;
- if(j < 1) j = 1;
-
-/* flag all diagonals within WINDOW of a top diagonal */
-
- for(i=tl1; i>=j; i--)
- if(displ[i] > 0) {
- pos = diag_index[i];
- l = (1 >pos-window) ? 1 : pos-window;
- m = (tl1<pos+window) ? tl1 : pos+window;
- for(; l <= m; l++)
- slopes[l] = 1;
- }
-
- for(i=1; i<=tl1; i++) displ[i] = 0;
-
-
- curr_frag=maxsf=0;
-
- for(i=1;i<=(l1-ktup+1);++i) {
- encrypt=flag=0;
- for(j=1;j<=ktup;++j) {
- residue = seq_array[seq_no][i+j-1];
- if((residue<0) || (residue>max_aa)) {
- flag=TRUE;
- break;
- }
- encrypt += ((residue)*pot[j]);
- }
- if(flag) continue;
- ++encrypt;
-
- vn2=zzd[encrypt];
-
- flag=FALSE;
- while(TRUE) {
- if(!vn2) {
- flag=TRUE;
- break;
- }
- osptr=i-vn2+l2;
- if(slopes[osptr]!=1) {
- vn2=zzb[vn2];
- continue;
- }
- flen=0;
- fs=ktup;
- next=maxsf;
-
- /*
- * A-loop
- */
-
- while(TRUE) {
- if(!next) {
- ++curr_frag;
- if(curr_frag>=2*max_aln_length) {
- info("(Partial alignment)");
- vatend=1;
- return;
- }
- displ[osptr]=curr_frag;
- put_frag(fs,i,vn2,flen);
- }
- else {
- tv1=accum[1][next];
- tv2=accum[2][next];
- if(frag_rel_pos(i,vn2,tv1,tv2)) {
- if(i-vn2==accum[1][next]-accum[2][next]) {
- if(i>accum[1][next]+(ktup-1))
- fs=accum[0][next]+ktup;
- else {
- rmndr=i-accum[1][next];
- fs=accum[0][next]+rmndr;
- }
- flen=next;
- next=0;
- continue;
- }
- else {
- if(displ[osptr]==0)
- subt1=ktup;
- else {
- if(i>accum[1][displ[osptr]]+(ktup-1))
- subt1=accum[0][displ[osptr]]+ktup;
- else {
- rmndr=i-accum[1][displ[osptr]];
- subt1=accum[0][displ[osptr]]+rmndr;
- }
- }
- subt2=accum[0][next]-wind_gap+ktup;
- if(subt2>subt1) {
- flen=next;
- fs=subt2;
- }
- else {
- flen=displ[osptr];
- fs=subt1;
- }
- next=0;
- continue;
- }
- }
- else {
- next=accum[4][next];
- continue;
- }
- }
- break;
- }
- /*
- * End of Aloop
- */
-
- vn2=zzb[vn2];
- }
- }
- vatend=0;
-}
-
-void show_pair(sint istart, sint iend, sint jstart, sint jend)
-{
- sint i,j,dsr;
- double calc_score;
-
- accum = (sint **)ckalloc( 5*sizeof (sint *) );
- for (i=0;i<5;i++)
- accum[i] = (sint *) ckalloc((2*max_aln_length+1) * sizeof (sint) );
-
- displ = (sint *) ckalloc( (2*max_aln_length +1) * sizeof (sint) );
- slopes = (char *)ckalloc( (2*max_aln_length +1) * sizeof (char));
- diag_index = (sint *) ckalloc( (2*max_aln_length +1) * sizeof (sint) );
-
- zza = (sint *)ckalloc( (max_aln_length+1) * sizeof (sint) );
- zzb = (sint *)ckalloc( (max_aln_length+1) * sizeof (sint) );
-
- zzc = (sint *)ckalloc( (max_aln_length+1) * sizeof (sint) );
- zzd = (sint *)ckalloc( (max_aln_length+1) * sizeof (sint) );
-
- if(dnaflag) {
- ktup = dna_ktup;
- window = dna_window;
- signif = dna_signif;
- wind_gap = dna_wind_gap;
- }
- else {
- ktup = prot_ktup;
- window = prot_window;
- signif = prot_signif;
- wind_gap = prot_wind_gap;
- }
-
- fprintf(stdout,"\n\n");
-
- for(i=istart+1;i<=iend;++i) {
- if(dnaflag)
- make_n_ptrs(zza,zzc,i,seqlen_array[i]);
- else
- make_p_ptrs(zza,zzc,i,seqlen_array[i]);
- for(j=jstart+2;j<=jend;++j) {
- if(dnaflag)
- make_n_ptrs(zzb,zzd,j,seqlen_array[j]);
- else
- make_p_ptrs(zzb,zzd,j,seqlen_array[j]);
- pair_align(i,seqlen_array[i],seqlen_array[j]);
- if(!maxsf)
- calc_score=0.0;
- else {
- calc_score=(double)accum[0][maxsf];
- if(percent) {
- dsr=(seqlen_array[i]<seqlen_array[j]) ?
- seqlen_array[i] : seqlen_array[j];
- calc_score = (calc_score/(double)dsr) * 100.0;
- }
- }
-/*
- tmat[i][j]=calc_score;
- tmat[j][i]=calc_score;
-*/
-
- tmat[i][j] = (100.0 - calc_score)/100.0;
- tmat[j][i] = (100.0 - calc_score)/100.0;
- if(calc_score>0.1)
- info("Sequences (%d:%d) Aligned. Score: %lg",
- (pint)i,(pint)j,calc_score);
- else
- info("Sequences (%d:%d) Not Aligned",
- (pint)i,(pint)j);
- }
- }
-
- for (i=0;i<5;i++)
- accum[i]=ckfree((void *)accum[i]);
- accum=ckfree((void *)accum);
-
- displ=ckfree((void *)displ);
- slopes=ckfree((void *)slopes);
- diag_index=ckfree((void *)diag_index);
-
- zza=ckfree((void *)zza);
- zzb=ckfree((void *)zzb);
- zzc=ckfree((void *)zzc);
- zzd=ckfree((void *)zzd);
-}
-
Deleted: trunk/packages/clustalw/trunk/trees.c
===================================================================
--- trunk/packages/clustalw/trunk/trees.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/trees.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,2166 +0,0 @@
-/* Phyle of filogenetic tree calculating functions for CLUSTAL W */
-/* DES was here FEB. 1994 */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include "clustalw.h"
-#include "dayhoff.h" /* set correction for amino acid distances >= 75% */
-
-
-/*
- * Prototypes
- */
-Boolean transition(sint base1, sint base2);
-void tree_gap_delete(void);
-void distance_matrix_output(FILE *ofile);
-void nj_tree(char **tree_description, FILE *tree);
-void compare_tree(char **tree1, char **tree2, sint *hits, sint n);
-void print_phylip_tree(char **tree_description, FILE *tree, sint bootstrap);
-void print_nexus_tree(char **tree_description, FILE *tree, sint bootstrap);
-sint two_way_split(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap);
-sint two_way_split_nexus(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap);
-void print_tree(char **tree_description, FILE *tree, sint *totals);
-static Boolean is_ambiguity(char c);
-static void overspill_message(sint overspill,sint total_dists);
-
-
-/*
- * Global variables
- */
-
-extern sint max_names;
-
-extern double **tmat; /* general nxn array of reals; allocated from main */
- /* this is used as a distance matrix */
-extern Boolean dnaflag; /* TRUE for DNA seqs; FALSE for proteins */
-extern Boolean tossgaps; /* Ignore places in align. where ANY seq. has a gap*/
-extern Boolean kimura; /* Use correction for multiple substitutions */
-extern Boolean output_tree_clustal; /* clustal text output for trees */
-extern Boolean output_tree_phylip; /* phylip nested parentheses format */
-extern Boolean output_tree_distances; /* phylip distance matrix */
-extern Boolean output_tree_nexus; /* nexus format tree */
-extern Boolean output_pim; /* perc identity matrix output Ramu */
-
-extern sint bootstrap_format; /* bootstrap file format */
-extern Boolean empty; /* any sequences in memory? */
-extern Boolean usemenu; /* interactive (TRUE) or command line (FALSE) */
-extern sint nseqs;
-extern sint max_aln_length;
-extern sint *seqlen_array; /* the lengths of the sequences */
-extern char **seq_array; /* the sequences */
-extern char **names; /* the seq. names */
-extern char seqname[]; /* name of input file */
-extern sint gap_pos1,gap_pos2;
-extern Boolean use_ambiguities;
-extern char *amino_acid_codes;
-
-static double *av;
-static double *left_branch, *right_branch;
-static double *save_left_branch, *save_right_branch;
-static sint *boot_totals;
-static sint *tkill;
-/*
- The next line is a fossil from the days of using the cc ran()
-static int ran_factor;
-*/
-static sint *boot_positions;
-static FILE *phylip_phy_tree_file;
-static FILE *clustal_phy_tree_file;
-static FILE *distances_phy_tree_file;
-static FILE *nexus_phy_tree_file;
-static FILE *pim_file; /* Ramu */
-static Boolean verbose;
-static char *tree_gaps;
-static sint first_seq, last_seq;
- /* array of weights; 1 for use this posn.; 0 don't */
-
-extern sint boot_ntrials; /* number of bootstrap trials */
-extern unsigned sint boot_ran_seed; /* random number generator seed */
-
-void phylogenetic_tree(char *phylip_name,char *clustal_name,char *dist_name, char *nexus_name, char *pim_name)
-/*
- Calculate a tree using the distances in the nseqs*nseqs array tmat.
- This is the routine for getting the REAL trees after alignment.
-*/
-{ char path[FILENAMELEN+1];
- sint i, j;
- sint overspill = 0;
- sint total_dists;
- static char **standard_tree;
- static char **save_tree;
- char lin2[10];
-
- if(empty) {
- error("You must load an alignment first");
- return;
- }
-
- if(nseqs<2) {
- error("Alignment has only %d sequences",nseqs);
- return;
- }
- first_seq=1;
- last_seq=nseqs;
-
- get_path(seqname,path);
-
-if(output_tree_clustal) {
- if (clustal_name[0]!=EOS) {
- if((clustal_phy_tree_file = open_explicit_file(
- clustal_name))==NULL) return;
- }
- else {
- if((clustal_phy_tree_file = open_output_file(
- "\nEnter name for CLUSTAL tree output file ",path,
- clustal_name,"nj")) == NULL) return;
- }
-}
-
-if(output_tree_phylip) {
- if (phylip_name[0]!=EOS) {
- if((phylip_phy_tree_file = open_explicit_file(
- phylip_name))==NULL) return;
- }
- else {
- if((phylip_phy_tree_file = open_output_file(
- "\nEnter name for PHYLIP tree output file ",path,
- phylip_name,"ph")) == NULL) return;
- }
-}
-
-if(output_tree_distances)
-{
- if (dist_name[0]!=EOS) {
- if((distances_phy_tree_file = open_explicit_file(
- dist_name))==NULL) return;
- }
- else {
- if((distances_phy_tree_file = open_output_file(
- "\nEnter name for distance matrix output file ",path,
- dist_name,"dst")) == NULL) return;
- }
-}
-
-if(output_tree_nexus)
-{
- if (nexus_name[0]!=EOS) {
- if((nexus_phy_tree_file = open_explicit_file(
- nexus_name))==NULL) return;
- }
- else {
- if((nexus_phy_tree_file = open_output_file(
- "\nEnter name for NEXUS tree output file ",path,
- nexus_name,"tre")) == NULL) return;
- }
-}
-
-if(output_pim)
-{
- if (pim_name[0]!=EOS) {
- if((pim_file = open_explicit_file(
- pim_name))==NULL) return;
- }
- else {
- if((pim_file = open_output_file(
- "\nEnter name for % Identity matrix output file ",path,
- pim_name,"pim")) == NULL) return;
- }
-}
-
- boot_positions = (sint *)ckalloc( (seqlen_array[first_seq]+2) * sizeof (sint) );
-
- for(j=1; j<=seqlen_array[first_seq]; ++j)
- boot_positions[j] = j;
-
- if(output_tree_clustal) {
- verbose = TRUE; /* Turn on file output */
- if(dnaflag)
- overspill = dna_distance_matrix(clustal_phy_tree_file);
- else
- overspill = prot_distance_matrix(clustal_phy_tree_file);
- }
-
- if(output_tree_phylip) {
- verbose = FALSE; /* Turn off file output */
- if(dnaflag)
- overspill = dna_distance_matrix(phylip_phy_tree_file);
- else
- overspill = prot_distance_matrix(phylip_phy_tree_file);
- }
-
- if(output_tree_nexus) {
- verbose = FALSE; /* Turn off file output */
- if(dnaflag)
- overspill = dna_distance_matrix(nexus_phy_tree_file);
- else
- overspill = prot_distance_matrix(nexus_phy_tree_file);
- }
-
- if(output_pim) { /* Ramu */
- verbose = FALSE; /* Turn off file output */
- if(dnaflag)
- calc_percidentity(pim_file);
- else
- calc_percidentity(pim_file);
- }
-
-
- if(output_tree_distances) {
- verbose = FALSE; /* Turn off file output */
- if(dnaflag)
- overspill = dna_distance_matrix(distances_phy_tree_file);
- else
- overspill = prot_distance_matrix(distances_phy_tree_file);
- distance_matrix_output(distances_phy_tree_file);
- }
-
-/* check if any distances overflowed the distance corrections */
- if ( overspill > 0 ) {
- total_dists = (nseqs*(nseqs-1))/2;
- overspill_message(overspill,total_dists);
- }
-
- if(output_tree_clustal) verbose = TRUE; /* Turn on file output */
-
- standard_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
- for(i=0; i<nseqs+1; i++)
- standard_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
- save_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
- for(i=0; i<nseqs+1; i++)
- save_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
-
- if(output_tree_clustal || output_tree_phylip || output_tree_nexus)
- nj_tree(standard_tree,clustal_phy_tree_file);
-
- for(i=1; i<nseqs+1; i++)
- for(j=1; j<nseqs+1; j++)
- save_tree[i][j] = standard_tree[i][j];
-
- if(output_tree_phylip)
- print_phylip_tree(standard_tree,phylip_phy_tree_file,0);
-
- for(i=1; i<nseqs+1; i++)
- for(j=1; j<nseqs+1; j++)
- standard_tree[i][j] = save_tree[i][j];
-
- if(output_tree_nexus)
- print_nexus_tree(standard_tree,nexus_phy_tree_file,0);
-
-/*
- print_tree(standard_tree,phy_tree_file);
-*/
- tree_gaps=ckfree((void *)tree_gaps);
- boot_positions=ckfree((void *)boot_positions);
- if (left_branch != NULL) left_branch=ckfree((void *)left_branch);
- if (right_branch != NULL) right_branch=ckfree((void *)right_branch);
- if (tkill != NULL) tkill=ckfree((void *)tkill);
- if (av != NULL) av=ckfree((void *)av);
- for (i=0;i<nseqs+1;i++)
- standard_tree[i]=ckfree((void *)standard_tree[i]);
- standard_tree=ckfree((void *)standard_tree);
-
- for (i=0;i<nseqs+1;i++)
- save_tree[i]=ckfree((void *)save_tree[i]);
- save_tree=ckfree((void *)save_tree);
-
-if(output_tree_clustal) {
- fclose(clustal_phy_tree_file);
- info("Phylogenetic tree file created: [%s]",clustal_name);
-}
-
-if(output_tree_phylip) {
- fclose(phylip_phy_tree_file);
- info("Phylogenetic tree file created: [%s]",phylip_name);
-}
-
-if(output_tree_distances) {
- fclose(distances_phy_tree_file);
- info("Distance matrix file created: [%s]",dist_name);
-}
-
-if(output_tree_nexus) {
- fclose(nexus_phy_tree_file);
- info("Nexus tree file created: [%s]",nexus_name);
-}
-
-if(output_pim) {
- fclose(pim_file);
- info(" perc identity matrix file created: [%s]",pim_name);
-}
-
-}
-
-static void overspill_message(sint overspill,sint total_dists)
-{
- char err_mess[1024]="";
-
- sprintf(err_mess,"%d of the distances out of a total of %d",
- (pint)overspill,(pint)total_dists);
- strcat(err_mess,"\n were out of range for the distance correction.");
- strcat(err_mess,"\n");
- strcat(err_mess,"\n SUGGESTIONS: 1) remove the most distant sequences");
- strcat(err_mess,"\n or 2) use the PHYLIP package");
- strcat(err_mess,"\n or 3) turn off the correction.");
- strcat(err_mess,"\n Note: Use option 3 with caution! With this degree");
- strcat(err_mess,"\n of divergence you will have great difficulty");
- strcat(err_mess,"\n getting robust and reliable trees.");
- strcat(err_mess,"\n\n");
- warning(err_mess);
-}
-
-
-
-Boolean transition(sint base1, sint base2) /* TRUE if transition; else FALSE */
-/*
-
- assumes that the bases of DNA sequences have been translated as
- a,A = 0; c,C = 1; g,G = 2; t,T,u,U = 3; N = 4;
- a,A = 0; c,C = 2; g,G = 6; t,T,u,U =17;
-
- A <--> G and T <--> C are transitions; all others are transversions.
-
-*/
-{
- if( ((base1 == 0) && (base2 == 6)) || ((base1 == 6) && (base2 == 0)) )
- return TRUE; /* A <--> G */
- if( ((base1 ==17) && (base2 == 2)) || ((base1 == 2) && (base2 ==17)) )
- return TRUE; /* T <--> C */
- return FALSE;
-}
-
-
-void tree_gap_delete(void) /* flag all positions in alignment that have a gap */
-{ /* in ANY sequence */
- sint seqn;
- sint posn;
-
- tree_gaps = (char *)ckalloc( (max_aln_length+1) * sizeof (char) );
-
- for(posn=1; posn<=seqlen_array[first_seq]; ++posn) {
- tree_gaps[posn] = 0;
- for(seqn=1; seqn<=last_seq-first_seq+1; ++seqn) {
- if((seq_array[seqn+first_seq-1][posn] == gap_pos1) ||
- (seq_array[seqn+first_seq-1][posn] == gap_pos2)) {
- tree_gaps[posn] = 1;
- break;
- }
- }
- }
-
-}
-
-void distance_matrix_output(FILE *ofile)
-{
- sint i,j;
-
- fprintf(ofile,"%6d",(pint)last_seq-first_seq+1);
- for(i=1;i<=last_seq-first_seq+1;i++) {
- fprintf(ofile,"\n%-*s ",max_names,names[i]);
- for(j=1;j<=last_seq-first_seq+1;j++) {
- fprintf(ofile,"%6.3f ",tmat[i][j]);
- if(j % 8 == 0) {
- if(j!=last_seq-first_seq+1) fprintf(ofile,"\n");
- if(j != last_seq-first_seq+1 ) fprintf(ofile," ");
- }
- }
- }
-}
-
-
-
-#ifdef ORIGINAL_NJ_TREE
-void nj_tree(char **tree_description, FILE *tree)
-{
- register int i;
- sint l[4],nude,k;
- sint nc,mini,minj,j,ii,jj;
- double fnseqs,fnseqs2=0,sumd;
- double diq,djq,dij,d2r,dr,dio,djo,da;
- double tmin,total,dmin;
- double bi,bj,b1,b2,b3,branch[4];
- sint typei,typej; /* 0 = node; 1 = OTU */
-
- fnseqs = (double)last_seq-first_seq+1;
-
-/*********************** First initialisation ***************************/
-
- if(verbose) {
- fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n");
- fprintf(tree,"\n Saitou, N. and Nei, M. (1987)");
- fprintf(tree," The Neighbor-joining Method:");
- fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees.");
- fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n");
- fprintf(tree,"\n\n This is an UNROOTED tree\n");
- fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n");
- }
-
- if (fnseqs == 2) {
- if (verbose) fprintf(tree,"Cycle 1 = SEQ: 1 (%9.5f) joins SEQ: 2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]);
- return;
- }
-
- mini = minj = 0;
-
- left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- tkill = (sint *) ckalloc( (nseqs+1) * sizeof (sint) );
- av = (double *) ckalloc( (nseqs+1) * sizeof (double) );
-
- for(i=1;i<=last_seq-first_seq+1;++i)
- {
- tmat[i][i] = av[i] = 0.0;
- tkill[i] = 0;
- }
-
-/*********************** Enter The Main Cycle ***************************/
-
- /* for(nc=1; nc<=(last_seq-first_seq+1-3); ++nc) { */ /**start main cycle**/
- for(nc=1; nc<=(last_seq-first_seq+1-3); ++nc) {
- sumd = 0.0;
- for(j=2; j<=last_seq-first_seq+1; ++j)
- for(i=1; i<j; ++i) {
- tmat[j][i] = tmat[i][j];
- sumd = sumd + tmat[i][j];
- }
-
- tmin = 99999.0;
-
-/*.................compute SMATij values and find the smallest one ........*/
-
- for(jj=2; jj<=last_seq-first_seq+1; ++jj)
- if(tkill[jj] != 1)
- for(ii=1; ii<jj; ++ii)
- if(tkill[ii] != 1) {
- diq = djq = 0.0;
-
- for(i=1; i<=last_seq-first_seq+1; ++i) {
- diq = diq + tmat[i][ii];
- djq = djq + tmat[i][jj];
- }
-
- dij = tmat[ii][jj];
- d2r = diq + djq - (2.0*dij);
- dr = sumd - dij -d2r;
- fnseqs2 = fnseqs - 2.0;
- total= d2r+ fnseqs2*dij +dr*2.0;
- total= total / (2.0*fnseqs2);
-
- if(total < tmin) {
- tmin = total;
- mini = ii;
- minj = jj;
- }
- }
-
-
-/*.................compute branch lengths and print the results ........*/
-
-
- dio = djo = 0.0;
- for(i=1; i<=last_seq-first_seq+1; ++i) {
- dio = dio + tmat[i][mini];
- djo = djo + tmat[i][minj];
- }
-
- dmin = tmat[mini][minj];
- dio = (dio - dmin) / fnseqs2;
- djo = (djo - dmin) / fnseqs2;
- bi = (dmin + dio - djo) * 0.5;
- bj = dmin - bi;
- bi = bi - av[mini];
- bj = bj - av[minj];
-
- if( av[mini] > 0.0 )
- typei = 0;
- else
- typei = 1;
- if( av[minj] > 0.0 )
- typej = 0;
- else
- typej = 1;
-
- if(verbose)
- fprintf(tree,"\n Cycle%4d = ",(pint)nc);
-
-/*
- set negative branch lengths to zero. Also set any tiny positive
- branch lengths to zero.
-*/ if( fabs(bi) < 0.0001) bi = 0.0;
- if( fabs(bj) < 0.0001) bj = 0.0;
-
- if(verbose) {
- if(typei == 0)
- fprintf(tree,"Node:%4d (%9.5f) joins ",(pint)mini,bi);
- else
- fprintf(tree," SEQ:%4d (%9.5f) joins ",(pint)mini,bi);
-
- if(typej == 0)
- fprintf(tree,"Node:%4d (%9.5f)",(pint)minj,bj);
- else
- fprintf(tree," SEQ:%4d (%9.5f)",(pint)minj,bj);
-
- fprintf(tree,"\n");
- }
-
-
- left_branch[nc] = bi;
- right_branch[nc] = bj;
-
- for(i=1; i<=last_seq-first_seq+1; i++)
- tree_description[nc][i] = 0;
-
- if(typei == 0) {
- for(i=nc-1; i>=1; i--)
- if(tree_description[i][mini] == 1) {
- for(j=1; j<=last_seq-first_seq+1; j++)
- if(tree_description[i][j] == 1)
- tree_description[nc][j] = 1;
- break;
- }
- }
- else
- tree_description[nc][mini] = 1;
-
- if(typej == 0) {
- for(i=nc-1; i>=1; i--)
- if(tree_description[i][minj] == 1) {
- for(j=1; j<=last_seq-first_seq+1; j++)
- if(tree_description[i][j] == 1)
- tree_description[nc][j] = 1;
- break;
- }
- }
- else
- tree_description[nc][minj] = 1;
-
-
-/*
- Here is where the -0.00005 branch lengths come from for 3 or more
- identical seqs.
-*/
-/* if(dmin <= 0.0) dmin = 0.0001; */
- if(dmin <= 0.0) dmin = 0.000001;
- av[mini] = dmin * 0.5;
-
-/*........................Re-initialisation................................*/
-
- fnseqs = fnseqs - 1.0;
- tkill[minj] = 1;
-
- for(j=1; j<=last_seq-first_seq+1; ++j)
- if( tkill[j] != 1 ) {
- da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5;
- if( (mini - j) < 0 )
- tmat[mini][j] = da;
- if( (mini - j) > 0)
- tmat[j][mini] = da;
- }
-
- for(j=1; j<=last_seq-first_seq+1; ++j)
- tmat[minj][j] = tmat[j][minj] = 0.0;
-
-
-/****/ } /**end main cycle**/
-
-/******************************Last Cycle (3 Seqs. left)********************/
-
- nude = 1;
-
- for(i=1; i<=last_seq-first_seq+1; ++i)
- if( tkill[i] != 1 ) {
- l[nude] = i;
- nude = nude + 1;
- }
-
- b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
- b2 = tmat[l[1]][l[2]] - b1;
- b3 = tmat[l[1]][l[3]] - b1;
-
- branch[1] = b1 - av[l[1]];
- branch[2] = b2 - av[l[2]];
- branch[3] = b3 - av[l[3]];
-
-/* Reset tiny negative and positive branch lengths to zero */
- if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
- if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
- if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;
-
- left_branch[last_seq-first_seq+1-2] = branch[1];
- left_branch[last_seq-first_seq+1-1] = branch[2];
- left_branch[last_seq-first_seq+1] = branch[3];
-
- for(i=1; i<=last_seq-first_seq+1; i++)
- tree_description[last_seq-first_seq+1-2][i] = 0;
-
- if(verbose)
- fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",(pint)nc);
-
- for(i=1; i<=3; ++i) {
- if( av[l[i]] > 0.0) {
- if(verbose)
- fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",(pint)l[i],branch[i]);
- for(k=last_seq-first_seq+1-3; k>=1; k--)
- if(tree_description[k][l[i]] == 1) {
- for(j=1; j<=last_seq-first_seq+1; j++)
- if(tree_description[k][j] == 1)
- tree_description[last_seq-first_seq+1-2][j] = i;
- break;
- }
- }
- else {
- if(verbose)
- fprintf(tree,"\n\t\t SEQ:%4d (%9.5f) ",(pint)l[i],branch[i]);
- tree_description[last_seq-first_seq+1-2][l[i]] = i;
- }
- if(i < 3) {
- if(verbose)
- fprintf(tree,"joins");
- }
- }
-
- if(verbose)
- fprintf(tree,"\n");
-
-}
-
-#else /* ORIGINAL_NJ_TREE */
-
-void nj_tree(char **tree_description, FILE *tree) {
- void fast_nj_tree();
-
- /*fprintf(stderr, "****** call fast_nj_tree() !!!! ******\n");*/
- fast_nj_tree(tree_description, tree);
-}
-
-
-/****************************************************************************
- * [ Improvement ideas in fast_nj_tree() ] by DDBJ & FUJITSU Limited.
- * written by Tadashi Koike
- * (takoike at genes.nig.ac.jp)
- *******************
- * <IMPROVEMENT 1> : Store the value of sum of the score to temporary array,
- * and use again and again.
- *
- * In the main cycle, these are calculated again and again :
- * diq = sum of tmat[n][ii] (n:1 to last_seq-first_seq+1),
- * djq = sum of tmat[n][jj] (n:1 to last_seq-first_seq+1),
- * dio = sum of tmat[n][mini] (n:1 to last_seq-first_seq+1),
- * djq = sum of tmat[n][minj] (n:1 to last_seq-first_seq+1)
- * // 'last_seq' and 'first_seq' are both constant values //
- * and the result of above calculations is always same until
- * a best pair of neighbour nodes is joined.
- *
- * So, we change the logic to calculate the sum[i] (=sum of tmat[n][i]
- * (n:1 to last_seq-first_seq+1)) and store it to array, before
- * beginning to find a best pair of neighbour nodes, and after that
- * we use them again and again.
- *
- * tmat[i][j]
- * 1 2 3 4 5
- * +---+---+---+---+---+
- * 1 | | | | | |
- * +---+---+---+---+---+
- * 2 | | | | | | 1) calculate sum of tmat[n][i]
- * +---+---+---+---+---+ (n: 1 to last_seq-first_seq+1)
- * 3 | | | | | | 2) store that sum value to sum[i]
- * +---+---+---+---+---+
- * 4 | | | | | | 3) use sum[i] during finding a best
- * +---+---+---+---+---+ pair of neibour nodes.
- * 5 | | | | | |
- * +---+---+---+---+---+
- * | | | | |
- * V V V V V Calculate sum , and store it to sum[i]
- * +---+---+---+---+---+
- * sum[i] | | | | | |
- * +---+---+---+---+---+
- *
- * At this time, we thought that we use upper triangle of the matrix
- * because tmat[i][j] is equal to tmat[j][i] and tmat[i][i] is equal
- * to zero. Therefore, we prepared sum_rows[i] and sum_cols[i] instead
- * of sum[i] for storing the sum value.
- *
- * tmat[i][j]
- * 1 2 3 4 5 sum_cols[i]
- * +---+---+---+---+---+ +---+
- * 1 | # | # | # | # | --> | | ... sum of tmat[1][2..5]
- * + - +---+---+---+---+ +---+
- * 2 | # | # | # | --> | | ... sum of tmat[2][3..5]
- * + - + - +---+---+---+ +---+
- * 3 | # | # | --> | | ... sum of tmat[3][4..5]
- * + - + - + - +---+---+ +---+
- * 4 | # | --> | | ... sum of tmat[4][5]
- * + - + - + - + - +---+ +---+
- * 5 | --> | | ... zero
- * + - + - + - + - + - + +---+
- * | | | | |
- * V V V V V Calculate sum , sotre to sum[i]
- * +---+---+---+---+---+
- * sum_rows[i] | | | | | |
- * +---+---+---+---+---+
- * | | | | |
- * | | | | +----- sum of tmat[1..4][5]
- * | | | +--------- sum of tmat[1..3][4]
- * | | +------------- sum of tmat[1..2][3]
- * | +----------------- sum of tmat[1][2]
- * +--------------------- zero
- *
- * And we use (sum_rows[i] + sum_cols[i]) instead of sum[i].
- *
- *******************
- * <IMPROVEMENT 2> : We manage valid nodes with chain list, instead of
- * tkill[i] flag array.
- *
- * In original logic, invalid(killed?) nodes after nodes-joining
- * are managed with tkill[i] flag array (set to 1 when killed).
- * By this method, it is conspicuous to try next node but skip it
- * at the latter of finding a best pair of neighbor nodes.
- *
- * So, we thought that we managed valid nodes by using a chain list
- * as below:
- *
- * 1) declare the list structure.
- * struct {
- * sint n; // entry number of node.
- * void *prev; // pointer to previous entry.
- * void *next; // pointer to next entry.
- * }
- * 2) construct a valid node list.
- *
- * +-----+ +-----+ +-----+ +-----+ +-----+
- * NULL<-|prev |<---|prev |<---|prev |<---|prev |<- - - -|prev |
- * | 0 | | 1 | | 2 | | 3 | | n |
- * | next|--->| next|--->| next|--->| next|- - - ->| next|->NULL
- * +-----+ +-----+ +-----+ +-----+ +-----+
- *
- * 3) when finding a best pair of neighbor nodes, we use
- * this chain list as loop counter.
- *
- * 4) If an entry was killed by node-joining, this chain list is
- * modified to remove that entry.
- *
- * EX) remove the entry No 2.
- * +-----+ +-----+ +-----+ +-----+
- * NULL<-|prev |<---|prev |<--------------|prev |<- - - -|prev |
- * | 0 | | 1 | | 3 | | n |
- * | next|--->| next|-------------->| next|- - - ->| next|->NULL
- * +-----+ +-----+ +-----+ +-----+
- * +-----+
- * NULL<-|prev |
- * | 2 |
- * | next|->NULL
- * +-----+
- *
- * By this method, speed is up at the latter of finding a best pair of
- * neighbor nodes.
- *
- *******************
- * <IMPROVEMENT 3> : Cut the frequency of division.
- *
- * At comparison between 'total' and 'tmin' in the main cycle, total is
- * divided by (2.0*fnseqs2) before comparison. If N nodes are available,
- * that division happen (N*(N-1))/2 order.
- *
- * We thought that the comparison relation between tmin and total/(2.0*fnseqs2)
- * is equal to the comparison relation between (tmin*2.0*fnseqs2) and total.
- * Calculation of (tmin*2.0*fnseqs2) is only one time. so we stop dividing
- * a total value and multiply tmin and (tmin*2.0*fnseqs2) instead.
- *
- *******************
- * <IMPROVEMENT 4> : some transformation of the equation (to cut operations).
- *
- * We transform an equation of calculating 'total' in the main cycle.
- *
- */
-
-
-void fast_nj_tree(char **tree_description, FILE *tree)
-{
- register int i;
- sint l[4],nude,k;
- sint nc,mini,minj,j,ii,jj;
- double fnseqs,fnseqs2=0,sumd;
- double diq,djq,dij,d2r,dr,dio,djo,da;
- double tmin,total,dmin;
- double bi,bj,b1,b2,b3,branch[4];
- sint typei,typej; /* 0 = node; 1 = OTU */
-
- /* IMPROVEMENT 1, STEP 0 : declare variables */
- double *sum_cols, *sum_rows, *join;
-
- /* IMPROVEMENT 2, STEP 0 : declare variables */
- sint loop_limit;
- typedef struct _ValidNodeID {
- sint n;
- struct _ValidNodeID *prev;
- struct _ValidNodeID *next;
- } ValidNodeID;
- ValidNodeID *tvalid, *lpi, *lpj, *lpii, *lpjj, *lp_prev, *lp_next;
-
- /*
- * correspondence of the loop counter variables.
- * i .. lpi->n, ii .. lpii->n
- * j .. lpj->n, jj .. lpjj->n
- */
-
- fnseqs = (double)last_seq-first_seq+1;
-
-/*********************** First initialisation ***************************/
-
- if(verbose) {
- fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n");
- fprintf(tree,"\n Saitou, N. and Nei, M. (1987)");
- fprintf(tree," The Neighbor-joining Method:");
- fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees.");
- fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n");
- fprintf(tree,"\n\n This is an UNROOTED tree\n");
- fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n");
- }
-
- if (fnseqs == 2) {
- if (verbose) fprintf(tree,"Cycle 1 = SEQ: 1 (%9.5f) joins SEQ: 2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]);
- return;
- }
-
- mini = minj = 0;
-
- left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- tkill = (sint *) ckalloc( (nseqs+1) * sizeof (sint) );
- av = (double *) ckalloc( (nseqs+1) * sizeof (double) );
-
- /* IMPROVEMENT 1, STEP 1 : Allocate memory */
- sum_cols = (double *) ckalloc( (nseqs+1) * sizeof (double) );
- sum_rows = (double *) ckalloc( (nseqs+1) * sizeof (double) );
- join = (double *) ckalloc( (nseqs+1) * sizeof (double) );
-
- /* IMPROVEMENT 2, STEP 1 : Allocate memory */
- tvalid = (ValidNodeID *) ckalloc( (nseqs+1) * sizeof (ValidNodeID) );
- /* tvalid[0] is special entry in array. it points a header of valid entry list */
- tvalid[0].n = 0;
- tvalid[0].prev = NULL;
- tvalid[0].next = &tvalid[1];
-
- /* IMPROVEMENT 2, STEP 2 : Construct and initialize the entry chain list */
- for(i=1, loop_limit = last_seq-first_seq+1,
- lpi=&tvalid[1], lp_prev=&tvalid[0], lp_next=&tvalid[2] ;
- i<=loop_limit ;
- ++i, ++lpi, ++lp_prev, ++lp_next)
- {
- tmat[i][i] = av[i] = 0.0;
- tkill[i] = 0;
- lpi->n = i;
- lpi->prev = lp_prev;
- lpi->next = lp_next;
-
- /* IMPROVEMENT 1, STEP 2 : Initialize arrays */
- sum_cols[i] = sum_rows[i] = join[i] = 0.0;
- }
- tvalid[loop_limit].next = NULL;
-
- /*
- * IMPROVEMENT 1, STEP 3 : Calculate the sum of score value that
- * is sequence[i] to others.
- */
- sumd = 0.0;
- for (lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) {
- double tmp_sum = 0.0;
- j = lpj->n;
- /* calculate sum_rows[j] */
- for (lpi=tvalid[0].next ; lpi->n < j ; lpi = lpi->next) {
- i = lpi->n;
- tmp_sum += tmat[i][j];
- /* tmat[j][i] = tmat[i][j]; */
- }
- sum_rows[j] = tmp_sum;
-
- tmp_sum = 0.0;
- /* Set lpi to that lpi->n is greater than j */
- if ((lpi != NULL) && (lpi->n == j)) {
- lpi = lpi->next;
- }
- /* calculate sum_cols[j] */
- for( ; lpi!=NULL ; lpi = lpi->next) {
- i = lpi->n;
- tmp_sum += tmat[j][i];
- /* tmat[i][j] = tmat[j][i]; */
- }
- sum_cols[j] = tmp_sum;
- }
-
-/*********************** Enter The Main Cycle ***************************/
-
- for(nc=1, loop_limit = (last_seq-first_seq+1-3); nc<=loop_limit; ++nc) {
-
- sumd = 0.0;
- /* IMPROVEMENT 1, STEP 4 : use sum value */
- for(lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) {
- sumd += sum_cols[lpj->n];
- }
-
- /* IMPROVEMENT 3, STEP 0 : multiply tmin and 2*fnseqs2 */
- fnseqs2 = fnseqs - 2.0; /* Set fnseqs2 at this point. */
- tmin = 99999.0 * 2.0 * fnseqs2;
-
-
-/*.................compute SMATij values and find the smallest one ........*/
-
- mini = minj = 0;
-
- /* jj must starts at least 2 */
- if ((tvalid[0].next != NULL) && (tvalid[0].next->n == 1)) {
- lpjj = tvalid[0].next->next;
- } else {
- lpjj = tvalid[0].next;
- }
-
- for( ; lpjj != NULL; lpjj = lpjj->next) {
- jj = lpjj->n;
- for(lpii=tvalid[0].next ; lpii->n < jj ; lpii = lpii->next) {
- ii = lpii->n;
- diq = djq = 0.0;
-
- /* IMPROVEMENT 1, STEP 4 : use sum value */
- diq = sum_cols[ii] + sum_rows[ii];
- djq = sum_cols[jj] + sum_rows[jj];
- /*
- * always ii < jj in this point. Use upper
- * triangle of score matrix.
- */
- dij = tmat[ii][jj];
-
- /*
- * IMPROVEMENT 3, STEP 1 : fnseqs2 is
- * already calculated.
- */
- /* fnseqs2 = fnseqs - 2.0 */
-
- /* IMPROVEMENT 4 : transform the equation */
- /*-------------------------------------------------------------------*
- * OPTIMIZE of expression 'total = d2r + fnseqs2*dij + dr*2.0' *
- * total = d2r + fnseq2*dij + 2.0*dr *
- * = d2r + fnseq2*dij + 2(sumd - dij - d2r) *
- * = d2r + fnseq2*dij + 2*sumd - 2*dij - 2*d2r *
- * = fnseq2*dij + 2*sumd - 2*dij - 2*d2r + d2r *
- * = fnseq2*dij + 2*sumd - 2*dij - d2r *
- * = fnseq2*dij + 2*sumd - 2*dij - (diq + djq - 2*dij) *
- * = fnseq2*dij + 2*sumd - 2*dij - diq - djq + 2*dij *
- * = fnseq2*dij + 2*sumd - 2*dij + 2*dij - diq - djq *
- * = fnseq2*dij + 2*sumd - diq - djq *
- *-------------------------------------------------------------------*/
- total = fnseqs2*dij + 2.0*sumd - diq - djq;
-
- /*
- * IMPROVEMENT 3, STEP 2 : abbrevlate
- * the division on comparison between
- * total and tmin.
- */
- /* total = total / (2.0*fnseqs2); */
-
- if(total < tmin) {
- tmin = total;
- mini = ii;
- minj = jj;
- }
- }
- }
-
- /* MEMO: always ii < jj in avobe loop, so mini < minj */
-
-/*.................compute branch lengths and print the results ........*/
-
-
- dio = djo = 0.0;
-
- /* IMPROVEMENT 1, STEP 4 : use sum value */
- dio = sum_cols[mini] + sum_rows[mini];
- djo = sum_cols[minj] + sum_rows[minj];
-
- dmin = tmat[mini][minj];
- dio = (dio - dmin) / fnseqs2;
- djo = (djo - dmin) / fnseqs2;
- bi = (dmin + dio - djo) * 0.5;
- bj = dmin - bi;
- bi = bi - av[mini];
- bj = bj - av[minj];
-
- if( av[mini] > 0.0 )
- typei = 0;
- else
- typei = 1;
- if( av[minj] > 0.0 )
- typej = 0;
- else
- typej = 1;
-
- if(verbose)
- fprintf(tree,"\n Cycle%4d = ",(pint)nc);
-
-/*
- set negative branch lengths to zero. Also set any tiny positive
- branch lengths to zero.
-*/ if( fabs(bi) < 0.0001) bi = 0.0;
- if( fabs(bj) < 0.0001) bj = 0.0;
-
- if(verbose) {
- if(typei == 0)
- fprintf(tree,"Node:%4d (%9.5f) joins ",(pint)mini,bi);
- else
- fprintf(tree," SEQ:%4d (%9.5f) joins ",(pint)mini,bi);
-
- if(typej == 0)
- fprintf(tree,"Node:%4d (%9.5f)",(pint)minj,bj);
- else
- fprintf(tree," SEQ:%4d (%9.5f)",(pint)minj,bj);
-
- fprintf(tree,"\n");
- }
-
-
- left_branch[nc] = bi;
- right_branch[nc] = bj;
-
- for(i=1; i<=last_seq-first_seq+1; i++)
- tree_description[nc][i] = 0;
-
- if(typei == 0) {
- for(i=nc-1; i>=1; i--)
- if(tree_description[i][mini] == 1) {
- for(j=1; j<=last_seq-first_seq+1; j++)
- if(tree_description[i][j] == 1)
- tree_description[nc][j] = 1;
- break;
- }
- }
- else
- tree_description[nc][mini] = 1;
-
- if(typej == 0) {
- for(i=nc-1; i>=1; i--)
- if(tree_description[i][minj] == 1) {
- for(j=1; j<=last_seq-first_seq+1; j++)
- if(tree_description[i][j] == 1)
- tree_description[nc][j] = 1;
- break;
- }
- }
- else
- tree_description[nc][minj] = 1;
-
-
-/*
- Here is where the -0.00005 branch lengths come from for 3 or more
- identical seqs.
-*/
-/* if(dmin <= 0.0) dmin = 0.0001; */
- if(dmin <= 0.0) dmin = 0.000001;
- av[mini] = dmin * 0.5;
-
-/*........................Re-initialisation................................*/
-
- fnseqs = fnseqs - 1.0;
- tkill[minj] = 1;
-
- /* IMPROVEMENT 2, STEP 3 : Remove tvalid[minj] from chain list. */
- /* [ Before ]
- * +---------+ +---------+ +---------+
- * |prev |<-------|prev |<-------|prev |<---
- * | n | | n(=minj)| | n |
- * | next|------->| next|------->| next|----
- * +---------+ +---------+ +---------+
- *
- * [ After ]
- * +---------+ +---------+
- * |prev |<--------------------------|prev |<---
- * | n | | n |
- * | next|-------------------------->| next|----
- * +---------+ +---------+
- * +---------+
- * NULL---|prev |
- * | n(=minj)|
- * | next|---NULL
- * +---------+
- */
- (tvalid[minj].prev)->next = tvalid[minj].next;
- if (tvalid[minj].next != NULL) {
- (tvalid[minj].next)->prev = tvalid[minj].prev;
- }
- tvalid[minj].prev = tvalid[minj].next = NULL;
-
- /* IMPROVEMENT 1, STEP 5 : re-calculate sum values. */
- for(lpj=tvalid[0].next ; lpj != NULL ; lpj = lpj->next) {
- double tmp_di = 0.0;
- double tmp_dj = 0.0;
- j = lpj->n;
-
- /*
- * subtrace a score value related with 'minj' from
- * sum arrays .
- */
- if (j < minj) {
- tmp_dj = tmat[j][minj];
- sum_cols[j] -= tmp_dj;
- } else if (j > minj) {
- tmp_dj = tmat[minj][j];
- sum_rows[j] -= tmp_dj;
- } /* nothing to do when j is equal to minj. */
-
-
- /*
- * subtrace a score value related with 'mini' from
- * sum arrays .
- */
- if (j < mini) {
- tmp_di = tmat[j][mini];
- sum_cols[j] -= tmp_di;
- } else if (j > mini) {
- tmp_di = tmat[mini][j];
- sum_rows[j] -= tmp_di;
- } /* nothing to do when j is equal to mini. */
-
- /*
- * calculate a score value of the new inner node.
- * then, store it temporary to join[] array.
- */
- join[j] = (tmp_dj + tmp_di) * 0.5;
- }
-
- /*
- * 1)
- * Set the score values (stored in join[]) into the matrix,
- * row/column position is 'mini'.
- * 2)
- * Add a score value of the new inner node to sum arrays.
- */
- for(lpj=tvalid[0].next ; lpj != NULL; lpj = lpj->next) {
- j = lpj->n;
- if (j < mini) {
- tmat[j][mini] = join[j];
- sum_cols[j] += join[j];
- } else if (j > mini) {
- tmat[mini][j] = join[j];
- sum_rows[j] += join[j];
- } /* nothing to do when j is equal to mini. */
- }
-
- /* Re-calculate sum_rows[mini],sum_cols[mini]. */
- sum_cols[mini] = sum_rows[mini] = 0.0;
-
- /* calculate sum_rows[mini] */
- da = 0.0;
- for(lpj=tvalid[0].next ; lpj->n < mini ; lpj = lpj->next) {
- da += join[lpj->n];
- }
- sum_rows[mini] = da;
-
- /* skip if 'lpj->n' is equal to 'mini' */
- if ((lpj != NULL) && (lpj->n == mini)) {
- lpj = lpj->next;
- }
-
- /* calculate sum_cols[mini] */
- da = 0.0;
- for( ; lpj != NULL; lpj = lpj->next) {
- da += join[lpj->n];
- }
- sum_cols[mini] = da;
-
- /*
- * Clean up sum_rows[minj], sum_cols[minj] and score matrix
- * related with 'minj'.
- */
- sum_cols[minj] = sum_rows[minj] = 0.0;
- for(j=1; j<=last_seq-first_seq+1; ++j)
- tmat[minj][j] = tmat[j][minj] = join[j] = 0.0;
-
-
-/****/ } /**end main cycle**/
-
-/******************************Last Cycle (3 Seqs. left)********************/
-
- nude = 1;
-
- for(lpi=tvalid[0].next; lpi != NULL; lpi = lpi->next) {
- l[nude] = lpi->n;
- ++nude;
- }
-
- b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
- b2 = tmat[l[1]][l[2]] - b1;
- b3 = tmat[l[1]][l[3]] - b1;
-
- branch[1] = b1 - av[l[1]];
- branch[2] = b2 - av[l[2]];
- branch[3] = b3 - av[l[3]];
-
-/* Reset tiny negative and positive branch lengths to zero */
- if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
- if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
- if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;
-
- left_branch[last_seq-first_seq+1-2] = branch[1];
- left_branch[last_seq-first_seq+1-1] = branch[2];
- left_branch[last_seq-first_seq+1] = branch[3];
-
- for(i=1; i<=last_seq-first_seq+1; i++)
- tree_description[last_seq-first_seq+1-2][i] = 0;
-
- if(verbose)
- fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",(pint)nc);
-
- for(i=1; i<=3; ++i) {
- if( av[l[i]] > 0.0) {
- if(verbose)
- fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",(pint)l[i],branch[i]);
- for(k=last_seq-first_seq+1-3; k>=1; k--)
- if(tree_description[k][l[i]] == 1) {
- for(j=1; j<=last_seq-first_seq+1; j++)
- if(tree_description[k][j] == 1)
- tree_description[last_seq-first_seq+1-2][j] = i;
- break;
- }
- }
- else {
- if(verbose)
- fprintf(tree,"\n\t\t SEQ:%4d (%9.5f) ",(pint)l[i],branch[i]);
- tree_description[last_seq-first_seq+1-2][l[i]] = i;
- }
- if(i < 3) {
- if(verbose)
- fprintf(tree,"joins");
- }
- }
-
- if(verbose)
- fprintf(tree,"\n");
-
-
- /* IMPROVEMENT 1, STEP 6 : release memory area */
- ckfree(sum_cols);
- ckfree(sum_rows);
- ckfree(join);
-
- /* IMPROVEMENT 2, STEP 4 : release memory area */
- ckfree(tvalid);
-
-}
-#endif /* ORIGINAL_NJ_TREE */
-
-
-
-void bootstrap_tree(char *phylip_name,char *clustal_name, char *nexus_name)
-{
- sint i,j;
- int ranno;
- char path[MAXLINE+1];
- char dummy[10];
- char err_mess[1024];
- static char **sample_tree;
- static char **standard_tree;
- static char **save_tree;
- sint total_dists, overspill = 0, total_overspill = 0;
- sint nfails = 0;
-
- if(empty) {
- error("You must load an alignment first");
- return;
- }
-
- if(nseqs<4) {
- error("Alignment has only %d sequences",nseqs);
- return;
- }
-
- if(!output_tree_clustal && !output_tree_phylip && !output_tree_nexus) {
- error("You must select either clustal or phylip or nexus tree output format");
- return;
- }
- get_path(seqname, path);
-
- if (output_tree_clustal) {
- if (clustal_name[0]!=EOS) {
- if((clustal_phy_tree_file = open_explicit_file(
- clustal_name))==NULL) return;
- }
- else {
- if((clustal_phy_tree_file = open_output_file(
- "\nEnter name for bootstrap output file ",path,
- clustal_name,"njb")) == NULL) return;
- }
- }
-
- first_seq=1;
- last_seq=nseqs;
-
- if (output_tree_phylip) {
- if (phylip_name[0]!=EOS) {
- if((phylip_phy_tree_file = open_explicit_file(
- phylip_name))==NULL) return;
- }
- else {
- if((phylip_phy_tree_file = open_output_file(
- "\nEnter name for bootstrap output file ",path,
- phylip_name,"phb")) == NULL) return;
- }
- }
-
- if (output_tree_nexus) {
- if (nexus_name[0]!=EOS) {
- if((nexus_phy_tree_file = open_explicit_file(
- nexus_name))==NULL) return;
- }
- else {
- if((nexus_phy_tree_file = open_output_file(
- "\nEnter name for bootstrap output file ",path,
- nexus_name,"treb")) == NULL) return;
- }
- }
-
- boot_totals = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
- for(i=0;i<nseqs+1;i++)
- boot_totals[i]=0;
-
- boot_positions = (sint *)ckalloc( (seqlen_array[first_seq]+2) * sizeof (sint) );
-
- for(j=1; j<=seqlen_array[first_seq]; ++j) /* First select all positions for */
- boot_positions[j] = j; /* the "standard" tree */
-
- if(output_tree_clustal) {
- verbose = TRUE; /* Turn on file output */
- if(dnaflag)
- overspill = dna_distance_matrix(clustal_phy_tree_file);
- else
- overspill = prot_distance_matrix(clustal_phy_tree_file);
- }
-
- if(output_tree_phylip) {
- verbose = FALSE; /* Turn off file output */
- if(dnaflag)
- overspill = dna_distance_matrix(phylip_phy_tree_file);
- else
- overspill = prot_distance_matrix(phylip_phy_tree_file);
- }
-
- if(output_tree_nexus) {
- verbose = FALSE; /* Turn off file output */
- if(dnaflag)
- overspill = dna_distance_matrix(nexus_phy_tree_file);
- else
- overspill = prot_distance_matrix(nexus_phy_tree_file);
- }
-
-/* check if any distances overflowed the distance corrections */
- if ( overspill > 0 ) {
- total_dists = (nseqs*(nseqs-1))/2;
- overspill_message(overspill,total_dists);
- }
-
- tree_gaps=ckfree((void *)tree_gaps);
-
- if (output_tree_clustal) verbose = TRUE; /* Turn on screen output */
-
- standard_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
- for(i=0; i<nseqs+1; i++)
- standard_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
-
-/* compute the standard tree */
-
- if(output_tree_clustal || output_tree_phylip || output_tree_nexus)
- nj_tree(standard_tree,clustal_phy_tree_file);
-
- if (output_tree_clustal)
- fprintf(clustal_phy_tree_file,"\n\n\t\t\tBootstrap Confidence Limits\n\n");
-
-/* save the left_branch and right_branch for phylip output */
- save_left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- save_right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- for (i=1;i<=nseqs;i++) {
- save_left_branch[i] = left_branch[i];
- save_right_branch[i] = right_branch[i];
- }
-/*
- The next line is a fossil from the days of using the cc ran()
- ran_factor = RAND_MAX / seqlen_array[first_seq];
-*/
-
- if(usemenu)
- boot_ran_seed =
-getint("\n\nEnter seed no. for random number generator ",1,1000,boot_ran_seed);
-
-/* do not use the native cc ran()
- srand(boot_ran_seed);
-*/
- addrandinit((unsigned long) boot_ran_seed);
-
- if (output_tree_clustal)
- fprintf(clustal_phy_tree_file,"\n Random number generator seed = %7u\n",
- boot_ran_seed);
-
- if(usemenu)
- boot_ntrials =
-getint("\n\nEnter number of bootstrap trials ",1,10000,boot_ntrials);
-
- if (output_tree_clustal) {
- fprintf(clustal_phy_tree_file,"\n Number of bootstrap trials = %7d\n",
- (pint)boot_ntrials);
-
- fprintf(clustal_phy_tree_file,
- "\n\n Diagrammatic representation of the above tree: \n");
- fprintf(clustal_phy_tree_file,"\n Each row represents 1 tree cycle;");
- fprintf(clustal_phy_tree_file," defining 2 groups.\n");
- fprintf(clustal_phy_tree_file,"\n Each column is 1 sequence; ");
- fprintf(clustal_phy_tree_file,"the stars in each line show 1 group; ");
- fprintf(clustal_phy_tree_file,"\n the dots show the other\n");
- fprintf(clustal_phy_tree_file,"\n Numbers show occurences in bootstrap samples.");
- }
-/*
- print_tree(standard_tree, clustal_phy_tree_file, boot_totals);
-*/
- verbose = FALSE; /* Turn OFF screen output */
-
- left_branch=ckfree((void *)left_branch);
- right_branch=ckfree((void *)right_branch);
- tkill=ckfree((void *)tkill);
- av=ckfree((void *)av);
-
- sample_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
- for(i=0; i<nseqs+1; i++)
- sample_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
-
- if (usemenu)
- fprintf(stdout,"\n\nEach dot represents 10 trials\n\n");
- total_overspill = 0;
- nfails = 0;
- for(i=1; i<=boot_ntrials; ++i) {
- for(j=1; j<=seqlen_array[first_seq]; ++j) { /* select alignment */
- /* positions for */
- ranno = addrand( (unsigned long) seqlen_array[1]) + 1;
- boot_positions[j] = ranno; /* bootstrap sample */
- }
- if(output_tree_clustal) {
- if(dnaflag)
- overspill = dna_distance_matrix(clustal_phy_tree_file);
- else
- overspill = prot_distance_matrix(clustal_phy_tree_file);
- }
-
- if(output_tree_phylip) {
- if(dnaflag)
- overspill = dna_distance_matrix(phylip_phy_tree_file);
- else
- overspill = prot_distance_matrix(phylip_phy_tree_file);
- }
-
- if(output_tree_nexus) {
- if(dnaflag)
- overspill = dna_distance_matrix(nexus_phy_tree_file);
- else
- overspill = prot_distance_matrix(nexus_phy_tree_file);
- }
-
- if( overspill > 0) {
- total_overspill = total_overspill + overspill;
- nfails++;
- }
-
- tree_gaps=ckfree((void *)tree_gaps);
-
- if(output_tree_clustal || output_tree_phylip || output_tree_nexus)
- nj_tree(sample_tree,clustal_phy_tree_file);
-
- left_branch=ckfree((void *)left_branch);
- right_branch=ckfree((void *)right_branch);
- tkill=ckfree((void *)tkill);
- av=ckfree((void *)av);
-
- compare_tree(standard_tree, sample_tree, boot_totals, last_seq-first_seq+1);
- if (usemenu) {
- if(i % 10 == 0) fprintf(stdout,".");
- if(i % 100 == 0) fprintf(stdout,"\n");
- }
- }
-
-/* check if any distances overflowed the distance corrections */
- if ( nfails > 0 ) {
- total_dists = (nseqs*(nseqs-1))/2;
- fprintf(stdout,"\n");
- fprintf(stdout,"\n WARNING: %ld of the distances out of a total of %ld times %ld",
- (long)total_overspill,(long)total_dists,(long)boot_ntrials);
- fprintf(stdout,"\n were out of range for the distance correction.");
- fprintf(stdout,"\n This affected %d out of %d bootstrap trials.",
- (pint)nfails,(pint)boot_ntrials);
- fprintf(stdout,"\n This may not be fatal but you have been warned!");
- fprintf(stdout,"\n");
- fprintf(stdout,"\n SUGGESTIONS: 1) turn off the correction");
- fprintf(stdout,"\n or 2) remove the most distant sequences");
- fprintf(stdout,"\n or 3) use the PHYLIP package.");
- fprintf(stdout,"\n\n");
- if (usemenu)
- getstr("Press [RETURN] to continue",10,dummy);
- }
-
-
- boot_positions=ckfree((void *)boot_positions);
-
- for (i=1;i<nseqs+1;i++)
- sample_tree[i]=ckfree((void *)sample_tree[i]);
- sample_tree=ckfree((void *)sample_tree);
-/*
- fprintf(clustal_phy_tree_file,"\n\n Bootstrap totals for each group\n");
-*/
- if (output_tree_clustal)
- print_tree(standard_tree, clustal_phy_tree_file, boot_totals);
-
- save_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
- for(i=0; i<nseqs+1; i++)
- save_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
-
- for(i=1; i<nseqs+1; i++)
- for(j=1; j<nseqs+1; j++)
- save_tree[i][j] = standard_tree[i][j];
-
- if(output_tree_phylip) {
- left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- for (i=1;i<=nseqs;i++) {
- left_branch[i] = save_left_branch[i];
- right_branch[i] = save_right_branch[i];
- }
- print_phylip_tree(standard_tree,phylip_phy_tree_file,
- bootstrap_format);
- left_branch=ckfree((void *)left_branch);
- right_branch=ckfree((void *)right_branch);
- }
-
- for(i=1; i<nseqs+1; i++)
- for(j=1; j<nseqs+1; j++)
- standard_tree[i][j] = save_tree[i][j];
-
- if(output_tree_nexus) {
- left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- for (i=1;i<=nseqs;i++) {
- left_branch[i] = save_left_branch[i];
- right_branch[i] = save_right_branch[i];
- }
- print_nexus_tree(standard_tree,nexus_phy_tree_file,
- bootstrap_format);
- left_branch=ckfree((void *)left_branch);
- right_branch=ckfree((void *)right_branch);
- }
-
- boot_totals=ckfree((void *)boot_totals);
- save_left_branch=ckfree((void *)save_left_branch);
- save_right_branch=ckfree((void *)save_right_branch);
-
- for (i=1;i<nseqs+1;i++)
- standard_tree[i]=ckfree((void *)standard_tree[i]);
- standard_tree=ckfree((void *)standard_tree);
-
- for (i=0;i<nseqs+1;i++)
- save_tree[i]=ckfree((void *)save_tree[i]);
- save_tree=ckfree((void *)save_tree);
-
- if (output_tree_clustal)
- fclose(clustal_phy_tree_file);
-
- if (output_tree_phylip)
- fclose(phylip_phy_tree_file);
-
- if (output_tree_nexus)
- fclose(nexus_phy_tree_file);
-
- if (output_tree_clustal)
- info("Bootstrap output file completed [%s]"
- ,clustal_name);
- if (output_tree_phylip)
- info("Bootstrap output file completed [%s]"
- ,phylip_name);
- if (output_tree_nexus)
- info("Bootstrap output file completed [%s]"
- ,nexus_name);
-}
-
-
-void compare_tree(char **tree1, char **tree2, sint *hits, sint n)
-{
- sint i,j,k;
- sint nhits1, nhits2;
-
- for(i=1; i<=n-3; i++) {
- for(j=1; j<=n-3; j++) {
- nhits1 = 0;
- nhits2 = 0;
- for(k=1; k<=n; k++) {
- if(tree1[i][k] == tree2[j][k]) nhits1++;
- if(tree1[i][k] != tree2[j][k]) nhits2++;
- }
- if((nhits1 == last_seq-first_seq+1) || (nhits2 == last_seq-first_seq+1)) hits[i]++;
- }
- }
-}
-
-
-void print_nexus_tree(char **tree_description, FILE *tree, sint bootstrap)
-{
- sint i;
- sint old_row;
-
- fprintf(tree,"#NEXUS\n\n");
-
- fprintf(tree,"BEGIN TREES;\n\n");
- fprintf(tree,"\tTRANSLATE\n");
- for(i=1;i<nseqs;i++) {
- fprintf(tree,"\t\t%d %s,\n",(pint)i,names[i]);
- }
- fprintf(tree,"\t\t%d %s\n",(pint)nseqs,names[nseqs]);
- fprintf(tree,"\t\t;\n");
-
- fprintf(tree,"\tUTREE PAUP_1= ");
-
- if(last_seq-first_seq+1==2) {
- fprintf(tree,"(%d:%7.5f,%d:%7.5f);",first_seq,tmat[first_seq][first_seq+1],first_seq+1,tmat[first_seq][first_seq+1]);
- }
- else {
-
- fprintf(tree,"(");
-
- old_row=two_way_split_nexus(tree_description, tree, last_seq-first_seq+1-2,1,bootstrap);
- fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1-2]);
- if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
- fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
- fprintf(tree,",");
-
- old_row=two_way_split_nexus(tree_description, tree, last_seq-first_seq+1-2,2,bootstrap);
- fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1-1]);
- if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
- fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
- fprintf(tree,",");
-
- old_row=two_way_split_nexus(tree_description, tree, last_seq-first_seq+1-2,3,bootstrap);
- fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1]);
- if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
- fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
- fprintf(tree,")");
- if (bootstrap==BS_NODE_LABELS) fprintf(tree,"TRICHOTOMY");
- fprintf(tree,";");
- }
- fprintf(tree,"\nENDBLOCK;\n");
-}
-
-
-sint two_way_split_nexus
-(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap)
-{
- sint row, new_row = 0, old_row, col, test_col = 0;
- Boolean single_seq;
-
- if(start_row != last_seq-first_seq+1-2) fprintf(tree,"(");
-
- for(col=1; col<=last_seq-first_seq+1; col++) {
- if(tree_description[start_row][col] == flag) {
- test_col = col;
- break;
- }
- }
-
- single_seq = TRUE;
- for(row=start_row-1; row>=1; row--)
- if(tree_description[row][test_col] == 1) {
- single_seq = FALSE;
- new_row = row;
- break;
- }
-
- if(single_seq) {
- tree_description[start_row][test_col] = 0;
- fprintf(tree,"%d",test_col+first_seq-1);
- if(start_row == last_seq-first_seq+1-2) {
- return(0);
- }
-
- fprintf(tree,":%7.5f,",left_branch[start_row]);
- }
- else {
- for(col=1; col<=last_seq-first_seq+1; col++) {
- if((tree_description[start_row][col]==1)&&
- (tree_description[new_row][col]==1))
- tree_description[start_row][col] = 0;
- }
- old_row=two_way_split_nexus(tree_description, tree, new_row, (sint)1, bootstrap);
- if(start_row == last_seq-first_seq+1-2) {
- return(new_row);
- }
-
- fprintf(tree,":%7.5f",left_branch[start_row]);
- if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0))
- fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
-
- fprintf(tree,",");
- }
-
-
- for(col=1; col<=last_seq-first_seq+1; col++)
- if(tree_description[start_row][col] == flag) {
- test_col = col;
- break;
- }
-
- single_seq = TRUE;
- new_row = 0;
- for(row=start_row-1; row>=1; row--)
- if(tree_description[row][test_col] == 1) {
- single_seq = FALSE;
- new_row = row;
- break;
- }
-
- if(single_seq) {
- tree_description[start_row][test_col] = 0;
- fprintf(tree,"%d",test_col+first_seq-1);
- fprintf(tree,":%7.5f)",right_branch[start_row]);
- }
- else {
- for(col=1; col<=last_seq-first_seq+1; col++) {
- if((tree_description[start_row][col]==1)&&
- (tree_description[new_row][col]==1))
- tree_description[start_row][col] = 0;
- }
- old_row=two_way_split_nexus(tree_description, tree, new_row, (sint)1, bootstrap);
- fprintf(tree,":%7.5f",right_branch[start_row]);
- if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0))
- fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
-
- fprintf(tree,")");
- }
- if ((bootstrap==BS_NODE_LABELS) && (boot_totals[start_row]>0))
- fprintf(tree,"%d",(pint)boot_totals[start_row]);
-
- return(start_row);
-}
-
-
-void print_phylip_tree(char **tree_description, FILE *tree, sint bootstrap)
-{
- sint old_row;
-
- if(last_seq-first_seq+1==2) {
- fprintf(tree,"(%s:%7.5f,%s:%7.5f);",names[first_seq],tmat[first_seq][first_seq+1],names[first_seq+1],tmat[first_seq][first_seq+1]);
- return;
- }
-
- fprintf(tree,"(\n");
-
- old_row=two_way_split(tree_description, tree, last_seq-first_seq+1-2,1,bootstrap);
- fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1-2]);
- if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
- fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
- fprintf(tree,",\n");
-
- old_row=two_way_split(tree_description, tree, last_seq-first_seq+1-2,2,bootstrap);
- fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1-1]);
- if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
- fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
- fprintf(tree,",\n");
-
- old_row=two_way_split(tree_description, tree, last_seq-first_seq+1-2,3,bootstrap);
- fprintf(tree,":%7.5f",left_branch[last_seq-first_seq+1]);
- if ((bootstrap==BS_BRANCH_LABELS) && (old_row>0) && (boot_totals[old_row]>0))
- fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
- fprintf(tree,")");
- if (bootstrap==BS_NODE_LABELS) fprintf(tree,"TRICHOTOMY");
- fprintf(tree,";\n");
-}
-
-
-sint two_way_split
-(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap)
-{
- sint row, new_row = 0, old_row, col, test_col = 0;
- Boolean single_seq;
-
- if(start_row != last_seq-first_seq+1-2) fprintf(tree,"(\n");
-
- for(col=1; col<=last_seq-first_seq+1; col++) {
- if(tree_description[start_row][col] == flag) {
- test_col = col;
- break;
- }
- }
-
- single_seq = TRUE;
- for(row=start_row-1; row>=1; row--)
- if(tree_description[row][test_col] == 1) {
- single_seq = FALSE;
- new_row = row;
- break;
- }
-
- if(single_seq) {
- tree_description[start_row][test_col] = 0;
- fprintf(tree,"%.*s",max_names,names[test_col+first_seq-1]);
- if(start_row == last_seq-first_seq+1-2) {
- return(0);
- }
-
- fprintf(tree,":%7.5f,\n",left_branch[start_row]);
- }
- else {
- for(col=1; col<=last_seq-first_seq+1; col++) {
- if((tree_description[start_row][col]==1)&&
- (tree_description[new_row][col]==1))
- tree_description[start_row][col] = 0;
- }
- old_row=two_way_split(tree_description, tree, new_row, (sint)1, bootstrap);
- if(start_row == last_seq-first_seq+1-2) {
- return(new_row);
- }
-
- fprintf(tree,":%7.5f",left_branch[start_row]);
- if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0))
- fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
-
- fprintf(tree,",\n");
- }
-
-
- for(col=1; col<=last_seq-first_seq+1; col++)
- if(tree_description[start_row][col] == flag) {
- test_col = col;
- break;
- }
-
- single_seq = TRUE;
- new_row = 0;
- for(row=start_row-1; row>=1; row--)
- if(tree_description[row][test_col] == 1) {
- single_seq = FALSE;
- new_row = row;
- break;
- }
-
- if(single_seq) {
- tree_description[start_row][test_col] = 0;
- fprintf(tree,"%.*s",max_names,names[test_col+first_seq-1]);
- fprintf(tree,":%7.5f)\n",right_branch[start_row]);
- }
- else {
- for(col=1; col<=last_seq-first_seq+1; col++) {
- if((tree_description[start_row][col]==1)&&
- (tree_description[new_row][col]==1))
- tree_description[start_row][col] = 0;
- }
- old_row=two_way_split(tree_description, tree, new_row, (sint)1, bootstrap);
- fprintf(tree,":%7.5f",right_branch[start_row]);
- if ((bootstrap==BS_BRANCH_LABELS) && (boot_totals[old_row]>0))
- fprintf(tree,"[%d]",(pint)boot_totals[old_row]);
-
- fprintf(tree,")\n");
- }
- if ((bootstrap==BS_NODE_LABELS) && (boot_totals[start_row]>0))
- fprintf(tree,"%d",(pint)boot_totals[start_row]);
-
- return(start_row);
-}
-
-
-
-void print_tree(char **tree_description, FILE *tree, sint *totals)
-{
- sint row,col;
-
- fprintf(tree,"\n");
-
- for(row=1; row<=last_seq-first_seq+1-3; row++) {
- fprintf(tree," \n");
- for(col=1; col<=last_seq-first_seq+1; col++) {
- if(tree_description[row][col] == 0)
- fprintf(tree,"*");
- else
- fprintf(tree,".");
- }
- if(totals[row] > 0)
- fprintf(tree,"%7d",(pint)totals[row]);
- }
- fprintf(tree," \n");
- for(col=1; col<=last_seq-first_seq+1; col++)
- fprintf(tree,"%1d",(pint)tree_description[last_seq-first_seq+1-2][col]);
- fprintf(tree,"\n");
-}
-
-
-
-sint dna_distance_matrix(FILE *tree)
-{
- sint m,n;
- sint j,i;
- sint res1, res2;
- sint overspill = 0;
- double p,q,e,a,b,k;
-
- tree_gap_delete(); /* flag positions with gaps (tree_gaps[i] = 1 ) */
-
- if(verbose) {
- fprintf(tree,"\n");
- fprintf(tree,"\n DIST = percentage divergence (/100)");
- fprintf(tree,"\n p = rate of transition (A <-> G; C <-> T)");
- fprintf(tree,"\n q = rate of transversion");
- fprintf(tree,"\n Length = number of sites used in comparison");
- fprintf(tree,"\n");
- if(tossgaps) {
- fprintf(tree,"\n All sites with gaps (in any sequence) deleted!");
- fprintf(tree,"\n");
- }
- if(kimura) {
- fprintf(tree,"\n Distances corrected by Kimura's 2 parameter model:");
- fprintf(tree,"\n\n Kimura, M. (1980)");
- fprintf(tree," A simple method for estimating evolutionary ");
- fprintf(tree,"rates of base");
- fprintf(tree,"\n substitutions through comparative studies of ");
- fprintf(tree,"nucleotide sequences.");
- fprintf(tree,"\n J. Mol. Evol., 16, 111-120.");
- fprintf(tree,"\n\n");
- }
- }
-
- for(m=1; m<last_seq-first_seq+1; ++m) /* for every pair of sequence */
- for(n=m+1; n<=last_seq-first_seq+1; ++n) {
- p = q = e = 0.0;
- tmat[m][n] = tmat[n][m] = 0.0;
- for(i=1; i<=seqlen_array[first_seq]; ++i) {
- j = boot_positions[i];
- if(tossgaps && (tree_gaps[j] > 0) )
- goto skip; /* gap position */
- res1 = seq_array[m+first_seq-1][j];
- res2 = seq_array[n+first_seq-1][j];
- if( (res1 == gap_pos1) || (res1 == gap_pos2) ||
- (res2 == gap_pos1) || (res2 == gap_pos2))
- goto skip; /* gap in a seq*/
- if(!use_ambiguities)
- if( is_ambiguity(res1) || is_ambiguity(res2))
- goto skip; /* ambiguity code in a seq*/
- e = e + 1.0;
- if(res1 != res2) {
- if(transition(res1,res2))
- p = p + 1.0;
- else
- q = q + 1.0;
- }
- skip:;
- }
-
-
- /* Kimura's 2 parameter correction for multiple substitutions */
-
- if(!kimura) {
- if (e == 0) {
- fprintf(stdout,"\n WARNING: sequences %d and %d are non-overlapping\n",m,n);
- k = 0.0;
- p = 0.0;
- q = 0.0;
- }
- else {
- k = (p+q)/e;
- if(p > 0.0)
- p = p/e;
- else
- p = 0.0;
- if(q > 0.0)
- q = q/e;
- else
- q = 0.0;
- }
- tmat[m][n] = tmat[n][m] = k;
- if(verbose) /* if screen output */
- fprintf(tree,
- "%4d vs.%4d: DIST = %7.4f; p = %6.4f; q = %6.4f; length = %6.0f\n"
- ,(pint)m,(pint)n,k,p,q,e);
- }
- else {
- if (e == 0) {
- fprintf(stdout,"\n WARNING: sequences %d and %d are non-overlapping\n",m,n);
- p = 0.0;
- q = 0.0;
- }
- else {
- if(p > 0.0)
- p = p/e;
- else
- p = 0.0;
- if(q > 0.0)
- q = q/e;
- else
- q = 0.0;
- }
-
- if( ((2.0*p)+q) == 1.0 )
- a = 0.0;
- else
- a = 1.0/(1.0-(2.0*p)-q);
-
- if( q == 0.5 )
- b = 0.0;
- else
- b = 1.0/(1.0-(2.0*q));
-
-/* watch for values going off the scale for the correction. */
- if( (a<=0.0) || (b<=0.0) ) {
- overspill++;
- k = 3.5; /* arbitrary high score */
- }
- else
- k = 0.5*log(a) + 0.25*log(b);
- tmat[m][n] = tmat[n][m] = k;
- if(verbose) /* if screen output */
- fprintf(tree,
- "%4d vs.%4d: DIST = %7.4f; p = %6.4f; q = %6.4f; length = %6.0f\n"
- ,(pint)m,(pint)n,k,p,q,e);
-
- }
- }
- return overspill; /* return the number of off-scale values */
-}
-
-
-sint prot_distance_matrix(FILE *tree)
-{
- sint m,n;
- sint j,i;
- sint res1, res2;
- sint overspill = 0;
- double p,e,k, table_entry;
-
-
- tree_gap_delete(); /* flag positions with gaps (tree_gaps[i] = 1 ) */
-
- if(verbose) {
- fprintf(tree,"\n");
- fprintf(tree,"\n DIST = percentage divergence (/100)");
- fprintf(tree,"\n Length = number of sites used in comparison");
- fprintf(tree,"\n\n");
- if(tossgaps) {
- fprintf(tree,"\n All sites with gaps (in any sequence) deleted");
- fprintf(tree,"\n");
- }
- if(kimura) {
- fprintf(tree,"\n Distances up tp 0.75 corrected by Kimura's empirical method:");
- fprintf(tree,"\n\n Kimura, M. (1983)");
- fprintf(tree," The Neutral Theory of Molecular Evolution.");
- fprintf(tree,"\n Page 75. Cambridge University Press, Cambridge, England.");
- fprintf(tree,"\n\n");
- }
- }
-
- for(m=1; m<nseqs; ++m) /* for every pair of sequence */
- for(n=m+1; n<=nseqs; ++n) {
- p = e = 0.0;
- tmat[m][n] = tmat[n][m] = 0.0;
- for(i=1; i<=seqlen_array[1]; ++i) {
- j = boot_positions[i];
- if(tossgaps && (tree_gaps[j] > 0) ) goto skip; /* gap position */
- res1 = seq_array[m][j];
- res2 = seq_array[n][j];
- if( (res1 == gap_pos1) || (res1 == gap_pos2) ||
- (res2 == gap_pos1) || (res2 == gap_pos2))
- goto skip; /* gap in a seq*/
- e = e + 1.0;
- if(res1 != res2) p = p + 1.0;
- skip:;
- }
-
- if(p <= 0.0)
- k = 0.0;
- else
- k = p/e;
-
-/* DES debug */
-/* fprintf(stdout,"Seq1=%4d Seq2=%4d k =%7.4f \n",(pint)m,(pint)n,k); */
-/* DES debug */
-
- if(kimura) {
- if(k < 0.75) { /* use Kimura's formula */
- if(k > 0.0) k = - log(1.0 - k - (k * k/5.0) );
- }
- else {
- if(k > 0.930) {
- overspill++;
- k = 10.0; /* arbitrarily set to 1000% */
- }
- else {
- table_entry = (k*1000.0) - 750.0;
- k = (double)dayhoff_pams[(int)table_entry];
- k = k/100.0;
- }
- }
- }
-
- tmat[m][n] = tmat[n][m] = k;
- if(verbose) /* if screen output */
- fprintf(tree,
- "%4d vs.%4d DIST = %6.4f; length = %6.0f\n",
- (pint)m,(pint)n,k,e);
- }
- return overspill;
-}
-
-
-void guide_tree(FILE *tree,sint firstseq,sint numseqs)
-/*
- Routine for producing unrooted NJ trees from seperately aligned
- pairwise distances. This produces the GUIDE DENDROGRAMS in
- PHYLIP format.
-*/
-{
- static char **standard_tree;
- sint i;
- float dist;
-
- phylip_phy_tree_file=tree;
- verbose = FALSE;
- first_seq=firstseq;
- last_seq=first_seq+numseqs-1;
-
- if(numseqs==2) {
- dist=tmat[firstseq][firstseq+1]/2.0;
- fprintf(tree,"(%s:%0.5f,%s:%0.5f);\n",
- names[firstseq],dist,names[firstseq+1],dist);
- }
- else {
- standard_tree = (char **) ckalloc( (last_seq-first_seq+2) * sizeof (char *) );
- for(i=0; i<last_seq-first_seq+2; i++)
- standard_tree[i] = (char *) ckalloc( (last_seq-first_seq+2) * sizeof(char));
-
- nj_tree(standard_tree,clustal_phy_tree_file);
-
- print_phylip_tree(standard_tree,phylip_phy_tree_file,0);
-
- if(left_branch != NULL) left_branch=ckfree((void *)left_branch);
- if(right_branch != NULL) right_branch=ckfree((void *)right_branch);
- if(tkill != NULL) tkill=ckfree((void *)tkill);
- if(av != NULL) av=ckfree((void *)av);
- for (i=1;i<last_seq-first_seq+2;i++)
- standard_tree[i]=ckfree((void *)standard_tree[i]);
- standard_tree=ckfree((void *)standard_tree);
- }
- fclose(phylip_phy_tree_file);
-
-}
-
-static Boolean is_ambiguity(char c)
-{
- int i;
- char codes[]="ACGTU";
-
- if(use_ambiguities==TRUE)
- {
- return FALSE;
- }
-
- for(i=0;i<5;i++)
- if(amino_acid_codes[c]==codes[i])
- return FALSE;
-
- return TRUE;
-}
-
Deleted: trunk/packages/clustalw/trunk/util.c
===================================================================
--- trunk/packages/clustalw/trunk/util.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/util.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,413 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-#include <stdarg.h>
-#include <ctype.h>
-#include "clustalw.h"
-
-extern char **seq_array;
-extern sint *seqlen_array;
-extern char **names,**titles;
-extern sint *output_index;
-extern sint *seq_weight;
-extern double **tmat;
-
-
-/*
-* ckalloc()
-*
-* Tries to allocate "bytes" bytes of memory. Exits program if failed.
-* Return value:
-* Generic pointer to the newly allocated memory.
-*/
-
-void *ckalloc(size_t bytes)
-{
- register void *ret;
-
- if( (ret = calloc(bytes, sizeof(char))) == NULL)
-/*
- if( (ret = malloc(bytes)) == NULL)
-*/
- fatal("Out of memory\n");
- else
- return ret;
-
- return ret;
-}
-
-/*
-* ckrealloc()
-*
-* Tries to reallocate "bytes" bytes of memory. Exits program if failed.
-* Return value:
-* Generic pointer to the re-allocated memory.
-*/
-
-void *ckrealloc(void *ptr, size_t bytes)
-{
- register void *ret=NULL;
-
- if (ptr == NULL)
- fatal("Bad call to ckrealloc\n");
- else if( (ret = realloc(ptr, bytes)) == NULL)
- fatal("Out of memory\n");
- else
- return ret;
-
- return ret;
-}
-
-/*
-* ckfree()
-*
-* Tries to free memory allocated by ckalloc.
-* Return value:
-* None.
-*/
-
-void *ckfree(void *ptr)
-{
- if (ptr == NULL)
- warning("Bad call to ckfree\n");
- else {
- free(ptr);
- ptr = NULL;
- }
- return ptr;
-}
-
-
-/*
-* rtrim()
-*
-* Removes trailing blanks from a string
-*
-* Return values:
-* Pointer to the processed string
-*/
-
-char * rtrim(char *str)
-{
- register int p;
-
- p = strlen(str) - 1;
-
- while ( isspace(str[p]) )
- p--;
-
- str[p + 1] = EOS;
-
- return str;
-}
-
-
-/*
-* blank_to_()
-*
-* Replace blanks in a string with underscores
-*
-* Also replaces , ; : ( or ) with _
-*
-* Return value:
-* Pointer to the processed string
-*/
-
-char * blank_to_(char *str)
-{
- int i,p;
-
- p = strlen(str) - 1;
-
- for(i=0;i<=p;i++)
- if(
- (str[i]==' ') ||
- (str[i]==';') ||
- (str[i]==',') ||
- (str[i]=='(') ||
- (str[i]==')') ||
- (str[i]==':')
- )
- str[i] = '_';
-
- return str;
-}
-
-
-/*
-* upstr()
-*
-* Converts string str to uppercase.
-* Return values:
-* Pointer to the converted string.
-*/
-
-char * upstr(char *str)
-{
- register char *s = str;
-
- while( (*s = toupper(*s)) )
- s++;
-
- return str;
-}
-
-/*
-* lowstr()
-*
-* Converts string str to lower case.
-* Return values:
-* Pointer to the converted string.
-*/
-
-char * lowstr(char *str)
-{
- register char *s = str;
-
- while( (*s = tolower(*s)) )
- s++;
-
- return str;
-}
-
-void getstr(char *instr, int n, char *outstr)
-{
- int sl;
- fprintf(stdout,"%s: ",instr);
- fgets(outstr,n,stdin);
- /*
- * modify outstr for compatibility with prior used (insecure) gets()
- */
- sl=strlen(outstr);
- if(sl>0 && '\n'==outstr[sl-1]) {
- outstr[sl-1]=0;
- }
-}
-
-double getreal(char *instr,double minx,double maxx,double def)
-{
- int status;
- float ret;
- char line[MAXLINE];
-
- while(TRUE) {
- fprintf(stdout,"%s (%.1f-%.1f) [%.1f]: ",instr,minx,maxx,def);
- fgets(line,MAXLINE,stdin);
- status=sscanf(line,"%f",&ret);
- if(status == EOF) return def;
- if(ret>maxx) {
- fprintf(stdout,"ERROR: Max. value=%.1f\n\n",maxx);
- continue;
- }
- if(ret<minx) {
- fprintf(stdout,"ERROR: Min. value=%.1f\n\n",minx);
- continue;
- }
- break;
- }
- return (double)ret;
-}
-
-
-int getint(char *instr,int minx,int maxx, int def)
-{
- int ret,status;
- char line[MAXLINE];
-
- while(TRUE) {
- fprintf(stdout,"%s (%d..%d) [%d]: ",
- instr,(pint)minx,(pint)maxx,(pint)def);
- fgets(line,MAXLINE,stdin);
- status=sscanf(line,"%d",&ret);
- if(status == EOF) return def;
- if(ret>maxx) {
- fprintf(stdout,"ERROR: Max. value=%d\n\n",(pint)maxx);
- continue;
- }
- if(ret<minx) {
- fprintf(stdout,"ERROR: Min. value=%d\n\n",(pint)minx);
- continue;
- }
- break;
- }
- return ret;
-}
-
-void do_system(void)
-{
- char line[MAXLINE];
-
- getstr("\n\nEnter system command",MAXLINE,line);
- if(*line != EOS)
- system(line);
- fprintf(stdout,"\n\n");
-}
-
-
-Boolean linetype(char *line,char *code)
-{
- return( strncmp(line,code,strlen(code)) == 0 );
-}
-
-Boolean keyword(char *line,char *code)
-{
- int i;
- char key[MAXLINE];
-
- for(i=0;!isspace(line[i]) && line[i]!=EOS;i++)
- key[i]=line[i];
- key[i]=EOS;
- return( strcmp(key,code) == 0 );
-}
-
-Boolean blankline(char *line)
-{
- int i;
-
- for(i=0;line[i]!='\n' && line[i]!=EOS;i++) {
- if( isdigit(line[i]) ||
- isspace(line[i]) ||
- (line[i] == '*') ||
- (line[i] == ':') ||
- (line[i] == '.'))
- ;
- else
- return FALSE;
- }
- return TRUE;
-}
-
-
-void get_path(char *str,char *path)
-{
- register int i;
-
- strcpy(path,str);
- for(i=strlen(path)-1;i>-1;--i) {
- if(str[i]==DIRDELIM) {
- i = -1;
- break;
- }
- if(str[i]=='.') break;
- }
- if(i<0)
- strcat(path,".");
- else
- path[i+1]=EOS;
-}
-
-void alloc_aln(sint nseqs)
-{
- sint i,j;
-
- seqlen_array = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
-
- seq_array = (char **)ckalloc( (nseqs + 1) * sizeof (char *) );
- for(i=0;i<nseqs+1;i++)
- seq_array[i]=NULL;
-
- names = (char **)ckalloc( (nseqs+1) * sizeof (char *) );
- for(i=1;i<=nseqs;i++)
- names[i] = (char *)ckalloc((MAXNAMES+1) * sizeof (char));
-
- titles = (char **)ckalloc( (nseqs+1) * sizeof (char *) );
- for(i=1;i<=nseqs;i++)
- titles[i] = (char *)ckalloc((MAXTITLES+1) * sizeof (char));
-
- output_index = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
-
- tmat = (double **) ckalloc( (nseqs+1) * sizeof (double *) );
- for(i=1;i<=nseqs;i++)
- tmat[i] = (double *)ckalloc( (nseqs+1) * sizeof (double) );
- for(i=1;i<=nseqs;i++)
- for(j=1;j<=nseqs;j++)
- tmat[i][j]=0.0;
-
- seq_weight = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
- for(i=1;i<=nseqs;i++)
- seq_weight[i]=100;
-}
-
-void realloc_aln(sint first_seq,sint nseqs)
-{
- sint i,j;
-
- seqlen_array = (sint *)ckrealloc(seqlen_array, (first_seq+nseqs+1) * sizeof (sint));
-
- seq_array = (char **)ckrealloc(seq_array, (first_seq+nseqs+1) * sizeof (char *) );
- for(i=first_seq;i<first_seq+nseqs+1;i++)
- seq_array[i]=NULL;
-
- names = (char **)ckrealloc(names, (first_seq+nseqs+1) * sizeof (char *) );
- for(i=first_seq;i<first_seq+nseqs;i++)
- names[i] = (char *)ckalloc((MAXNAMES+1) * sizeof (char));
-
- titles = (char **)ckrealloc(titles, (first_seq+nseqs+1) * sizeof (char *) );
- for(i=first_seq;i<first_seq+nseqs;i++)
- titles[i] = (char *)ckalloc((MAXTITLES+1) * sizeof (char));
-
- output_index = (sint *)ckrealloc(output_index, (first_seq+nseqs+1) * sizeof (sint));
-
- seq_weight = (sint *)ckrealloc(seq_weight, (first_seq+nseqs+1) * sizeof (sint));
- for(i=first_seq;i<first_seq+nseqs;i++)
- seq_weight[i]=100;
-
- tmat = (double **) ckrealloc(tmat, (first_seq+nseqs+1) * sizeof (double *) );
- for(i=1;i<first_seq;i++)
- tmat[i] = (double *)ckrealloc(tmat[i], (first_seq+nseqs+1) * sizeof (double) );
- for(i=first_seq;i<first_seq+nseqs;i++)
- tmat[i] = (double *)ckalloc( (first_seq+nseqs+1) * sizeof (double) );
- for(i=1;i<first_seq;i++)
- for(j=first_seq;j<first_seq+nseqs;j++)
- {
- tmat[i][j]=0.0;
- tmat[j][i]=0.0;
- }
-}
-
-void free_aln(sint nseqs)
-{
- sint i;
-
- if(nseqs<=0) return;
-
- seqlen_array = ckfree(seqlen_array);
-
- for(i=1;i<=nseqs;i++)
- seq_array[i] = ckfree(seq_array[i]);
- seq_array = ckfree(seq_array);
-
- for(i=1;i<=nseqs;i++)
- names[i] = ckfree(names[i]);
- names = ckfree(names);
-
- for(i=1;i<=nseqs;i++)
- titles[i] = ckfree(titles[i]);
- titles = ckfree(titles);
-
- output_index = ckfree(output_index);
-
- seq_weight = ckfree(seq_weight);
-
- for(i=1;i<=nseqs;i++)
- tmat[i] = ckfree(tmat[i]);
- tmat = ckfree(tmat);
-}
-
-void alloc_seq(sint seq_no,sint length)
-{
- seq_array[seq_no] = (char *)ckalloc((length+2) * sizeof (char));
-}
-
-void realloc_seq(sint seq_no,sint length)
-{
- seq_array[seq_no] = (char *)realloc(seq_array[seq_no], (length+2) * sizeof (char));
-
-}
-
-void free_seq(sint seq_no)
-{
- seq_array[seq_no]=ckfree(seq_array[seq_no]);
-}
-
Deleted: trunk/packages/clustalw/trunk/xcolor.c
===================================================================
--- trunk/packages/clustalw/trunk/xcolor.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/xcolor.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,1191 +0,0 @@
-#include <stdio.h>
-#include <stdarg.h>
-#include <string.h>
-#include <time.h>
-#include <ctype.h>
-
-#include <vibrant.h>
-
-#include "clustalw.h"
-#include "xmenu.h"
-
-#define SIMPLE 1
-#define COMPOUND 2
-
-#define LEFTMARGIN 20
-#define SEPARATION 2
-#define CHARHEIGHT 10
-#define CHARWIDTH 6
-#define A4X 564
-#define A4Y 800
-#define A3X 832
-#define A3Y 1159
-#define USLETTERX 564
-#define USLETTERY 750
-#define SCOREY 3
-#define HEADER 7
-#define NOHEADER 0
-#define MAXRESNO 6
-
-#define MAXPARLEN 10
-#define MAXPAR 100
-
-static void print_ps_info(FILE *fd,int pagesize);
-static void print_page_header(FILE *fd,int ps_rotation,int maxx,int maxy,
-int page,int numpages,Boolean header,char *str_time,
-char *ps_file,int ps_xtrans,int ps_ytrans,float ps_scale);
-static void print_header_line(FILE *fd,panel_data name_data, panel_data seq_data,
-int ix,int fr,int lr);
-static void print_footer_line(FILE *fd,panel_data name_data, panel_data seq_data,
-int ix,int fr,int lr);
-static void print_quality_curve(FILE *fd,panel_data seq_data
-,int fr,int lr,int score_height);
-static void print_seq_line(FILE *fd,panel_data name_data, panel_data seq_data,
-int row,int seq,int fr,int lr,int res_number);
-
-
-typedef struct consensus_parameters
-{
-char consensus;
-int cutoff;
-int length;
-char cutoff_list[20];
-} consensus_para;
-
-typedef struct color_parameters
-{
-int type;
-char residue;
-int color;
-int length;
-char cons_list[20];
-} color_para;
-
-static void init_color_lut(FILE *fd);
-static int init_printer_lut(char *filename);
-static char *init_consensus(panel_data data);
-static int SaveColPara(char word[MAXPAR][MAXPARLEN],int num_words,int count);
-static int SaveConPara(char word[MAXPAR][MAXPARLEN],int num_words,int count);
-static int get_line(char *sinline,char word[MAXPAR][MAXPARLEN]);
-static int residue_color(char res,char consensus);
-static Boolean commentline(char *line);
-
-#define DEF_NCOLORS 4
-#define MAX_NCOLORS 8
-#define DEFAULT_COLOR 0
-
-typedef struct rgb_color {
- char name[20];
- float r,g,b;
-} rgb_color;
-
-rgb_color def_color_lut[MAX_NCOLORS]={
- "RED" ,0.9, 0.1, 0.1,
- "BLUE" ,0.1, 0.1, 0.7,
- "GREEN" ,0.1, 0.9, 0.1,
- "ORANGE" ,0.9, 0.6, 0.3,
- "CYAN" ,0.1, 0.9, 0.9,
- "PINK" ,0.9, 0.5, 0.5,
- "MAGENTA" ,0.9, 0.1, 0.9,
- "YELLOW" ,0.9, 0.9, 0.0,
-};
-
-char def_aacolor[MAX_NCOLORS][26]={"krh",
- "fwy",
- "ilmv",
- "gpst"};
-
-char def_dnacolor[MAX_NCOLORS][26]={"a",
- "c",
- "tu",
- "g"};
-
-extern char revision_level[];
-
-extern int max_names;
-
-extern int ncolors;
-extern int ncolor_pars;
-extern color color_lut[];
-extern int inverted;
-extern Boolean residue_exceptions;
-extern Boolean segment_exceptions;
-extern Boolean dnaflag;
-
-int NumColParas;
-int NumConParas;
-
-color_para Col_Par[100];
-consensus_para Con_Par[100];
-
-
-void make_colormask(panel_data data)
-{
- int i,j;
-
- for(i=0;i<data.nseqs;i++)
- for(j=0;j<data.ncols;j++)
- data.colormask[i][j] = DEFAULT_COLOR;
-
- if (ncolors > 1)
- {
- data.consensus=init_consensus(data);
-
- for(i=0;i<data.nseqs;i++)
- for(j=0;j<data.ncols;j++)
- data.colormask[i][j] = residue_color(data.lines[i][j],data.consensus[j]);
-
- }
-}
-
-static void init_color_lut(FILE *fd)
-{
- char sinline[1025];
- char *args[10];
- int i,numargs;
- Boolean found=FALSE;
-
- if (inverted==FALSE)
- {
- strcpy(color_lut[0].name,"BLACK");
- color_lut[0].r=0.4;
- color_lut[0].g=0.4;
- color_lut[0].b=0.4;
- SelectColor(color_lut[0].r*255, color_lut[0].g*255, color_lut[0].b*255);
- color_lut[0].val=GetColor();
- }
- else
- {
- strcpy(color_lut[0].name,"WHITE");
- color_lut[0].r=1.0;
- color_lut[0].g=1.0;
- color_lut[0].b=1.0;
- SelectColor(color_lut[0].r*255, color_lut[0].g*255, color_lut[0].b*255);
- color_lut[0].val=GetColor();
- }
-
- ncolors=1;
- if (fd != NULL)
- {
- for (;fgets(sinline,1024,fd)!=NULL;)
- {
- sinline[strlen(sinline)-1] = '\0';
- if (strcmp(sinline,"@rgbindex")==0)
- {
- found = TRUE;
- break;
- }
- }
- }
- if (found == TRUE)
- {
- for (;fgets(sinline,1024,fd)!=NULL;)
- {
- if (commentline(sinline)) continue;
- if (sinline[0]=='@') break;
- numargs = getargs(sinline, args, 4);
- if (numargs != 4)
- {
- error("Problem in color rgb index - line %d\n",ncolors+1);
- break;
- }
- else
- {
- strcpy(color_lut[ncolors].name, args[0]);
- color_lut[ncolors].r=atof(args[1]);
- color_lut[ncolors].g=atof(args[2]);
- color_lut[ncolors].b=atof(args[3]);
- SelectColor(color_lut[ncolors].r*255, color_lut[ncolors].g*255, color_lut[ncolors].b*255);
- color_lut[ncolors].val=GetColor();
- ncolors++;
- if (ncolors>=MAXCOLORS)
- {
- warning("Only using first %d colors in rgb index.",MAXCOLORS);
- break;
- }
- }
- }
-
- }
-
-/* if we can't find a table, use the hard-coded colors */
- if (ncolors==1)
- {
- ncolors=MAX_NCOLORS+1;
- for(i=1;i<ncolors;i++)
- {
- strcpy(color_lut[i].name,def_color_lut[i-1].name);
- color_lut[i].r=def_color_lut[i-1].r;
- color_lut[i].g=def_color_lut[i-1].g;
- color_lut[i].b=def_color_lut[i-1].b;
- SelectColor(color_lut[i].r*255, color_lut[i].g*255, color_lut[i].b*255);
- color_lut[i].val=GetColor();
- }
- }
-
-}
-
-void init_color_parameters(char *par_file)
-{
-
- int i,j,err;
- char sinline[1025];
- int maxparas = 50;
- char inword[MAXPAR][MAXPARLEN];
- int num_words;
- int in_consensus=FALSE,in_color=FALSE;
- int consensus_found=FALSE,color_found=FALSE;
- FILE *par_fd=NULL;
-
- if(par_file!=NULL)
- par_fd=fopen(par_file,"r");
- if(par_fd==NULL)
- {
- info("No color file found - using defaults");
- ncolor_pars=0;
- }
-
- init_color_lut(par_fd);
- if (par_fd != NULL) rewind(par_fd);
- if (ncolors==0) return;
-
- NumColParas=0;
- NumConParas=0;
- if (par_fd != NULL)
- {
- for(;fgets(sinline,1024,par_fd) != NULL;)
- {
- sinline[strlen(sinline)-1] = '\0';
- if (commentline(sinline)) continue;
- switch(sinline[0])
- {
- case '\0':
- break;
- case '@':
- if (strcmp((char*)(sinline+1),"consensus")==0)
- {
- in_consensus = TRUE;
- in_color = FALSE;
- consensus_found = TRUE;
- }
- else if (strcmp((char*)(sinline+1),"color")==0)
- {
- in_consensus = FALSE;
- in_color = TRUE;
- color_found = TRUE;
- }
- break;
- default:
- num_words = get_line(sinline,inword);
- if (in_consensus == TRUE)
- {
- err = SaveConPara(inword,num_words,NumConParas);
- if (err == 0) NumConParas++;
- }
- else if (in_color == TRUE)
- {
- err = SaveColPara(inword,num_words,NumColParas);
- if (err == 0) NumColParas++;
- }
-
- if((NumColParas>maxparas) || (NumConParas>maxparas))
- error("Too many parameters in color file");
-
- }
- }
- if (color_found == FALSE)
- {
- error("@color not found in parameter file - using defaults\n");
- ncolor_pars=0;
- }
- fclose(par_fd);
- }
- ncolor_pars=NumColParas;
-
-/* if no color parameters found, use the default aa groupings */
- if(ncolor_pars==0)
- {
- if (dnaflag)
- {
- for(i=0;i<DEF_NCOLORS;i++)
- {
- for(j=0;j<strlen(def_dnacolor[i]);j++)
- {
- Col_Par[ncolor_pars].type=SIMPLE;
- Col_Par[ncolor_pars].residue=def_dnacolor[i][j];
- Col_Par[ncolor_pars].color=i+1;
- ncolor_pars++;
- }
- }
- }
- else
- {
- for(i=0;i<DEF_NCOLORS;i++)
- {
- for(j=0;j<strlen(def_aacolor[i]);j++)
- {
- Col_Par[ncolor_pars].type=SIMPLE;
- Col_Par[ncolor_pars].residue=def_aacolor[i][j];
- Col_Par[ncolor_pars].color=i+1;
- ncolor_pars++;
- }
- }
- }
- }
- NumColParas=ncolor_pars;
-}
-
-char *find_file(char *def_file)
-{
- char filename[FILENAMELEN];
- char *retname;
- FILE *fd;
- Boolean found=FALSE;
-#ifdef UNIX
- char *path, *path1, *deb, *fin;
- sint lf, ltot;
- char *home;
-#endif
-
-
- strcpy(filename,def_file);
- fd = fopen(filename,"r");
- if (fd != NULL)
- found=TRUE;
-#ifdef UNIX
- if (found == FALSE)
- {
- home = getenv("HOME");
- if (home != NULL)
- {
- sprintf(filename,"%s/%s",home,def_file);
- fd = fopen(filename,"r");
- if (fd != NULL)
- found=TRUE;
- }
- if (found == FALSE)
- {
- path=getenv("PATH");/* get the list of path directories,
- separated by : */
- /* added for File System Standards - Francois */
- path1=(char *)ckalloc((strlen(path)+64)*sizeof(char));
- strcpy(path1,path);
- strcat(path1,"/usr/share/clustalx:/usr/local/share/clustalx");
-
- lf=(sint)strlen(def_file);
- deb=path1;
- do
- {
- fin=strchr(deb,':');
- if(fin!=NULL)
- {
- strncpy(filename,deb,fin-deb);
- ltot=fin-deb;
- }
- else
- {
- strcpy(filename,deb);
- ltot=(sint)strlen(filename);
- }
- /* now one directory is in filename */
- if( ltot + lf + 1 <= FILENAMELEN)
- {
- filename[ltot]='/';
- strcpy(filename+ltot+1,def_file); /* now dir is appended with filename */
- if( (fd = fopen(filename,"r") ) != NULL)
- {
- found=TRUE;
- break;
- }
- }
- else found = FALSE;
- deb=fin+1;
- }
- while (fin != NULL);
- }
- }
-#endif
- if (found == TRUE)
- {
- fclose(fd);
- retname=(char *)ckalloc((strlen(filename)+1)*sizeof(char));
- strcpy(retname,filename);
- }
- else
- retname=NULL;
- return(retname);
-}
-
-static char *init_consensus(panel_data data)
-{
- char *cons_data;
- int num_res,seq,res,par,cons_total,i;
- char residue;
-
- cons_data=(char *)ckalloc((data.ncols+1)*sizeof(char));
-
- for (res=0;res<data.ncols;res++)
- {
- cons_data[res] = '.';
- for (par=0;par<NumConParas;par++)
- {
- cons_total = num_res = 0;
- for (seq=0;seq<data.nseqs;seq++)
- {
- residue=tolower(data.lines[seq][res]);
- if (isalpha(residue))
- num_res++;
- for (i=0;i<Con_Par[par].length;i++)
- if (residue==tolower(Con_Par[par].cutoff_list[i]))
- cons_total++;
- }
- if (num_res != 0)
- if (((cons_total*100)/num_res) >= Con_Par[par].cutoff)
- cons_data[res] = Con_Par[par].consensus;
- }
- }
-
- return(cons_data);
-}
-
-static int SaveColPara(char word[MAXPAR][MAXPARLEN],int num_words,int count)
-{
-
- int i;
-
- if (num_words < 3)
- {
- error("Wrong format in color list");
- return(1);
- }
-
- if (word[1][0] != '=')
- {
- error("Wrong format in color list");
- return(2);
- }
-
- if (num_words == 3)
- {
- Col_Par[count].type = SIMPLE;
- Col_Par[count].residue = word[0][0];
- Col_Par[count].color = -1;
- for (i=0;i<ncolors;i++)
- if (strcmp(word[2],color_lut[i].name)==0) Col_Par[count].color = i;
- if (Col_Par[count].color == -1)
- {
- error("%s not found in rgb index - using %s",word[2],color_lut[0].name);
- Col_Par[count].color = 0;
- }
- }
- else
- {
- if (strcmp(word[3],"if")==0)
- {
- Col_Par[count].type = COMPOUND;
- Col_Par[count].residue = word[0][0];
- Col_Par[count].color = -1;
- for (i=0;i<ncolors;i++)
- if (strcmp(word[2],color_lut[i].name)==0) Col_Par[count].color = i;
- if (Col_Par[count].color == -1)
- {
- error("%s not found in rgb index - using %s",word[2],color_lut[0].name);
- Col_Par[count].color = 0;
- }
- Col_Par[count].length = num_words - 4;
- for (i=4;i<num_words;i++)
- Col_Par[count].cons_list[i-4] = word[i][0];
- }
- else
- {
- error("Wrong format in color list");
- return(3);
- }
- }
-
- return(0);
-
-}
-
-
-static int SaveConPara(char word[MAXPAR][MAXPARLEN],int num_words,int count)
-{
-
- int i;
-
- if (num_words < 3)
- {
- error("Wrong format in consensus list");
- return(1);
- }
-
- if (word[1][0] != '=')
- {
- error("Wrong format in consensus list");
- return(2);
- }
-
- Con_Par[count].consensus = word[0][0];
- for (i=0;i<MAXPARLEN-1;i++)
- {
- if(word[2][i]=='%') word[2][i] = '\0';
- }
- Con_Par[count].cutoff = atoi(word[2]);
- Con_Par[count].length = num_words - 3;
- for (i=3;i<num_words;i++)
- {
- Con_Par[count].cutoff_list[i-3] = word[i][0];
- }
-
- return(0);
-
-}
-
-static int get_line(char *sinline,char word[MAXPAR][MAXPARLEN])
-{
- int i=0, j, word_count=0, char_count=0;
- int in_word=FALSE;
-
- for(i=0;i<MAXPAR-1;i++)
- for(j=0;j<MAXPARLEN-1;j++)
- word[i][j]='\0';
-
- for (i=0;i<=strlen(sinline);i++)
- {
- switch (sinline[i])
- {
- case ' ':
- case '\t':
- case '\0':
- case ':':
- if (in_word)
- {
- word[word_count][char_count] = '\0';
- word_count++;
- char_count = 0;
- in_word = FALSE;
- }
- break;
- default:
- in_word = TRUE;
- word[word_count][char_count] = sinline[i];
- char_count++;
- break;
- }
-
- }
- return(word_count);
-}
-
-static int residue_color(char res,char consensus)
-{
- int i,j;
-
- for (i=0;i<NumColParas;i++)
- {
- if (tolower(res) == tolower(Col_Par[i].residue))
- {
- switch (Col_Par[i].type)
- {
- case SIMPLE:
- return(Col_Par[i].color);
- case COMPOUND:
- for (j=0;j<Col_Par[i].length;j++)
- {
- if (consensus == Col_Par[i].cons_list[j]
-)
- return(Col_Par[i].color);
- }
- break;
- default:
- return(DEFAULT_COLOR);
- }
- }
- }
- return(DEFAULT_COLOR);
-}
-
-static Boolean commentline(char *line)
-{
- int i;
-
- if (line[0] == '#') return TRUE;
- for(i=0;line[i]!='\n' && line[i]!=EOS;i++) {
- if( !isspace(line[i]) )
- return FALSE;
- }
- return TRUE;
-}
-
-int block_height,block_left,block_top;
-int header_top,seq_top,footer_top,curve_top;
-
-void write_ps_file(spanel p,char *ps_file,char *par_file,int pagesize,
-int orientation,Boolean header, Boolean ruler, Boolean resno, Boolean resize,
-int first_printres,int last_printres,
-int blength,Boolean show_curve)
-{
- int i,j,bn,seq,numseqs;
- int err;
- int blocklen,numpages;
- int fr,lr;
- int page,row;
- int ps_rotation=0,ps_xtrans=0,ps_ytrans=0;
- float ps_scale,hscale,wscale;
- int maxseq;
- int maxx=0,maxy=0;
- int score_height=0;
- int main_header=0;
- int numelines,numecols;
- int nhead,nfoot;
- int ppix_width; /* width of the page in pixels */
- int pchar_height; /* height of the page in chars for sequences */
- int ppix_height; /* height of the page in pixels for sequences */
- int blocksperpage,numblocks;
- int *res_number;
- panel_data name_data,seq_data;
- FILE *fd;
-
- time_t *tptr=NULL,ttime;
- char *str_time;
-
-/* open the output file */
- if ((fd=fopen(ps_file,"w"))==NULL)
- {
- error("Cannot open file %s",ps_file);
- return;
- }
-
-/* check for printer-specific rgb values */
- err=init_printer_lut(par_file);
- if(err>0) warning("No PS Colors file: using default colors\n");
-
-/* get the page size parameters */
-
- if (pagesize==A4)
- {
- if (orientation==PORTRAIT)
- {
- maxx=A4X;
- maxy=A4Y;
- ps_rotation=0;
- }
- else
- {
- maxx=A4Y;
- maxy=A4X;
- ps_rotation=-90;
- }
- }
- else if (pagesize==A3)
- {
- if (orientation==PORTRAIT)
- {
- maxx=A3X;
- maxy=A3Y;
- ps_rotation=0;
- }
- else
- {
- maxx=A3Y;
- maxy=A3X;
- ps_rotation=-90;
- }
- }
- else if (pagesize==USLETTER)
- {
- if (orientation==PORTRAIT)
- {
- maxx=USLETTERX;
- maxy=USLETTERY;
- ps_rotation=0;
- }
- else
- {
- maxx=USLETTERY;
- maxy=USLETTERX;
- ps_rotation=-90;
- }
- }
- if(show_curve) score_height=SCOREY;
- if(header) main_header=HEADER;
- else main_header=NOHEADER;
- ppix_width=maxx-LEFTMARGIN*2;
- ppix_height=maxy-main_header*CHARHEIGHT;
-
-/* get the name data */
- GetPanelExtra(p.names,&name_data);
-
-/* get the sequence data */
- GetPanelExtra(p.seqs,&seq_data);
- numseqs=seq_data.nseqs;
- nhead=seq_data.nhead;
- if(ruler)
- nfoot=seq_data.nfoot;
- else
- nfoot=seq_data.nfoot-1;
- numelines=nhead+nfoot+score_height+SEPARATION;
-
-/* check the block length, residue range parameters */
- if(first_printres<=0)
- first_printres=1;
- if((last_printres<=0) || (last_printres>seq_data.ncols))
- last_printres=seq_data.ncols;
- if(first_printres>last_printres)
- {
- error("Bad residue range - cannot write postscript");
- return;
- }
- if (blength==0 || last_printres-first_printres+1<blength)
- blocklen=last_printres-first_printres+1;
- else
- blocklen=blength;
-
- res_number=(int *)ckalloc((name_data.nseqs+1)*sizeof(int));
- for(i=0;i<name_data.nseqs;i++)
- {
- res_number[i]=0;
- for(j=0;j<first_printres-1;j++)
- if(isalpha(seq_data.lines[i][j])) res_number[i]++;
- }
- if(resno)
- numecols=MAXRESNO+1+max_names;
- else
- numecols=1+max_names;
-
-/* print out the PS revision level etc. */
- ttime = time(tptr);
- str_time = ctime(&ttime);
- print_ps_info(fd,pagesize);
-
-/* calculate scaling factors, block sizes to fit the page etc. */
-
- if (resize==FALSE || blocklen==last_printres-first_printres+1)
- {
-/* split the alignment into blocks of sequences. If the blocks are too long
-for the page - tough! */
- if(resize==FALSE)
- ps_scale=1.0;
- else
- ps_scale=(float)ppix_width/(float)((blocklen+numecols)*CHARWIDTH);
- ps_xtrans= LEFTMARGIN * (1-ps_scale);
- ps_ytrans= ppix_height * (1-ps_scale);
- if (pagesize!=A3 && orientation==LANDSCAPE)
- ps_xtrans-=LEFTMARGIN;
-
- pchar_height=((maxy/CHARHEIGHT)-main_header)/ps_scale;
- maxseq=pchar_height-numelines;
- block_height = (maxseq+numelines) * CHARHEIGHT;
- numpages = (numseqs/maxseq) + 1;
- seq=0;
- for (page=0;page<numpages;page++)
- {
-/* print the top of page header */
- print_page_header(fd,ps_rotation,maxx,maxy,
- page,numpages,header,str_time,
- ps_file,ps_xtrans,ps_ytrans,ps_scale);
-
- block_top = maxy - main_header*CHARHEIGHT;
- block_left = LEFTMARGIN + (1+max_names)*CHARWIDTH;
- header_top = block_top;
-
- fr=first_printres-1;
- lr=last_printres-1;
-/* show the header lines */
- for (i=0;i<nhead;i++)
- print_header_line(fd,name_data,seq_data,i,fr,lr);
-
- seq_top = block_top-nhead*CHARHEIGHT;
-/* show the sequence lines */
- for (row=0;row<maxseq ;row++)
- {
- if(resno)
- {
- for(i=fr;i<=lr;i++)
- if(isalpha(seq_data.lines[seq][i]))
- res_number[seq]++;
- }
- print_seq_line(fd,name_data,seq_data,row,seq,fr,lr,res_number[seq]);
- seq++;
- if(seq>=numseqs)
- {
- row++;
- break;
- }
- }
-
- footer_top = seq_top-row*CHARHEIGHT;
-/* show the footer lines */
- for (i=0;i<nfoot;i++)
- print_footer_line(fd,name_data,seq_data,i,fr,lr);
-
- curve_top = footer_top-nfoot*CHARHEIGHT;
-/* show the quality curve */
- if(show_curve)
- print_quality_curve(fd,seq_data,fr,lr,score_height);
-
- fprintf(fd,"\nshowpage\n");
- fprintf(fd,"restore\n");
- }
- }
- else
- {
-/* split the alignment into blocks of residues, and scale the blocks to fit the page */
- maxseq=ppix_height/CHARHEIGHT-numelines-main_header;
- hscale=(float)maxseq/(float)numseqs;
- wscale=(float)ppix_width/(float)((blocklen+numecols)*CHARWIDTH);
- ps_scale=MIN(hscale,wscale);
- ps_xtrans= LEFTMARGIN * (1-ps_scale);
- ps_ytrans= ppix_height * (1-ps_scale);
- if (pagesize!=A3 && orientation==LANDSCAPE)
- ps_xtrans-=LEFTMARGIN;
-
- pchar_height=((maxy/CHARHEIGHT)-main_header)/ps_scale;
- maxseq=pchar_height-numelines;
- block_height = (numseqs+numelines) * CHARHEIGHT;
- blocksperpage = pchar_height/(numseqs+numelines);
- if (blocksperpage==0)
- {
- error("illegal combination of print parameters");
- return;
- }
- numblocks = (last_printres-first_printres) / blocklen + 1;
- if (numblocks % blocksperpage == 0)
- numpages = numblocks / blocksperpage;
- else
- numpages = numblocks / blocksperpage + 1;
-
- for (bn=0;bn<numblocks;bn++)
- {
- page = bn / blocksperpage;
-/* print the top of page header */
- if (bn % blocksperpage == 0)
- print_page_header(fd,ps_rotation,maxx,maxy,
- page,numpages,header,str_time,
- ps_file,ps_xtrans,ps_ytrans,ps_scale);
-
- block_top = maxy - main_header*CHARHEIGHT-block_height*(bn%blocksperpage);
- block_left = LEFTMARGIN + (1+max_names)*CHARWIDTH;
- header_top = block_top;
- seq_top = block_top-nhead*CHARHEIGHT;
- footer_top = block_top-(nhead+numseqs)*CHARHEIGHT;
- curve_top = block_top-(nhead+numseqs+nfoot)*CHARHEIGHT;
-
- fr=first_printres-1 + blocklen*bn;
- lr=fr+blocklen-1;
- if(lr>=last_printres) lr=last_printres-1;
-/* show the header lines */
- for (i=0;i<nhead;i++)
- print_header_line(fd,name_data,seq_data,i,fr,lr);
-
-/* show the sequence lines */
- for (i=0;i<numseqs;i++)
- {
- row = i % maxseq;
- if(resno)
- {
- for(j=fr;j<=lr;j++)
- if(isalpha(seq_data.lines[i][j]))
- res_number[i]++;
- }
- print_seq_line(fd,name_data,seq_data,row,i,fr,lr,res_number[i]);
- }
-/* show the footer lines */
- for (i=0;i<nfoot;i++)
- print_footer_line(fd,name_data,seq_data,i,fr,lr);
-
-/* show the quality curve */
- if(show_curve)
- print_quality_curve(fd,seq_data,fr,lr,score_height);
-
- if ((bn == (numblocks-1)) || ((bn % blocksperpage == blocksperpage-1)))
- {
- fprintf(fd,"\nshowpage\n");
- fprintf(fd,"restore\n");
- }
- }
- }
- fclose(fd);
- return;
-}
-
-static int init_printer_lut(char *filename)
-{
- FILE *fd;
- char sinline[1025];
- char *args[10];
- char name[20];
- int i,numargs;
- Boolean found=FALSE;
- char *par_file=NULL;
-
-/* reset the printer rgb colors to the color file rgb values */
- for(i=0;i<ncolors;i++)
- {
- color_lut[i].pr=color_lut[i].r;
- color_lut[i].pg=color_lut[i].g;
- color_lut[i].pb=color_lut[i].b;
- }
-
-/* search for the printer color file */
- if(filename[0]==EOS) return 1;
- par_file=find_file(filename);
- if(par_file==NULL)
- {
- error("Cannot find printer file %s",filename);
- return 1;
- }
- if ((fd=fopen(par_file,"r"))==NULL)
- {
- error("Cannot open printer file %s",par_file);
- return 1;
- }
-
- for (;fgets(sinline,1024,fd)!=NULL;)
- {
- if (commentline(sinline)) continue;
- numargs = getargs(sinline, args, 4);
- if (numargs != 4)
- {
- error("Problem in parameter file - line %d\n",ncolors+1);
- break;
- }
- else
- {
-/* we've found a color - find the index the color lut */
- strcpy(name, args[0]);
- for(i=0;i<ncolors;i++)
- {
- if(strcmp(name,color_lut[i].name)==0)
- {
- color_lut[i].pr=atof(args[1]);
- color_lut[i].pg=atof(args[2]);
- color_lut[i].pb=atof(args[3]);
- }
- }
- }
- }
- ckfree(par_file);
- return 0;
-}
-
-static void print_ps_info(FILE *fd,int pagesize)
-{
- fprintf(fd,"%%!PS-Adobe-1.0\n");
- fprintf(fd,"%%%%Creator: Julie Thompson\n");
- fprintf(fd,"%%%%Title:ClustalX Alignment\n");
- fprintf(fd,"%%%%EndComments\n");
- fprintf(fd,"/box { newpath\n");
- fprintf(fd,"\t-0 -3 moveto\n");
- fprintf(fd,"\t-0 %d lineto\n",CHARHEIGHT-3);
- fprintf(fd,"\t%d %d lineto\n",CHARWIDTH,CHARHEIGHT-3);
- fprintf(fd,"\t%d -3 lineto\n",CHARWIDTH);
- fprintf(fd,"\tclosepath\n");
- fprintf(fd," } def\n\n");
-
- fprintf(fd,"/color_char { gsave\n");
- fprintf(fd,"\tsetrgbcolor\n");
- fprintf(fd,"\tmoveto\n");
- fprintf(fd,"\tshow\n");
- fprintf(fd,"\tgrestore\n");
- fprintf(fd," } def\n\n");
-
- fprintf(fd,"/cbox { gsave\n");
- fprintf(fd,"\ttranslate\n");
- fprintf(fd,"\tnewpath\n");
- fprintf(fd,"\t0 0 moveto\n");
- fprintf(fd,"\tlineto\n");
- fprintf(fd,"\tlineto\n");
- fprintf(fd,"\tlineto\n");
- fprintf(fd,"\tclosepath\n");
- fprintf(fd,"\tfill\n");
- fprintf(fd,"\tgrestore\n");
- fprintf(fd," } def\n\n");
-
- fprintf(fd,"/color_inv { gsave\n");
- fprintf(fd,"\tsetrgbcolor\n");
- fprintf(fd,"\ttranslate\n");
- fprintf(fd,"\tbox fill\n");
- fprintf(fd,"\tgrestore\n");
- fprintf(fd,"\tmoveto\n");
- fprintf(fd,"\tshow\n");
- fprintf(fd," } def\n\n");
-
- fprintf(fd,"/white_inv { gsave\n");
- fprintf(fd,"\tsetrgbcolor\n");
- fprintf(fd,"\ttranslate\n");
- fprintf(fd,"\tbox fill\n");
- fprintf(fd,"\tgrestore\n");
- fprintf(fd,"\tgsave\n");
- fprintf(fd,"\tsetrgbcolor\n");
- fprintf(fd,"\tmoveto\n");
- fprintf(fd,"\tshow\n");
- fprintf(fd,"\tgrestore\n");
- fprintf(fd," } def\n\n");
-
- if (pagesize==A3)
- fprintf(fd,"statusdict begin a3 end\n\n");
-/* For canon color printer, use a3tray instead of a3!! */
-}
-
-static void print_page_header(FILE *fd,int ps_rotation,int maxx,int maxy,
-int page,int numpages,Boolean header,char *str_time,
-char *ps_file,int ps_xtrans,int ps_ytrans,float ps_scale)
-{
- int ps_x,ps_y;
- char tstr[50];
-
- fprintf(fd,"%%%%Page: P%d\n",page);
- fprintf(fd,"save\n\n");
-
- if (ps_rotation==-90)
- {
- fprintf(fd,"0 %d translate\n",maxx);
- fprintf(fd,"%d rotate\n",ps_rotation);
- }
-
- if (header)
- {
- sprintf(tstr,"CLUSTAL %s MULTIPLE SEQUENCE ALIGNMENT",revision_level);
- ps_x = (maxx-strlen(tstr)*10)/2;
- ps_y = maxy - 2*CHARHEIGHT;
- fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
- fprintf(fd,"/Times-Bold findfont 14 scalefont setfont\n");
- fprintf(fd,"(%s) show\n\n",tstr);
-
- ps_x = 20;
- ps_y = maxy - 4*CHARHEIGHT;
- fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
- fprintf(fd,"(File: %s) show\n\n",ps_file);
-
- sprintf(tstr,"Date: %s",str_time);
- ps_x = maxx-strlen(tstr)*8-20;
- ps_y = maxy - 4*CHARHEIGHT;
- fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
- fprintf(fd,"(%s) show\n\n",tstr);
-
- sprintf(tstr,"Page %d of %d",page+1,numpages);
- ps_x = 20;
- ps_y = maxy - 5*CHARHEIGHT-4;
- fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
- fprintf(fd,"(%s) show\n\n",tstr);
- }
- fprintf(fd,"%d %d translate\n",ps_xtrans,ps_ytrans);
- fprintf(fd,"%#3.2f %#3.2f scale\n",ps_scale,ps_scale);
- fprintf(fd,"/Courier-Bold findfont 10 scalefont setfont\n");
-}
-
-static void print_header_line(FILE *fd,panel_data name_data, panel_data seq_data,
-int ix,int fr,int lr)
-{
- int i;
- int ps_x,ps_y;
-
- ps_x = LEFTMARGIN;
- ps_y = header_top - (ix * CHARHEIGHT);
- fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
- fprintf(fd,"(%*s ) show\n",max_names,name_data.header[ix]);
- for(i=fr;i<=lr;i++)
- {
- ps_x = block_left + (i-fr) * CHARWIDTH;
- fprintf(fd,"(");
- fprintf(fd,"%c",seq_data.header[ix][i]);
- fprintf(fd,") ");
- fprintf(fd,"%d %d %d %d 1.0 1.0 1.0 color_inv\n",ps_x,ps_y,ps_x,ps_y);
- }
- fprintf(fd,"\n");
-}
-
-static void print_footer_line(FILE *fd,panel_data name_data, panel_data seq_data,
-int ix,int fr,int lr)
-{
- int i;
- int ps_x,ps_y;
-
- ps_x = LEFTMARGIN;
- ps_y = footer_top - (ix * CHARHEIGHT);
- fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
- fprintf(fd,"(%*s ) show\n",max_names,name_data.footer[ix]);
- for(i=fr;i<=lr;i++)
- {
- ps_x = block_left + (i-fr) * CHARWIDTH;
- fprintf(fd,"(");
- fprintf(fd,"%c",seq_data.footer[ix][i]);
- fprintf(fd,") ");
- fprintf(fd,"%d %d %d %d 1.0 1.0 1.0 color_inv\n",ps_x,ps_y,ps_x,ps_y);
- }
- fprintf(fd,"\n");
-}
-
-static void print_quality_curve(FILE *fd,panel_data seq_data,
-int fr,int lr,int score_height)
-{
- int i,w,h;
- int ps_x,ps_y,curve_bottom;
-
- w=CHARWIDTH;
- ps_x = block_left+CHARWIDTH;
- curve_bottom=curve_top-score_height*CHARHEIGHT;
- fprintf(fd,"0.3 0.3 0.3 setrgbcolor\n");
- for(i=fr+1;i<=lr;i++)
- {
- fprintf(fd,"%d %d moveto\n",ps_x,curve_bottom);
- h=score_height*CHARHEIGHT*((float)seq_data.colscore[i]/100.0);
- if(h<1) h=1;
- fprintf(fd,"%d 0 %d %d 0 %d %d %d cbox\n",w,w,h,h,ps_x,curve_bottom);
- ps_x+=CHARWIDTH;
- }
- fprintf(fd,"0.0 0.0 0.0 setrgbcolor\n");
-}
-
-static void print_seq_line(FILE *fd,panel_data name_data, panel_data seq_data,
-int row,int seq,int fr,int lr,int res_number)
-{
- int i,color;
- int ps_x,ps_y;
- float red, green, blue;
-
- ps_x = LEFTMARGIN;
- ps_y = seq_top - (row * CHARHEIGHT);
- fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
- fprintf(fd,"(%*s ) show\n",max_names,name_data.lines[seq]);
- for(i=fr;i<=lr;i++)
- {
- color = seq_data.colormask[seq][i];
- red = color_lut[color].pr;
- green = color_lut[color].pg;
- blue = color_lut[color].pb;
- ps_x = block_left + (i-fr) * CHARWIDTH;
- fprintf(fd,"(");
- fprintf(fd,"%c",seq_data.lines[seq][i]);
- fprintf(fd,") ");
- if(segment_exceptions && seq_data.segment_exception[seq][i] > 0)
- {
- fprintf(fd,"%d %d %1.1f %1.1f %1.1f %d %d %1.1f %1.1f %1.1f white_inv\n",
- ps_x,ps_y,1.0,1.0,1.0,ps_x,ps_y,0.1,0.1,0.1);
- }
- else if(residue_exceptions && seq_data.residue_exception[seq][i] == TRUE)
- {
- fprintf(fd,"%d %d %1.1f %1.1f %1.1f %d %d %1.1f %1.1f %1.1f white_inv\n",
- ps_x,ps_y,1.0,1.0,1.0,ps_x,ps_y,0.4,0.4,0.4);
- }
- else
- {
- if(inverted)
- fprintf(fd,"%d %d %d %d %1.1f %1.1f %1.1f color_inv\n",
- ps_x,ps_y,ps_x,ps_y,red,green,blue);
- else
- fprintf(fd,"%d %d %1.1f %1.1f %1.1f color_char\n",
- ps_x,ps_y,red,green,blue);
- }
- }
-
- if(res_number>0)
- {
- ps_x = block_left + (lr-fr+1) * CHARWIDTH;
- ps_y = seq_top - (row * CHARHEIGHT);
- fprintf(fd,"%d %d moveto\n",ps_x,ps_y);
- fprintf(fd,"(%*d) show\n",MAXRESNO,res_number);
- }
- fprintf(fd,"\n");
-}
Deleted: trunk/packages/clustalw/trunk/xdisplay.c
===================================================================
--- trunk/packages/clustalw/trunk/xdisplay.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/xdisplay.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,2191 +0,0 @@
-#include <stdio.h>
-#include <stdarg.h>
-#include <string.h>
-
-#include <vibrant.h>
-#include <document.h>
-
-#include "clustalw.h"
-#include "xmenu.h"
-
-static void VscrollMulti(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-static void HscrollMultiN(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-static void HscrollMultiS(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-static void VscrollPrf1(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-static void HscrollPrf1N(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-static void HscrollPrf1S(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-static void VscrollPrf2(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-static void HscrollPrf2N(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-static void HscrollPrf2S(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-
-static void NameClick(PaneL panel, PoinT pt);
-static void NameDrag(PaneL panel, PoinT pt);
-static void NameRelease(PaneL panel, PoinT pt);
-static void SeqClick(PaneL panel, PoinT pt);
-static void SeqDrag(PaneL panel, PoinT pt);
-static void SeqRelease(PaneL panel, PoinT pt);
-
-static void fit_seq_display(RecT wr,Boolean mv_message);
-static void fit_prf_displays(RecT wr,int numseqs1,int numseqs2,Boolean mv_message);
-
-static void vscrollnames(BaR bar, int newval, int oldval);
-static void hscrollnames(BaR bar, int newval, int oldval);
-static void vscrollseqs(BaR bar, int newval, int oldval);
-static void hscrollseqs(BaR bar, int newval, int oldval);
-
-static void correct_scrollbar(BaR b,int visible,int total,int value,Boolean reset);
-
-static PaneL make_panel(int type,GrouP g,int width,int height,int firstline,int tlines);
-static panel_data free_panel_data(panel_data data);
-static panel_data alloc_name_data(panel_data data);
-static panel_data alloc_seq_data(panel_data data);
-
-extern sint max_names;
-
-extern int mheader; /* maximum header lines */
-extern int mfooter; /* maximum footer lines */
-extern int max_plines; /* profile align display length */
-extern int min_plines1; /* profile align display length */
-extern int min_plines2; /* profile align display length */
-extern int loffset,boffset,toffset;
-extern int roffset;
-extern int poffset;
-
-extern Boolean aln_mode;
-extern Boolean fixed_prf_scroll;
-extern Boolean window_displayed;
-
-extern PrompT message; /* used in temporary message window */
-
-extern spanel seq_panel; /* data for multiple alignment area */
-extern spanel prf_panel[]; /* data for profile alignment areas */
-extern spanel active_panel; /* 'in-use' panel -scrolling,clicking etc. */
-extern FonT datafont;
-extern WindoW mainw;
-extern GrouP seq_display,prf1_display,prf2_display;
-
-extern int ncolors;
-extern int inverted;
-
-extern Boolean dnaflag;
-extern sint nseqs;
-extern sint profile1_nseqs;
-extern sint output_order;
-extern sint *output_index;
-extern sint *seqlen_array;
-extern char **seq_array;
-extern char **names, **titles;
-extern char *amino_acid_codes;
-extern sint gap_pos1, gap_pos2;
-extern char *gap_penalty_mask1,*gap_penalty_mask2;
-extern char *sec_struct_mask1,*sec_struct_mask2;
-extern sint struct_penalties1,struct_penalties2;
-extern sint output_struct_penalties;
-extern Boolean use_ss1, use_ss2;
-
-extern char *explicit_par_file;
-extern char *par_file;
-extern char def_protpar_file[];
-extern char def_dnapar_file[];
-extern sint ncutseqs;
-extern Boolean residue_exceptions;
-extern Boolean segment_exceptions;
-extern color color_lut[];
-extern char *res_cat1[];
-extern char *res_cat2[];
-
-static range selected_seqs; /* sequences selected by clicking on names */
-static range selected_res; /* residues selected by clicking on seqs */
-
-
-static int fromvscroll,fromhscroll; /* set by scrolling functions,
- used by DrawPanel, draw_names, draw_seqs */
-
-
-void resize_multi_window(void)
-{
- RecT r;
-
- ObjectRect(mainw,&r);
- fit_seq_display(r,FALSE);
-}
-
-void resize_prf_window(int numseqs1,int numseqs2)
-{
- RecT r;
-
- SelectFont(datafont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
-
- if(numseqs1>max_plines)
- numseqs1=max_plines;
- else if(numseqs1<min_plines1)
- numseqs1=min_plines1;
- if(numseqs2>max_plines)
- numseqs2=max_plines;
- else if(numseqs2<min_plines2)
- numseqs2=min_plines2;
- ObjectRect(mainw,&r);
-
- fit_prf_displays(r,numseqs1,numseqs2,FALSE);
-}
-
-static void fit_seq_display(RecT wr,Boolean mv_message)
-{
- int width,height,moffset;
- RecT nr,sr,mr;
- panel_data data;
-
- ObjectRect(seq_panel.names,&nr);
- ObjectRect(message,&mr);
- moffset=mr.top-nr.bottom;
- width=nr.right-nr.left;
- height=wr.bottom-wr.top-boffset-toffset;
- nr.top=toffset;
- nr.left=loffset;
- nr.bottom=nr.top+height;
- nr.right=nr.left+width;
- SetPosition(seq_panel.names,&nr);
-
- GetPanelExtra(seq_panel.names,&data);
- data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
- data.vseqs=data.vlines-data.nhead-data.nfoot;
- SetPanelExtra(seq_panel.names,&data);
-
-
- sr.top=nr.top;
- sr.left=nr.right;
- sr.bottom=sr.top+height;
- sr.right=wr.right-wr.left-roffset;
- width=sr.right-sr.left;
- SetPosition(seq_panel.seqs,&sr);
-
- GetPanelExtra(seq_panel.seqs,&data);
- data.vcols=width/data.charwidth - MARGIN*2;
- data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
- data.vseqs=data.vlines-data.nhead-data.nfoot;
- SetPanelExtra(seq_panel.seqs,&data);
-
- if(mv_message) {
- height=mr.bottom-mr.top;
- mr.top=nr.bottom+moffset;
- mr.bottom=mr.top+height;
- SetPosition(message,&mr);
- }
-
-
- position_scrollbars(seq_panel);
- correct_name_bars(TRUE);
- correct_seq_bars(TRUE);
-
-}
-
-static void fit_prf_displays(RecT wr,int numseqs1,int numseqs2,Boolean mv_message)
-{
- int width,height,moffset;
- RecT mr,nr,sr;
- panel_data data;
-
- ObjectRect(prf_panel[1].names,&nr);
- ObjectRect(message,&mr);
- moffset=mr.top-nr.bottom;
-
- ObjectRect(prf_panel[0].names,&nr);
- width=nr.right-nr.left;
-
- nr.top=toffset;
- nr.left=loffset;
- height=(wr.bottom-wr.top-boffset-toffset-poffset)*numseqs1/(numseqs1+numseqs2);
- nr.bottom=nr.top+height;
- nr.right=nr.left+width;
- SetPosition(prf_panel[0].names,&nr);
- GetPanelExtra(prf_panel[0].names,&data);
- data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
- data.vseqs=data.vlines-data.nhead-data.nfoot;
- SetPanelExtra(prf_panel[0].names,&data);
- sr.top=nr.top;
- sr.left=nr.right;
- sr.bottom=sr.top+height;
- sr.right=wr.right-wr.left-roffset;
- width=sr.right-sr.left;
- SetPosition(prf_panel[0].seqs,&sr);
- GetPanelExtra(prf_panel[0].seqs,&data);
- data.vcols=width/data.charwidth - MARGIN*2;
- data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
- data.vseqs=data.vlines-data.nhead-data.nfoot;
- SetPanelExtra(prf_panel[0].seqs,&data);
- position_scrollbars(prf_panel[0]);
-
-
-
- nr.top=nr.bottom+poffset;
- height=(wr.bottom-wr.top-boffset-toffset-poffset)*numseqs2/(numseqs1+numseqs2);
- nr.bottom=nr.top+height;
- SetPosition(prf_panel[1].names,&nr);
- GetPanelExtra(prf_panel[1].names,&data);
- data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
- data.vseqs=data.vlines-data.nhead-data.nfoot;
- SetPanelExtra(prf_panel[1].names,&data);
- sr.top=nr.top;
- sr.bottom=sr.top+height;
- SetPosition(prf_panel[1].seqs,&sr);
- GetPanelExtra(prf_panel[1].seqs,&data);
- data.vcols=width/data.charwidth - MARGIN*2;
- data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
- data.vseqs=data.vlines-data.nhead-data.nfoot;
- SetPanelExtra(prf_panel[1].seqs,&data);
- position_scrollbars(prf_panel[1]);
-
- if(mv_message) {
- height=mr.bottom-mr.top;
- mr.top=nr.bottom+moffset;
- mr.bottom=mr.top+height;
- SetPosition(message,&mr);
- }
-
- correct_name_bars(TRUE);
- correct_seq_bars(TRUE);
-}
-
-void ResizeWindowProc(WindoW w)
-{
- int numseqs1,numseqs2;
- RecT wr;
- panel_data data;
-
- if(window_displayed==FALSE) return;
-
- ObjectRect(w,&wr);
- if (aln_mode==MULTIPLEM)
- {
-/* if the window is too small, hide everything */
- if(wr.bottom-wr.top < toffset+boffset)
- {
- Hide(seq_display);
- Hide(message);
- return;
- }
- fit_seq_display(wr,TRUE);
- Show(seq_display);
- Show(message);
- }
- else
- {
-/* if the window is too small, hide everything */
- if(wr.bottom-wr.top < toffset+boffset+2*poffset)
- {
- Hide(prf1_display);
- Hide(prf2_display);
- Hide(message);
- return;
- }
- GetPanelExtra(prf_panel[0].names,&data);
- numseqs1=data.nseqs;
- if(numseqs1<min_plines1)numseqs1=min_plines1;
- else if(numseqs1>max_plines)numseqs1=max_plines;
- GetPanelExtra(prf_panel[1].names,&data);
- numseqs2=data.nseqs;
- if(numseqs2<min_plines2)numseqs2=min_plines2;
- else if(numseqs2>max_plines)numseqs2=max_plines;
-
- fit_prf_displays(wr,numseqs1,numseqs2,TRUE);
- Show(prf1_display);
- Show(prf2_display);
- Show(message);
- }
-
-}
-
-void position_scrollbars(spanel p)
-{
- int height;
- RecT hr,vr,nr,sr;
- panel_data data;
-
- ObjectRect(p.names,&nr);
- GetPanelExtra(p.names,&data);
- ObjectRect(data.hscrollbar,&hr);
- height=hr.bottom-hr.top;
- LoadRect(&hr,nr.left,nr.bottom,nr.right,nr.bottom+height);
- SetPosition(data.hscrollbar,&hr);
-#ifdef WIN_MAC
- AdjustPrnt(data.hscrollbar,&hr,FALSE);
-#endif
- ObjectRect(p.seqs,&sr);
- GetPanelExtra(p.seqs,&data);
- ObjectRect(data.hscrollbar,&hr);
- height=hr.bottom-hr.top;
- LoadRect(&hr,sr.left,sr.bottom,sr.right,sr.bottom+height);
- SetPosition(data.hscrollbar,&hr);
-#ifdef WIN_MAC
- AdjustPrnt(data.hscrollbar,&hr,FALSE);
-#endif
- ObjectRect(data.vscrollbar,&vr);
- LoadRect(&vr,vr.left,sr.top,vr.right,sr.bottom);
- SetPosition(data.vscrollbar,&vr);
-#ifdef WIN_MAC
- AdjustPrnt(data.vscrollbar,&vr,FALSE);
-#endif
-}
-
-
-
-
-
-void load_aln_data(spanel p,int fs,int ls,Boolean reset)
-{
- int i,j,slength=0;
- int nhead;
- sint val;
- panel_data name_data,seq_data;
-
- WatchCursor();
-
- GetPanelExtra(p.names,&name_data);
- GetPanelExtra(p.seqs,&seq_data);
- name_data=free_panel_data(name_data);
- seq_data=free_panel_data(seq_data);
- SetPanelExtra(p.names,&name_data);
- SetPanelExtra(p.seqs,&seq_data);
-
- name_data.nseqs=ls-fs+1;
- seq_data.nseqs=name_data.nseqs;
- name_data.firstseq=fs;
- seq_data.firstseq=fs;
-
-/* find the maximum length of sequence */
- for(i=fs;i<=ls;i++)
- if (slength < seqlen_array[i+1]) slength = seqlen_array[i+1];
- name_data.ncols=max_names;
- seq_data.ncols=slength;
-
- if (name_data.nseqs>0)
- {
- name_data=alloc_name_data(name_data);
- seq_data=alloc_seq_data(seq_data);
-
-
- for(i=fs;i<=ls;i++)
- {
- strncpy(name_data.lines[i-fs],names[i+1],MAXNAMES);
- name_data.lines[i-fs][MAXNAMES]='\0';
- for(j=0;j<seqlen_array[i+1];j++)
- {
- val = seq_array[i+1][j+1];
- if((val == -3) || (val == 253))
- break;
- else if((val == gap_pos1) || (val == gap_pos2))
- seq_data.lines[i-fs][j] = '-';
- else {
- seq_data.lines[i-fs][j] = amino_acid_codes[val];
- }
- }
- for(j=seqlen_array[i+1];j<slength;j++)
- seq_data.lines[i-fs][j] = ' ';
- seq_data.lines[i-fs][j]='\0';
-
- name_data.selected[i-fs]=FALSE;
-
- }
-
-
- make_consensus(seq_data,name_data.header[0],seq_data.header[0]);
- nhead=make_struct_data(seq_data.prf_no,slength,name_data.header[1],seq_data.header[1]);
- if (nhead==0)
- nhead=make_gp_data(seq_data.prf_no,slength,name_data.header[1],seq_data.header[1]);
- seq_data.nhead=name_data.nhead=nhead+1;
-
- seq_data.nfoot=name_data.nfoot=1;
- seq_data.consensus=NULL;
- make_ruler(slength,name_data.footer[0],seq_data.footer[0]);
- make_colscores(seq_data);
- }
- else
- {
- seq_data.ncols=name_data.ncols=0;
- }
-
- if(reset==TRUE)
- {
- name_data.firstvline=0;
- name_data.firstvcol=0;
- seq_data.firstvline=0;
- seq_data.firstvcol=0;
- }
- name_data.vseqs=name_data.vlines-name_data.nhead-name_data.nfoot;
- seq_data.vseqs=seq_data.vlines-seq_data.nhead-seq_data.nfoot;
-
- if(seq_data.nseqs>0)
- {
-/* try to find the user's color parameter file */
- if (explicit_par_file == NULL)
- {
- if (par_file != NULL)
- ckfree(par_file);
- if(dnaflag)
- par_file=find_file(def_dnapar_file);
- else
- par_file=find_file(def_protpar_file);
- }
- init_color_parameters(par_file);
- make_colormask(seq_data);
- }
-
- SetPanelExtra(p.names,&name_data);
- SetPanelExtra(p.seqs,&seq_data);
-
- ArrowCursor();
-}
-
-void load_aln(spanel p,int fs,int ls,Boolean reset)
-{
-
- load_aln_data(p,fs,ls,reset);
-
- DrawPanel(p.names);
- DrawPanel(p.seqs);
- correct_name_bars(reset);
- correct_seq_bars(reset);
-
-}
-
-static panel_data alloc_name_data(panel_data data)
-{
- int i;
-
- data.lines=(char **)ckalloc((data.nseqs+1)*sizeof(char *));
- data.colormask=NULL;
- data.selected=(int *)ckalloc((data.nseqs+1)*sizeof(int));
-
- for(i=0;i<data.nseqs;i++)
- {
- data.lines[i]=(char *)ckalloc((MAXNAMES+1)*sizeof(char));
- strncpy(data.lines[i],names[i+1],MAXNAMES);
- data.lines[i][MAXNAMES]='\0';
- }
-
- data.header=(char **)ckalloc((mheader+1)*sizeof(char *));
- for(i=0;i<mheader;i++)
- data.header[i]=(char *)ckalloc((MAXNAMES+1)*sizeof(char));
- data.footer=(char **)ckalloc((mfooter+1)*sizeof(char *));
- for(i=0;i<mfooter;i++)
- data.footer[i]=(char *)ckalloc((MAXNAMES+1)*sizeof(char));
- return(data);
-}
-
-static panel_data alloc_seq_data(panel_data data)
-{
- int i;
-
- data.lines=(char **)ckalloc((data.nseqs+1)*sizeof(char *));
- data.colormask=(char **)ckalloc((data.nseqs+1)*sizeof(char *));
- data.firstsel=data.lastsel=-1;
-
- for(i=0;i<data.nseqs;i++)
- {
- data.lines[i]=(char *)ckalloc((data.ncols+1)*sizeof(char));
- data.colormask[i]=(char *)ckalloc((data.ncols+1)*sizeof(char));
- }
-
- data.selected=(int *)ckalloc((data.ncols+1)*sizeof(int));
- for(i=0;i<data.ncols;i++)
- data.selected[i]=FALSE;
-
- data.header=(char **)ckalloc((mheader+1)*sizeof(char *));
- for(i=0;i<mheader;i++)
- data.header[i]=(char *)ckalloc((data.ncols+1)*sizeof(char));
-
- data.colscore=(sint *)ckalloc((data.ncols+1)*sizeof(sint));
- data.residue_exception=(Boolean **)ckalloc((data.nseqs+1)*sizeof(Boolean *));
- for(i=0;i<data.nseqs;i++)
- data.residue_exception[i]=(Boolean *)ckalloc((data.ncols+1)*sizeof(Boolean));
- data.segment_exception=(short **)ckalloc((data.nseqs+1)*sizeof(short *));
- for(i=0;i<data.nseqs;i++)
- data.segment_exception[i]=(short *)ckalloc((data.ncols+1)*sizeof(short));
-
- data.footer=(char **)ckalloc((mfooter+1)*sizeof(char *));
- for(i=0;i<mfooter;i++)
- data.footer[i]=(char *)ckalloc((data.ncols+1)*sizeof(char));
- return(data);
-}
-
-void correct_name_bars(Boolean reset)
-{
- panel_data data,data1;
-
- if(aln_mode==PROFILEM)
- {
- GetPanelExtra(prf_panel[0].names,&data);
- GetPanelExtra(prf_panel[1].names,&data1);
- if(reset==TRUE)
- {
- data.firstvcol=0;
- data1.firstvcol=0;
- }
- correct_scrollbar(data.hscrollbar,data.vcols,data.ncols,data.firstvcol,reset);
- correct_scrollbar(data1.hscrollbar,data1.vcols,data1.ncols,data.firstvcol,reset);
- if(reset==TRUE)
- {
- data.firstvline=0;
- data1.firstvline=0;
- }
- correct_scrollbar(data.vscrollbar,data.vseqs,data.nseqs,data.firstvline,reset);
- correct_scrollbar(data1.vscrollbar,data1.vseqs,data1.nseqs,data1.firstvline,reset);
- SetPanelExtra(prf_panel[0].names,&data);
- SetPanelExtra(prf_panel[1].names,&data1);
- }
- else
- {
- GetPanelExtra(seq_panel.names,&data);
- if(reset==TRUE)
- {
- data.firstvcol=0;
- data.firstvline=0;
- }
- correct_scrollbar(data.vscrollbar,data.vseqs,data.nseqs,data.firstvline,reset);
- correct_scrollbar(data.hscrollbar,data.vcols,data.ncols,data.firstvcol,reset);
-
- SetPanelExtra(seq_panel.names,&data);
- }
-
-}
-
-void correct_seq_bars(Boolean reset)
-{
- int maxcols,m1,m2;
- panel_data data,data1;
-
- if(aln_mode==PROFILEM)
- {
- GetPanelExtra(prf_panel[0].seqs,&data);
- GetPanelExtra(prf_panel[1].seqs,&data1);
- if(fixed_prf_scroll==TRUE)
- {
- Hide(data.hscrollbar);
- m1=MAX(data.firstvcol,data1.firstvcol);
- m2=MAX(data.ncols-data.firstvcol,data1.ncols-data1.firstvcol);
- maxcols=m1+m2;
- if(reset==TRUE)
- {
- data.firstvcol=0;
- data1.firstvcol=0;
- }
- data.lockoffset= -MAX(data1.firstvcol-data.firstvcol,0);
- data1.lockoffset= -MAX(data.firstvcol-data1.firstvcol,0);
- correct_scrollbar(data1.hscrollbar,data1.vcols,maxcols,m1,TRUE);
- }
- else
- {
- Show(data.hscrollbar);
- if(reset==TRUE)
- {
- data.firstvcol=0;
- data1.firstvcol=0;
- }
- data.lockoffset=0;
- data1.lockoffset=0;
- correct_scrollbar(data.hscrollbar,data.vcols,data.ncols,data.firstvcol,reset);
- correct_scrollbar(data1.hscrollbar,data1.vcols,data1.ncols,data.firstvcol,reset);
- }
- if(reset==TRUE)
- {
- data.firstvline=0;
- data1.firstvline=0;
- }
- correct_scrollbar(data.vscrollbar,data.vseqs,data.nseqs,data.firstvline,reset);
- correct_scrollbar(data1.vscrollbar,data1.vseqs,data1.nseqs,data.firstvline,reset);
- SetPanelExtra(prf_panel[0].seqs,&data);
- SetPanelExtra(prf_panel[1].seqs,&data1);
- }
- else
- {
- GetPanelExtra(seq_panel.seqs,&data);
- if(reset==TRUE)
- {
- data.firstvcol=0;
- data.firstvline=0;
- }
- correct_scrollbar(data.vscrollbar,data.vseqs,data.nseqs,data.firstvline,reset);
- correct_scrollbar(data.hscrollbar,data.vcols,data.ncols,data.firstvcol,reset);
-
- SetPanelExtra(seq_panel.seqs,&data);
- }
-
-}
-
-static void correct_scrollbar(BaR b,int visible,int total,int value,Boolean reset)
-{
- int max;
-
- if (b!=NULL)
- {
- if (visible > 0 && total > visible)
- max=total-visible;
- else
- max=0;
- if(reset==TRUE) CorrectBarValue(b,0);
- CorrectBarPage(b,visible,visible);
- CorrectBarValue(b,value);
- CorrectBarMax(b,max);
- }
-}
-
-
-void color_seqs(void)
-{
- panel_data data;
-
- GetPanelExtra(seq_panel.seqs,&data);
- if (data.nseqs == 0) return;
-
- info("Coloring sequences...");
- make_colormask(data);
- DrawPanel(seq_panel.seqs);
- info("Done.");
-}
-
-void color_prf1(void)
-{
- panel_data data;
-
- GetPanelExtra(prf_panel[0].seqs,&data);
- if (data.nseqs == 0) return;
-
- make_colormask(data);
- info("Coloring profile 1...");
- DrawPanel(prf_panel[0].seqs);
- info("Done.");
-}
-
-void color_prf2(void)
-{
- panel_data data;
-
- GetPanelExtra(prf_panel[1].seqs,&data);
- if (data.nseqs == 0) return;
-
- make_colormask(data);
- info("Coloring profile 2...");
- DrawPanel(prf_panel[1].seqs);
- info("Done.");
-}
-
-void remove_gap_pos(int fseq, int lseq,int prf_no)
-{
- int i,j,k,ngaps;
-
-
- if (fseq>=lseq) return;
-
- for (i=1;i<=seqlen_array[fseq];)
- {
- ngaps=0;
- for (j=fseq;j<=lseq;j++)
- if(seq_array[j][i]==gap_pos1 || seq_array[j][i]==gap_pos2) ngaps++;
- if (ngaps==lseq-fseq+1)
- {
- for (j=fseq;j<=lseq;j++)
- {
- for(k=i+1;k<=seqlen_array[j]+1;k++)
- seq_array[j][k-1]=seq_array[j][k];
- seqlen_array[j]--;
- }
- if(prf_no==1 && sec_struct_mask1 != NULL)
- for(k=i;k<=seqlen_array[fseq];k++)
- sec_struct_mask1[k-1]=sec_struct_mask1[k];
- if(prf_no==1 && gap_penalty_mask1 != NULL)
- for(k=i;k<=seqlen_array[fseq];k++)
- gap_penalty_mask1[k-1]=gap_penalty_mask1[k];
- if(prf_no==2 && sec_struct_mask2 != NULL)
- for(k=i;k<=seqlen_array[fseq];k++)
- sec_struct_mask2[k-1]=sec_struct_mask2[k];
- if(prf_no==2 && gap_penalty_mask2 != NULL)
- for(k=i;k<=seqlen_array[fseq];k++)
- gap_penalty_mask2[k-1]=gap_penalty_mask2[k];
- if(seqlen_array[fseq]<=0) break;
- }
- else i++;
- }
-}
-
-/* width and height passed here are in pixels */
-
-static PaneL make_panel(int type,GrouP g,int width,int height,int firstseq,int nseqs)
-{
- int i,l,length=0;
- PaneL p;
- panel_data data;
-
- data.type=type;
- SelectFont(datafont);
- data.lineheight=LineHeight();
- data.charwidth=CharWidth('A');
- if(type==NAMES)
- {
-/* find the maximum length of sequence name */
- for (i=firstseq;i<=firstseq+nseqs-1;i++)
- {
- l = strlen(names[i]);
- if (length < l) length = l;
- }
- data.vcols=width/data.charwidth - MARGIN*2 - DNUMBER;
- }
- else
- {
- for (i=firstseq;i<=firstseq+nseqs-1;i++)
- if (length < seqlen_array[i]) length = seqlen_array[i];
- data.vcols=width/data.charwidth - MARGIN*2;
- }
-
- data.lines=NULL;
- data.nhead=0;
- data.nfoot=0;
- data.header=NULL;
- data.footer=NULL;
- data.consensus=NULL;
- data.colormask=NULL;
- data.vlines=(height-SCOREHEIGHT)/data.lineheight - MARGIN;
- data.vseqs=data.vlines-data.nhead-data.nfoot;
- data.nseqs=nseqs;
- data.ncols=length;
- data.firstseq=firstseq-1;
- data.firstvline=0;
- data.firstvcol=0;
- data.lockoffset=0;
- data.ascent=Ascent();
- data.descent=Descent();
- data.selected=NULL;
- data.firstsel=-1;
- data.lastsel=-1;
- data.colscore=NULL;
- data.seqweight=NULL;
- data.subgroup=NULL;
- data.residue_exception=NULL;
- data.segment_exception=NULL;
- data.vscrollbar=NULL;
- data.hscrollbar=NULL;
-
- p=AutonomousPanel(g, width, height, DrawPanel, NULL,NULL,sizeof(panel_data), NULL, NULL);
-
- SetPanelExtra(p, &data);
- return p;
-
-}
-
-void DrawPanel(PaneL p)
-{
- RecT r;
- panel_data data;
- int pixelwidth,pixelheight;
-
- UseWindow(mainw);
- Select(p);
-
- if (fromvscroll==0 && fromhscroll==0)
- {
- ObjectRect(p,&r);
- pixelwidth=r.right-r.left;
- pixelheight=r.bottom-r.top;
-
- SelectFont(datafont);
- GetPanelExtra(p, &data);
- data.lineheight=LineHeight();
- data.charwidth=CharWidth('A');
- if (data.type==NAMES)
- data.vcols=pixelwidth/data.charwidth-MARGIN*2-DNUMBER;
- else
- data.vcols=pixelwidth/data.charwidth-MARGIN*2;
- data.vlines=(pixelheight-SCOREHEIGHT)/data.lineheight - MARGIN;
- data.vseqs=data.vlines-data.nhead-data.nfoot;
- if(data.vseqs<0)data.vseqs=0;
- if(data.vcols<0)data.vcols=0;
- SetPanelExtra(p, &data);
-/* draw the outside frame */
- ObjectRect (p, &r);
- Black();
- FrameRect(&r);
- InsetRect(&r,1,1);
- black_on_white();
- EraseRect(&r);
- if(data.nseqs == 0) return;
- }
-
-/* draw the structure and gap penalty data */
-/* draw the footer */
- if (fromvscroll==0)
- {
- draw_header(p);
- draw_footer(p);
- draw_colscores(p);
- }
-
-/* draw the data lines */
- if (data.type==NAMES)
- draw_names(p);
- else
- draw_seqs(p);
-
-
-}
-
-void hscrollnames(BaR bar, int newval, int oldval)
-{
- PaneL p;
- panel_data data;
-
- p = active_panel.names;
- GetPanelExtra(p, &data);
- data.firstvcol = newval;
- SetPanelExtra(p, &data);
- Select(p);
-
- if (data.vseqs<=0) return;
- draw_names(p);
-}
-
-void vscrollnames(BaR bar, int newval, int oldval)
-{
- PaneL p;
- panel_data data;
-
- p = active_panel.names;
- GetPanelExtra(p, &data);
- data.firstvline = newval;
- SetPanelExtra(p, &data);
- Select(p);
-
- if (data.vseqs<=0) return;
- draw_names(p);
-}
-
-void vscrollseqs(BaR bar, int newval, int oldval)
-{
- PaneL p;
- panel_data data;
- RecT block,rect;
- int l;
-
- p = active_panel.seqs;
- GetPanelExtra(p, &data);
- l=data.firstvline;
- data.firstvline = newval;
- SetPanelExtra(p, &data);
- Select(p);
-
- if (data.vseqs<=0) return;
-
- if (data.vseqs<3 || data.nseqs-l < data.vseqs)
- {
- fromvscroll=0;
- draw_seqs(p);
- return;
- }
-
- if (newval == oldval + 1) {
- fromvscroll=1;
- ObjectRect(p, &rect);
- InsetRect(&rect,1,1);
- block.top = rect.top+(data.nhead)*data.lineheight+data.descent+1;
- block.bottom = block.top+(data.vseqs)*data.lineheight;
- block.left=rect.left;
- block.right=rect.right;
- ScrollRect(&block, 0, -data.lineheight);
- } else if (newval == oldval - 1) {
- fromvscroll=-1;
- ObjectRect(p, &rect);
- InsetRect(&rect,1,1);
- block.top = rect.top+(data.nhead)*data.lineheight+data.descent+1;
- block.bottom = block.top+(data.vseqs)*data.lineheight;
- block.left=rect.left;
- block.right=rect.right;
- ScrollRect(&block, 0, data.lineheight);
- } else {
- fromvscroll=0;
- }
- draw_seqs(p);
-}
-
-void hscrollseqs(BaR bar, int newval, int oldval)
-{
- PaneL p;
- panel_data data;
- RecT rect;
-
-
- p = active_panel.seqs;
- GetPanelExtra(p, &data);
- data.firstvcol = newval+data.lockoffset;
- SetPanelExtra(p, &data);
- Select(p);
-
- if (data.vcols<=0) return;
-
- if (data.vcols<3)
- {
- fromhscroll=0;
- draw_header(p);
- draw_seqs(p);
- draw_footer(p);
- draw_colscores(p);
- return;
- }
- if (newval == oldval + 1) {
- fromhscroll=1;
- ObjectRect(p, &rect);
- InsetRect(&rect,1,1);
- rect.left+=data.charwidth;
- ScrollRect(&rect, -data.charwidth, 0);
- } else if (newval == oldval - 1) {
- fromhscroll=-1;
- ObjectRect(p, &rect);
- InsetRect(&rect,1,1);
- rect.right=rect.left+(data.vcols+1)*data.charwidth;
- ScrollRect(&rect, data.charwidth, 0);
- } else {
- fromhscroll=0;
- }
- draw_header(p);
- draw_seqs(p);
- draw_footer(p);
- draw_colscores(p);
-}
-
-void draw_names(PaneL p)
-{
- int i,f,l;
- panel_data data;
-
- UseWindow(mainw);
- Select(p);
- GetPanelExtra(p,&data);
- if(data.lines==NULL) return;
- SelectFont(datafont);
-
- if (fromvscroll==0)
- {
- f=data.firstvline;
- l=data.firstvline+data.vseqs-1;
- }
- else if (fromvscroll==-1)
- f=l=data.firstvline;
- else
- f=l=data.firstvline+data.vseqs-1;
-
- if(l>=data.nseqs) l=data.nseqs-1;
- for(i=f;i<=l;i++)
- if (data.selected[i]==TRUE)
- draw_nameline(p,i,i,HIGHLIGHT);
- else
- draw_nameline(p,i,i,NORMAL);
-}
-
-void draw_seqs(PaneL p)
-{
- int i,f,l,s,x,y,format;
- int fs,ls;
- panel_data data;
- PoinT pt;
- RecT r,block;
-
- UseWindow(mainw);
- Select(p);
- GetPanelExtra(p,&data);
- if(data.lines==NULL) return;
- SelectFont(datafont);
- black_on_white();
- if (fromhscroll==-1)
- {
- f=data.firstvcol;
- if ((f>=data.firstsel) && (f<=data.lastsel))
- format=HIGHLIGHT;
- else format=NORMAL;
- draw_seqcol(p,f,format);
- }
- else if (fromhscroll==1)
- {
- f=data.firstvcol+data.vcols-1;
- if ((f>=data.firstsel) && (f<=data.lastsel))
- format=HIGHLIGHT;
- else format=NORMAL;
- draw_seqcol(p,f,format);
- }
- else
- {
- if (fromvscroll==-1)
- {
- f=l=data.firstvline;
- }
- else if (fromvscroll==1)
- {
- f=l=data.firstvline+data.vseqs-1;
- }
- else
- {
- f=data.firstvline;
- l=data.firstvline+data.vseqs-1;
- }
-
- if(l>=data.nseqs) l=data.nseqs-1;
- s=f-data.firstvline;
- ObjectRect (p, &r);
- InsetRect(&r,1,1);
- data_colors();
- block.top=r.top+((s+data.nhead)*data.lineheight)+data.descent+1;
- block.bottom=block.top+(l-f+1)*data.lineheight;
- block.left=r.left;
- block.right=r.right;
- EraseRect(&block);
- if(data.nseqs == 0) return;
-
- if(data.firstsel != -1)
- {
- if ((data.firstsel>=data.firstvcol && data.firstsel<data.firstvcol+data.vcols)||
- (data.lastsel>=data.firstvcol && data.lastsel<data.firstvcol+data.vcols))
- {
- fs=data.firstsel-data.firstvcol;
- if (fs<0) fs=0;
- if (fs>=data.vcols) fs=data.vcols-1;
- ls=data.lastsel-data.firstvcol;
- if (ls<0) ls=0;
- if (ls>=data.vcols) ls=data.vcols-1;
- block.left=r.left+(fs+1)*data.charwidth;
- block.right=r.left+(ls+2)*data.charwidth;
- text_colors();
- EraseRect(&block);
- }
- }
- x=r.left+data.charwidth;
-
- for(i=f;i<=l;i++)
- {
- y=block.top+(i-f+1)*data.lineheight-data.descent-1;
- LoadPt(&pt,x,y);
- draw_seqline(data,i,pt,data.firstvcol,data.firstvcol+data.vcols-1,NORMAL);
- }
- }
-
- black_on_white();
- fromvscroll=fromhscroll=0;
-}
-
-static void NameClick(PaneL panel, PoinT pt)
-{
- int i;
- panel_data data;
- RecT r;
-
- GetPanelExtra(panel,&data);
- if(data.prf_no==1)
- {
-/* revert selected area in profile 2 to normal */
- GetPanelExtra(prf_panel[1].names,&data);
- if(data.nseqs==0)
- draw_seq_pointer(prf_panel[1].names,0,NORMAL);
- for(i=0;i<data.nseqs;i++)
- if (data.selected[i]==TRUE)
- draw_nameline(prf_panel[1].names,i,i,NORMAL);
- SetPanelExtra(prf_panel[1].names,&data);
- }
- else if(data.prf_no==2)
- {
-/* revert selected area in profile 1 to normal */
- GetPanelExtra(prf_panel[0].names,&data);
- if(data.nseqs==0)
- draw_seq_pointer(prf_panel[0].names,0,NORMAL);
- for(i=0;i<data.nseqs;i++)
- if (data.selected[i]==TRUE)
- draw_nameline(prf_panel[0].names,i,i,NORMAL);
- SetPanelExtra(prf_panel[0].names,&data);
- }
- GetPanelExtra(panel,&data);
- Select(panel);
- ObjectRect(panel,&r);
- if (!shftKey)
- {
-/* revert existing selected area to normal */
- for(i=0;i<data.nseqs;i++)
- if (data.selected[i]==TRUE)
- draw_nameline(panel,i,i,NORMAL);
- }
-
- selected_seqs.first = (pt.y - r.top-data.lineheight/2)/data.lineheight + data.firstvline-data.nhead;
- if (selected_seqs.first <0) selected_seqs.first=0;
- if (selected_seqs.first >=data.nseqs) selected_seqs.first=data.nseqs-1;
- if (selected_seqs.first==-1 && ncutseqs > 0)
- {
- selected_seqs.last=selected_seqs.first=0;
- draw_seq_pointer(panel,0,HIGHLIGHT);
- }
- else
- {
- selected_seqs.last=selected_seqs.first;
- draw_nameline(panel,selected_seqs.first,selected_seqs.last,HIGHLIGHT);
- }
- black_on_white();
-
-}
-
-static void NameDrag(PaneL panel, PoinT pt)
-{
- panel_data data;
- RecT r;
- int s;
-
- GetPanelExtra(panel,&data);
- Select(panel);
- ObjectRect(panel,&r);
- s = (pt.y - r.top-data.lineheight/2)/data.lineheight + data.firstvline-data.nhead;
- if (s<0) s=0;
- if (s>=data.nseqs) s=data.nseqs-1;
- if (s==selected_seqs.first)
- {
- if (s!=selected_seqs.last)
- {
- draw_nameline(panel,selected_seqs.first,selected_seqs.last,NORMAL);
- draw_nameline(panel,selected_seqs.first,s,HIGHLIGHT);
- }
- }
- else if (s>selected_seqs.first)
- {
- if (s>selected_seqs.last)
- draw_nameline(panel,selected_seqs.last+1,s,HIGHLIGHT);
- else if (s<selected_seqs.last)
- draw_nameline(panel,s+1,selected_seqs.last,NORMAL);
- }
- else
- {
- if (s<selected_seqs.last)
- draw_nameline(panel,s,selected_seqs.last-1,HIGHLIGHT);
- else if (s>selected_seqs.last)
- draw_nameline(panel,selected_seqs.last,s-1,NORMAL);
- }
- selected_seqs.last=s;
-
- black_on_white();
-}
-
-static void NameRelease(PaneL panel, PoinT pt)
-{
- int t;
- panel_data data;
-
- if (selected_seqs.first > selected_seqs.last)
- {
- t=selected_seqs.first;
- selected_seqs.first=selected_seqs.last;
- selected_seqs.last=t;
- }
- active_panel.names = panel;
- GetPanelExtra(panel,&data);
- active_panel.seqs = data.index;
-
-}
-
-void draw_seq_pointer(PaneL panel,int seq,int format)
-{
- RecT r,block;
- panel_data data;
-
- Select(panel);
- GetPanelExtra(panel,&data);
-
- ObjectRect(panel,&r);
- InsetRect(&r,1,1);
- block.top=r.top+((seq+data.nhead)*data.lineheight)+data.descent+1;
- block.bottom=block.top+data.lineheight;
- block.left=r.left;
- block.right=r.right;
- if (format==HIGHLIGHT)
- Black();
- else
- White();
- PaintRect(&block);
-
-}
-
-static void SeqClick(PaneL panel, PoinT pt)
-{
- int s;
- int f,l;
- panel_data data;
- RecT r;
-
- GetPanelExtra(panel,&data);
- if(data.prf_no==1)
- {
-/* revert selected area in profile 2 to normal */
- GetPanelExtra(prf_panel[1].seqs,&data);
- f=data.firstsel;
- l=data.lastsel;
- data.firstsel=-1;
- data.lastsel=-1;
- SetPanelExtra(prf_panel[1].seqs,&data);
- if (f != -1) highlight_seqrange(prf_panel[1].seqs,f,l,NORMAL);
- }
- else if(data.prf_no==2)
- {
-/* revert selected area in profile 1 to normal */
- GetPanelExtra(prf_panel[0].seqs,&data);
- f=data.firstsel;
- l=data.lastsel;
- data.firstsel=-1;
- data.lastsel=-1;
- SetPanelExtra(prf_panel[0].seqs,&data);
- if (f != -1) highlight_seqrange(prf_panel[0].seqs,f,l,NORMAL);
- }
- GetPanelExtra(panel,&data);
- Select(panel);
- ObjectRect(panel,&r);
-
- s = (pt.x - r.left-data.charwidth)/data.charwidth + data.firstvcol;
- if (s <0) s=0;
- if (s<data.firstvcol) s=data.firstvcol;
- if (s >=data.ncols) s=data.ncols-1;
- if (s >=data.firstvcol+data.vcols) s=data.firstvcol+data.vcols-1;
-
- if (shftKey && data.firstsel != -1)
- {
- if (s>data.lastsel)
- {
- highlight_seqrange(panel,data.firstsel,s,HIGHLIGHT);
- data.lastsel=s;
- }
- else if (s<data.firstsel)
- {
- highlight_seqrange(panel,s,data.lastsel,HIGHLIGHT);
- data.firstsel=s;
- }
- else
- {
- highlight_seqrange(panel,s+1,data.lastsel,NORMAL);
- highlight_seqrange(panel,data.firstsel,s,HIGHLIGHT);
- data.lastsel=s;
- }
- selected_res.first=data.firstsel;
- selected_res.last=data.lastsel;
- }
- else
- {
-/* revert existing selected area to normal */
- f=data.firstsel;
- l=data.lastsel;
- data.firstsel=-1;
- data.lastsel=-1;
- SetPanelExtra(panel,&data);
- if (f != -1) highlight_seqrange(panel,f,l,NORMAL);
- selected_res.first=selected_res.last=s;
- highlight_seqrange(panel,selected_res.first,selected_res.last,HIGHLIGHT);
- data.firstsel=selected_res.first;
- data.lastsel=selected_res.last;
- }
-
- SetPanelExtra(panel,&data);
- black_on_white();
-
-}
-
-static void SeqDrag(PaneL panel, PoinT pt)
-{
- panel_data data;
- RecT r;
- int s;
-
- GetPanelExtra(panel,&data);
- Select(panel);
- ObjectRect(panel,&r);
- s = (pt.x - r.left-data.charwidth)/data.charwidth + data.firstvcol;
- if (s<0) s=0;
- if (s<data.firstvcol) s=data.firstvcol;
- if (s>=data.ncols) s=data.ncols-1;
- if (s >=data.firstvcol+data.vcols) s=data.firstvcol+data.vcols-1;
- if (s==selected_res.first)
- {
- if (s!=selected_res.last)
- {
- highlight_seqrange(panel,selected_res.first,selected_res.last,NORMAL);
- highlight_seqrange(panel,selected_res.first,s,HIGHLIGHT);
- }
- }
- else if (s>selected_res.first)
- {
- if (s>selected_res.last)
- highlight_seqrange(panel,selected_res.last+1,s,HIGHLIGHT);
- else if (s<selected_res.last)
- highlight_seqrange(panel,s+1,selected_res.last,NORMAL);
- }
- else
- {
- if (s<selected_res.last)
- highlight_seqrange(panel,s,selected_res.last-1,HIGHLIGHT);
- else if (s>selected_res.last)
- highlight_seqrange(panel,selected_res.last,s-1,NORMAL);
- }
- selected_res.last=s;
-
- black_on_white();
-}
-
-static void SeqRelease(PaneL panel, PoinT pt)
-{
- int t;
- panel_data data;
-
- if (selected_res.first > selected_res.last)
- {
- t=selected_res.first;
- selected_res.first=selected_res.last;
- selected_res.last=t;
- }
-
- active_panel.seqs = panel;
- GetPanelExtra(panel,&data);
- active_panel.names = data.index;
- data.firstsel=selected_res.first;
- data.lastsel=selected_res.last;
- SetPanelExtra(panel,&data);
-
-}
-
-void draw_header(PaneL p)
-{
- RecT block,r;
- PoinT pt;
- int i, j, x, y;
- panel_data data;
- char *line;
-
- UseWindow(mainw);
- Select(p);
- SelectFont(datafont);
- GetPanelExtra(p, &data);
- if(data.nseqs == 0) return;
- if(data.header == NULL) return;
- if(data.vlines<data.nhead) return;
- if(data.vcols<=0) return;
-
- line=(char *)ckalloc((data.vcols+1) * sizeof(char));
- ObjectRect (p, &r);
- InsetRect(&r,1,1);
- block.top=r.top+data.descent/2;
- block.bottom=block.top+(data.nhead*data.lineheight);
- block.left=r.left;
- block.right=r.right;
- text_colors();
- EraseRect(&block);
- if (data.type==NAMES)
- x=r.left+DNUMBER*data.charwidth;
- else
- x=r.left+data.charwidth;
- y=r.top+data.lineheight-data.descent/2;
- for(i=0;i<data.nhead;i++)
- {
- for(j=data.firstvcol;j<data.firstvcol+data.vcols && j<data.ncols;j++)
- if(j>=0)
- line[j-data.firstvcol]=data.header[i][j];
- else
- line[j-data.firstvcol]=' ';
- line[j-data.firstvcol]='\0';
- LoadPt(&pt, x, y);
- SetPen(pt);
- PaintString(line);
- y+=data.lineheight;
- }
- black_on_white();
- ckfree(line);
-}
-
-void draw_footer(PaneL p)
-{
- RecT block,r;
- PoinT pt;
- int i, j,x, y;
- panel_data data;
- char *line;
-
- UseWindow(mainw);
- Select(p);
- SelectFont(datafont);
- GetPanelExtra(p, &data);
- if(data.nseqs == 0) return;
- if(data.footer == NULL) return;
- if(data.vlines<data.nfoot) return;
- if(data.vcols<=0) return;
-
- line=(char *)ckalloc((data.vcols+1) * sizeof(char));
- ObjectRect (p, &r);
- InsetRect(&r,1,1);
- block.top=r.top+((data.vlines-data.nfoot)*data.lineheight)+data.descent+data.ascent/2;
- block.bottom=block.top+data.nfoot*data.lineheight;
- block.left=r.left;
- block.right=r.right;
- text_colors();
- EraseRect(&block);
- if(data.type==NAMES)
- x=block.left+DNUMBER*data.charwidth;
- else
- x=block.left+data.charwidth;
- y=block.top+data.lineheight-1;
- for(i=0;i<data.nfoot;i++)
- {
- for(j=data.firstvcol;j<data.firstvcol+data.vcols && j<data.ncols;j++)
- if(j>=0)
- line[j-data.firstvcol]=data.footer[i][j];
- else
- line[j-data.firstvcol]=' ';
- line[j-data.firstvcol]='\0';
- LoadPt(&pt, x, y);
- SetPen(pt);
- PaintString(line);
- y+=data.lineheight;
- }
- black_on_white();
- ckfree(line);
-}
-
-
-void draw_nameline(PaneL p,int fseq,int lseq,int format)
-{
- RecT block,r;
- PoinT pt;
- int n,i, j, t, f,l,x, y,ix;
- panel_data data;
- char *line;
-
- Select(p);
- SelectFont(datafont);
- GetPanelExtra(p, &data);
- if(data.nseqs == 0) return;
-
- n=1;
- i=data.nseqs;
- for(;;)
- {
- i/=10;
- if(i==0) break;
- n++;
- }
-
- line=(char *)ckalloc((data.vcols+1) * sizeof(char));
- if (fseq > lseq)
- {
- t=fseq;
- fseq=lseq;
- lseq=t;
- }
- if (format==HIGHLIGHT)
- for(i=fseq;i<=lseq;i++) data.selected[i]=TRUE;
- else
- for(i=fseq;i<=lseq;i++) data.selected[i]=FALSE;
- SetPanelExtra(p,&data);
- if (fseq<data.firstvline)
- fseq=data.firstvline;
- if (fseq>=data.firstvline+data.vseqs)
- fseq=data.firstvline+data.vseqs;
- if (lseq<data.firstvline)
- lseq=data.firstvline;
- if (lseq>=data.firstvline+data.vseqs)
- lseq=data.firstvline+data.vseqs-1;
- f=fseq-data.firstvline;
- l=lseq-data.firstvline;
- ObjectRect (p, &r);
- InsetRect(&r,1,1);
- block.top=r.top+((f+data.nhead)*data.lineheight)+data.descent+1;
- block.bottom=block.top+((l-f+1)*data.lineheight);
- block.left=r.left;
- block.right=r.right;
- if (format==HIGHLIGHT)
- white_on_black();
- else
- data_colors();
- EraseRect(&block);
- y=block.top+data.lineheight-data.descent-1;
- for(i=fseq;i<=lseq;i++)
- {
- x=r.left+data.charwidth;
- sprintf(line,"%*d",n,i+1);
- LoadPt(&pt, x, y);
- SetPen(pt);
- Gray();
- PaintString(line);
- y+=data.lineheight;
- }
- y=block.top+data.lineheight-data.descent-1;
- for(i=fseq;i<=lseq;i++)
- {
- ix=output_index[i+1]-1;
- x=r.left+DNUMBER*data.charwidth;
- for(j=0;j<data.vcols && j<data.ncols-data.firstvcol;j++)
- line[j]=data.lines[ix][j+data.firstvcol];
- line[j]='\0';
- LoadPt(&pt, x, y);
- SetPen(pt);
- if(format==HIGHLIGHT) White();
- else Black();
- PaintString(line);
- y+=data.lineheight;
- }
- black_on_white();
- ckfree(line);
-}
-
-void draw_seqline(panel_data data,int seq,PoinT pt,int fcol,int lcol,int format)
-{
- RecT r;
- int i, j, ix;
- char *line[MAXCOLORS+1];
-
- if(data.nseqs == 0) return;
-
-/* draw colored character on white background */
- for(i=0;i<ncolors;i++)
- {
- line[i]=(char *)ckalloc((data.vcols+1) * sizeof(char));
- for(j=0;j<data.vcols;j++)
- line[i][j]=' ';
- line[i][j]='\0';
- }
-
- ix=output_index[seq+1]-1;
-
- r.top=pt.y-data.lineheight+data.descent+1;
- r.bottom=r.top+data.lineheight;
- for(j=fcol;j<=lcol && j<data.ncols;j++)
- {
- if(j>=0)
- {
- if(segment_exceptions && data.segment_exception[ix][j] > 0)
- {
- r.left=pt.x;
- r.right=r.left+data.charwidth;
- DkGray();
- PaintRect(&r);
- White();
- }
- else if(residue_exceptions && data.residue_exception[ix][j] == TRUE)
- {
- r.left=pt.x;
- r.right=r.left+data.charwidth;
- /* LtGray(); */
- SelectColor(150,150,150);
- PaintRect(&r);
- White();
- }
- else
- {
- if(inverted)
- {
- if(format==HIGHLIGHT || (j>=data.firstsel && j<=data.lastsel))
- Black();
- else
- {
- r.left=pt.x;
-#ifdef UNIX
- r.right=r.left+data.charwidth-1;
-#else
- r.right=r.left+data.charwidth;
-#endif
- SetColor(color_lut[(int)data.colormask[ix][j]].val);
- PaintRect(&r);
- Black();
- }
- }
- else
- SetColor(color_lut[(int)data.colormask[ix][j]].val);
-
- }
- SetPen(pt);
- PaintChar(data.lines[ix][j]);
- }
- pt.x+=data.charwidth;
- }
- for(i=0;i<ncolors;i++)
- ckfree(line[i]);
- Black();
-}
-
-void draw_seqcol(PaneL p,int col,int format)
-{
- RecT block,r, r2;
- PoinT pt;
- int totseqs,i, c,x,y,ix;
- panel_data data;
-
- Select(p);
- SelectFont(datafont);
- GetPanelExtra(p, &data);
- if(data.nseqs == 0) return;
- if(data.ncols == 0) return;
-
- SetPanelExtra(p, &data);
-
- if (col<data.firstvcol)
- col=data.firstvcol;
- if (col>=data.firstvcol+data.vcols)
- col=data.firstvcol+data.vcols-1;
- c=col-data.firstvcol;
- totseqs=data.vseqs;
- if (totseqs>data.nseqs) totseqs=data.nseqs;
- ObjectRect (p, &r);
- InsetRect(&r,1,1);
- block.top=r.top+(data.nhead*data.lineheight)+data.descent+1;
- block.bottom=block.top+(totseqs)*data.lineheight;
- block.left=r.left+(c+1)*data.charwidth;
- block.right=block.left+data.charwidth;
- if (format==HIGHLIGHT)
- text_colors();
- else
- data_colors();
- EraseRect(&block);
-
- x=r.left+(c+1)*data.charwidth;
- y=block.top+data.lineheight-data.descent-1;
- r2.left=x;
- r2.right=r2.left+data.charwidth;
- for(i=data.firstvline;i<data.firstvline+data.vseqs && i<data.nseqs;i++)
- {
- ix=output_index[i+1]-1;
- if(segment_exceptions && data.segment_exception[ix][col] > 0)
- {
- r2.top=y-data.lineheight+data.descent+1;
- r2.bottom=r.top+data.lineheight;
- DkGray();
- PaintRect(&r2);
- White();
- }
- else if(residue_exceptions && data.residue_exception[ix][col] == TRUE)
- {
- r2.top=y-data.lineheight+data.descent+1;
- r2.bottom=r.top+data.lineheight;
- /* LtGray(); */
- SelectColor(150,150,150);
- PaintRect(&r2);
- White();
- }
- else
- {
- if(inverted)
- {
- r2.top=y-data.lineheight+data.descent+1;
- r2.bottom=r2.top+data.lineheight;
- if(format==HIGHLIGHT)
- {
- LtGray();
- }
- else
- SetColor(color_lut[(int)data.colormask[ix][col]].val);
- PaintRect(&r2);
- Black();
- }
- else
- SetColor(color_lut[(int)data.colormask[ix][col]].val);
-
- }
- LoadPt(&pt,x,y);
- SetPen(pt);
- PaintChar(data.lines[ix][col]);
- y+=data.lineheight;
- }
- Black();
-}
-
-void highlight_seqrange(PaneL p,int fcol,int lcol, int format)
-{
- RecT block,r;
- int i,t,x,y;
- int fseq,lseq,s;
- panel_data data;
- PoinT pt;
-
- Select(p);
- SelectFont(datafont);
- GetPanelExtra(p, &data);
- if(data.nseqs == 0) return;
- if(data.ncols == 0) return;
-
- if (fcol > lcol)
- {
- t=fcol;
- fcol=lcol;
- lcol=t;
- }
-
- if ((fcol>=data.firstvcol && fcol<data.firstvcol+data.vcols)||
- (lcol>=data.firstvcol && lcol<data.firstvcol+data.vcols))
- {
- if (fcol<data.firstvcol) fcol=data.firstvcol;
- if (fcol>=data.firstvcol+data.vcols) fcol=data.firstvcol+data.vcols-1;
- if (lcol<data.firstvcol) lcol=data.firstvcol;
- if (lcol>=data.firstvcol+data.vcols) lcol=data.firstvcol+data.vcols-1;
- }
-
- fseq=data.firstvline;
- lseq=data.firstvline+data.vseqs-1;
- if(lseq>=data.nseqs) lseq=data.nseqs-1;
- s=fseq-data.firstvline;
- ObjectRect (p, &r);
- InsetRect(&r,1,1);
- if(format==HIGHLIGHT)
- text_colors();
- else
- data_colors();
- block.top=r.top+((s+data.nhead)*data.lineheight)+data.descent+1;
- block.bottom=block.top+(lseq-fseq+1)*data.lineheight;
- block.left=r.left+(fcol-data.firstvcol+1)*data.charwidth;
- block.right=r.left+(lcol-data.firstvcol+2)*data.charwidth;
- EraseRect(&block);
-
- x=r.left+(fcol-data.firstvcol+1)*data.charwidth;
-
- for(i=fseq;i<=lseq;i++)
- {
- y=block.top+(i-fseq+1)*data.lineheight-data.descent-1;
- LoadPt(&pt,x,y);
- draw_seqline(data,i,pt,fcol,lcol,format);
- }
- black_on_white();
-}
-
-GrouP make_scroll_area(GrouP w,int prf_no,int nwidth,int swidth,int height,int firstseq,int nseqs,spanel *p)
-{
- panel_data ndata,sdata;
- GrouP display;
- RecT rect;
- PoinT pt;
- PaneL names,seqs;
- BaR vscrollbar,hnscrollbar,hsscrollbar;
- BarScrlProc hscrollnameproc, hscrollseqproc, vscrollproc;
-
- if(prf_no==0)
- {
- hscrollnameproc=HscrollMultiN;
- hscrollseqproc=HscrollMultiS;
- vscrollproc=VscrollMulti;
- }
- else if (prf_no==1)
- {
- hscrollnameproc=HscrollPrf1N;
- hscrollseqproc=HscrollPrf1S;
- vscrollproc=VscrollPrf1;
- }
- else
- {
- hscrollnameproc=HscrollPrf2N;
- hscrollseqproc=HscrollPrf2S;
- vscrollproc=VscrollPrf2;
- }
-
- display=HiddenGroup(w, 0, 0, NULL);
- SetGroupSpacing(display, 0, 0);
- Hide(display);
-
- vscrollbar=ScrollBar(display, -1, 1, vscrollproc);
-
- ObjectRect(vscrollbar, &rect);/* vscrollbar for names */
- pt.x=rect.right; /*how near they should be with name panel Ramu */
- pt.y=rect.top;
- SetNextPosition(display, pt);
- names=make_panel(NAMES,display, nwidth+(5*max_names), height, firstseq,nseqs); /* 5*max_names Ramu */
-
- ObjectRect(names, &rect);
- pt.x=rect.right;
- pt.y=rect.top;
- SetNextPosition(display, pt);
- seqs=make_panel(SEQS,display, swidth, height, firstseq,nseqs);
-
-/* horizontal scroll bars */
- ObjectRect(names, &rect);
- pt.x=rect.left;
- pt.y=rect.bottom;
- SetNextPosition(display, pt);
- hnscrollbar=ScrollBar(display, 1, -1, hscrollnameproc);
- ObjectRect(seqs, &rect);
- pt.x=rect.left;
- pt.y=rect.bottom;
- SetNextPosition(display, pt);
- hsscrollbar=ScrollBar(display, 1, -1, hscrollseqproc);
-
- SetRange(hsscrollbar,1,1,0);
- SetRange(hnscrollbar,1,1,0);
- SetRange(vscrollbar,1,1,0);
-
- GetPanelExtra(names,&ndata);
- ndata.hscrollbar=hnscrollbar;
- ndata.index=seqs;
- ndata.prf_no=prf_no;
-
- GetPanelExtra(seqs,&sdata);
- sdata.vscrollbar=vscrollbar;
- sdata.hscrollbar=hsscrollbar;
- sdata.index=names;
- sdata.prf_no=prf_no;
-
- SetPanelClick(names,NameClick, NameDrag, NULL, NameRelease);
- SetPanelClick(seqs,SeqClick, SeqDrag, NULL, SeqRelease);
-
- p->names = names;
- p->seqs = seqs;
-
- ndata=alloc_name_data(ndata);
- sdata=alloc_seq_data(sdata);
- SetPanelExtra(names,&ndata);
- SetPanelExtra(seqs,&sdata);
-
- Show(display);
- return(display);
-}
-
-
-void white_on_black(void)
-{
- Black(); InvertColors(); White();
-}
-void black_on_white(void)
-{
- White(); InvertColors(); Black();
-}
-void text_colors(void)
-{
- SelectColor(220,220,220);
- InvertColors();
- Black();
-}
-void data_colors(void)
-{
- White();
- InvertColors();
- Black();
-}
-
-
-
-
-void make_ruler(int length, char *name,char *seq)
-{
-
- int i,j;
- char marker[5];
- int marker_len;
-
- strcpy(name,"ruler");
- seq[0] = '1';
- for (i=1;i<length;i++)
- {
- if ((i+1)%10 > 0)
- seq[i] = '.';
- else
- {
- sprintf(marker,"%d",((i+1)/10)*10);
- marker_len = strlen(marker);
- for (j=0;j<marker_len && i+1+j-marker_len < length;j++)
- seq[i+1+j-marker_len] = marker[j];
- }
- }
- seq[length]='\0';
-}
-
-panel_data free_panel_data(panel_data data)
-{
- int i;
-
- if (data.header!=NULL)
- {
- for (i=0;i<mheader;i++)
- {
- if(data.header[i] != NULL) ckfree(data.header[i]);
- data.header[i]=NULL;
- }
- ckfree(data.header);
- data.header=NULL;
- }
- if (data.footer!=NULL)
- {
- for (i=0;i<mfooter;i++)
- {
- if(data.footer[i] != NULL) ckfree(data.footer[i]);
- data.footer[i]=NULL;
- }
- ckfree(data.footer);
- data.footer=NULL;
- }
- if (data.consensus!=NULL)
- {
- ckfree(data.consensus);
- data.consensus=NULL;
- }
- if (data.lines!=NULL)
- {
- for (i=0;i<data.nseqs;i++)
- {
- if(data.lines[i] != NULL) ckfree(data.lines[i]);
- data.lines[i]=NULL;
- }
- ckfree(data.lines);
- data.lines=NULL;
- }
- if (data.colormask!=NULL)
- {
- for (i=0;i<data.nseqs;i++)
- {
- if(data.colormask[i] != NULL) ckfree(data.colormask[i]);
- data.colormask[i]=NULL;
- }
- ckfree(data.colormask);
- data.colormask=NULL;
- }
- if (data.selected!=NULL) ckfree(data.selected);
- data.selected=NULL;
-
- if (data.seqweight!=NULL) ckfree(data.seqweight);
- data.seqweight=NULL;
- if (data.subgroup!=NULL) ckfree(data.subgroup);
- data.subgroup=NULL;
- if (data.colscore!=NULL) ckfree(data.colscore);
- data.colscore=NULL;
- if (data.residue_exception!=NULL)
- {
- for (i=0;i<data.nseqs;i++)
- {
- if(data.residue_exception[i] != NULL) ckfree(data.residue_exception[i]);
- data.residue_exception[i]=NULL;
- }
- ckfree(data.residue_exception);
- data.residue_exception=NULL;
- }
- if (data.segment_exception!=NULL)
- {
- for (i=0;i<data.nseqs;i++)
- {
- if(data.segment_exception[i] != NULL) ckfree(data.segment_exception[i]);
- data.segment_exception[i]=NULL;
- }
- ckfree(data.segment_exception);
- data.segment_exception=NULL;
- }
-
- return(data);
-}
-
-
-void make_consensus(panel_data data,char *name,char *seq1)
-{
- char c;
- sint catident1[NUMRES],catident2[NUMRES],ident;
- sint i,j,k,l;
-
-
- strcpy(name,"");
- for(i=0; i<data.ncols; i++) {
- seq1[i]=' ';
- ident=0;
- for(j=0;res_cat1[j]!=NULL;j++) catident1[j] = 0;
- for(j=0;res_cat2[j]!=NULL;j++) catident2[j] = 0;
- for(j=0;j<data.nseqs;++j) {
- if(isalpha(data.lines[0][i])) {
- if(data.lines[0][i] == data.lines[j][i])
- ++ident;
- for(k=0;res_cat1[k]!=NULL;k++) {
- for(l=0;(c=res_cat1[k][l]);l++) {
- if (c=='\0') break;
- if (data.lines[j][i]==c)
- {
- catident1[k]++;
- break;
- }
- }
- }
- for(k=0;res_cat2[k]!=NULL;k++) {
- for(l=0;(c=res_cat2[k][l]);l++) {
- if (c=='\0') break;
- if (data.lines[j][i]==c)
- {
- catident2[k]++;
- break;
- }
- }
- }
- }
- }
- if(ident==data.nseqs)
- seq1[i]='*';
- else if (!dnaflag) {
- for(k=0;res_cat1[k]!=NULL;k++) {
- if (catident1[k]==data.nseqs) {
- seq1[i]=':';
- break;
- }
- }
- if(seq1[i]==' ')
- for(k=0;res_cat2[k]!=NULL;k++) {
- if (catident2[k]==data.nseqs) {
- seq1[i]='.';
- break;
- }
- }
- }
- }
-}
-
-int make_struct_data(int prf_no,int len, char *name,char *seq)
-{
- int i,n=0;
- char val;
- char *ss_mask;
-
- seq[0]='\0';
- name[0]='\0';
-if (prf_no == 1)
-{
- if (struct_penalties1 == SECST && use_ss1 == TRUE) {
- n=1;
- strcpy(name,"Structures");
- ss_mask = (char *)ckalloc((seqlen_array[1]+10) * sizeof(char));
- for (i=0;i<seqlen_array[1];i++)
- ss_mask[i] = sec_struct_mask1[i];
- print_sec_struct_mask(seqlen_array[1],sec_struct_mask1,ss_mask)
-;
- for(i=0; i<len; i++) {
- val=ss_mask[i];
- if (val == gap_pos1)
- seq[i]='-';
- else
- seq[i]=val;
- }
- seq[i]=EOS;
- ckfree(ss_mask);
- }
-
-}
-else if (prf_no == 2)
-{
- if (struct_penalties2 == SECST && use_ss2 == TRUE) {
- n=1;
- strcpy(name,"Structures");
- ss_mask = (char *)ckalloc((seqlen_array[profile1_nseqs+1]+10) *
-sizeof(char));
- for (i=0;i<seqlen_array[profile1_nseqs+1];i++)
- ss_mask[i] = sec_struct_mask2[i];
- print_sec_struct_mask(seqlen_array[profile1_nseqs+1],sec_struct_mask2,ss_mask);
-
- for(i=0; i<len; i++) {
- val=ss_mask[i];
- if (val == gap_pos1)
- seq[i]='-';
- else
- seq[i]=val;
- }
- seq[i]=EOS;
- ckfree(ss_mask);
- }
-}
- return(n);
-}
-
-int make_gp_data(int prf_no,int len, char *name,char *seq)
-{
- int i,n=0;
- char val;
-
- seq[0]='\0';
- name[0]='\0';
-if (prf_no == 1)
-{
- if (struct_penalties1 == GMASK && use_ss1 == TRUE) {
- n=1;
- strcpy(name,"Gap Penalties");
- for(i=0; i<len; i++) {
- val=gap_penalty_mask1[i];
- if (val == gap_pos1)
- seq[i]='-';
- else
- seq[i]=val;
- }
- seq[i]=EOS;
- }
-}
-else if (prf_no == 2)
-{
- if (struct_penalties2 == GMASK && use_ss2 == TRUE) {
- n=1;
- strcpy(name,"Gap Penalties");
- for(i=0; i<len; i++) {
- val=gap_penalty_mask2[i];
- if (val == gap_pos1)
- seq[i]='-';
- else
- seq[i]=val;
- }
- seq[i]=EOS;
- }
-}
- return(n);
-}
-
-static void VscrollMulti(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- active_panel=seq_panel;
- vscrollnames(bar, newval, oldval);
- vscrollseqs(bar, newval, oldval);
-}
-
-static void HscrollMultiN(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- active_panel=seq_panel;
- hscrollnames(bar, newval, oldval);
-}
-
-static void HscrollMultiS(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- active_panel=seq_panel;
- hscrollseqs(bar, newval, oldval);
-}
-
-static void VscrollPrf1(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- active_panel=prf_panel[0];
- vscrollnames(bar, newval, oldval);
- vscrollseqs(bar, newval, oldval);
-}
-
-static void HscrollPrf1N(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- active_panel=prf_panel[0];
- hscrollnames(bar, newval, oldval);
-}
-
-static void HscrollPrf1S(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- active_panel=prf_panel[0];
- hscrollseqs(bar, newval, oldval);
- if(fixed_prf_scroll==TRUE)
- {
- active_panel=prf_panel[1];
- hscrollseqs(bar, newval, oldval);
- }
-}
-
-static void VscrollPrf2(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- active_panel=prf_panel[1];
- vscrollnames(bar, newval, oldval);
- vscrollseqs(bar, newval, oldval);
-}
-
-static void HscrollPrf2N(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- active_panel=prf_panel[1];
- hscrollnames(bar, newval, oldval);
-}
-
-static void HscrollPrf2S(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- if(fixed_prf_scroll==TRUE)
- {
- active_panel=prf_panel[0];
- hscrollseqs(bar, newval, oldval);
- }
- active_panel=prf_panel[1];
- hscrollseqs(bar, newval, oldval);
-}
-
-
Deleted: trunk/packages/clustalw/trunk/xmenu.c
===================================================================
--- trunk/packages/clustalw/trunk/xmenu.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/xmenu.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,4636 +0,0 @@
-/***********************************************************************************************
- *
- *
- *
- * History
- *
- * 27.3.2002 - color parameter chooser can browse filenames like load sequences - Jose
- * 16.1.2002 - remove the 'cut sequences' dialog box, not needed - Jise
- * 17.1.2002 - 'Remove positions that contain gaps in all sequences ?' removed, no need for confirmation - Toby
- *
- *
- *
-*/
-
-#include <stdarg.h>
-#include <string.h>
-
-#include <vibrant.h>
-#include <document.h>
-
-/* #include <ncbi.h> ramu for time funs */
-
-#include "clustalw.h"
-#include "xmenu.h"
-
-
-static void RemoveWin(WindoW w);
-static void QuitWinW(WindoW w);
-static void QuitWinI(IteM i);
-static void QuitHelpW(WindoW w);
-static void QuitHelpB(ButtoN b);
-static void SearchStrWin (IteM item);
-static void SavePSSeqWin (IteM item);
-static void SavePSPrf1Win (IteM item);
-static void SavePSPrf2Win (IteM item);
-static void SaveSeqFileWin (IteM item);
-static void SavePrf1FileWin (IteM item);
-static void SavePrf2FileWin (IteM item);
-static void OpenColorParWin (IteM item);
-static void SearchStr(ButtoN but);
-static void SavePSSeqFile(ButtoN but);
-static void SavePSPrf1File(ButtoN but);
-static void SavePSPrf2File(ButtoN but);
-static void SaveSeqFile(ButtoN but);
-static void SavePrf1File(ButtoN but);
-static void SavePrf2File(ButtoN but);
-static void SaveScoresWin (IteM item);
-static void SaveScores(ButtoN but);
-static void OpenColorPar(ButtoN but);
-static void CancelWin(ButtoN but);
-static void SaveTreeWin (IteM item);
-static void CAlignWin (IteM item);
-static void RealignSeqsWin (IteM item);
-static void RealignSeqRangeWin (IteM item);
-static void DrawTreeWin (IteM item);
-static void AlignFromTreeWin(IteM item);
-static void PrfPrfAlignWin(IteM item);
-static void PrfPrfTreeAlignWin(IteM item);
-static void SeqPrfAlignWin(IteM item);
-static void SeqPrfTreeAlignWin(IteM item);
-static void BootstrapTreeWin (IteM item);
-static void CreateAlignTree(ButtoN but);
-static void CompleteAlign(ButtoN but);
-static void RealignSeqs(ButtoN but);
-static void RealignSeqRange(ButtoN but);
-static void DrawTree(ButtoN but);
-static void AlignFromTree(ButtoN but);
-static void PrfPrfAlign(ButtoN but);
-static void PrfPrfTreeAlign(ButtoN but);
-static void SeqPrfAlign(ButtoN but);
-static void SeqPrfTreeAlign(ButtoN but);
-static void BootstrapTree(ButtoN but);
-static void OpenSeqFile (IteM item);
-static void AppendSeqFile (IteM item);
-static void OpenPrf1File (IteM item);
-static void OpenPrf2File (IteM item);
-static void ScoreWin(IteM item);
-static void SegmentWin(IteM item);
-static void ScoreSegments(ButtoN but);
-static void PWParameters(IteM item);
-static void MultiParameters(IteM item);
-static void GapParameters(IteM item);
-static void SSParameters(IteM item);
-static void OutputParameters(IteM item);
-static void OutputTreeParameters(IteM item);
-static void HelpProc(IteM item);
-static void DefColorPar(IteM item);
-static void BlackandWhite(IteM item);
-static void set_reset_new_gaps(IteM i);
-static void set_reset_all_gaps(IteM i);
-static void SearchStringAgain(ButtoN but);
-
-static PopuP make_toggle(GrouP g,CharPtr title,CharPtr true_text, CharPtr false_text,
- Boolean *value,PupActnProc SetProc);
-static PrompT make_scale(GrouP g,CharPtr title,int length,int value,int max,BarScrlProc SetProc);
-static PrompT make_prompt(GrouP g,CharPtr title);
-
-static void CutSequences(IteM item);
-static void PasteSequences(IteM item);
-static void RemoveGaps(IteM item);
-static void RemoveGapPos(IteM item);
-
-static void SelectSeqs(IteM item);
-static void SelectPrf1(IteM item);
-static void SelectPrf2(IteM item);
-static void MergeProfiles(IteM item);
-static void ClearSeqs(IteM item);
-
-static void cut_multiplem(void);
-static void cut_profile1(void);
-static void cut_profile2(void);
-static void ssave(int j);
-static void sscpy(int i,int j);
-static void sload(int i);
-static void clear_seqrange(spanel p);
-static void select_seqs(spanel p,Boolean flag);
-static void clear_seg_exceptions(spanel p);
-
-static void make_menu_headers(WindoW w);
-static void make_help_menu(void);
-static void make_score_menu(void);
-static void make_file_menu(void);
-static void make_edit_menu(void);
-static void make_align_menu(void);
-static void make_tree_menu(void);
-static void make_color_menu(void);
-
-static void save_aln_window(int prf_no,char *title,char *prompt,void save_proc(ButtoN but));
-static void save_ps_window(int prf_no,char *prompt,void save_proc(ButtoN but));
-static void read_file_window(char *title,char *prompt,char *filename,void read_proc(ButtoN but));
-static void do_align_window(WindoW *alignw,TexT *treetext,Boolean treestatus,char *title,void align_proc(ButtoN but));
-static void do_palign_window(WindoW *alignw,TexT *tree1text,TexT *tree2test,Boolean treestatus,char *title,void align_proc(ButtoN but));
-static Boolean open_aln_files(void);
-static void write_file(int fseq,int lseq,int fres,int lres);
-
-
-Boolean x_menus=FALSE;
-
-int mheader = 2; /* maximum header lines */
-int mfooter = 1; /* maximum footer lines */
-int max_mlines = 20; /* multiple align display length */
-int min_mlines = 10; /* multiple align display length */
-int max_plines = 8; /* profile align display length */
-int min_plines1 = 5; /* profile align display length */
-int min_plines2 = 3; /* profile align display length */
-
-Boolean aln_mode = MULTIPLEM;
-Boolean window_displayed = FALSE;
-
-int save_format = CLUSTAL;
-Boolean fixed_prf_scroll = FALSE;
-int loffset,boffset,toffset;
-int roffset;
-int poffset;
-
-int score_cutoff=5; /* cutoff for residue exceptions */
-int score_hwin=5; /* half window for summing alignment column scores */
-int score_scale=5;
-int segment_dnascale=5;
-int length_cutoff=1; /* length cutoff for segment exceptions */
-Boolean residue_exceptions=FALSE;
-Boolean segment_exceptions=FALSE;
-int score_matnum=4;
-char score_mtrxname[FILENAMELEN];
-int segment_matnum=3;
-char segment_mtrxname[FILENAMELEN];
-int score_dnamatnum=1;
-char score_dnamtrxname[FILENAMELEN];
-int segment_dnamatnum=1;
-char segment_dnamtrxname[FILENAMELEN];
-
-Boolean output_ss;
-Boolean output_gp;
-
-extern char revision_level[];
-extern Boolean interactive;
-
-extern char seqname[];
-extern char outfile_name[];
-extern char profile1_name[];
-extern char profile2_name[];
-extern char usermtrxname[], pw_usermtrxname[];
-extern char dnausermtrxname[], pw_dnausermtrxname[];
-
-extern Boolean usemenu;
-extern Boolean use_tree_file;
-extern Boolean use_tree1_file,use_tree2_file;
-extern Boolean dnaflag;
-extern sint nseqs;
-extern sint profile1_nseqs;
-extern sint profile_no;
-extern sint max_aa;
-extern sint *seqlen_array;
-extern char **seq_array;
-extern char **names, **titles;
-extern Boolean empty;
-extern Boolean profile1_empty, profile2_empty;
-extern sint gap_pos1, gap_pos2;
-extern Boolean use_ambiguities;
-
-
-extern float gap_open, gap_extend;
-extern float dna_gap_open, dna_gap_extend;
-extern float prot_gap_open, prot_gap_extend;
-extern float pw_go_penalty, pw_ge_penalty;
-extern float dna_pw_go_penalty, dna_pw_ge_penalty;
-extern float prot_pw_go_penalty, prot_pw_ge_penalty;
-extern sint wind_gap,ktup,window,signif;
-extern sint dna_wind_gap, dna_ktup, dna_window, dna_signif;
-extern sint prot_wind_gap,prot_ktup,prot_window,prot_signif;
-extern sint helix_penalty;
-extern sint strand_penalty;
-extern sint loop_penalty;
-extern sint helix_end_minus;
-extern sint helix_end_plus;
-extern sint strand_end_minus;
-extern sint strand_end_plus;
-extern sint helix_end_penalty;
-extern sint strand_end_penalty;
-extern sint divergence_cutoff;
-extern sint gap_dist;
-extern sint boot_ntrials; /* number of bootstrap trials */
-extern unsigned sint boot_ran_seed; /* random number generator seed */
-
-extern sint matnum,pw_matnum;
-extern char mtrxname[], pw_mtrxname[];
-extern sint dnamatnum,pw_dnamatnum;
-extern char dnamtrxname[], pw_dnamtrxname[];
-
-extern MatMenu matrix_menu;
-extern MatMenu pw_matrix_menu;
-extern MatMenu dnamatrix_menu;
-
-extern Boolean quick_pairalign;
-extern sint matnum,pw_matnum;
-extern Boolean neg_matrix;
-extern float transition_weight;
-extern char hyd_residues[];
-extern Boolean no_var_penalties, no_hyd_penalties, no_pref_penalties;
-extern Boolean use_endgaps;
-extern Boolean endgappenalties;
-extern Boolean output_clustal, output_nbrf, output_phylip, output_gcg, output_gde, output_nexus;
-extern Boolean output_fasta; /* Ramu */
-
-extern Boolean save_parameters;
-extern Boolean output_tree_clustal, output_tree_phylip, output_tree_distances, output_tree_nexus, output_pim;
-extern Boolean lowercase; /* Flag for GDE output - set on comm. line*/
-extern Boolean cl_seq_numbers;
-
-extern Boolean seqRange;
-
-extern sint output_order;
-extern sint *output_index;
-extern Boolean reset_alignments_new; /* DES */
-extern Boolean reset_alignments_all; /* DES */
-
-extern FILE *clustal_outfile, *gcg_outfile, *nbrf_outfile, *phylip_outfile;
-extern FILE *gde_outfile, *nexus_outfile;
-extern FILE *fasta_outfile;
-
-extern sint max_aln_length;
-
-extern Boolean tossgaps; /* Ignore places in align. where ANY seq. has a gap*/
-extern Boolean kimura; /* Use correction for multiple substitutions */
-extern sint bootstrap_format; /* bootstrap file format */
-
-extern sint output_struct_penalties;
-extern Boolean use_ss1, use_ss2;
-extern char *res_cat1[];
-extern char *res_cat2[];
-
-extern char *amino_acid_codes;
-
-PrompT message; /* used in temporary message window */
-
-static Char filename[FILENAMELEN]; /* used in temporary file selection window */
-
-Boolean mess_output=TRUE;
-Boolean save_log=FALSE;
-FILE *save_log_fd=NULL;
-static char save_log_filename[FILENAMELEN];
-static IteM save_item1,save_item2,exc_item;
-
-spanel seq_panel; /* data for multiple alignment area */
-spanel prf_panel[2]; /* data for profile alignment areas */
-spanel active_panel; /* 'in-use' panel -scrolling,clicking etc. */
-static range selected_seqs; /* sequences selected by clicking on names */
-static range selected_res; /* residues selected by clicking on seqs */
-int firstres, lastres; /* range of alignment for saving as ... */
-
-/* data for Search function */
-
-char find_string[MAXFINDSTR]="";
-aln_pos find_pos;
-
-/* arrays for storing clustalw data for cut-and-paste sequences */
-static sint *saveseqlen_array=NULL;
-static char **saveseq_array=NULL;
-static char **savenames=NULL, **savetitles=NULL;
-sint ncutseqs=0;
-
-FonT datafont,helpfont;
-WindoW mainw=NULL;
-WindoW messagew=NULL;
-WindoW readfilew=NULL;
-WindoW savealnw=NULL;
-WindoW savescoresw=NULL;
-WindoW savepsw=NULL;
-WindoW findw=NULL;
-WindoW calignw=NULL;
-WindoW ralignw=NULL;
-WindoW rralignw=NULL;
-WindoW talignw=NULL;
-WindoW palignw=NULL;
-WindoW salignw=NULL;
-WindoW scorew=NULL;
-WindoW exceptionw=NULL;
-TexT savealntext;
-TexT savescorestext;
-TexT savepstext;
-TexT findtext;
-TexT pspartext;
-TexT ctreetext;
-TexT rtreetext;
-TexT rrtreetext;
-TexT ttreetext;
-TexT ptree1text,ptree2text;
-TexT streetext;
-TexT readfiletext;
-WindoW savetreew=NULL;
-TexT savetreetext;
-WindoW drawtreew=NULL;
-TexT drawnjtreetext;
-TexT drawphtreetext;
-TexT drawdsttreetext;
-TexT drawnxstreetext;
-
-TexT drawpimtext;
-
-WindoW boottreew=NULL;
-TexT bootnjtreetext;
-TexT bootphtreetext;
-TexT bootnxstreetext;
-TexT blocklentext;
-PrompT mattext,pwmattext,dnamattext,pwdnamattext,scoremattext,segmentmattext;
-PrompT scorednamattext,segmentdnamattext;
-GrouP seg_matrix_list,score_matrix_list;
-GrouP seg_dnamatrix_list,score_dnamatrix_list;
-GrouP matrix_list,pw_matrix_list,dnamatrix_list,pw_dnamatrix_list;
-
-TexT cl_outtext,pir_outtext,msf_outtext,phylip_outtext,gde_outtext,nexus_outtext;
-TexT fasta_outtext; /* Ramu */
-
-GrouP slow_para,fast_para;
-GrouP seq_display,prf1_display,prf2_display;
-
-MenU filem,alignm,editm,treem,colorm;
-menu_item file_item,align_item,edit_item,tree_item,color_item;
-MenU scorem,helpmenu;
-menu_item score_item,help_item;
-IteM segment_item;
-IteM bw_item,defcol_item,usercol_item;
-IteM new_gaps_item,all_gaps_item;
-WindoW helpw[MAXHELPW];
-int numhelp=0;
-
-PopuP modetext,flisttext;
-ButtoN pscrolltext;
-
-ButtoN selFonts;
-
-PopuP show_seg_toggle;
-PrompT residue_cutofftext;
-PrompT length_cutofftext;
-PrompT scorescaletext;
-PrompT segmentdnascaletext;
-
-#define MAXFONTS 6
-int nfonts=6; /*shoud be MAXFONTS ................ ramu */
-int av_font[MAXFONTS]={8,10,12,14,18,24};
-int font_size=1;
-
-int ncolors=0;
-int ncolor_pars=0;
-color color_lut[MAXCOLORS+1];
-char def_protpar_file[]="colprot.par";
-char def_dnapar_file[]="coldna.par";
-char *explicit_par_file = NULL;
-char *par_file = NULL;
-int inverted = TRUE;
-int usebw=FALSE,usedefcolors=TRUE,useusercolors=FALSE;
-
-char ps_par_file[FILENAMELEN]="colprint.par";
-int pagesize=A4;
-int orientation=LANDSCAPE;
-Boolean ps_header=TRUE;
-Boolean ps_ruler=TRUE;
-Boolean resize=TRUE;
-int first_printres=0,last_printres=0,blocklen;
-Boolean ps_curve=TRUE;
-Boolean ps_resno=TRUE;
-PoinT display_pos;
-int namewidth,seqwidth; /* fixed widths of sequence display areas */
-
-Boolean realign_endgappenalties=TRUE;
-Boolean align_endgappenalties=FALSE;
-
-char helptext[MAXHELPLENGTH];
-
-
-
-
-/* ramu */
-
-#include <time.h>
-#include <math.h>
-#include <unistd.h>
-#include <pwd.h>
-#include <sys/times.h>
-
-float cputime(float *seconds); /* Ramu , need's reset function */
-
-float cputime(float *seconds)
-{
- struct tms buf;
- static time_t last=0, first;
- static int calls=0;
- int hertz=sysconf(_SC_CLK_TCK);
- time_t this;
-
- /* get the current number of user and system cpu ticks */
-
- times(&buf);
- this = buf.tms_utime + buf.tms_stime;
-
- /* if this is the first call then this is time zero */
-
- if ( !calls ) {
- first = this;
- calls = -1;
- }
- else
- this = this - first;
- if(seconds)
- *seconds = ((float)(this - last))/(float)hertz;
- last = this;
- return ((float)this)/(float)hertz;
-}
-
-/* Ramu */
-
-
-/* main subroutine called from clustalx.c, initialises windows and enters a
- forever loop monitoring user input */
-
-void x_menu(void)
-{
- int i,n;
- char font[30];
- char tstr[30];
- int height;
- PrompT fsize;
- RecT wr,r,r1;
-
-
-/* make the pulldown menu bar */
-
-#ifdef WIN_MAC
- MenU m;
-
- m=AppleMenu (NULL);
- DeskAccGroup (m);
- make_menu_headers(NULL);
-#endif
-#ifndef UNIX
- ProcessUpdatesFirst(FALSE);
-#endif
-
- sprintf(tstr,"Clustal%s",revision_level);
-/*#ifdef WIN_MSWIN
- mainw = FixedWindow (-50,-33,-10,-10,tstr,QuitWinW);
-#else*/
- mainw = DocumentWindow (-50,-33,-10,-10,tstr,QuitWinW,ResizeWindowProc);
-/*#endif*/ SetGroupSpacing(mainw,0,10);
- SetGroupSpacing(mainw,0,10);
-
- x_menus=TRUE;
-
-#ifndef WIN_MAC
- make_menu_headers(mainw);
-#endif
-/* decide if we're starting in profile or sequence mode */
- if (!profile1_empty) aln_mode=PROFILEM;
- else aln_mode=MULTIPLEM;
-
- make_file_menu();
- make_edit_menu();
- make_align_menu();
- make_tree_menu();
- make_color_menu();
- make_score_menu();
- make_help_menu();
-
-/* add a button to switch between multiple and profile alignment modes */
-
- modetext=PopupList(mainw,TRUE,set_aln_mode);
- PopupItem(modetext,"Multiple Alignment Mode");
- PopupItem(modetext,"Profile Alignment Mode");
- if(aln_mode==MULTIPLEM)
- SetValue(modetext,1);
- else
- SetValue(modetext,2);
-
- sprintf(font, "%s,%d,%c", "courier", av_font[font_size], 'm');
- datafont=ParseFont(font);
-
- sprintf(font, "%s,%d,%c", "courier", 10, 'm');
- helpfont=ParseFont(font);
-
- Advance(mainw);
- shift(mainw,20,0);
-
-/* add a button to select font size */
- fsize=StaticPrompt(mainw,"Font Size:",0,dialogTextHeight,systemFont,'r');
- Advance(mainw);
- flisttext=PopupList(mainw,TRUE,set_font_size);
- for(i=0;i<nfonts;i++)
- {
- sprintf(tstr,"%d",av_font[i]);
- PopupItem(flisttext,tstr);
- }
- SetValue(flisttext,font_size+1);
-
- Advance(mainw);
- shift(mainw,20,0);
-
- /* ramu .........
- selFonts = PushButton(mainw,"Select Fonts",VSeqMgrFontProc);
- Advance(mainw);
- shift(mainw,20,0);
-
- end ramu ........... */
-
-/* add a button to switch profile scrolling modes */
- pscrolltext=CheckBox(mainw,"Lock Scroll",set_pscroll_mode);
- if(fixed_prf_scroll) SetStatus(pscrolltext,TRUE);
- Break(mainw);
-
-
- selected_seqs.first=selected_seqs.last=-1;
- selected_res.first=selected_res.last=-1;
-
-
-/* initialise the multiple alignment display area */
-
- SelectFont(datafont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
-
- GetNextPosition(mainw,&display_pos);
-
-/* calculate initial pixel width and height of displays */
- namewidth=(DNAMES+DNUMBER+1)*stdCharWidth;
- seqwidth=(DCOLS+2*MARGIN)*stdCharWidth+2;
- n=screenRect.right-screenRect.left;
- if(seqwidth+namewidth>n) seqwidth=n-namewidth;
-
- height=(max_mlines+mfooter+MARGIN)*stdLineHeight+2+SCOREHEIGHT;
- n=screenRect.bottom-screenRect.top;
- if(height>n) height=n;
-
- seq_display=make_scroll_area(mainw,0,namewidth+20,seqwidth,height,1,nseqs,&seq_panel);
- position_scrollbars(seq_panel);
-
-/* initialise the profile alignment display area */
-
- SetNextPosition(mainw,display_pos);
- height=(max_plines+MARGIN)*stdLineHeight+2+SCOREHEIGHT;
- if(height>n) height=n;
- prf1_display=make_scroll_area(mainw,1,namewidth,seqwidth,height,1,profile1_nseqs,&prf_panel[0]);
- position_scrollbars(prf_panel[0]);
-
- prf2_display=make_scroll_area(mainw,2,namewidth,seqwidth,height,profile1_nseqs+1,nseqs-profile1_nseqs,&prf_panel[1]);
- position_scrollbars(prf_panel[1]);
-
-/* add the message line */
- Break(mainw);
- Advance(mainw);
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- message = StaticPrompt(mainw, "",500, 0,systemFont,'l');
-
-/* save some pixel sizes for future resizing events */
- if(aln_mode==PROFILEM)
- {
- Hide(seq_display);
- profile_no=1;
- Show(prf1_display);
- Show(prf2_display);
- Show(pscrolltext);
- active_panel=prf_panel[0];
- Select(prf1_display);
- load_aln(prf_panel[0],0,profile1_nseqs-1,TRUE);
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,TRUE);
-
- Show(mainw);
- ObjectRect(mainw,&wr);
- ObjectRect(prf_panel[0].names,&r);
- ObjectRect(prf_panel[1].names,&r1);
- boffset=wr.bottom-wr.top-r1.bottom;
- loffset=r.left;
- toffset=r.top;
- ObjectRect(prf_panel[0].seqs,&r);
- roffset=wr.right-wr.left-r.right;
- }
- else
- {
- Hide(prf1_display);
- Hide(prf2_display);
- Hide(pscrolltext);
- profile_no=0;
- Show(seq_display);
- active_panel=seq_panel;
-
- Select(seq_display);
- load_aln(seq_panel,0,nseqs-1,TRUE);
-
- Show(mainw);
- ObjectRect(mainw,&wr);
- ObjectRect(seq_panel.names,&r);
- boffset=wr.bottom-wr.top-r.bottom;
- loffset=r.left;
- toffset=r.top;
- ObjectRect(seq_panel.seqs,&r);
- roffset=wr.right-wr.left-r.right;
- }
- ObjectRect(prf_panel[0].names,&r);
- ObjectRect(prf_panel[1].names,&r1);
- poffset=r1.top-r.bottom;
-
-/* initialise some variables before we display the window */
- if(orientation==LANDSCAPE)
- {
- if(pagesize==A4) blocklen=150;
- else if (pagesize==A3) blocklen=250;
- else blocklen=150;
- }
- else
- {
- if(pagesize==A4) blocklen=80;
- else if (pagesize==A3) blocklen=150;
- else blocklen=150;
- }
-
-/* ok - Go! */
- window_displayed=TRUE;
- ProcessEvents();
-
-}
-
-
-static void RemoveWin(WindoW w)
-{
- Remove(w);
-}
-
-
-static void QuitWinW(WindoW w)
-{
- if(aln_mode == MULTIPLEM)
- {
- if(seq_panel.modified)
- if (Message(MSG_YN,"Alignment has not been saved.\n"
- "Quit program anyway?")==ANS_NO) return;
- }
- else if(aln_mode == PROFILEM)
- {
- if(prf_panel[0].modified)
- if (Message(MSG_YN,"Profile 1 has not been saved.\n"
- "Quit program anyway?")==ANS_NO) return;
- if(prf_panel[1].modified)
- if (Message(MSG_YN,"Profile 2 has not been saved.\n"
- "Quit program anyway?")==ANS_NO) return;
- }
- QuitProgram ();
-}
-
-static void SearchStrWin (IteM item)
-{
- int i;
- Boolean sel=FALSE;
- GrouP findgr;
- ButtoN find_can,find_ok;
- PopuP ps,or;
- char path[FILENAMELEN];
- char str[FILENAMELEN];
- panel_data data;
-
- GetPanelExtra(active_panel.names,&data);
- if (data.nseqs==0)
- {
- Message(MSG_OK,"No file loaded.");
- return;
- }
- for (i=0;i<data.nseqs;i++)
- if(data.selected[i]==TRUE)
- {
- sel=TRUE;
- break;
- }
- if(sel==FALSE)
- {
- Message(MSG_OK,"Select sequences by clicking on the names.");
- return;
- }
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- findw=FixedWindow(-50, -33, -10, -10, "SEARCH IN SELECTED SEQUENCES",RemoveWin);
- stdLineHeight=18;
- SelectFont(programFont);
- findtext=DialogText(findw, "", 35, NULL);
- Break(findw);
- find_ok=PushButton(findw, "SEARCH FROM START", SearchStr);
- Break(findw);
- find_ok=PushButton(findw, "SEARCH AGAIN", SearchStringAgain);
- Break(findw);
- find_can=PushButton(findw, "CLOSE", CancelWin);
-
- Show(findw);
-}
-
-static void SavePSSeqWin (IteM item)
-{
- if (empty)
- {
- error("No file loaded");
- return;
- }
- save_ps_window(0,"WRITE SEQUENCES TO:",SavePSSeqFile);
-}
-
-static void SavePSPrf1Win (IteM item)
-{
- if (profile1_empty)
- {
- error("No file loaded");
- return;
- }
- save_ps_window(1,"WRITE PROFILE 1 TO:",SavePSPrf1File);
-}
-
-static void SavePSPrf2Win (IteM item)
-{
- if (profile2_empty)
- {
- error("No file loaded");
- return;
- }
- save_ps_window(2,"WRITE PROFILE 2 TO:",SavePSPrf2File);
-}
-
-static void save_ps_window(int prf_no,char *prompt,void save_proc(ButtoN but))
-{
- GrouP savegr;
- ButtoN save_can,save_ok;
- PopuP ps,or;
- char path[FILENAMELEN];
- char str[FILENAMELEN];
- panel_data data;
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- savepsw=FixedWindow(-50, -33, -10, -10, "WRITE POSTSCRIPT FILE",RemoveWin);
- make_prompt(savepsw, prompt);
- stdLineHeight=18;
- SelectFont(programFont);
- savepstext=DialogText(savepsw, "", 35, NULL);
- Break(savepsw);
- make_prompt(savepsw, "PS Colors File :");
- pspartext=DialogText(savepsw, ps_par_file, 35, NULL);
- Break(savepsw);
- make_prompt(savepsw, "Page Size");
- Advance(savepsw);
- ps=PopupList(savepsw,TRUE,set_pagesize);
- PopupItem(ps,"A4");
- PopupItem(ps,"A3");
- PopupItem(ps,"US Letter");
- if (pagesize == A4)
- SetValue(ps,1);
- else if (pagesize == A3)
- SetValue(ps,2);
- else if (pagesize == USLETTER)
- SetValue(ps,3);
- Break(savepsw);
- make_prompt(savepsw, "Orientation");
- Advance(savepsw);
- or=PopupList(savepsw,TRUE,set_orientation);
- PopupItem(or,"LANDSCAPE");
- PopupItem(or,"PORTRAIT");
- if (orientation == LANDSCAPE)
- SetValue(or,1);
- else if (orientation == PORTRAIT)
- SetValue(or,2);
- Break(savepsw);
- make_toggle(savepsw,"Print Header :","YES","NO",&ps_header,set_header);
- Advance(savepsw);
- make_toggle(savepsw,"Print Quality Curve :","YES","NO",&ps_curve,set_curve);
- Break(savepsw);
- make_toggle(savepsw,"Print Ruler :","YES","NO",&ps_ruler,set_ruler);
- Advance(savepsw);
- make_toggle(savepsw,"Print Residue Numbers :","YES","NO",&ps_resno,set_resno);
- Break(savepsw);
- make_toggle(savepsw,"Resize to fit page:","YES","NO",&resize,set_resize);
- Break(savepsw);
- first_printres=1;
- if (prf_no==0)
- GetPanelExtra(seq_panel.seqs,&data);
- else if (prf_no==1)
- GetPanelExtra(prf_panel[0].seqs,&data);
- else
- GetPanelExtra(prf_panel[1].seqs,&data);
- last_printres=data.ncols;
- make_prompt(savepsw, "Print from position :");
- Advance(savepsw);
- sprintf(str,"%5d",first_printres);
- DialogText(savepsw, str, 5,set_fpres);
- Advance(savepsw);
- make_prompt(savepsw, "to :");
- Advance(savepsw);
- sprintf(str,"%5d",last_printres);
- DialogText(savepsw, str, 5,set_lpres);
- Break(savepsw);
- make_prompt(savepsw, "Use block length :");
- Advance(savepsw);
- sprintf(str,"%5d",blocklen);
- blocklentext=DialogText(savepsw, str, 5,set_blocklen);
- Break(savepsw);
- savegr=HiddenGroup(savepsw, 2, 0, NULL);
- shift(savegr, 60, 20);
- save_ok=PushButton(savegr, " OK ", save_proc);
- shift(savegr, 20,0);
- save_can=PushButton(savegr, "CLOSE", CancelWin);
-
- if(prf_no==0)
- get_path(seqname,path);
- else if(prf_no==1)
- get_path(profile1_name,path);
- else if(prf_no==2)
- get_path(profile2_name,path);
- strcat(path,"ps");
- SetTitle(savepstext, path);
- Show(savepsw);
-}
-
-static void SaveScoresWin (IteM item)
-{
- int i;
- Boolean sel=FALSE;
- GrouP scoregr;
- ButtoN score_can,score_ok;
- PopuP ps,or;
- char path[FILENAMELEN];
- char str[FILENAMELEN];
- panel_data data;
-
-
- if (empty)
- {
- error("No file loaded");
- return;
- }
-
- GetPanelExtra(active_panel.names,&data);
- for (i=0;i<data.nseqs;i++)
- if(data.selected[i]==TRUE)
- {
- sel=TRUE;
- break;
- }
- if(sel==FALSE)
- {
- Message(MSG_OK,"Select sequences to be written by clicking on the names.");
- return;
- }
-
- get_path(seqname,path);
- strcat(path,"qscores");
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- savescoresw=FixedWindow(-50, -33, -10, -10, "SAVE QUALITY SCORES",RemoveWin);
- stdLineHeight=18;
- SelectFont(programFont);
- make_prompt(savescoresw, "SAVE QUALITY SCORES TO:");
- stdLineHeight=18;
- SelectFont(programFont);
- Break(savescoresw);
- savescorestext=DialogText(savescoresw, "", 35, NULL);
- Break(savescoresw);
- scoregr=HiddenGroup(savescoresw, 2, 0, NULL);
- shift(scoregr, 60, 20);
- score_ok=PushButton(scoregr, " OK ", SaveScores);
- shift(scoregr, 20,0);
- score_can=PushButton(scoregr, "CANCEL", CancelWin);
-
- SetTitle(savescorestext, path);
- Show(savescoresw);
-
- Advance(savescoresw);
- Show(savescoresw);
-}
-
-static void SaveScores(ButtoN but)
-{
- char c;
- int i,j,val;
- int length=0;
- FILE *outfile;
- panel_data name_data,seq_data;
- Boolean gap;
-
- GetPanelExtra(active_panel.names,&name_data);
- GetPanelExtra(active_panel.seqs,&seq_data);
-
- GetTitle(savescorestext, filename, FILENAMELEN);
- stripspace(filename);
-
- outfile=open_explicit_file(filename);
-
-/* get the maximum length of the selected sequences */
- for (i=1;i<=nseqs;i++)
- if (name_data.selected[i-1]==TRUE && length < seqlen_array[i]) length = seqlen_array[i];
-
- for(j=1;j<=length;j++)
- {
-/* first check for a column of gaps */
- gap=TRUE;
- for (i=1;i<=nseqs;i++)
- if (name_data.selected[i-1]==TRUE)
- {
- val = seq_array[i][j];
- if(j<=seqlen_array[i] && (val != gap_pos1) && (val != gap_pos2))
- {
- gap=FALSE;
- break;
- }
- }
- if(gap==FALSE)
- {
- for (i=1;i<=nseqs;i++)
- {
- if (name_data.selected[i-1]==TRUE)
- {
- val = seq_array[i][j];
- if(j>seqlen_array[i] || (val == gap_pos1) || (val == gap_pos2))
- c = '-';
- else {
- c = amino_acid_codes[val];
- }
-
- fprintf(outfile,"%c ",c);
- }
- }
- fprintf(outfile,"\t%3d\n",seq_data.colscore[j-1]);
- }
-
- }
- fclose(outfile);
-
- if (Visible(savescoresw))
- {
- Remove(savescoresw);
- savescoresw=NULL;
- }
-
-
-
- info("File %s saved",filename);
-}
-
-static void SaveSeqFileWin (IteM item)
-{
- if (empty)
- {
- error("No file loaded");
- return;
- }
- save_aln_window(0,"SAVE SEQUENCES","SAVE SEQUENCES AS:",SaveSeqFile);
-}
-
-static void SavePrf1FileWin (IteM item)
-{
- if (profile1_empty)
- {
- error("No file loaded");
- return;
- }
- save_aln_window(1,"SAVE PROFILE","SAVE PROFILE 1 AS:",SavePrf1File);
-}
-static void SavePrf2FileWin (IteM item)
-{
- if (profile2_empty)
- {
- error("No file loaded");
- return;
- }
- save_aln_window(2,"SAVE PROFILE","SAVE PROFILE 2 AS:",SavePrf2File);
-}
-
-static void save_aln_window(int prf_no,char *title,char *prompt,void save_proc(ButtoN but))
-{
- GrouP savegr;
- ButtoN save_ok, save_can;
- GrouP maing;
- GrouP format_list;
- ButtoN formatb[6+1]; /* + 1 for fasta */
- PopuP case_toggle,snos_toggle;
- PopuP seqRange_toggle; /* Ramu */
- char path[FILENAMELEN+1];
- char str[FILENAMELEN+1];
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
-
- savealnw=FixedWindow(-50, -33, -10, -10, title,RemoveWin);
-
- format_list=NormalGroup(savealnw,3,0,"Format",systemFont,set_format);
- formatb[0]=RadioButton(format_list,"CLUSTAL");
- formatb[1]=RadioButton(format_list,"NBRF/PIR");
- formatb[2]=RadioButton(format_list,"GCG/MSF");
- formatb[3]=RadioButton(format_list,"PHYLIP");
- formatb[4]=RadioButton(format_list,"GDE");
- formatb[5]=RadioButton(format_list,"NEXUS");
- formatb[6]=RadioButton(format_list,"FASTA");
-
- if(prf_no==0)
- get_path(seqname,path);
- else if(prf_no==1)
- get_path(profile1_name,path);
- else if(prf_no==2)
- get_path(profile2_name,path);
-
- if (save_format==CLUSTAL)
- {
- SetValue(format_list,1);
- strcat(path,"aln");
- }
- else if (save_format==PIR)
- {
- SetValue(format_list,2);
- strcat(path,"pir");
- }
- else if (save_format==MSF)
- {
- SetValue(format_list,3);
- strcat(path,"msf");
- }
- else if (save_format==PHYLIP)
- {
- SetValue(format_list,4);
- strcat(path,"phy");
- }
- else if (save_format==GDE)
- {
- SetValue(format_list,5);
- strcat(path,"gde");
- }
- else if (save_format==NEXUS)
- {
- SetValue(format_list,6);
- strcat(path,"nxs");
- }
- else if (save_format==FASTA)
- {
- SetValue(format_list,7);
- strcat(path,"fasta");
- }
-
- maing=HiddenGroup(savealnw,0,0,NULL);
- SetGroupSpacing(maing,0,10);
-
- case_toggle=make_toggle(maing,"GDE output case :","Lower","Upper",&lowercase,set_case);
- Break(maing);
- snos_toggle=make_toggle(maing,"CLUSTALW sequence numbers :","ON","OFF",&cl_seq_numbers,set_snos);
-
- Break(maing);
- make_prompt(maing, "Save range from :");
- Advance(maing);
- firstres = 0; /* init always ramu */
- lastres = 0; /* init always ramu */
- sprintf(str,"%5d",firstres);
- DialogText(maing, str, 5,set_fres);
- Advance(maing);
- make_prompt(maing, "to :");
- Advance(maing);
- sprintf(str,"%5d",lastres);
- DialogText(maing, str, 5,set_lres);
- /* <Ramu> */
- Advance(maing);
- seqRange_toggle=make_toggle(maing," and include range numbers :","ON","OFF",&seqRange,setRange);
- /*</Ramu>*/
-
- Break(maing);
- shift(savealnw, 0, 20);
- make_prompt(savealnw, prompt);
- stdLineHeight=18;
- SelectFont(programFont);
- Break(savealnw);
- savealntext=DialogText(savealnw, "", 35, NULL);
- Break(savealnw);
- savegr=HiddenGroup(savealnw, 2, 0, NULL);
- shift(savegr, 60, 20);
- save_ok=PushButton(savegr, " OK ", save_proc);
- shift(savegr, 20,0);
- save_can=PushButton(savegr, "CANCEL", CancelWin);
-
- SetTitle(savealntext, path);
- Show(savealnw);
-
-}
-
-static void read_file_window(char *title,char *prompt,char *filename,void read_proc(ButtoN but))
-{
- GrouP readgr;
- ButtoN read_ok, read_can;
- GrouP maing;
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- readfilew=FixedWindow(-50, -33, -10, -10, title,RemoveWin);
-
- maing=HiddenGroup(readfilew,2,0,NULL);
- SetGroupSpacing(maing,0,10);
-
- shift(readfilew, 0, 20);
- make_prompt(readfilew, prompt);
- stdLineHeight=18;
- SelectFont(programFont);
- Break(readfilew);
- readfiletext=DialogText(readfilew, "", 35, NULL);
- if (filename != NULL) SetTitle(readfiletext, filename);
- Break(readfilew);
- readgr=HiddenGroup(readfilew, 2, 0, NULL);
- shift(readgr, 60, 20);
- read_ok=PushButton(readgr, " OK ", read_proc);
- shift(readgr, 20,0);
- read_can=PushButton(readgr, "CANCEL", CancelWin);
-
- Show(readfilew);
-}
-
-static void CancelWin (ButtoN but)
-{
- Remove(ParentWindow(but));
-}
-
-static void SearchStr(ButtoN but)
-{
-
-/* reset the current position */
-
- find_pos.seq=0;
- find_pos.res=-1;
-
-/* find the next occurrence of the string */
- SearchStringAgain(but);
-
-
-}
-
-static void SearchStringAgain(ButtoN but)
-{
- int i,j,ix,length;
- int seq,res,start_res;
- Boolean in_string,found;
- panel_data ndata,sdata;
-
- GetTitle(findtext, filename, FILENAMELEN);
- stripspace(filename);
-
- strncpy(find_string,filename,MAXFINDSTR);
- length=strlen(find_string);
- if(length==0) return;
- for(i=0;i<length;i++)
- find_string[i]=toupper(find_string[i]);
-
- GetPanelExtra(active_panel.names,&ndata);
- GetPanelExtra(active_panel.seqs,&sdata);
-
- in_string=FALSE;
- found=FALSE;
- start_res=0;
- ix=0;
- seq=find_pos.seq;
- res=find_pos.res+1;
- while (seq<ndata.nseqs)
- {
- if(ndata.selected[seq]==TRUE)
- {
- while (res<sdata.ncols)
- {
- if(sdata.lines[seq][res]==find_string[ix])
- {
- if(in_string==FALSE)
- start_res=res;
- ix++;
- in_string=TRUE;
- }
- else if(in_string==TRUE)
- {
- res=start_res;
- ix=0;
- in_string=FALSE;
- }
- if(ix==length)
- {
- find_pos.seq=seq;
- find_pos.res=start_res;
- found=TRUE;
- break;
- }
- res++;
- while(res<sdata.ncols && sdata.lines[seq][res]=='-')
- res++;
- }
- }
- if(found) break;
- seq++;
- res=0;
- }
-
-
- if(found==FALSE)
- info("String %s not found",find_string);
- else
- {
- info("String %s in sequence %s, column %d",find_string,names[find_pos.seq+1],find_pos.res+1);
- }
-}
-
-static void SavePSSeqFile(ButtoN but)
-{
- char *ps_file;
-
- GetTitle(savepstext, filename, FILENAMELEN);
- stripspace(filename);
-
- ps_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
- strcpy(ps_file,filename);
-
- GetTitle(pspartext, filename, FILENAMELEN);
- stripspace(filename);
-
- strcpy(ps_par_file,filename);
-
- write_ps_file(seq_panel,ps_file,ps_par_file,pagesize,orientation,
- ps_header,ps_ruler,ps_resno,
- resize,first_printres,last_printres,blocklen,ps_curve);
-
- info("Postscript file %s written",ps_file);
- ckfree(ps_file);
-
-}
-
-static void SavePSPrf1File(ButtoN but)
-{
- char *ps_file;
- char *ps_par_file;
-
- GetTitle(savepstext, filename, FILENAMELEN);
- stripspace(filename);
-
- ps_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
- strcpy(ps_file,filename);
-
- GetTitle(pspartext, filename, FILENAMELEN);
- stripspace(filename);
-
- ps_par_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
- strcpy(ps_par_file,filename);
-
- write_ps_file(prf_panel[0],ps_file,ps_par_file,pagesize,orientation,
- ps_header,ps_ruler,ps_resno,
- resize,first_printres,last_printres,blocklen,ps_curve);
-
- info("Postscript file %s written",ps_file);
- ckfree(ps_file);
-
-}
-
-static void SavePSPrf2File(ButtoN but)
-{
- char *ps_file;
- char *ps_par_file;
-
- GetTitle(savepstext, filename, FILENAMELEN);
- stripspace(filename);
-
- ps_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
- strcpy(ps_file,filename);
-
- GetTitle(pspartext, filename, FILENAMELEN);
- stripspace(filename);
-
- ps_par_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
- strcpy(ps_par_file,filename);
-
- write_ps_file(prf_panel[1],ps_file,ps_par_file,pagesize,orientation,
- ps_header,ps_ruler,ps_resno,
- resize,first_printres,last_printres,blocklen,ps_curve);
-
- info("Postscript file %s written",ps_file);
- ckfree(ps_file);
-
-}
-
-static void SaveSeqFile(ButtoN but)
-{
- write_file(1,nseqs,firstres,lastres);
- seq_panel.modified=FALSE;
- info("File %s saved",filename);
-}
-
-static void SavePrf1File(ButtoN but)
-{
- write_file(1,profile1_nseqs,firstres,lastres);
- prf_panel[0].modified=FALSE;
- info("File %s saved",filename);
-}
-
-static void SavePrf2File(ButtoN but)
-{
- write_file(profile1_nseqs+1,nseqs,firstres,lastres);
- prf_panel[1].modified=FALSE;
- info("File %s saved",filename);
-}
-
-/* this is equivalent to open_alignment_output(), but uses the window
-interface to input file names */
-
-static Boolean open_aln_files(void)
-{
- char path[FILENAMELEN];
-
- if(!output_clustal && !output_nbrf && !output_gcg &&
- !output_phylip && !output_gde && !output_nexus) {
- error("You must select an alignment output format");
- return FALSE;
- }
-
- if(output_clustal) {
- GetTitle(cl_outtext,filename,FILENAMELEN);
- stripspace(filename);
- if((clustal_outfile = open_explicit_file(
- filename))==NULL) return FALSE;
- }
- if(output_nbrf) {
- GetTitle(pir_outtext,filename,FILENAMELEN);
- stripspace(filename);
- if((nbrf_outfile = open_explicit_file(
- filename))==NULL) return FALSE;
- }
- if(output_gcg) {
- GetTitle(msf_outtext,filename,FILENAMELEN);
- stripspace(filename);
- if((gcg_outfile = open_explicit_file(
- filename))==NULL) return FALSE;
- }
- if(output_phylip) {
- GetTitle(phylip_outtext,filename,FILENAMELEN);
- stripspace(filename);
- if((phylip_outfile = open_explicit_file(
- filename))==NULL) return FALSE;
- }
- if(output_gde) {
- GetTitle(gde_outtext,filename,FILENAMELEN);
- stripspace(filename);
- if((gde_outfile = open_explicit_file(
- filename))==NULL) return FALSE;
- }
- if(output_nexus) {
- GetTitle(nexus_outtext,filename,FILENAMELEN);
- stripspace(filename);
- if((nexus_outfile = open_explicit_file(
- filename))==NULL) return FALSE;
- }
-
-/* <Ramu> */
- if(output_fasta) {
- GetTitle(fasta_outtext,filename,FILENAMELEN);
- stripspace(filename);
- if((fasta_outfile = open_explicit_file(
- filename))==NULL) return FALSE;
- }
-/* </Ramu> */
- if(save_log)
- {
- get_path(seqname,path);
- strcpy(save_log_filename,path);
- strcat(save_log_filename,"log");
- if ((save_log_fd=fopen(save_log_filename,"a"))==NULL)
- error("Cannot open log file %s",save_log_filename);
- }
-
- return TRUE;
-}
-
-static void write_file(int fseq, int lseq, int fres, int lres)
-{
- int i,length=0;
- FILE *outfile;
-
- GetTitle(savealntext, filename, FILENAMELEN);
- stripspace(filename);
-
- outfile=open_explicit_file(filename);
-
- for (i=fseq;i<=lseq;i++)
- if (length < seqlen_array[i]) length = seqlen_array[i];
-
- if(fres<1) fres=1;
- if(lres<1) lres=length;
- length=lres-fres+1;
-
- if(save_format==CLUSTAL) {
- clustal_out(outfile, fres, length, fseq, lseq);
- fclose(outfile);
- info("CLUSTAL format file created [%s]",filename);
- }
- else if(save_format==PIR) {
- nbrf_out(outfile, fres, length, fseq, lseq);
- fclose(outfile);
- info("NBRF/PIR format file created [%s]",filename);
- }
- else if(save_format==MSF) {
- gcg_out(outfile, fres, length, fseq, lseq);
- fclose(outfile);
- info("GCG/MSF format file created [%s]",filename);
- }
- else if(save_format==PHYLIP) {
- phylip_out(outfile, fres, length, fseq, lseq);
- fclose(outfile);
- info("PHYLIP format file created [%s]",filename);
- }
- else if(save_format==GDE) {
- gde_out(outfile, fres, length, fseq, lseq);
- fclose(outfile);
- info("GDE format file created [%s]",filename);
- }
- else if(save_format==NEXUS) {
- nexus_out(outfile, fres, length, fseq, lseq);
- fclose(outfile);
- info("NEXUS format file created [%s]",filename);
- }
-
-/* <Ramu> */
- else if(save_format==FASTA) {
- fasta_out(outfile, fres, length, fseq, lseq);
- fclose(outfile);
- info("FASTA format file created [%s]",filename);
- }
-
-
-/* </Ramu> */
- if (Visible(savealnw))
- {
- Remove(savealnw);
- savealnw=NULL;
- }
-
-
-}
-
-static void SaveTreeWin (IteM item)
-{
- GrouP savegr;
- ButtoN save_ok, save_can;
- char path[FILENAMELEN];
-
- if (empty)
- {
- error("No file loaded");
- return;
- }
- if (nseqs < 2)
- {
- error("Alignment has only %d sequences",nseqs);
- return;
- }
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- savetreew=FixedWindow(-50, -33, -10, -10, "CREATE TREE",RemoveWin);
- shift(savetreew, 0, 20);
- make_prompt(savetreew, "SAVE TREE AS :");
- Advance(savetreew);
- shift(savetreew, 0, -10);
- stdLineHeight=18;
- SelectFont(programFont);
- savetreetext=DialogText(savetreew, "", 35, NULL);
- SelectFont(systemFont);
- stdLineHeight=15;
- Break(savetreew);
- savegr=HiddenGroup(savetreew, 2, 0, NULL);
- shift(savegr, 140, 20);
- save_ok=PushButton(savegr, " OK ", CreateAlignTree);
- shift(savegr, 20, 0);
- save_can=PushButton(savegr, "CANCEL", CancelWin);
-
- get_path(seqname,path);
- strcat(path,"dnd");
-
- SetTitle(savetreetext, path);
- Show(savetreew);
-}
-
-static void DrawTreeWin (IteM item)
-{
- GrouP drawgr;
- GrouP output_list;
- ButtoN draw_ok, draw_can;
- char path[FILENAMELEN];
- char name[FILENAMELEN];
-
- if (empty)
- {
- error("No file loaded");
- return;
- }
- if (nseqs < 2)
- {
- error("Alignment has only %d sequences",nseqs);
- return;
- }
-
- get_path(seqname,path);
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- drawtreew=FixedWindow(-50, -33, -10, -10, "DRAW TREE",RemoveWin);
- output_list=HiddenGroup(drawtreew, 2, 0, NULL);
- if (output_tree_clustal)
- {
- make_prompt(output_list, "SAVE CLUSTAL TREE AS :");
- drawnjtreetext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"nj");
- SetTitle(drawnjtreetext, name);
- Break(output_list);
- }
- if (output_tree_phylip)
- {
- make_prompt(output_list, "SAVE PHYLIP TREE AS :");
- drawphtreetext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"ph");
- SetTitle(drawphtreetext, name);
- Break(output_list);
- }
- if (output_tree_distances)
- {
- make_prompt(output_list, "SAVE DISTANCE MATRIX AS :");
- drawdsttreetext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"dst");
- SetTitle(drawdsttreetext, name);
- Break(output_list);
- }
- if (output_tree_nexus)
- {
- make_prompt(output_list, "SAVE NEXUS TREE AS :");
- drawnxstreetext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"tre");
- SetTitle(drawnxstreetext, name);
- Break(output_list);
- }
-
- if (output_pim)
- {
- make_prompt(output_list, "SAVE % IDENTITY MATRIX AS :");
- drawpimtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"pim");
- SetTitle(drawpimtext, name);
- Break(output_list);
- }
-
- SelectFont(systemFont);
- stdLineHeight=15;
- Break(drawtreew);
- drawgr=HiddenGroup(drawtreew, 2, 0, NULL);
- shift(drawgr, 140, 20);
- draw_ok=PushButton(drawgr, " OK ", DrawTree);
- shift(drawgr, 20, 0);
- draw_can=PushButton(drawgr, "CANCEL", CancelWin);
-
- Show(drawtreew);
-}
-
-static void BootstrapTreeWin (IteM item)
-{
- GrouP bootgr;
- ButtoN boot_ok, boot_can;
- TexT seed,ntrials;
- char name[FILENAMELEN];
- char path[FILENAMELEN];
- char str[FILENAMELEN];
- GrouP output_list;
-
- if (empty)
- {
- error("No file loaded");
- return;
- }
- if (nseqs < 2)
- {
- error("Alignment has only %d sequences",nseqs);
- return;
- }
-
- get_path(seqname,path);
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- boottreew=FixedWindow(-50, -33, -10, -10, "BOOTSTRAP TREE",RemoveWin);
- make_prompt(boottreew, "Random number generator seed [1-1000] :");
- Advance(boottreew);
- sprintf(str,"%4d",boot_ran_seed);
- seed=DialogText(boottreew, str, 4,set_ran_seed);
- Break(boottreew);
- make_prompt(boottreew, "Number of bootstrap trials [1-10000] :");
- Advance(boottreew);
- sprintf(str,"%5d",boot_ntrials);
- ntrials=DialogText(boottreew, str, 5,set_ntrials);
- Break(boottreew);
-
- output_list=HiddenGroup(boottreew, 2, 0, NULL);
- if (output_tree_clustal)
- {
- make_prompt(output_list, "SAVE CLUSTAL TREE AS :");
- bootnjtreetext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"njb");
- SetTitle(bootnjtreetext, name);
- Break(output_list);
- }
- if (output_tree_phylip)
- {
- make_prompt(output_list, "SAVE PHYLIP TREE AS :");
- bootphtreetext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"phb");
- SetTitle(bootphtreetext, name);
- Break(output_list);
- }
- if (output_tree_nexus)
- {
- make_prompt(output_list, "SAVE NEXUS TREE AS :");
- bootnxstreetext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"treb");
- SetTitle(bootnxstreetext, name);
- Break(output_list);
- }
- SelectFont(systemFont);
- stdLineHeight=15;
- Break(boottreew);
- bootgr=HiddenGroup(boottreew, 2, 0, NULL);
- shift(bootgr, 140, 20);
- boot_ok=PushButton(bootgr, " OK ", BootstrapTree);
- shift(bootgr, 20, 0);
- boot_can=PushButton(bootgr, "CANCEL", CancelWin);
-
-
- Show(boottreew);
-}
-
-static void CreateAlignTree(ButtoN but)
-{
- char path[FILENAMELEN];
- char phylip_name[FILENAMELEN];
-
- GetTitle(savetreetext, filename, FILENAMELEN);
- strcpy(phylip_name,filename);
- stripspace(filename);
-
- info("Doing pairwise alignments...");
- if(save_log)
- {
- get_path(seqname,path);
- strcpy(save_log_filename,path);
- strcat(save_log_filename,"log");
- if ((save_log_fd=fopen(save_log_filename,"a"))==NULL)
- error("Cannot open log file %s",save_log_filename);
- }
-
- WatchCursor();
- if (Visible(savetreew))
- {
- Remove(savetreew);
- savetreew=NULL;
- }
- make_tree(phylip_name);
- if(save_log && save_log_fd!=NULL)
- {
- fclose(save_log_fd);
- save_log_fd=NULL;
- }
- ArrowCursor();
- info("Tree %s created",filename);
-}
-
-static void DrawTree(ButtoN but)
-{
- char path[FILENAMELEN];
- char phylip_name[FILENAMELEN];
- char clustal_name[FILENAMELEN];
- char dist_name[FILENAMELEN];
- char nexus_name[FILENAMELEN];
- char pim_name[FILENAMELEN];
-
- if(output_tree_clustal)
- {
- GetTitle(drawnjtreetext, filename, FILENAMELEN);
- stripspace(filename);
- strcpy(clustal_name,filename);
- }
- if(output_tree_phylip)
- {
- GetTitle(drawphtreetext, filename, FILENAMELEN);
- stripspace(filename);
- strcpy(phylip_name,filename);
- }
- if(output_tree_distances)
- {
- GetTitle(drawdsttreetext, filename, FILENAMELEN);
- stripspace(filename);
- strcpy(dist_name,filename);
- }
- if(output_tree_nexus)
- {
- GetTitle(drawnxstreetext, filename, FILENAMELEN);
- stripspace(filename);
- strcpy(nexus_name,filename);
- }
-
-
-
- if(output_pim) /* if this is absent, no file gets created ??? */
- {
- GetTitle(drawpimtext, filename, FILENAMELEN);
- stripspace(filename);
- strcpy(pim_name,filename);
- }
-
-
- info("Calculating tree...");
- WatchCursor();
- if(save_log)
- {
- get_path(seqname,path);
- strcpy(save_log_filename,path);
- strcat(save_log_filename,"log");
- if ((save_log_fd=fopen(save_log_filename,"a"))==NULL)
- error("Cannot open log file %s",save_log_filename);
- }
- if (Visible(drawtreew))
- {
- Remove(drawtreew);
- drawtreew=NULL;
- }
- phylogenetic_tree(phylip_name,clustal_name,dist_name,nexus_name,pim_name);
- if(save_log && save_log_fd!=NULL)
- {
- fclose(save_log_fd);
- save_log_fd=NULL;
- }
- ArrowCursor();
- info("Tree %s created",filename);
-}
-
-static void BootstrapTree(ButtoN but)
-{
- char phylip_name[FILENAMELEN];
- char clustal_name[FILENAMELEN];
- char nexus_name[FILENAMELEN];
- char path[FILENAMELEN];
-
- if(output_tree_clustal)
- {
- GetTitle(bootnjtreetext, filename, FILENAMELEN);
- stripspace(filename);
- strcpy(clustal_name,filename);
- }
- if(output_tree_phylip)
- {
- GetTitle(bootphtreetext, filename, FILENAMELEN);
- stripspace(filename);
- strcpy(phylip_name,filename);
- }
- if(output_tree_nexus)
- {
- GetTitle(bootnxstreetext, filename, FILENAMELEN);
- stripspace(filename);
- strcpy(nexus_name,filename);
- }
-
- info("Bootstrapping tree...");
-
- WatchCursor();
- if(save_log)
- {
- get_path(seqname,path);
- strcpy(save_log_filename,path);
- strcat(save_log_filename,"log");
- if ((save_log_fd=fopen(save_log_filename,"a"))==NULL)
- warning("Cannot open log file %s",save_log_filename);
- }
- if (Visible(boottreew))
- {
- Remove(boottreew);
- boottreew=NULL;
- }
- bootstrap_tree(phylip_name,clustal_name,nexus_name);
- if(save_log && save_log_fd!=NULL)
- {
- fclose(save_log_fd);
- save_log_fd=NULL;
- }
- info("Bootstrap tree %s created",filename);
- ArrowCursor();
-}
-
-
-static void OpenSeqFile (IteM item)
-{
- int n;
- panel_data data;
-
- if (nseqs>0)
- {
- if (Message(MSG_YN,"Replace existing sequences ?")==ANS_NO)
- return;
- }
-
- if (!GetInputFileName (filename,FILENAMELEN,"","")) return;
-
- strcpy(seqname,filename);
- GetPanelExtra(seq_panel.names,&data);
- data.nseqs=0;
- data.vseqs=0;
- SetPanelExtra(seq_panel.names,&data);
- GetPanelExtra(seq_panel.seqs,&data);
- data.nseqs=0;
- data.vseqs=0;
- SetPanelExtra(seq_panel.seqs,&data);
-
- n=seq_input(FALSE);
- if (n<=0)
- {
- info("File %s not loaded.",seqname);
- return;
- }
-
- load_aln(seq_panel,0,nseqs-1,TRUE);
-
- ncutseqs=0;
-
- info("File %s loaded.",seqname);
-}
-
-static void AppendSeqFile (IteM item)
-{
- int n;
- panel_data data;
-
- if (!GetInputFileName (filename,FILENAMELEN,"","")) return;
-
- strcpy(seqname,filename);
- GetPanelExtra(seq_panel.names,&data);
- data.nseqs=0;
- SetPanelExtra(seq_panel.names,&data);
- n=seq_input(TRUE);
- if (n<=0)
- {
- info("File %s not loaded.",seqname);
- return;
- }
-
- load_aln(seq_panel,0,nseqs-1,FALSE);
-
- info("File %s appended.",seqname);
-}
-
-static void OpenPrf1File (IteM item)
-{
- int i,j,n,tmpn=0,tmpfs;
- sint *tmplen_array;
- sint *tmpindex;
- char **tmp_array;
- char **tmpnames;
- char **tmptitles;
- panel_data data;
-
- if (profile1_nseqs>0)
- {
- if (Message(MSG_YN,"Replace existing sequences ?")==ANS_NO)
- return;
- }
-
- if (!GetInputFileName (filename,FILENAMELEN,"","")) return;
-
- if(!profile2_empty)
- {
- tmpn=nseqs-profile1_nseqs;
- tmpfs=profile1_nseqs;
- tmpnames=(char **)ckalloc((tmpn+1)*sizeof(char *));
- tmptitles=(char **)ckalloc((tmpn+1)*sizeof(char *));
- tmplen_array=(sint *)ckalloc((tmpn+1)*sizeof(sint));
- tmpindex=(sint *)ckalloc((tmpn+1)*sizeof(sint));
- tmp_array=(char **)ckalloc((tmpn+1)*sizeof(char *));
- for(i=profile1_nseqs+1;i<=nseqs;i++)
- {
- tmpnames[i-profile1_nseqs-1]=(char *)ckalloc((MAXNAMES+2)*sizeof(char));
- tmptitles[i-profile1_nseqs-1]=(char *)ckalloc((MAXTITLES+2)*sizeof(char));
- strcpy(tmpnames[i-profile1_nseqs-1],names[i]);
-
- strcpy(tmptitles[i-profile1_nseqs-1],titles[i]);
- tmplen_array[i-profile1_nseqs-1]=seqlen_array[i];
- tmpindex[i-profile1_nseqs-1]=output_index[i]-tmpfs+profile1_nseqs;
- tmp_array[i-profile1_nseqs-1]=(char *)ckalloc((seqlen_array[i]+2)*sizeof(char));
- for(j=1;j<=seqlen_array[i];j++)
- tmp_array[i-profile1_nseqs-1][j]=seq_array[i][j];
- }
- }
-
- strcpy(seqname,filename);
- GetPanelExtra(prf_panel[0].names,&data);
- data.nseqs=0;
- data.vseqs=0;
- SetPanelExtra(prf_panel[0].names,&data);
- GetPanelExtra(prf_panel[0].seqs,&data);
- data.nseqs=0;
- data.vseqs=0;
- SetPanelExtra(prf_panel[0].seqs,&data);
- profile_no = 1;
- n=profile_input();
- if (n<=0)
- {
- info("File %s not loaded.",seqname);
- return;
- }
- strcpy(profile1_name,seqname);
- load_aln(prf_panel[0],0,profile1_nseqs-1,TRUE);
-
- if(tmpn!=0)
- {
- nseqs=tmpn+profile1_nseqs;
- realloc_aln(profile1_nseqs+1,nseqs);
- for(i=profile1_nseqs+1;i<=nseqs;i++)
- {
- names[i]=(char *)ckalloc((MAXNAMES+2)*sizeof(char));
- titles[i]=(char *)ckalloc((MAXTITLES+2)*sizeof(char));
-
- strcpy(names[i],tmpnames[i-profile1_nseqs-1]);
- ckfree(tmpnames[i-profile1_nseqs-1]);
- strcpy(titles[i],tmptitles[i-profile1_nseqs-1]);
- ckfree(tmptitles[i-profile1_nseqs-1]);
- seqlen_array[i]=tmplen_array[i-profile1_nseqs-1];
- output_index[i]=tmpindex[i-profile1_nseqs-1]-tmpfs+profile1_nseqs;
- seq_array[i]=(char *)ckalloc((seqlen_array[i]+2)*sizeof(char));
- for(j=1;j<=seqlen_array[i];j++)
- seq_array[i][j]=tmp_array[i-profile1_nseqs-1][j];
- ckfree(tmp_array[i-profile1_nseqs-1]);
- }
- ckfree(tmpnames);
- ckfree(tmptitles);
- ckfree(tmplen_array);
- ckfree(tmpindex);
- ckfree(tmp_array);
- profile2_empty=FALSE;
- }
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,TRUE);
-
- ncutseqs=0;
-
- info("File %s loaded.",profile1_name);
-}
-
-static void OpenPrf2File (IteM item)
-{
- int n;
- panel_data data;
-
- if(profile1_empty)
- {
- error("You must load profile 1 first.");
- return;
- }
-
- if (nseqs>profile1_nseqs)
- {
- if (Message(MSG_YN,"Replace existing sequences ?")==ANS_NO)
- return;
- }
-
- if (!GetInputFileName (filename,FILENAMELEN,"","")) return;
-
- strcpy(seqname,filename);
- GetPanelExtra(prf_panel[1].names,&data);
- data.nseqs=0;
- data.vseqs=0;
- SetPanelExtra(prf_panel[1].names,&data);
- GetPanelExtra(prf_panel[1].seqs,&data);
- data.nseqs=0;
- data.vseqs=0;
- SetPanelExtra(prf_panel[1].seqs,&data);
- profile_no = 2;
- n=profile_input();
- if (n<=0)
- {
- info("File %s not loaded.",seqname);
- return;
- }
- strcpy(profile2_name,seqname);
- ncutseqs=0;
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,TRUE);
-
- info("File %s loaded.",profile2_name);
-}
-
-
-static void BlackandWhite(IteM item)
-{
-
- ncolors=1;
-
- if (aln_mode == MULTIPLEM)
- color_seqs();
- else
- {
- color_prf1();
- color_prf2();
- }
- usebw=TRUE;
- usedefcolors=FALSE;
- useusercolors=FALSE;
- SetStatus(bw_item,usebw);
- SetStatus(defcol_item,usedefcolors);
- SetStatus(usercol_item,useusercolors);
- info("Done.");
-}
-
-
-static void DefColorPar(IteM item)
-{
-
- if (explicit_par_file != NULL)
- ckfree(explicit_par_file);
- explicit_par_file=NULL;
- if(dnaflag)
- par_file=find_file(def_dnapar_file);
- else
- par_file=find_file(def_protpar_file);
- init_color_parameters(par_file);
- if (aln_mode == MULTIPLEM)
- color_seqs();
- else
- {
- color_prf1();
- color_prf2();
- }
- usebw=FALSE;
- usedefcolors=TRUE;
- useusercolors=FALSE;
- SetStatus(bw_item,usebw);
- SetStatus(defcol_item,usedefcolors);
- SetStatus(usercol_item,useusercolors);
- info("Done.");
-}
-
-void set_reset_new_gaps(IteM i)
-{
- reset_alignments_new=GetStatus(i);
- if(reset_alignments_new==TRUE)
- {
- reset_alignments_all=FALSE;
- SetStatus(all_gaps_item,reset_alignments_all);
- }
-}
-void set_reset_all_gaps(IteM i)
-{
- reset_alignments_all=GetStatus(i);
- if(reset_alignments_all==TRUE)
- {
- reset_alignments_new=FALSE;
- SetStatus(new_gaps_item,reset_alignments_new);
- }
-}
-
-
-static void OpenColorParWin(IteM item)
-{
- read_file_window("Input Color File","COLOR PARAMETER FILE NAME:",explicit_par_file,OpenColorPar);
-}
-
-static void OpenColorPar(ButtoN but)
-{
- /*<ramu> this might do to open a file selection window */
-
- if (par_file != NULL)
- ckfree(par_file);
- par_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
-
- if (!GetInputFileName (par_file,FILENAMELEN,"par","")) return;
-
-
- /* GetTitle(readfiletext, filename, FILENAMELEN); */
- /* stripspace(filename); */
-
- if (explicit_par_file != NULL)
- ckfree(explicit_par_file);
- explicit_par_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
- /* if (par_file != NULL)
- ckfree(par_file);
- par_file=(char *)ckalloc(FILENAMELEN*sizeof(char));
- strcpy(explicit_par_file,filename); */
-
- strcpy(explicit_par_file,par_file);
- /*strcpy(par_file,filename); */
- info("Loading color file: %s\n",par_file);
- init_color_parameters(par_file);
- if (Visible(readfilew))
- {
- Remove(readfilew);
- readfilew=NULL;
- }
- if (aln_mode == MULTIPLEM)
- color_seqs();
- else
- {
- color_prf1();
- color_prf2();
- }
- usebw=FALSE;
- usedefcolors=FALSE;
- useusercolors=TRUE;
- SetStatus(bw_item,usebw);
- SetStatus(defcol_item,usedefcolors);
- SetStatus(usercol_item,useusercolors);
- info("Done.");
-}
-
-static void RemoveGapPos(IteM item)
-{
- int i,j,sl;
- Boolean sel=FALSE;
-
- if (nseqs==0)
- {
- Message(MSG_OK,"No file loaded.");
- return;
- }
- /* no need for a confirmation! Ramu
- if (Message(MSG_YN,"Remove positions that contain gaps in all sequences ?")==ANS_NO)
- return;
- */
- if(aln_mode==MULTIPLEM)
- {
- remove_gap_pos(1,nseqs,0);
- load_aln(seq_panel,0,nseqs-1,FALSE);
- }
- else
- {
- remove_gap_pos(1,profile1_nseqs,1);
- load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
- remove_gap_pos(profile1_nseqs+1,nseqs,2);
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
- }
- /* info("Gap positions removed.");*/
- info("All the columns that contains only the gaps, are removed!");
-}
-
-
-static void RemoveGaps(IteM item)
-{
- int i,j,sl;
- panel_data data;
- Boolean sel=FALSE;
-
- if (nseqs==0)
- {
- Message(MSG_OK,"No file loaded.");
- return;
- }
- GetPanelExtra(active_panel.names,&data);
- for (i=0;i<data.nseqs;i++)
- if(data.selected[i]==TRUE)
- {
- sel=TRUE;
- break;
- }
- if(sel==FALSE)
- {
- Message(MSG_OK,"Select sequences by clicking on the names.");
- return;
- }
-
- if (Message(MSG_YN,"Remove gaps from selected sequences ?")==ANS_NO)
- return;
-
- for (i=data.firstseq+1;i<=data.firstseq+data.nseqs;i++)
- if(data.selected[i-data.firstseq-1]==TRUE)
- {
- sl=0;
- for(j=1;j<=seqlen_array[i];++j) {
- if((seq_array[i][j] == gap_pos1) ||
- (seq_array[i][j] == gap_pos2)) continue;
- ++sl;
- seq_array[i][sl]=seq_array[i][j];
- }
- seq_array[i][sl+1]=-3;
- seqlen_array[i]=sl;
- }
- load_aln(active_panel,data.firstseq,data.firstseq+data.nseqs-1,FALSE);
- active_panel.modified=TRUE;
- info("Gaps in selected sequences removed.");
-}
-
-static void CutSequences(IteM item)
-{
- int i,pos;
- Boolean sel=FALSE;
- panel_data data;
-
- if (nseqs==0)
- {
- Message(MSG_OK,"No file loaded.");
- return;
- }
-
- GetPanelExtra(active_panel.names,&data);
- for (i=0;i<data.nseqs;i++)
- if(data.selected[i]==TRUE)
- {
- sel=TRUE;
- pos=i;
- break;
- }
- if(sel==FALSE)
- {
- Message(MSG_OK,"Select sequences to be cut by clicking on the names.");
- return;
- }
-
- /* if(ncutseqs>0)
- {
- if (Message(MSG_YN,"The previously cut sequences will be lost.\nDo you want to continue?")==ANS_NO) return;
- }
- */
- if (saveseqlen_array!=NULL) ckfree(saveseqlen_array);
- if (saveseq_array!=NULL)
- {
- for(i=0;i<ncutseqs;i++)
- {
- if (saveseq_array[i]!=NULL) ckfree(saveseq_array[i]);
- }
- ckfree(saveseq_array);
- }
- if (savetitles!=NULL)
- {
- for(i=0;i<ncutseqs;i++)
- {
- if (savetitles[i]!=NULL) ckfree(savetitles[i]);
- }
- ckfree(savetitles);
- }
- if (savenames!=NULL)
- {
- for(i=0;i<ncutseqs;i++)
- {
- if (savenames[i]!=NULL) ckfree(savenames[i]);
- }
- ckfree(savenames);
- }
- ncutseqs=0;
-
- savenames=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
- savetitles=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
- saveseq_array=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
- saveseqlen_array=(sint *)ckalloc((data.nseqs+1) * sizeof(sint));
- for(i=0;i<data.nseqs;i++)
- {
- savenames[i]=NULL;
- savetitles[i]=NULL;
- saveseq_array[i]=NULL;
- }
- if (data.prf_no == 0)
- cut_multiplem();
- else if (data.prf_no == 1)
- cut_profile1();
- else if (data.prf_no == 2)
- cut_profile2();
-
- GetPanelExtra(active_panel.names,&data);
- if(pos>=data.nseqs) pos=data.nseqs-1;
- if(data.nseqs>0)
- data.selected[pos]=TRUE;
- SetPanelExtra(active_panel.names,&data);
- DrawPanel(active_panel.names);
-
- active_panel.modified=TRUE;
- info("Cut %d sequences.",ncutseqs);
-}
-
-static void cut_multiplem(void)
-{
- int i,j;
- panel_data data;
-
- GetPanelExtra(active_panel.names,&data);
- for (i=data.nseqs;i>0;i--)
- {
- if(data.selected[i-1]==TRUE)
- {
- ssave(i);
- for(j=i;j<data.nseqs;j++)
- sscpy(j,j+1);
- }
- }
- nseqs-=ncutseqs;
- if (nseqs<=0) empty=TRUE;
- if (ncutseqs>0)
- if(nseqs<=data.vseqs)
- load_aln(active_panel,0,nseqs-1,TRUE);
- else
- load_aln(active_panel,0,nseqs-1,FALSE);
-}
-
-static void cut_profile1(void)
-{
- int i,j;
- panel_data data;
-
- GetPanelExtra(active_panel.names,&data);
- for (i=data.nseqs;i>0;i--)
- {
- if(data.selected[i-1]==TRUE)
- {
- ssave(i);
- for(j=i;j<nseqs;j++)
- sscpy(j,j+1);
- }
- }
- profile1_nseqs-=ncutseqs;
- nseqs-=ncutseqs;
- if (profile1_nseqs<=0) profile1_empty=TRUE;
- if (nseqs<=0) empty=TRUE;
- if (ncutseqs>0)
- {
- if(profile1_nseqs<=data.vseqs)
- load_aln(active_panel,0,profile1_nseqs-1,TRUE);
- else
- load_aln(active_panel,0,profile1_nseqs-1,FALSE);
- if (!profile2_empty)
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
- }
-}
-
-static void cut_profile2(void)
-{
- int i,j;
- panel_data data;
-
- GetPanelExtra(active_panel.names,&data);
- for (i=data.nseqs;i>0;i--)
- {
- if(data.selected[i-1]==TRUE)
- {
- ssave(i+profile1_nseqs);
- for(j=i+profile1_nseqs;j<nseqs;j++)
- sscpy(j,j+1);
- }
- }
- nseqs-=ncutseqs;
- if (nseqs-profile1_nseqs<=0) profile2_empty=TRUE;
- if (nseqs<=0) empty=TRUE;
- if (ncutseqs>0)
- if(nseqs-profile1_nseqs<=data.vseqs)
- load_aln(active_panel,profile1_nseqs,nseqs-1,FALSE);
- else
- load_aln(active_panel,profile1_nseqs,nseqs-1,TRUE);
-}
-
-static void PasteSequences(IteM item)
-{
- int insert;
- int i,n;
- panel_data data;
-
- if (ncutseqs<=0)
- {
- Message(MSG_OK,"No sequences available for pasting.\n"
- " Cut selected sequences first.");
- return;
- }
-
- GetPanelExtra(active_panel.names,&data);
- n=ncutseqs;
- insert=-1;
- if (data.nseqs>0)
- {
- for(i=data.nseqs-1;i>=0;i--)
- if(data.selected[i]==TRUE)
- {
- insert=i;
- break;
- }
- if (insert==-1)
- {
- Message(MSG_OK,"Select a sequence by clicking on the name.\n"
- " Cut sequences will be pasted after this one.");
- return;
- }
- }
-
- if (data.prf_no == 2)
- {
- insert += profile1_nseqs;
- for(i=profile1_nseqs+data.nseqs;i>insert+1;i--)
- sscpy(i+ncutseqs,i);
- for(i=1;ncutseqs>0;i++)
- sload(insert+i+1);
- }
-
-
- else
- {
- for(i=nseqs;i>insert+1;i--)
- sscpy(i+ncutseqs,i);
- for(i=1;ncutseqs>0;i++)
- sload(insert+i+1);
- }
-
- if(data.prf_no==0)
- {
- nseqs=data.nseqs+n;
- if (nseqs>0) empty=FALSE;
- load_aln(seq_panel,0,nseqs-1,FALSE);
- }
- else if(data.prf_no==1)
- {
- profile1_nseqs=data.nseqs+n;
- nseqs+=n;
- if (profile1_nseqs>0) profile1_empty=FALSE;
- load_aln(active_panel,0,profile1_nseqs-1,FALSE);
- if (!profile2_empty)
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
- }
- else if(data.prf_no==2)
- {
- nseqs=profile1_nseqs+data.nseqs+n;
- if (profile1_nseqs<nseqs)
- {
- profile2_empty=FALSE;
- empty=FALSE;
- }
-/*
- load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
-*/
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
- }
-
- active_panel.modified=TRUE;
- info("Pasted %d sequences.",n);
-}
-
-/* copies a sequence from clustal arrays position j to temp arrays */
-static void ssave(int j)
-{
- int k;
-
- if (saveseq_array[ncutseqs] != NULL) ckfree(saveseq_array[ncutseqs]);
- if (savenames[ncutseqs] != NULL) ckfree(savenames[ncutseqs]);
- if (savetitles[ncutseqs] != NULL) ckfree(savetitles[ncutseqs]);
- savenames[ncutseqs]=(char *)ckalloc((MAXNAMES+2)*sizeof(char));
- savetitles[ncutseqs]=(char *)ckalloc((MAXTITLES+2)*sizeof(char));
-
- strcpy(savenames[ncutseqs],names[j]);
- strcpy(savetitles[ncutseqs],titles[j]);
- saveseqlen_array[ncutseqs]=seqlen_array[j];
- saveseq_array[ncutseqs]=(char *)ckalloc((seqlen_array[j]+2)*sizeof(char));
- for(k=1;k<=seqlen_array[j];k++)
- saveseq_array[ncutseqs][k]=seq_array[j][k];
- saveseq_array[ncutseqs][k]= -3;
- ncutseqs++;
-}
-
-/* copies a sequence from clustal arrays position i to position j */
-static void sscpy(int i,int j)
-{
- int k;
-
-
- strcpy(names[i],names[j]);
- strcpy(titles[i],titles[j]);
- seqlen_array[i]=seqlen_array[j];
- realloc_seq(i,seqlen_array[i]);
-
- for(k=1;k<=seqlen_array[j];k++)
- seq_array[i][k]=seq_array[j][k];
- seq_array[i][k]= -3;
-}
-
-/* copies last sequence in temp arrays to clustal arrays after entry i */
-static void sload(int i)
-{
- int k;
-
- if (ncutseqs<1) return;
-
- ncutseqs--;
- strcpy(names[i],savenames[ncutseqs]);
- strcpy(titles[i],savetitles[ncutseqs]);
- seqlen_array[i]=saveseqlen_array[ncutseqs];
- realloc_seq(i,seqlen_array[i]);
- for(k=1;k<=seqlen_array[i];k++)
- seq_array[i][k]=saveseq_array[ncutseqs][k];
- seq_array[i][k]= -3;
-}
-
-static void SelectSeqs(IteM item)
-{
- select_seqs(seq_panel,TRUE);
-}
-
-static void SelectPrf1(IteM item)
-{
- select_seqs(prf_panel[0],TRUE);
-}
-
-static void SelectPrf2(IteM item)
-{
- select_seqs(prf_panel[1],TRUE);
-}
-
-static void MergeProfiles(IteM item)
-{
- if (profile2_empty)
- {
- error("Profile 2 not loaded");
- return;
- }
- profile_no=1;
- profile1_nseqs=nseqs;
- profile2_empty=TRUE;
- load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
- active_panel=prf_panel[0];
-
- info("Added Profile 2 to Profile 1.");
-}
-
-
-static void ClearSeqRange(IteM item)
-{
- if(aln_mode==MULTIPLEM)
- clear_seqrange(seq_panel);
- else
- {
- clear_seqrange(prf_panel[1]);
- clear_seqrange(prf_panel[0]);
- }
-}
-
-static void ClearSeqs(IteM item)
-{
- if(aln_mode==MULTIPLEM)
- select_seqs(seq_panel,FALSE);
- else
- {
- select_seqs(prf_panel[1],FALSE);
- select_seqs(prf_panel[0],FALSE);
- }
-}
-
-static void clear_seqrange(spanel p)
-{
- int f,l;
- panel_data data;
-
- GetPanelExtra(p.seqs,&data);
- f=data.firstsel;
- l=data.lastsel;
- data.firstsel=data.lastsel=-1;
- SetPanelExtra(p.seqs,&data);
- highlight_seqrange(p.seqs,f,l,NORMAL);
-}
-
-static void select_seqs(spanel p,Boolean flag)
-{
- int i;
- panel_data data;
-
- GetPanelExtra(p.names,&data);
- if (data.nseqs == 0) return;
-
- for (i=0;i<data.nseqs;i++)
- data.selected[i]=flag;
-
- SetPanelExtra(p.names,&data);
- draw_names(p.names);
- if(flag==TRUE) active_panel=p;
-}
-
-static void CAlignWin (IteM item)
-{
- if (empty)
- {
- error("No sequences loaded");
- return;
- }
- if (nseqs <= 1)
- {
- error("Alignment has only %d sequences",nseqs);
- return;
- }
- do_align_window(&calignw,&ctreetext,NEW,"Complete Alignment",CompleteAlign);
-}
-
-void CompleteAlign(ButtoN but)
-{
- char phylip_name[FILENAMELEN];
-
- float etime = 0;
-
- /* time_t startTime, timeElapsed; */
-
- GetTitle(ctreetext, filename, FILENAMELEN);
- stripspace(filename);
-
- strcpy(phylip_name,filename);
-
- if (!open_aln_files()) return;
-
- WatchCursor();
- if (Visible(calignw))
- {
- Remove(calignw);
- calignw=NULL;
- }
- cputime(0); /* Start timing ........ Ramu */
- /* startTime = GetSecs(); */
-
- align(phylip_name);
-
- if(save_log && save_log_fd!=NULL)
- {
- fclose(save_log_fd);
- save_log_fd=NULL;
- }
-/* reload the sequences from the output file (so that the sequence order is
-correct - either INPUT or ALIGNED , don't output messages */
- reload_alignment();
-
- load_aln(seq_panel,0,nseqs-1,FALSE);
- ArrowCursor();
- /* timeElapsed = GetSecs() - startTime; */
- info("\n Elapsed time : %7.2f Secs ",cputime(0)); /* Ramu */
-}
-
-static void RealignSeqsWin (IteM item)
-{
- int i;
- Boolean sel=FALSE;
- panel_data data;
-
- if (empty)
- {
- error("No sequences loaded");
- return;
- }
- if (nseqs <= 1)
- {
- error("Alignment has only %d sequences",nseqs);
- return;
- }
-
-/* check some sequences have been selected */
-
- GetPanelExtra(seq_panel.names,&data);
- for (i=0;i<data.nseqs;i++)
- if(data.selected[i]==TRUE)
- {
- sel=TRUE;
- break;
- }
- if(sel==FALSE)
- {
- Message(MSG_OK,"Select sequences to be realigned\n"
- "by clicking on the names.");
- return;
- }
-
- do_align_window(&ralignw,&rtreetext,NEW,"Realign Sequences",RealignSeqs);
-}
-
-static void RealignSeqs(ButtoN but)
-{
- int insert;
- int i,j,n;
- panel_data data;
- char phylip_name[FILENAMELEN];
-
- GetTitle(rtreetext, filename, FILENAMELEN);
- stripspace(filename);
-
- strcpy(phylip_name,filename);
-
- if (!open_aln_files()) return;
-
-/* cut selected sequences */
-
- GetPanelExtra(seq_panel.names,&data);
-
- if (saveseqlen_array!=NULL) ckfree(saveseqlen_array);
- if (saveseq_array!=NULL)
- {
- for(i=0;i<ncutseqs;i++)
- {
- if (saveseq_array[i]!=NULL) ckfree(saveseq_array[i]);
- }
- ckfree(saveseq_array);
- }
- if (savetitles!=NULL)
- {
- for(i=0;i<ncutseqs;i++)
- {
- if (savetitles[i]!=NULL) ckfree(savetitles[i]);
- }
- ckfree(savetitles);
- }
- if (savenames!=NULL)
- {
- for(i=0;i<ncutseqs;i++)
- {
- if (savenames[i]!=NULL) ckfree(savenames[i]);
- }
- ckfree(savenames);
- }
- ncutseqs=0;
-
- savenames=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
- savetitles=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
- saveseq_array=(char **)ckalloc((data.nseqs+1) * sizeof(char *));
- saveseqlen_array=(sint *)ckalloc((data.nseqs+1) * sizeof(sint));
- for(i=0;i<data.nseqs;i++)
- {
- savenames[i]=NULL;
- savetitles[i]=NULL;
- saveseq_array[i]=NULL;
- }
- for (i=data.nseqs;i>0;i--)
- {
- if(data.selected[i-1]==TRUE)
- {
- ssave(i);
- for(j=i;j<data.nseqs;j++)
- sscpy(j,j+1);
- }
- }
- nseqs=data.nseqs-ncutseqs;
- if (nseqs<=0) empty=TRUE;
-
-/* paste selected sequences at the end */
- n=ncutseqs;
- profile1_nseqs=nseqs;
- insert=profile1_nseqs-1;
-
- for(i=nseqs;i>insert+1;i--)
- sscpy(i+ncutseqs,i);
- for(i=1;ncutseqs>0;i++)
- sload(insert+i+1);
-
- nseqs=profile1_nseqs+n;
-
-/* align profile 2 sequences to profile 1 */
- WatchCursor();
- if (Visible(ralignw))
- {
- Remove(ralignw);
- ralignw=NULL;
- }
- new_sequence_align(phylip_name);
-
- if(save_log && save_log_fd!=NULL)
- {
- fclose(save_log_fd);
- save_log_fd=NULL;
- }
-/* reload the sequences from the output file (so that the sequence order is
-correct - either INPUT or ALIGNED */
- reload_alignment();
- load_aln(seq_panel,0,nseqs-1,FALSE);
-
- GetPanelExtra(seq_panel.names,&data);
- for (i=0;i<profile1_nseqs;i++)
- data.selected[i]=FALSE;
- for (i=profile1_nseqs;i<nseqs;i++)
- data.selected[i]=TRUE;
- SetPanelExtra(seq_panel.names,&data);
- draw_names(seq_panel.names);
- ArrowCursor();
- info("Selected sequences realigned.");
-}
-
-static void RealignSeqRangeWin (IteM item)
-{
- panel_data data;
- GrouP aligngr;
- GrouP output_list;
- ButtoN align_ok, align_can;
- GrouP maing;
- PopuP end_gap_toggle;
- char name[FILENAMELEN+1];
- char path[FILENAMELEN+1];
-
- if (empty)
- {
- error("No sequences loaded");
- return;
- }
- if (nseqs <= 1)
- {
- error("Alignment has only %d sequences",nseqs);
- return;
- }
-
-/* check a range has been selected */
-
- GetPanelExtra(seq_panel.seqs,&data);
- if(data.firstsel==-1)
- {
- Message(MSG_OK,"Select residue range to be realigned\n"
- "by clicking in the sequence display area.");
- return;
- }
-
-
- get_path(seqname,path);
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- rralignw=FixedWindow(-50, -33, -10, -10,"Realign Residue Range",RemoveWin);
-
- maing=HiddenGroup(rralignw,2,0,NULL);
- SetGroupSpacing(maing,0,10);
-
- make_prompt(rralignw, "Output Guide Tree File:");
- stdLineHeight=18;
- SelectFont(programFont);
- Break(rralignw);
- rrtreetext=DialogText(rralignw, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"dnd");
- SetTitle(rrtreetext, name);
- Break(rralignw);
-
- make_prompt(rralignw, "Output Alignment Files:");
- output_list=HiddenGroup(rralignw, 2, 0, NULL);
- if(output_clustal) {
- make_prompt(output_list,"Clustal: ");
- cl_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"aln");
- SetTitle(cl_outtext, name);
- Break(output_list);
- }
- if(output_nbrf) {
- make_prompt(output_list,"NBRF/PIR: ");
- pir_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"pir");
- SetTitle(pir_outtext, name);
- Break(output_list);
- }
- if(output_gcg) {
- make_prompt(output_list,"GCG/MSF: ");
- msf_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"msf");
- SetTitle(msf_outtext, name);
- Break(output_list);
- }
- if(output_phylip) {
- make_prompt(output_list,"Phylip: ");
- phylip_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"phy");
- SetTitle(phylip_outtext, name);
- Break(output_list);
- }
- if(output_gde) {
- make_prompt(output_list,"GDE: ");
- gde_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"gde");
- SetTitle(gde_outtext, name);
- Break(output_list);
- }
- if(output_nexus) {
- make_prompt(output_list,"Nexus: ");
- nexus_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"nxs");
- SetTitle(nexus_outtext, name);
- Break(output_list);
- }
-
- Break(rralignw);
- end_gap_toggle=make_toggle(rralignw,"Realign Segment End Gap Penalties","ON","OFF",&realign_endgappenalties,set_realign_endgappenalties);
- Break(rralignw);
-
- aligngr=HiddenGroup(rralignw, 2, 0, NULL);
- shift(aligngr, 60, 20);
- align_ok=PushButton(aligngr, " ALIGN ", RealignSeqRange);
- shift(aligngr, 20,0);
- align_can=PushButton(aligngr, "CANCEL", CancelWin);
-
- Show(rralignw);
-}
-
-static void RealignSeqRange(ButtoN but)
-{
- int i,j;
- int fs,save_order,length,length1,length2;
- panel_data data;
- sint *tmplen_array;
- char **tmp_array;
- sint *newlen_array;
- char **new_array;
- char phylip_name[FILENAMELEN];
-
- GetTitle(rrtreetext, filename, FILENAMELEN);
- stripspace(filename);
-
- strcpy(phylip_name,filename);
-
- if (!open_aln_files()) return;
-
- WatchCursor();
- if (Visible(rralignw))
- {
- Remove(rralignw);
- rralignw=NULL;
- }
-/* save the alignment into a temporary area */
-
- GetPanelExtra(seq_panel.seqs,&data);
-
- tmplen_array=(sint *)ckalloc((data.nseqs+2) * sizeof(sint));
- tmp_array=(char **)ckalloc((data.nseqs+2) * sizeof(char *));
- for (i=1;i<=data.nseqs;i++)
- {
- tmplen_array[i]=seqlen_array[i];
- tmp_array[i]=(char *)ckalloc((data.ncols+2) * sizeof(char));
- for(j=1;j<=seqlen_array[i];j++)
- tmp_array[i][j]=seq_array[i][j];
- for(j=seqlen_array[i]+1;j<=data.ncols;j++)
- tmp_array[i][j]=gap_pos2;
- }
-
-/* copy the selected residue range to the clustal alignment arrays */
-
- fs=data.firstsel;
- length=data.lastsel-data.firstsel+1;
- max_aln_length=2*length;
- for (i=1;i<=data.nseqs;i++)
- {
- seqlen_array[i]=length;
- realloc_seq(i,length);
- for(j=data.firstsel;j<=data.lastsel;j++)
- seq_array[i][j-data.firstsel+1]=tmp_array[i][j+1];
- seq_array[i][j-data.firstsel+1]=-3;
- }
-/* temporarily set the output order to be the same as the input */
- save_order=output_order;
- output_order=INPUT;
-/* set the end gaps penalties */
- endgappenalties=realign_endgappenalties;
-
-/* align the residue range */
- align(phylip_name);
- if(save_log && save_log_fd!=NULL)
- {
- fclose(save_log_fd);
- save_log_fd=NULL;
- }
-
- output_order=save_order;
-/* reset the end gaps penalties */
- endgappenalties=align_endgappenalties;
-
-/* remove positions that contain just gaps */
- remove_gap_pos(1,nseqs,0);
-
-
-/* save the new alignment into another temporary area */
- newlen_array=(sint *)ckalloc((data.nseqs+2) * sizeof(sint));
- new_array=(char **)ckalloc((data.nseqs+2) * sizeof(char *));
- for (i=1;i<=data.nseqs;i++)
- {
- newlen_array[i]=seqlen_array[i];
- new_array[i]=(char *)ckalloc((seqlen_array[i]+2) * sizeof(char));
- for(j=1;j<=seqlen_array[i];j++)
- new_array[i][j]=seq_array[i][j];
- }
-
-/* paste the realigned range back into the alignment */
- max_aln_length=0;
- length1=length2=0;
- for (i=1;i<=data.nseqs;i++)
- {
- length1=tmplen_array[i]-length+newlen_array[i];
- if(length1>max_aln_length) max_aln_length=length1;
- length2=newlen_array[i];
- seqlen_array[i]=length1;
- realloc_seq(i,length1);
- for(j=1;j<=data.firstsel;j++)
- seq_array[i][j]=tmp_array[i][j];
- for(j=data.firstsel+1;j<=data.firstsel+length2;j++)
- seq_array[i][j]=new_array[i][j-data.firstsel];
- for(j=data.firstsel+length2+1;j<=length1;j++)
- seq_array[i][j]=tmp_array[i][data.lastsel+j-data.firstsel-length2+1];
- }
- max_aln_length*=2;
- ckfree(tmplen_array);
- for(i=1;i<=data.nseqs;i++)
- ckfree(tmp_array[i]);
- ckfree(tmp_array);
- ckfree(newlen_array);
- for(i=1;i<=data.nseqs;i++)
- ckfree(new_array[i]);
- ckfree(new_array);
-
- if (open_aln_files())
- create_alignment_output(1,data.nseqs);
-
- load_aln(seq_panel,0,nseqs-1,FALSE);
- GetPanelExtra(seq_panel.seqs,&data);
- data.firstsel=fs;
- data.lastsel=data.firstsel+length2-1;
- SetPanelExtra(seq_panel.seqs,&data);
- highlight_seqrange(seq_panel.seqs,data.firstsel,data.lastsel,HIGHLIGHT);
- ArrowCursor();
- info("Selected sequence range realigned.");
-}
-
-
-void AlignFromTreeWin(IteM item)
-{
- if (empty)
- {
- error("No sequences loaded");
- return;
- }
- if (nseqs < 2)
- {
- error("Alignment has only %d sequences",nseqs);
- return;
- }
- do_align_window(&talignw,&ttreetext,OLD,"Alignment from Guide Tree",AlignFromTree);
-}
-
-static void do_align_window(WindoW *ralignw,TexT *rtreetext,Boolean treestatus,char *title,void align_proc(ButtoN but))
-{
- WindoW alignw;
- TexT treetext;
- GrouP aligngr;
- GrouP output_list;
- ButtoN align_ok, align_can;
- GrouP maing;
- char name[FILENAMELEN+1];
- char path[FILENAMELEN+1];
-
- get_path(seqname,path);
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- alignw=FixedWindow(-50, -33, -10, -10,title,RemoveWin);
-
- maing=HiddenGroup(alignw,2,0,NULL);
- SetGroupSpacing(maing,0,10);
-
- if(treestatus==NEW)
- make_prompt(alignw, "Output Guide Tree File:");
- else
- make_prompt(alignw, "Input Guide Tree File:");
- stdLineHeight=18;
- SelectFont(programFont);
- Break(alignw);
- treetext=DialogText(alignw, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"dnd");
- SetTitle(treetext, name);
- Break(alignw);
-
- make_prompt(alignw, "Output Alignment Files:");
- output_list=HiddenGroup(alignw, 2, 0, NULL);
- if(output_clustal) {
- make_prompt(output_list,"Clustal: ");
- cl_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"aln");
- SetTitle(cl_outtext, name);
- Break(output_list);
- }
- if(output_nbrf) {
- make_prompt(output_list,"NBRF/PIR: ");
- pir_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"pir");
- SetTitle(pir_outtext, name);
- Break(output_list);
- }
- if(output_gcg) {
- make_prompt(output_list,"GCG/MSF: ");
- msf_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"msf");
- SetTitle(msf_outtext, name);
- Break(output_list);
- }
- if(output_phylip) {
- make_prompt(output_list,"Phylip: ");
- phylip_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"phy");
- SetTitle(phylip_outtext, name);
- Break(output_list);
- }
- if(output_gde) {
- make_prompt(output_list,"GDE: ");
- gde_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"gde");
- SetTitle(gde_outtext, name);
- Break(output_list);
- }
- if(output_nexus) {
- make_prompt(output_list,"Nexus: ");
- nexus_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"nxs");
- SetTitle(nexus_outtext, name);
- Break(output_list);
- }
-/* Ramu */
- if(output_fasta) {
- make_prompt(output_list,"Fasta: ");
- fasta_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"fasta");
- SetTitle(fasta_outtext, name);
- Break(output_list);
- }
-
-/* Ramu */
-
- Break(alignw);
- aligngr=HiddenGroup(alignw, 2, 0, NULL);
- shift(aligngr, 60, 20);
- align_ok=PushButton(aligngr, " ALIGN ", align_proc);
- shift(aligngr, 20,0);
- align_can=PushButton(aligngr, "CANCEL", CancelWin);
-
- *ralignw=alignw;
- *rtreetext=treetext;
- Show(alignw);
-}
-
-
-static void do_palign_window(WindoW *ralignw,TexT *rtree1text,TexT *rtree2text,Boolean treestatus,char *title,void align_proc(ButtoN but))
-{
- Boolean istree=FALSE;
- WindoW alignw;
- TexT tree1text,tree2text;
- GrouP aligngr;
- GrouP output_list;
- ButtoN align_ok, align_can;
- GrouP maing;
- char name[FILENAMELEN+1];
- char path[FILENAMELEN+1];
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- alignw=FixedWindow(-50, -33, -10, -10,title,RemoveWin);
-
- maing=HiddenGroup(alignw,2,0,NULL);
- SetGroupSpacing(maing,0,10);
-
- if(treestatus==NEW)
- make_prompt(alignw, "Output Guide Tree Files:");
- else
- make_prompt(alignw, "Input Guide Tree Files:");
- stdLineHeight=18;
- SelectFont(programFont);
- Break(alignw);
- tree1text=DialogText(alignw, "", 35, NULL);
- get_path(profile1_name,path);
- strcpy(name,path);
- strcat(name,"dnd");
- SetTitle(tree1text, name);
- Break(alignw);
- tree2text=DialogText(alignw, "", 35, NULL);
- get_path(profile2_name,path);
- strcpy(name,path);
- strcat(name,"dnd");
- SetTitle(tree2text, name);
- Break(alignw);
-
- make_prompt(alignw, "Output Alignment Files:");
- output_list=HiddenGroup(alignw, 2, 0, NULL);
- if(output_clustal) {
- make_prompt(output_list,"Clustal: ");
- cl_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"aln");
- SetTitle(cl_outtext, name);
- Break(output_list);
- }
- if(output_nbrf) {
- make_prompt(output_list,"NBRF/PIR: ");
- pir_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"pir");
- SetTitle(pir_outtext, name);
- Break(output_list);
- }
- if(output_gcg) {
- make_prompt(output_list,"GCG/MSF: ");
- msf_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"msf");
- SetTitle(msf_outtext, name);
- Break(output_list);
- }
- if(output_phylip) {
- make_prompt(output_list,"Phylip: ");
- phylip_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"phy");
- SetTitle(phylip_outtext, name);
- Break(output_list);
- }
- if(output_gde) {
- make_prompt(output_list,"GDE: ");
- gde_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"gde");
- SetTitle(gde_outtext, name);
- Break(output_list);
- }
- if(output_nexus) {
- make_prompt(output_list,"Nexus: ");
- nexus_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"nxs");
- SetTitle(nexus_outtext, name);
- Break(output_list);
- }
-/* Ramu */
- if(output_fasta) {
- make_prompt(output_list,"Fasta: ");
- fasta_outtext=DialogText(output_list, "", 35, NULL);
- strcpy(name,path);
- strcat(name,"fasta");
- SetTitle(fasta_outtext, name);
- Break(output_list);
- }
-/* Ramu */
- Break(alignw);
- aligngr=HiddenGroup(alignw, 2, 0, NULL);
- shift(aligngr, 60, 20);
- align_ok=PushButton(aligngr, " ALIGN ", align_proc);
- shift(aligngr, 20,0);
- align_can=PushButton(aligngr, "CANCEL", CancelWin);
-
- *ralignw=alignw;
- *rtree1text=tree1text;
- *rtree2text=tree2text;
- Show(alignw);
-}
-
-
-void AlignFromTree(ButtoN but)
-{
- FILE *tree;
- char phylip_name[FILENAMELEN];
-
- GetTitle(ttreetext, filename, FILENAMELEN);
- stripspace(filename);
-
- strcpy(phylip_name,filename);
-#ifdef VMS
- if((tree=fopen(phylip_name,"r","rat=cr","rfm=var"))==NULL) {
-#else
- if((tree=fopen(phylip_name,"r"))==NULL) {
-#endif
- error("Cannot open tree file [%s]",phylip_name);
- return;
- }
-
- if (!open_aln_files()) return;
-
- WatchCursor();
- info("Doing alignments from guide tree...");
- if (Visible(talignw))
- {
- Remove(talignw);
- talignw=NULL;
- }
- get_tree(phylip_name);
- if(save_log && save_log_fd!=NULL)
- {
- fclose(save_log_fd);
- save_log_fd=NULL;
- }
-/* reload the sequences from the output file (so that the sequence order is
-correct - either INPUT or ALIGNED */
- reload_alignment();
- load_aln(seq_panel,0,nseqs-1,FALSE);
- ArrowCursor();
- info("Done.");
-}
-
-static void PrfPrfAlignWin (IteM item)
-{
- if (profile1_empty)
- {
- error("Profile 1 not loaded");
- return;
- }
- if (profile2_empty)
- {
- error("Profile 2 not loaded");
- return;
- }
- do_palign_window(&palignw,&ptree1text,&ptree2text,NEW,"Profile to Profile Alignment",PrfPrfAlign);
-}
-
-static void PrfPrfTreeAlignWin (IteM item)
-{
- if (profile1_empty)
- {
- error("Profile 1 not loaded");
- return;
- }
- if (profile2_empty)
- {
- error("Profile 2 not loaded");
- return;
- }
- do_palign_window(&palignw,&ptree1text,&ptree2text,OLD,"Profile Alignment from Tree",PrfPrfTreeAlign);
-}
-
-static void SeqPrfAlignWin (IteM item)
-{
- if (profile1_empty)
- {
- error("Profile 1 not loaded");
- return;
- }
- if (profile2_empty)
- {
- error("Profile 2 not loaded");
- return;
- }
- do_align_window(&salignw,&streetext,NEW,"Sequence to Profile Alignment",SeqPrfAlign);
-}
-
-static void SeqPrfTreeAlignWin (IteM item)
-{
- if (profile1_empty)
- {
- error("Profile 1 not loaded");
- return;
- }
- if (profile2_empty)
- {
- error("Profile 2 not loaded");
- return;
- }
- do_align_window(&salignw,&streetext,OLD,"Sequence to Profile Alignment from Tree",SeqPrfTreeAlign);
-}
-
-static void PrfPrfAlign(ButtoN but)
-{
- char p1_tree_name[FILENAMELEN];
- char p2_tree_name[FILENAMELEN];
-
- GetTitle(ptree1text, filename, FILENAMELEN);
- stripspace(filename);
- use_tree1_file=FALSE;
- strcpy(p1_tree_name,filename);
-
- GetTitle(ptree2text, filename, FILENAMELEN);
- stripspace(filename);
- use_tree2_file=FALSE;
- strcpy(p2_tree_name,filename);
-
- if (!open_aln_files()) return;
-
- WatchCursor();
- if (Visible(palignw))
- {
- Remove(palignw);
- palignw=NULL;
- }
- profile_align(p1_tree_name,p2_tree_name);
- if(save_log && save_log_fd!=NULL)
- {
- fclose(save_log_fd);
- save_log_fd=NULL;
- }
- load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
- ArrowCursor();
-}
-
-static void PrfPrfTreeAlign(ButtoN but)
-{
- char p1_tree_name[FILENAMELEN];
- char p2_tree_name[FILENAMELEN];
-
- GetTitle(ptree1text, filename, FILENAMELEN);
- stripspace(filename);
- if(filename[0]!=EOS) use_tree1_file=TRUE;
- strcpy(p1_tree_name,filename);
-
- GetTitle(ptree2text, filename, FILENAMELEN);
- stripspace(filename);
- if(filename[0]!=EOS) use_tree2_file=TRUE;
- strcpy(p2_tree_name,filename);
-
- if (!open_aln_files()) return;
-
- WatchCursor();
- if (Visible(palignw))
- {
- Remove(palignw);
- palignw=NULL;
- }
- profile_align(p1_tree_name,p2_tree_name);
- if(save_log && save_log_fd!=NULL)
- {
- fclose(save_log_fd);
- save_log_fd=NULL;
- }
- load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
- ArrowCursor();
-}
-
-static void SeqPrfAlign(ButtoN but)
-{
- char phylip_name[FILENAMELEN];
-
- GetTitle(streetext, filename, FILENAMELEN);
- stripspace(filename);
-
- strcpy(phylip_name,filename);
- use_tree_file=FALSE;
-
- if (!open_aln_files()) return;
-
- WatchCursor();
- if (Visible(salignw))
- {
- Remove(salignw);
- salignw=NULL;
- }
- new_sequence_align(phylip_name);
- if(save_log && save_log_fd!=NULL)
- {
- fclose(save_log_fd);
- save_log_fd=NULL;
- }
-
-/* reload the sequences from the output file (so that the sequence order is
-correct - either INPUT or ALIGNED */
- reload_alignment();
-
- load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
- ArrowCursor();
-}
-
-static void SeqPrfTreeAlign(ButtoN but)
-{
- char phylip_name[FILENAMELEN];
-
- GetTitle(streetext, filename, FILENAMELEN);
- stripspace(filename);
-
- strcpy(phylip_name,filename);
- use_tree_file=TRUE;
-
- if (!open_aln_files()) return;
-
- WatchCursor();
- if (Visible(salignw))
- {
- Remove(salignw);
- salignw=NULL;
- }
- new_sequence_align(phylip_name);
- if(save_log && save_log_fd!=NULL)
- {
- fclose(save_log_fd);
- save_log_fd=NULL;
- }
-
-/* reload the sequences from the output file (so that the sequence order is
-correct - either INPUT or ALIGNED */
- reload_alignment();
-
- load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
- ArrowCursor();
-}
-void reload_alignment(void)
-{
- int i,k;
- sint *sseqlen_array;
- char **sseq_array;
- char **snames, **stitles;
-
- if (nseqs==0) return;
- if (output_order == INPUT) return;
-
-
- snames=(char **)ckalloc((nseqs+2) * sizeof(char *));
- stitles=(char **)ckalloc((nseqs+2) * sizeof(char *));
- sseq_array=(char **)ckalloc((nseqs+2) * sizeof(char *));
- sseqlen_array=(sint *)ckalloc((nseqs+2) * sizeof(sint));
- for (i=1;i<=nseqs;i++)
- {
- snames[i]=(char *)ckalloc((MAXNAMES+2)*sizeof(char));
- stitles[i]=(char *)ckalloc((MAXTITLES+2)*sizeof(char));
- sseq_array[i]=(char *)ckalloc((seqlen_array[output_index[i]]+2)*sizeof(char));
- strcpy(snames[i],names[output_index[i]]);
- strcpy(stitles[i],titles[output_index[i]]);
- sseqlen_array[i]=seqlen_array[output_index[i]];
- for(k=1;k<=seqlen_array[output_index[i]];k++)
- sseq_array[i][k]=seq_array[output_index[i]][k];
- }
- for (i=1;i<=nseqs;i++)
- {
- strcpy(names[i],snames[i]);
- strcpy(titles[i],stitles[i]);
- seqlen_array[i]=sseqlen_array[i];
- realloc_seq(i,seqlen_array[i]);
- for(k=1;k<=seqlen_array[i];k++)
- seq_array[i][k]=sseq_array[i][k];
- output_index[i]=i;
- }
-
- ckfree(sseqlen_array);
- for(i=1;i<=nseqs;i++)
- ckfree(sseq_array[i]);
- ckfree(sseq_array);
- for(i=1;i<=nseqs;i++)
- ckfree(stitles[i]);
- ckfree(stitles);
- for(i=1;i<=nseqs;i++)
- ckfree(snames[i]);
- ckfree(snames);
- ncutseqs=0;
-}
-
-static void SegmentWin(IteM item)
-{
- WindoW w;
- GrouP maing;
- ButtoN closeb;
- GrouP mat_list;
- ButtoN matrixb[5];
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- w=FixedWindow(-50, -33, -10, -10, "Low-Scoring Segment Parameters",RemoveWin);
- maing=HiddenGroup(w,0,0,NULL);
- SetGroupSpacing(maing,20,10);
- closeb=PushButton(maing, "CLOSE", CancelWin);
- Break(maing);
-
- /*PushButton(maing, "Calculate Low-Scoring Segments", calc_segment_exceptions);
- Break(maing);*/
-
- length_cutofftext=make_scale(maing,"Minimum Length of Segments:",9,length_cutoff,19,set_lengthcutoff);
- Break(maing);
-
- segmentdnascaletext=make_scale(maing,"DNA Marking Scale:",9,segment_dnascale,9,set_segment_dnascale);
- if(!dnaflag) Disable(segmentdnascaletext);
- Break(maing);
-
-
- mat_list=NormalGroup(maing,4,0,"Protein Weight Matrix",systemFont,set_segment_matrix);
- matrixb[0]=RadioButton(mat_list,"Gonnet PAM 80");
- matrixb[1]=RadioButton(mat_list,"Gonnet PAM 120");
- matrixb[2]=RadioButton(mat_list,"Gonnet PAM 250");
- matrixb[3]=RadioButton(mat_list,"Gonnet PAM 350");
- matrixb[4]=RadioButton(mat_list,"User defined");
- SetValue(mat_list,segment_matnum);
- seg_matrix_list=mat_list;
- Break(maing);
- PushButton(maing, "Load protein matrix: ", set_segment_user_matrix);
- Advance(maing);
- segmentmattext=StaticPrompt(maing,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
- SetTitle(segmentmattext,segment_mtrxname);
- Break(maing);
-
- mat_list=NormalGroup(maing,4,0,"DNA Weight Matrix",systemFont,set_segment_dnamatrix);
- matrixb[0]=RadioButton(mat_list,"IUB");
- matrixb[1]=RadioButton(mat_list,"CLUSTALW(1.6)");
- matrixb[2]=RadioButton(mat_list,"User defined");
- SetValue(mat_list,segment_dnamatnum);
- seg_dnamatrix_list=matrix_list;
- Break(maing);
- PushButton(maing, "Load DNA matrix: ", set_segment_user_dnamatrix);
- Advance(maing);
- segmentdnamattext=StaticPrompt(maing,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
- SetTitle(segmentdnamattext,segment_dnamtrxname);
- Break(maing);
-
-
- Show(w);
-}
-
-static void ScoreWin(IteM item)
-{
- WindoW w;
- GrouP maing;
- ButtoN closeb;
- GrouP mat_list;
- PopuP show_exceptions;
- ButtoN matrixb[6];
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- w=FixedWindow(-50, -33, -10, -10, "Score Parameters",RemoveWin);
- maing=HiddenGroup(w,0,0,NULL);
- SetGroupSpacing(maing,20,10);
- closeb=PushButton(maing, "CLOSE", CancelWin);
- Break(maing);
-
-
-/* add a scale to set the scaling value for the alignment scoring function */
- scorescaletext=make_scale(maing,"Score Plot Scale:",9,score_scale,9,set_scorescale);
- Break(maing);
-
- residue_cutofftext=make_scale(maing,"Residue Exception Cutoff:",9,score_cutoff,9,set_scorecutoff);
- Break(maing);
-
- mat_list=NormalGroup(maing,4,0,"Protein Weight Matrix",systemFont,set_score_matrix);
- matrixb[0]=RadioButton(mat_list,"Identity");
- matrixb[1]=RadioButton(mat_list,"Gonnet PAM 80");
- matrixb[2]=RadioButton(mat_list,"Gonnet PAM 120");
- matrixb[3]=RadioButton(mat_list,"Gonnet PAM 250");
- matrixb[4]=RadioButton(mat_list,"Gonnet PAM 350");
- matrixb[5]=RadioButton(mat_list,"User defined");
- SetValue(mat_list,score_matnum);
- score_matrix_list=mat_list;
- Break(maing);
- PushButton(maing, "Load protein matrix: ", set_score_user_matrix);
- Advance(maing);
- scoremattext=StaticPrompt(maing,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
- SetTitle(scoremattext,score_mtrxname);
- Break(maing);
-
- mat_list=NormalGroup(maing,4,0,"DNA Weight Matrix",systemFont,set_score_dnamatrix);
- matrixb[0]=RadioButton(mat_list,"IUB");
- matrixb[1]=RadioButton(mat_list,"CLUSTALW(1.6)");
- matrixb[2]=RadioButton(mat_list,"User defined");
- SetValue(mat_list,score_dnamatnum);
- score_dnamatrix_list=mat_list;
- Break(maing);
- PushButton(maing, "Load DNA matrix: ", set_score_user_dnamatrix);
- Advance(maing);
- scorednamattext=StaticPrompt(maing,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
- SetTitle(scorednamattext,score_dnamtrxname);
- Break(maing);
-
- Show (w);
-}
-
-
-static void PWParameters(IteM item)
-{
- int i;
- WindoW w;
- PoinT pt;
- GrouP maing;
- ButtoN closeb;
- TexT go_scale,ge_scale;
- TexT gp_scale,ktuple_scale,topdiags_scale,window_scale;
- PopuP fs_toggle;
- GrouP mat_list;
- ButtoN matrixb[5];
- char str[FILENAMELEN];
-
- if(dnaflag) {
- gap_open = dna_gap_open;
- gap_extend = dna_gap_extend;
- pw_go_penalty = dna_pw_go_penalty;
- pw_ge_penalty = dna_pw_ge_penalty;
- ktup = dna_ktup;
- window = dna_window;
- signif = dna_signif;
- wind_gap = dna_wind_gap;
-
- }
- else {
- gap_open = prot_gap_open;
- gap_extend = prot_gap_extend;
- pw_go_penalty = prot_pw_go_penalty;
- pw_ge_penalty = prot_pw_ge_penalty;
- ktup = prot_ktup;
- window = prot_window;
- signif = prot_signif;
- wind_gap = prot_wind_gap;
-
- }
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- w=FixedWindow(-50, -33, -10, -10, "Pairwise Parameters",RemoveWin);
- maing=HiddenGroup(w,0,0,NULL);
- SetGroupSpacing(maing,0,10);
- closeb=PushButton(maing, "CLOSE", CancelWin);
- Break(maing);
- fs_toggle=make_toggle(maing,"Pairwise Alignments :","Fast-Approximate","Slow-Accurate",&quick_pairalign,set_fs_toggle);
- Break(maing);
-
- GetNextPosition(maing,&pt);
- slow_para=NormalGroup(maing,0,0,"Pairwise Parameters",systemFont,NULL);
- SetGroupSpacing(slow_para,0,10);
-
- make_prompt(slow_para, "Gap Opening [0-100] :");
- Advance(slow_para);
- sprintf(str,"%.2f",pw_go_penalty);
- go_scale=DialogText(slow_para, str, 5, set_pw_go_penalty);
- Break(slow_para);
-
- make_prompt(slow_para, "Gap Extension [0-100] :");
- Advance(slow_para);
- sprintf(str,"%.2f",pw_ge_penalty);
- ge_scale=DialogText(slow_para, str, 5, set_pw_ge_penalty);
- Break(slow_para);
- mat_list=NormalGroup(slow_para,4,0,"Protein Weight Matrix",systemFont,set_pw_matrix);
- for(i=0;i<pw_matrix_menu.noptions;i++)
- matrixb[i]=RadioButton(mat_list,pw_matrix_menu.opt[i].title);
- SetValue(mat_list,pw_matnum);
- pw_matrix_list=mat_list;
- Break(slow_para);
- PushButton(slow_para, "Load protein matrix: ", set_pw_user_matrix);
- Advance(slow_para);
- pwmattext=StaticPrompt(slow_para,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
- SetTitle(pwmattext,pw_usermtrxname);
-
- Break(slow_para);
- mat_list=NormalGroup(slow_para,4,0,"DNA Weight Matrix",systemFont,set_pw_dnamatrix);
- for(i=0;i<dnamatrix_menu.noptions;i++)
- matrixb[i]=RadioButton(mat_list,dnamatrix_menu.opt[i].title);
- SetValue(mat_list,pw_dnamatnum);
- pw_dnamatrix_list=mat_list;
- Break(slow_para);
- PushButton(slow_para, "Load DNA matrix: ", set_pw_user_dnamatrix);
- Advance(slow_para);
- pwdnamattext=StaticPrompt(slow_para,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
- SetTitle(pwdnamattext,pw_dnausermtrxname);
-
- Break(slow_para);
-
-
-/* fast parameters */
- SetNextPosition(maing,pt);
- fast_para=NormalGroup(maing,2,0,"Pairwise Parameters",systemFont,NULL);
- SetGroupSpacing(fast_para,0,10);
- make_prompt(fast_para, "Gap Penalty [1-500]:");
- sprintf(str,"%d",wind_gap);
- gp_scale=DialogText(fast_para, str, 3, set_gp);
- make_prompt(fast_para, "K-Tuple Size [1-2]:");
- sprintf(str,"%d",ktup);
- ktuple_scale=DialogText(fast_para, str, 1, set_ktuple);
- make_prompt(fast_para, "Top Diagonals [1-50]:");
- sprintf(str,"%d",signif);
- topdiags_scale=DialogText(fast_para, str, 2, set_topdiags);
- make_prompt(fast_para, "Window Size [1-50]:");
- sprintf(str,"%d",window);
- window_scale=DialogText(fast_para, str, 2, set_window);
-
- if (quick_pairalign)
- {
- Hide(slow_para);
- Show(fast_para);
- }
- else
- {
- Hide(fast_para);
- Show(slow_para);
- }
-
- Break(maing);
-
- Show (w);
-}
-
-
-static void MultiParameters(IteM item)
-{
- int i;
- WindoW w;
- GrouP maing;
- ButtoN closeb;
- TexT go_scale,ge_scale;
- GrouP mat_list;
- ButtoN matrixb[5];
- GrouP multi_para;
- TexT div_seq;
- TexT transitions;
- PopuP neg_mat_toggle;
- PopuP end_gap_toggle;
- char str[FILENAMELEN];
-
- if(dnaflag) {
- gap_open = dna_gap_open;
- gap_extend = dna_gap_extend;
- pw_go_penalty = dna_pw_go_penalty;
- pw_ge_penalty = dna_pw_ge_penalty;
- ktup = dna_ktup;
- window = dna_window;
- signif = dna_signif;
- wind_gap = dna_wind_gap;
-
- }
- else {
- gap_open = prot_gap_open;
- gap_extend = prot_gap_extend;
- pw_go_penalty = prot_pw_go_penalty;
- pw_ge_penalty = prot_pw_ge_penalty;
- ktup = prot_ktup;
- window = prot_window;
- signif = prot_signif;
- wind_gap = prot_wind_gap;
-
- }
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- w=FixedWindow(-50, -33, -10, -10, "Alignment Parameters",RemoveWin);
- maing=HiddenGroup(w,0,0,NULL);
- SetGroupSpacing(maing,0,10);
- closeb=PushButton(maing, "CLOSE", CancelWin);
- Break(maing);
-
-/* multiple alignment parameters */
-
- multi_para=NormalGroup(maing,0,0,"Multiple Parameters",systemFont,NULL);
- SetGroupSpacing(multi_para,0,10);
- make_prompt(multi_para, "Gap Opening [0-100] :");
- Advance(multi_para);
- sprintf(str,"%.2f",gap_open);
- go_scale=DialogText(multi_para, str, 5, set_go_penalty);
- Advance(multi_para);
- make_prompt(multi_para, "Gap Extention [0-100] :");
- Advance(multi_para);
- sprintf(str,"%.2f",gap_extend);
- ge_scale=DialogText(multi_para, str, 5, set_ge_penalty);
- Break(multi_para);
- make_prompt(multi_para, "Delay Divergent Sequences (%) :");
- Advance(multi_para);
- sprintf(str,"%d",divergence_cutoff);
- div_seq=DialogText(multi_para, str, 3, set_div_seq);
- Break(multi_para);
-
- make_prompt(multi_para, "DNA Transition Weight [0-1] :");
- Advance(multi_para);
- sprintf(str,"%.2f",transition_weight);
- transitions=DialogText(multi_para, str, 5, set_transitions);
- Break(multi_para);
-
-
- neg_mat_toggle=make_toggle(multi_para,"Use Negative Matrix","ON","OFF",&neg_matrix,set_neg_matrix);
-
- Break(multi_para);
- mat_list=NormalGroup(multi_para,2,0,"Protein Weight Matrix",systemFont,set_matrix);
- for(i=0;i<matrix_menu.noptions;i++)
- matrixb[i]=RadioButton(mat_list,matrix_menu.opt[i].title);
- SetValue(mat_list,matnum);
- matrix_list=mat_list;
- Break(multi_para);
- PushButton(multi_para, "Load protein matrix: ", set_user_matrix);
- Advance(multi_para);
- mattext=StaticPrompt(multi_para,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
- SetTitle(mattext,usermtrxname);
-
- Break(multi_para);
- mat_list=NormalGroup(multi_para,2,0,"DNA Weight Matrix",systemFont,set_dnamatrix);
- for(i=0;i<dnamatrix_menu.noptions;i++)
- matrixb[i]=RadioButton(mat_list,dnamatrix_menu.opt[i].title);
- SetValue(mat_list,dnamatnum);
- dnamatrix_list=mat_list;
- Break(multi_para);
- PushButton(multi_para, "Load DNA: ", set_user_dnamatrix);
- Advance(multi_para);
- dnamattext=StaticPrompt(multi_para,"", MAXPROMPTLEN, dialogTextHeight, systemFont, 'l');
- SetTitle(dnamattext,dnausermtrxname);
- Show (w);
-}
-
-static void GapParameters(IteM item)
-{
- WindoW gapparaw;
- GrouP maing;
- ButtoN closeb;
- PopuP rp_toggle,vp_toggle,hp_toggle,end_gap_toggle;
- TexT gdist,hyd_text;
- char str[80];
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- gapparaw=FixedWindow(-50, -33, -10, -10, "Protein Gap Parameters",RemoveWin);
- maing=HiddenGroup(gapparaw,0,0,NULL);
- SetGroupSpacing(maing,0,10);
- closeb=PushButton(maing, "CLOSE", CancelWin);
- Break(maing);
- rp_toggle=make_toggle(maing,"Residue-specific Penalties","OFF","ON",&no_pref_penalties,set_pref_penalties);
- Break(maing);
- hp_toggle=make_toggle(maing,"Hydrophilic Penalties","OFF","ON",&no_hyd_penalties,set_hyd_penalties);
- Break(maing);
- make_prompt(maing, "Hydrophilic Residues :");
- Advance(maing);
- hyd_text=DialogText(maing, hyd_residues, 20, set_hyd_res);
- Break(maing);
- make_prompt(maing, "Gap Separation Distance [0-100] :");
- Advance(maing);
- sprintf(str,"%d",gap_dist);
- gdist=DialogText(maing, str, 3, set_gap_dist);
- Break(maing);
- end_gap_toggle=make_toggle(maing,"End Gap Separation","ON","OFF",&use_endgaps,set_endgaps);
-
- Show (gapparaw);
-}
-
-static void SSParameters(IteM item)
-{
- WindoW ssparaw;
- GrouP maing;
- ButtoN closeb;
- PopuP use_p1,use_p2;
- TexT helix_gp,strand_gp,loop_gp,terminal_gp,helix_minus,helix_plus;
- TexT strand_minus,strand_plus;
- GrouP output_list;
- ButtoN outputb[4];
- char str[80];
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- ssparaw=FixedWindow(-50, -33, -10, -10, "Secondary Structure Options",RemoveWin);
- closeb=PushButton(ssparaw, "CLOSE", CancelWin);
- Break(ssparaw);
-
- use_p1=make_toggle(ssparaw,"Use profile 1 secondary structure / penalty mask","YES","NO",&use_ss1,set_use_ss1);
- Break(ssparaw);
- use_p2=make_toggle(ssparaw,"Use profile 2 secondary structure / penalty mask","YES","NO",&use_ss2,set_use_ss2);
-
- Break(ssparaw);
- output_list=NormalGroup(ssparaw,2,0,"Output ",systemFont,NULL)
-;
- outputb[0]=CheckBox(output_list,"Secondary Structure",set_ss_output);
- if(output_struct_penalties==0 || output_struct_penalties==2)
- output_ss=TRUE;
- else
- output_ss=FALSE;
- SetStatus(outputb[0],output_ss);
- outputb[1]=CheckBox(output_list,"Gap Penalty Mask",set_gp_output);
- if(output_struct_penalties==1 || output_struct_penalties==2)
- output_gp=TRUE;
- else
- output_gp=FALSE;
- SetStatus(outputb[1],output_gp);
-
- Break(ssparaw);
- maing=HiddenGroup(ssparaw,2,0,NULL);
- SetGroupSpacing(maing,0,10);
- make_prompt(maing, "Helix Gap Penalty [0-9] :");
- sprintf(str,"%d",helix_penalty);
- helix_gp=DialogText(maing, str, 1, set_helix_gp);
- make_prompt(maing, "Strand Gap Penalty [0-9] :");
- sprintf(str,"%d",strand_penalty);
- strand_gp=DialogText(maing, str, 1, set_strand_gp);
- make_prompt(maing, "Loop Gap Penalty [0-9] :");
- sprintf(str,"%d",loop_penalty);
- loop_gp=DialogText(maing, str, 1, set_loop_gp);
- make_prompt(maing, "Secondary Structure Terminal Penalty [0-9] :");
- sprintf(str,"%d",helix_end_penalty);
- terminal_gp=DialogText(maing, str, 1, set_terminal_gp);
- make_prompt(maing, "Helix Terminal Positions [0-3] within:");
- sprintf(str,"%d",helix_end_minus);
- helix_minus=DialogText(maing, str, 1, set_helix_minus);
- make_prompt(maing, "outside:");
- sprintf(str,"%d",helix_end_plus);
- helix_plus=DialogText(maing, str, 1, set_helix_plus);
- make_prompt(maing, "Strand Terminal Penalty [0-3] within:");
- sprintf(str,"%d",strand_end_minus);
- strand_minus=DialogText(maing, str, 1, set_strand_minus);
- make_prompt(maing, "outside:");
- sprintf(str,"%d",strand_end_plus);
- strand_plus=DialogText(maing, str, 1, set_strand_plus);
-
-
- Show (ssparaw);
-}
-
-static void OutputParameters(IteM item)
-{
- WindoW outputparaw;
- GrouP maing;
- ButtoN closeb;
- GrouP output_list;
- ButtoN outputb[6+1]; /* +1 for fasta */
- PopuP order_toggle,para_toggle;
- PopuP case_toggle,snos_toggle;
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- outputparaw=FixedWindow(-50, -33, -10, -10, "Output Format Options",RemoveWin);
- closeb=PushButton(outputparaw, "CLOSE", CancelWin);
- Break(outputparaw);
- output_list=NormalGroup(outputparaw,2,0,"Output Files",systemFont,NULL);
-
- SelectFont(systemFont);
- outputb[0]=CheckBox(output_list,"CLUSTAL format",set_output_clustal);
- if(output_clustal) SetStatus(outputb[0],TRUE);
-
- outputb[1]=CheckBox(output_list,"NBRF/PIR format",set_output_nbrf);
- if(output_nbrf) SetStatus(outputb[1],TRUE);
-
- outputb[2]=CheckBox(output_list,"GCG/MSF format",set_output_gcg);
- if(output_gcg) SetStatus(outputb[2],TRUE);
-
- outputb[3]=CheckBox(output_list,"PHYLIP format",set_output_phylip);
- if(output_phylip) SetStatus(outputb[3],TRUE);
-
- outputb[4]=CheckBox(output_list,"GDE format",set_output_gde);
- if(output_gde) SetStatus(outputb[4],TRUE);
-
- outputb[5]=CheckBox(output_list,"NEXUS format",set_output_nexus);
- if(output_nexus) SetStatus(outputb[5],TRUE);
-
-/* Ramu */
- outputb[6]=CheckBox(output_list,"FASTA format",set_output_fasta);
- if(output_fasta) SetStatus(outputb[6],TRUE);
-/* Ramu */
-
- maing=HiddenGroup(outputparaw,2,0,NULL);
- SetGroupSpacing(maing,0,10);
-
-
- case_toggle=make_toggle(maing,"GDE output case :","Lower","Upper",&lowercase,set_case);
- snos_toggle=make_toggle(maing,"CLUSTALW sequence numbers :","ON","OFF",&cl_seq_numbers,set_snos);
- make_prompt(maing, "Output order");
- order_toggle=PopupList(maing,TRUE,set_output_order);
- PopupItem(order_toggle,"INPUT");
- PopupItem(order_toggle,"ALIGNED");
- if (output_order == INPUT)
- SetValue(order_toggle,1);
- else if (output_order == ALIGNED)
- SetValue(order_toggle,2);
- para_toggle=make_toggle(maing,"Parameter output","ON","OFF",&save_parameters,set_save_paras);
-
- Show (outputparaw);
-}
-
-
-static void OutputTreeParameters(IteM item)
-{
- WindoW outputtreeparaw;
- ButtoN closeb;
- GrouP output_list;
- ButtoN outputb[5]; /* ButtoN outputb[3]; Ramu, crash????; 5th one for perc_ident mat */
- PopuP boot_format;
-
- SelectFont(systemFont);
- stdCharWidth=CharWidth('A');
- stdLineHeight=LineHeight();
- outputtreeparaw=FixedWindow(-50, -33, -10, -10, "Output Tree Format Options",RemoveWin);
- closeb=PushButton(outputtreeparaw, "CLOSE", CancelWin);
- Break(outputtreeparaw);
- output_list=NormalGroup(outputtreeparaw,2,0,"Output Files",systemFont,NULL);
- outputb[0]=CheckBox(output_list,"CLUSTAL format tree",set_output_tree_clustal);
- if(output_tree_clustal) SetStatus(outputb[0],TRUE);
-
- outputb[1]=CheckBox(output_list,"Phylip format tree",set_output_tree_phylip);
- if(output_tree_phylip) SetStatus(outputb[1],TRUE);
-
- outputb[2]=CheckBox(output_list,"Phylip distance matrix",set_output_tree_distances);
- if(output_tree_distances) SetStatus(outputb[2],TRUE);
-
- outputb[3]=CheckBox(output_list,"Nexus format tree",set_output_tree_nexus);
- if(output_tree_nexus) SetStatus(outputb[3],TRUE);
-
-
- outputb[4]=CheckBox(output_list,"% identity matrix",set_output_pim); /* pim? perc ident matr */
- if(output_pim) SetStatus(outputb[4],TRUE);
-
- Break(outputtreeparaw);
- make_prompt(outputtreeparaw, "Bootstrap labels on:");
- Advance(outputtreeparaw);
- boot_format=PopupList(outputtreeparaw,TRUE,set_boot_format);
- PopupItem(boot_format ,"NODE");
- PopupItem(boot_format ,"BRANCH");
- if (bootstrap_format == BS_NODE_LABELS)
- SetValue(boot_format ,1);
- if (bootstrap_format == BS_BRANCH_LABELS)
- SetValue(boot_format ,2);
- Show (outputtreeparaw);
-}
-
-static PrompT make_prompt(GrouP g,CharPtr title)
-{
- PrompT p=NULL;
-
- if (title != NULL)
- p=StaticPrompt(g, title, 0, dialogTextHeight, systemFont, 'l');
-
- return p;
-}
-
-static PrompT make_scale(GrouP g,CharPtr title,int length, int value,int max,BarScrlProc SetProc)
-{
- char str[FILENAMELEN];
- BaR scale;
- PrompT t;
-
- sprintf(str,"%s %3d",title,value);
- t=make_prompt(g,str);
- Advance(g);
- scale=ScrollBar(g, length, -1, SetProc);
- CorrectBarPage(scale,(Int4)1,(Int4)1);
- CorrectBarMax(scale,(Int4)max);
- CorrectBarValue(scale,(Int4)value);
- return t;
-}
-
-static PopuP make_toggle(GrouP g,CharPtr title,CharPtr true_text, CharPtr false_text, Boolean *value,PupActnProc SetProc)
-{
- PopuP p;
-
- if (title != NULL)
- make_prompt(g, title);
- Advance(g);
- p=PopupList(g,TRUE,SetProc);
- PopupItem(p,true_text);
- PopupItem(p,false_text);
- if (*value)
- SetValue(p,1);
- else
- SetValue(p,2);
- return p;
-}
-
-
-void switch_mode(void)
-{
- char path[FILENAMELEN];
-
- if(aln_mode==MULTIPLEM)
- {
- Hide(prf1_display);
- Hide(prf2_display);
- Hide(pscrolltext);
- SetValue(modetext,1);
- resize_multi_window();
- profile_no=0;
- check_menus(file_item,PROFILEM);
- check_menus(align_item,PROFILEM);
- check_menus(edit_item,PROFILEM);
- check_menus(tree_item,PROFILEM);
- check_menus(color_item,PROFILEM);
- active_panel=seq_panel;
- fix_gaps();
- load_aln_data(seq_panel,0,nseqs-1,TRUE);
- Show(seq_display);
- }
- else if(aln_mode==PROFILEM)
- {
- Hide(seq_display);
- resize_prf_window(nseqs,0);
- SetValue(modetext,2);
- profile_no=1;
- profile1_nseqs=nseqs;
- if (profile1_nseqs > 0) profile1_empty = FALSE;
- profile2_empty = TRUE;
-
- check_menus(file_item,MULTIPLEM);
- check_menus(align_item,MULTIPLEM);
- check_menus(edit_item,MULTIPLEM);
- check_menus(tree_item,MULTIPLEM);
- check_menus(color_item,MULTIPLEM);
- active_panel=prf_panel[0];
- get_path(seqname,path);
- strcpy(profile1_name,path);
- strcat(profile1_name,"1.");
- strcpy(profile2_name,path);
- strcat(profile2_name,"2.");
- fix_gaps();
- load_aln_data(prf_panel[0],0,profile1_nseqs-1,TRUE);
- load_aln_data(prf_panel[1],profile1_nseqs,nseqs-1,TRUE);
- Show(prf1_display);
- Show(prf2_display);
- Show(pscrolltext);
- }
-
-}
-
-
-static void make_menu_headers(WindoW w)
-{
- filem = PulldownMenu (w,"File/FFF");
- editm = PulldownMenu (w,"Edit/EEE");
- alignm = PulldownMenu (w,"Alignment/AAA");
- treem = PulldownMenu(w,"Trees/TTT");
- colorm = PulldownMenu(w,"Colors/CCC");
- scorem = PulldownMenu(w,"Quality/UUU");
- helpmenu = PulldownMenu(w,"Help/HHH");
-
-}
-
-static void make_file_menu(void)
-{
- int n=0;
-
- file_item.mode[n] = MULTIPLEM;
-
- file_item.i[n] = CommandItem (filem,"Load Sequences/OOO", OpenSeqFile); n++;
-
- file_item.mode[n] = MULTIPLEM;
- file_item.i[n] = CommandItem (filem,"Append Sequences", AppendSeqFile); n++;
- file_item.mode[n] = MULTIPLEM;
- file_item.i[n] = CommandItem (filem,"Save Sequences as.../SSS", SaveSeqFileWin); n++;
- file_item.mode[n] = PROFILEM;
- file_item.i[n] = CommandItem (filem,"Load Profile 1", OpenPrf1File); n++;
- file_item.mode[n] = PROFILEM;
- file_item.i[n] = CommandItem (filem,"Load Profile 2", OpenPrf2File); n++;
- file_item.mode[n] = PROFILEM;
- file_item.i[n] = CommandItem (filem,"Save Profile 1 as...", SavePrf1FileWin); n++;
- file_item.mode[n] = PROFILEM;
- file_item.i[n] = CommandItem (filem,"Save Profile 2 as...", SavePrf2FileWin); n++;
- file_item.mode[n] = MULTIPLEM;
- file_item.i[n] = CommandItem (filem,"Write Alignment as PostScript/PPP", SavePSSeqWin); n++;
- file_item.mode[n] = PROFILEM;
- file_item.i[n] = CommandItem (filem,"Write Profile 1 as PostScript", SavePSPrf1Win); n++;
- file_item.mode[n] = PROFILEM;
- file_item.i[n] = CommandItem (filem,"Write Profile 2 as PostScript", SavePSPrf2Win); n++;
- file_item.i[n] = CommandItem (filem,"Quit/QQQ", QuitWinI); n++;
- file_item.num = n;
- if(aln_mode==MULTIPLEM)
- check_menus(file_item,PROFILEM);
- else
- check_menus(file_item,MULTIPLEM);
-}
-
-static void QuitWinI (IteM i)
-{
- if(aln_mode == MULTIPLEM)
- {
- if(seq_panel.modified)
- if (Message(MSG_YN,"Alignment has not been saved.\n"
- "Quit program anyway?")==ANS_NO) return;
- }
- else if(aln_mode == PROFILEM)
- {
- if(prf_panel[0].modified)
- if (Message(MSG_YN,"Profile 1 has not been saved.\n"
- "Quit program anyway?")==ANS_NO) return;
- if(prf_panel[1].modified)
- if (Message(MSG_YN,"Profile 2 has not been saved.\n"
- "Quit program anyway?")==ANS_NO) return;
- }
- QuitProgram ();
-}
-
-
-
-static void make_score_menu(void)
-{
- int n=0;
-
- score_item.i[n] = CommandItem (scorem,"Calculate Low-Scoring Segments", calc_segment_exceptions); n++;
- segment_item=score_item.i[n]=StatusItem(scorem, "Show Low-Scoring Segments", set_show_segments);
- SetStatus(score_item.i[n],segment_exceptions); n++;
- score_item.i[n]=StatusItem(scorem, "Show Exceptional Residues", set_residue_exceptions);
- SetStatus(score_item.i[n],residue_exceptions); n++;
- score_item.i[n] = CommandItem (scorem,"Low-Scoring Segment Parameters",SegmentWin); n++;
- score_item.i[n] = CommandItem (scorem,"Column Score Parameters",ScoreWin); n++;
- score_item.mode[n] = MULTIPLEM;
- score_item.i[n]=CommandItem(scorem, "Save Column Scores to File", SaveScoresWin); n++;
-
-
- score_item.num = n;
-}
-
-static void make_help_menu(void)
-{
- int n=0;
-
- help_item.ptr[n] = 'G';
- help_item.i[n] = CommandItem (helpmenu,"General",HelpProc); n++;
- help_item.ptr[n] = 'F';
- help_item.i[n] = CommandItem (helpmenu,"Input & Output Files",HelpProc); n++;
- help_item.ptr[n] = 'E';
- help_item.i[n] = CommandItem (helpmenu,"Editing Alignments",HelpProc); n++;
- help_item.ptr[n] = 'M';
- help_item.i[n] = CommandItem (helpmenu,"Multiple Alignments",HelpProc); n++;
- help_item.ptr[n] = 'P';
- help_item.i[n] = CommandItem (helpmenu,"Profile Alignments",HelpProc); n++;
- help_item.ptr[n] = 'B';
- help_item.i[n] = CommandItem (helpmenu,"Secondary Structures",HelpProc); n++;
- help_item.ptr[n] = 'T';
- help_item.i[n] = CommandItem (helpmenu,"Trees",HelpProc); n++;
- help_item.ptr[n] = 'C';
- help_item.i[n] = CommandItem (helpmenu,"Colors",HelpProc); n++;
- help_item.ptr[n] = 'Q';
- help_item.i[n] = CommandItem (helpmenu,"Alignment Quality",HelpProc); n++;
- help_item.ptr[n] = '9';
- help_item.i[n] = CommandItem (helpmenu,"Command Line Parameters",HelpProc); n++;
- help_item.ptr[n] = 'R';
- help_item.i[n] = CommandItem (helpmenu,"References",HelpProc); n++;
-
- help_item.num = n;
-}
-
-static void HelpProc(IteM item)
-{
- int n,index=-1;
- FILE *fd;
- int i, number, nlines;
- Boolean found_help;
- char temp[MAXLINE+1];
- char token;
- char *digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
- char *help_marker = ">>HELP";
-
- TexT htext;
- char *help_file = NULL;
-
- extern char *help_file_name;
-
- helptext[0]='\0';
-
- for(n=0;n<help_item.num;n++)
- if (item==help_item.i[n])
- {
- index=n;
- break;
- }
-
- if (index==-1)
- {
- error("Problem with HELP routines\n");
- return;
- }
-
-
- help_file=find_file(help_file_name);
- if(help_file==NULL) {
- error("Cannot find help file");
- return;
- }
-
- if((fd=fopen(help_file,"r"))==NULL) {
- error("Cannot open help file [%s]",help_file);
- return;
- }
- nlines = 0;
- number = -1;
- found_help = FALSE;
-
- while(TRUE) {
- if(fgets(temp,MAXLINE+1,fd) == NULL) {
- if(!found_help)
- error("No help found in help file [%s]",help_file);
- fclose(fd);
- return;
- }
- if(strstr(temp,help_marker)) {
- token = ' ';
- for(i=strlen(help_marker); i<8; i++)
- if(strchr(digits, temp[i])) {
- token = temp[i];
- break;
- }
- if(token == help_item.ptr[index]) {
- found_help = TRUE;
- while(fgets(temp,MAXLINE+1,fd)) {
- if(strstr(temp, help_marker)) break;
- if(strlen(helptext)+strlen(temp) >MAXHELPLENGTH)
- break;
- for(i=strlen(temp)-1;i>=0;i--)
- if(iscntrl(temp[i])||isspace(temp[i]))
- temp[i]='\0';
- else break;
-/* ignore lines starting with < - these are for html output processing */
- if(temp[0]!='<') {
- strcat(helptext,temp);
-#ifdef WIN_MAC
- strcat(helptext,"\r");
-#else
-#ifdef WIN32
- strcat(helptext,"\r\n");
-#else
- strcat(helptext,"\n");
-#endif
-#endif
- ++nlines;
- }
- }
- fclose(fd);
-#ifdef WIN_MAC
- if(numhelp>=1)
-#else
- if(numhelp>=MAXHELPW)
-#endif
- {
- error("Too many help windows");
- return;
- }
- numhelp++;
-
- helpw[numhelp]=FixedWindow(-50, -33, -10, -10, "", QuitHelpW);
- SelectFont(helpfont);
-#ifdef WIN_MAC
- htext=ScrollText(helpw[numhelp], 60, 20, helpfont, TRUE, NULL);
-#else
- htext=ScrollText(helpw[numhelp], 80, 30, helpfont, TRUE, NULL);
-#endif
- Break(helpw[numhelp]);
- PushButton(helpw[numhelp], "OK", QuitHelpB);
- SetTitle(htext, helptext);
- Show(helpw[numhelp]);
- return;
- }
- }
- }
-}
-
-void QuitHelpB(ButtoN b)
-{
-
- Remove(ParentWindow(b));
- numhelp--;
-}
-
-void QuitHelpW(WindoW w)
-{
-
- Remove(w);
- numhelp--;
-}
-static void make_edit_menu(void)
-{
- int n=0;
-
- edit_item.i[n] = CommandItem (editm,"Cut Sequences/XXX", CutSequences); n++;
- edit_item.i[n] = CommandItem (editm,"Paste Sequences/VVV", PasteSequences); n++;
- edit_item.mode[n] = MULTIPLEM;
- edit_item.i[n] = CommandItem (editm,"Select All Sequences/AAA", SelectSeqs); n++;
- edit_item.mode[n] = PROFILEM;
- edit_item.i[n] = CommandItem (editm,"Select Profile 1", SelectPrf1); n++;
- edit_item.mode[n] = PROFILEM;
- edit_item.i[n] = CommandItem (editm,"Select Profile 2", SelectPrf2); n++;
- edit_item.mode[n] = PROFILEM;
- edit_item.i[n] = CommandItem (editm,"Add Profile 2 to Profile 1", MergeProfiles); n++;
- edit_item.i[n] = CommandItem (editm,"Clear Sequence Selection",ClearSeqs); n++;
- edit_item.i[n] = CommandItem (editm,"Clear Range Selection",ClearSeqRange); n++;
- SeparatorItem(editm);
- edit_item.i[n] = CommandItem (editm,"Search for String/FFF", SearchStrWin); n++;
- SeparatorItem(editm);
- edit_item.i[n] = CommandItem (editm,"Remove All Gaps", RemoveGaps); n++;
- edit_item.i[n] = CommandItem (editm,"Remove Gap-Only Columns", RemoveGapPos); n++;
- edit_item.num = n;
- if(aln_mode==MULTIPLEM)
- check_menus(edit_item,PROFILEM);
- else
- check_menus(edit_item,MULTIPLEM);
-}
-
-static void make_align_menu(void)
-{
- MenU parasm;
- int n=0;
-
- align_item.mode[n] = MULTIPLEM;
- align_item.i[n] = CommandItem (alignm,"Do Complete Alignment/LLL",CAlignWin); n++;
- align_item.mode[n] = MULTIPLEM;
- align_item.i[n] = CommandItem (alignm,"Produce Guide Tree Only/GGG",SaveTreeWin); n++;
- align_item.mode[n] = MULTIPLEM;
- align_item.i[n] = CommandItem (alignm,"Do Alignment from Guide Tree",AlignFromTreeWin); n++;
- SeparatorItem(alignm);
- align_item.mode[n] = MULTIPLEM;
- align_item.i[n] = CommandItem (alignm,"Realign Selected Sequences",RealignSeqsWin); n++;
- align_item.mode[n] = MULTIPLEM;
- align_item.i[n] = CommandItem (alignm,"Realign Selected Residue Range",RealignSeqRangeWin); n++;
- align_item.mode[n] = PROFILEM;
- align_item.i[n] = CommandItem (alignm,"Align Profile 2 to Profile 1",PrfPrfAlignWin); n++;
- align_item.mode[n] = PROFILEM;
- align_item.i[n] = CommandItem (alignm,"Align Profiles from Guide Trees",PrfPrfTreeAlignWin); n++;
- align_item.mode[n] = PROFILEM;
- align_item.i[n] = CommandItem (alignm,"Align Sequences to Profile 1",SeqPrfAlignWin); n++;
- align_item.mode[n] = PROFILEM;
- align_item.i[n] = CommandItem (alignm,"Align Sequences to Profile 1 from Tree",SeqPrfTreeAlignWin); n++;
-
- SeparatorItem(alignm);
- parasm=SubMenu(alignm,"Alignment Parameters");
- new_gaps_item=align_item.i[n]=StatusItem(parasm, "Reset New Gaps before Alignment", set_reset_new_gaps);
- SetStatus(align_item.i[n],reset_alignments_new); n++;
- all_gaps_item=align_item.i[n]=StatusItem(parasm, "Reset All Gaps before Alignment", set_reset_all_gaps);
- SetStatus(align_item.i[n],reset_alignments_all); n++;
- align_item.i[n] = CommandItem (parasm,"Pairwise Alignment Parameters",PWParameters); n++;
- align_item.i[n] = CommandItem (parasm,"Multiple Alignment Parameters",MultiParameters); n++;
- align_item.i[n] = CommandItem (parasm,"Protein Gap Parameters",GapParameters); n++;
- align_item.mode[n] = PROFILEM;
- align_item.i[n] = CommandItem (parasm,"Secondary Structure Parameters",SSParameters); n++;
- align_item.i[n]=StatusItem(alignm, "Save Log File", set_save_log);
- save_item1=align_item.i[n];
- SetStatus(save_item1,save_log); n++;
- align_item.i[n] = CommandItem (alignm,"Output Format Options",OutputParameters); n++;
- align_item.num = n;
- if(aln_mode==MULTIPLEM)
- check_menus(align_item,PROFILEM);
- else
- check_menus(align_item,MULTIPLEM);
-
-}
-
-void set_save_log(IteM i)
-{
- save_log=GetStatus(i);
- SetStatus(save_item1,save_log);
- SetStatus(save_item2,save_log);
-}
-
-static void make_tree_menu(void)
-{
- int n=0;
-
- tree_item.mode[n] = MULTIPLEM;
- tree_item.i[n] = CommandItem (treem,"Draw N-J Tree/RRR",DrawTreeWin); n++;
- tree_item.mode[n] = MULTIPLEM;
- tree_item.i[n] = CommandItem (treem,"Bootstrap N-J Tree/BBB",BootstrapTreeWin); n++;
- SeparatorItem(treem);
- tree_item.mode[n] = MULTIPLEM;
- tree_item.i[n]=StatusItem(treem, "Exclude Positions with Gaps", set_tossgaps);
- SetStatus(tree_item.i[n],tossgaps); n++;
- tree_item.mode[n] = MULTIPLEM;
- tree_item.i[n]=StatusItem(treem, "Correct for Multiple Substitutions", set_kimura);
- SetStatus(tree_item.i[n],kimura); n++;
- SeparatorItem(treem);
- tree_item.mode[n] = MULTIPLEM;
- tree_item.i[n]=StatusItem(treem, "Save Log File", set_save_log);
- save_item2=tree_item.i[n];
- SetStatus(save_item2,save_log); n++;
- tree_item.mode[n] = MULTIPLEM;
- tree_item.i[n] = CommandItem (treem,"Output Format Options",OutputTreeParameters); n++;
- tree_item.mode[n] = MULTIPLEM;
- tree_item.num = n;
- if(aln_mode==MULTIPLEM)
- check_menus(tree_item,PROFILEM);
- else
- check_menus(tree_item,MULTIPLEM);
-
-}
-
-static void make_color_menu(void)
-{
- int n=0;
-
- color_item.i[n]=StatusItem(colorm, "Background Coloring", set_inverted);
- SetStatus(color_item.i[n],inverted); n++;
- SeparatorItem(colorm);
- bw_item=color_item.i[n] = StatusItem (colorm,"Black and White",BlackandWhite);
- SetStatus(color_item.i[n],usebw); n++;
- defcol_item=color_item.i[n] = StatusItem (colorm,"Default Colors",DefColorPar);
- SetStatus(color_item.i[n],usedefcolors); n++;
- /*usercol_item=color_item.i[n] = StatusItem (colorm,"Load Color Parameter File",OpenColorParWin);*/
- usercol_item=color_item.i[n] = StatusItem (colorm,"Load Color Parameter File",OpenColorPar);
- SetStatus(color_item.i[n],useusercolors); n++;
- color_item.num = n;
-
-}
-
-void check_menus(menu_item m,int mode)
-{
- int i;
-
- for (i=0;i<m.num;i++)
- if (m.mode[i] == mode)
- Disable(m.i[i]);
- else
- Enable(m.i[i]);
-}
-
-
-
Deleted: trunk/packages/clustalw/trunk/xmenu.h
===================================================================
--- trunk/packages/clustalw/trunk/xmenu.h 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/xmenu.h 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,278 +0,0 @@
-
-#define MAXHELPLENGTH 50000
-#define MAXHELPW 2
-
-#define DCOLS 60 /* sequence display width */
-#define DNAMES 15
-#define DNUMBER 6
-#define SCREENMARGIN 10
-#define SCOREHEIGHT 50
-
-#define MAXMENU 20
-#define MAXCOLORS 16
-
-#define MESSLENGTH 70
-#define MESSLINES 10
-#define MAXPROMPTLEN 300
-
-#define MULTIPLEM 1
-#define PROFILEM 2
-
-#define NEW 1
-#define OLD 0
-
-#define HIGHLIGHT 1
-#define NORMAL 0
-
-#define MARGIN 1
-
-#define A4 0
-#define A3 1
-#define USLETTER 2
-
-#define PORTRAIT 1
-#define LANDSCAPE 0
-
-#define MAXFINDSTR 20
-typedef struct aln_pos {
- int seq;
- int res;
-} aln_pos;
-
-typedef struct menu_item {
- int num;
- IteM i[MAXMENU];
- int mode[MAXMENU];
- char ptr[MAXMENU];
-} menu_item;
-
-typedef struct color {
- char name[20];
- Uint4 val;
- float r,g,b;
- float pr,pg,pb;
-} color;
-
-#define NAMES 0
-#define SEQS 1
-
-typedef struct panel_data {
- int type; /* = NAMES or SEQS */
- PaneL index;
- int prf_no;
- char **lines;
- char **header;
- char **footer;
- char **colormask;
- int nhead;
- int nfoot;
- PnlActnProc callback;
- int pixelheight;
- int pixelwidth;
- int vlines;
- int vseqs;
- int vcols;
- int nseqs;
- int ncols;
- int firstseq;
- int firstvline;
- int firstvcol;
- int lockoffset;
- int *selected;
- int firstsel;
- int lastsel;
- int lineheight, charwidth, ascent, descent;
- BaR vscrollbar,hscrollbar;
- int *seqweight;
- int *subgroup;
- int *colscore;
- char *consensus;
- Boolean **residue_exception;
- short **segment_exception;
-} panel_data;
-
-typedef struct spanel {
- PaneL names;
- PaneL seqs;
- Boolean modified;
-} spanel;
-
-typedef struct range {
- int first;
- int last;
-} range;
-
-/*
- PROTOTYPES - subroutines with capitalised names are defined in NCBI toolkit
- and cannot be modified!
-*/
-void x_menu(void);
-void ResizeWindowProc(WindoW w);
-
-void shift(Handle a, int dx, int dy);
-void swap(float *scores,int s1, int s2);
-void sort_scores(float *scores,int f,int l);
-void reload_alignment(void);
-
-void DrawPanel(PaneL p);
-
-void resize_multi_window(void);
-void resize_prf_window(int numseqs1,int numseqs2);
-void position_scrollbars(spanel p);
-
-void color_seqs(void);
-void color_prf1(void);
-void color_prf2(void);
-
-void select_panel(spanel p);
-void deselect_panel(spanel p);
-void correct_name_bars(Boolean reset);
-void correct_seq_bars(Boolean reset);
-void load_aln_data(spanel p, int fs, int ls, Boolean reset);
-void load_aln(spanel p, int fs, int ls, Boolean reset);
-void remove_gap_pos(int fseq,int lseq,int prf_no);
-
-GrouP make_scroll_area(GrouP w,int prf_no,int nwidth,int swidth,int height,int firstseq,int nseqs,spanel *p);
-
-void draw_seq_pointer(PaneL p,int pos,int format);
-void draw_names(PaneL p);
-void draw_seqs(PaneL p);
-void draw_header(PaneL p);
-void draw_footer(PaneL p);
-void draw_colscores(PaneL p);
-void draw_allseqs(PaneL p,int fseq,int lseq);
-void draw_nameline(PaneL p,int fseq,int lseq,int format);
-void draw_seqline(panel_data data,int seq,PoinT pt,int fcol,int lcol,int format);
-void draw_seqcol(PaneL p,int col,int format);
-void highlight_seqrange(PaneL p,int fcol,int lcol,int format);
-void make_ruler(int length,char *name,char *seq);
-void make_consensus(panel_data data,char *name,char *seq);
-void make_colscores(panel_data data);
-int calc_colscore(sint matrix[NUMRES][NUMRES],int col);
-void calc_seg_exceptions(void);
-int make_struct_data(int prf_no,int length,char *name,char *seq);
-int make_gp_data(int prf_no,int length,char *name,char *seq);
-void make_colormask(panel_data data);
-void init_color_parameters(char *filename);
-char *find_file(char *filename);
-void white_on_black(void);
-void black_on_white(void);
-void text_colors(void);
-void data_colors(void);
-void switch_mode(void);
-void show_segment_exceptions(void);
-void check_menus(menu_item m,int mode);
-FILE * open_input_file(char *file_name);
-void stripspace(char *str);
-
-void set_scorecutoff(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-void set_lengthcutoff(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-void set_scorescale(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-void set_go_penalty(TexT t);
-void set_ge_penalty(TexT t);
-void set_gap_dist(TexT t);
-void set_ntrials(TexT t);
-void set_ran_seed(TexT t);
-void set_div_seq(TexT t);
-void set_pw_go_penalty(TexT t);
-void set_pw_ge_penalty(TexT t);
-void set_gp(TexT t);
-void set_ktuple(TexT t);
-void set_topdiags(TexT t);
-void set_window(TexT t);
-void set_hyd_res(TexT t);
-void set_matrix(GrouP g);
-void set_dnamatrix(GrouP g);
-void set_user_matrix(ButtoN but);
-void set_user_dnamatrix(ButtoN but);
-int get_user_matrixname(char *usermtrxname,short *usermat,short *aa_xref,int usermatnum,int *matnum,PrompT mattext);
-void set_format(GrouP g);
-void set_button(ButtoN l,Boolean *value);
-void set_toggle(PopuP l,Boolean *value);
-void set_pref_penalties(PopuP l);
-void set_hyd_penalties(PopuP l);
-void set_var_penalties(PopuP l);
-void set_endgaps(PopuP l);
-void set_align_endgappenalties(PopuP l);
-void set_realign_endgappenalties(PopuP l);
-void set_case(PopuP l);
-void set_snos(PopuP l);
-
-void setRange(PopuP l);
-
-void set_save_paras(PopuP l);
-void set_transitions(TexT t);
-void set_save_log(IteM i);
-void set_neg_matrix(PopuP l);
-void set_ambiguities(PopuP l);
-void set_aln_mode(PopuP g);
-void set_pscroll_mode(ButtoN l);
-void set_show_segments(IteM l);
-void set_font_size(PopuP g);
-void set_residue_exceptions(IteM i);
-void set_segment_exceptions(IteM i);
-void set_segment_dnascale(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval);
-void set_fs_toggle(PopuP l);
-void set_score_matrix(GrouP g);
-void set_segment_matrix(GrouP g);
-void set_score_user_matrix(ButtoN but);
-void set_segment_user_matrix(ButtoN but);
-void set_score_dnamatrix(GrouP g);
-void set_segment_dnamatrix(GrouP g);
-void set_score_user_dnamatrix(ButtoN but);
-void set_segment_user_dnamatrix(ButtoN but);
-void set_pagesize(PopuP g);
-void set_orientation(PopuP g);
-void set_header(PopuP l);
-void set_ruler(PopuP l);
-void set_curve(PopuP l);
-void set_resno(PopuP l);
-void set_resize(PopuP l);
-void set_fres(TexT t);
-void set_lres(TexT t);
-void set_fpres(TexT t);
-void set_lpres(TexT t);
-void set_blocklen(TexT t);
-void set_output_clustal(ButtoN l);
-void set_output_nbrf(ButtoN l);
-void set_output_phylip(ButtoN l);
-void set_output_gcg(ButtoN l);
-void set_output_gde(ButtoN l);
-void set_output_nexus(ButtoN l);
-
-void set_output_pim(ButtoN l); /* Ramu */
-
-void set_output_fasta(ButtoN l); /* Ramu */
-
-void set_pw_matrix(GrouP g);
-void set_pw_dnamatrix(GrouP g);
-void set_pw_user_matrix(ButtoN but);
-void set_pw_user_dnamatrix(ButtoN but);
-void set_output_order(PopuP g);
-void set_output_tree_clustal(ButtoN l);
-void set_output_tree_phylip(ButtoN l);
-void set_output_tree_distances(ButtoN l);
-void set_output_tree_nexus(ButtoN l);
-void set_inverted(IteM i);
-void set_tossgaps(IteM i);
-void set_kimura(IteM i);
-void set_boot_format(PopuP g);
-void set_use_ss1(PopuP l);
-void set_use_ss2(PopuP l);
-void set_helix_gp(TexT t);
-void set_strand_gp(TexT t);
-void set_loop_gp(TexT t);
-void set_terminal_gp(TexT t);
-void set_helix_minus(TexT t);
-void set_helix_plus(TexT t);
-void set_strand_minus(TexT t);
-void set_strand_plus(TexT t);
-void set_ss_output(ButtoN b);
-void set_gp_output(ButtoN b);
-void calc_segment_exceptions(IteM i);
-
-void write_ps_file(spanel p,char *ps_file,char *par_file,int pagesize,
-int orientation,Boolean header, Boolean ruler, Boolean resno, Boolean resize,
-int first_printres,int last_printres,
-int blocklen,Boolean show_curve);
-
Deleted: trunk/packages/clustalw/trunk/xscore.c
===================================================================
--- trunk/packages/clustalw/trunk/xscore.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/xscore.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,1017 +0,0 @@
-#include <stdio.h>
-#include <stdarg.h>
-#include <string.h>
-
-#include <vibrant.h>
-#include <document.h>
-
-#include "clustalw.h"
-#include "xmenu.h"
-
-static void build_profile(int prf_length,int first_seq,int last_seq,sint matrix[NUMRES][NUMRES],
- sint *weight,sint **profile);
-static void calc_colscores(Boolean update_seqs,Boolean update_scores);
-static void calc_panel_segment_exceptions(PaneL p);
-static void calc_weights(PaneL p);
-static void remove_short_segments(PaneL p);
-
-
-extern Boolean aln_mode;
-extern Boolean profile1_empty,profile2_empty;
-
-extern int score_cutoff; /* cutoff for residue exceptions */
-extern int score_hwin; /* half window for summing alignment column scores */
-extern int score_scale;
-extern int segment_dnascale;
-extern int length_cutoff; /* cutoffs for segment exceptions */
-extern Boolean residue_exceptions;
-extern Boolean segment_exceptions;
-extern int score_matnum;
-extern char score_mtrxname[];
-extern int segment_matnum;
-extern char segment_mtrxname[];
-extern int score_dnamatnum;
-extern char score_dnamtrxname[];
-extern int segment_dnamatnum;
-extern char segment_dnamtrxname[];
-extern IteM segment_item;
-
-extern double **tmat;
-
-extern short score_matrix[];
-extern short score_aa_xref[];
-extern short segment_matrix[];
-extern short segment_aa_xref[];
-extern short score_dnamatrix[];
-extern short score_dna_xref[];
-extern short segment_dnamatrix[];
-extern short segment_dna_xref[];
-
-extern WindoW mainw;
-extern FonT datafont;
-extern short idmat[];
-extern short def_dna_xref[],def_aa_xref[];
-extern short swgapdnamt[],clustalvdnamt[]; /* used for alignment scores */
-extern short gon80mt[],gon120mt[],gon250mt[],gon350mt[];
-extern Boolean dnaflag;
-extern sint max_aa;
-extern sint *seqlen_array;
-extern char **seq_array;
-extern sint gap_pos1, gap_pos2;
-extern sint *output_index;
-extern spanel seq_panel; /* data for multiple alignment area */
-extern spanel prf_panel[]; /* data for profile alignment areas */
-
-extern PrompT residue_cutofftext;
-extern PrompT length_cutofftext;
-extern PrompT scorescaletext;
-extern PrompT segmentdnascaletext;
-extern PrompT scoremattext;
-extern PrompT segmentmattext;
-extern PrompT scorednamattext;
-extern PrompT segmentdnamattext;
-extern PopuP show_seg_toggle;
-
-extern GrouP score_matrix_list,seg_matrix_list;
-extern GrouP score_dnamatrix_list,seg_dnamatrix_list;
-
-static Char filename[FILENAMELEN]; /* used in temporary file selection window */
-
-
-
-void draw_colscores(PaneL p)
-{
- RecT block,r;
- int i, b, x, y;
- panel_data data;
-
- UseWindow(mainw);
- Select(p);
- SelectFont(datafont);
- GetPanelExtra(p, &data);
- if(data.nseqs == 0) return;
- if(data.colscore == NULL) return;
- if(data.vcols<=0) return;
-
- ObjectRect (p, &r);
- InsetRect(&r,1,1);
- block.bottom=r.bottom;
- block.top=block.bottom-SCOREHEIGHT-1;
- block.left=r.left;
- block.right=r.right;
- data_colors();
- EraseRect(&block);
-
- Gray();
- r.left=block.left+data.charwidth;
- r.bottom=b=block.bottom;
- for(i=data.firstvcol;i<data.firstvcol+data.vcols && i<data.ncols;i++)
- {
- x=r.left;
- MoveTo(x,b);
- r.right=r.left+data.charwidth;
- r.top=block.bottom-SCOREHEIGHT*(float)data.colscore[i]/100.0;
- PaintRect(&r);
- r.left+=data.charwidth;
- }
- black_on_white();
-}
-
-
-void make_colscores(panel_data data)
-{
-/* FILE *fd;*/
- int n,i,s,p,r,r1;
- short *mat_xref, *matptr;
- float median,mean;
- float t,q1,q3,ul;
- float *seqdist,*sorteddist,diff;
- sint maxres;
- sint *seqvector;
- sint *freq,**profile;
- sint matrix[NUMRES][NUMRES];
- Boolean include_gaps=FALSE;
- panel_data data1;
-
- if(dnaflag)
- {
- if (score_dnamatnum==1)
- {
- matptr = swgapdnamt;
- mat_xref = def_dna_xref;
- }
- else if (score_dnamatnum==2)
- {
- matptr = clustalvdnamt;
- mat_xref = def_dna_xref;
- }
- else
- {
- matptr = score_dnamatrix;
- mat_xref = score_dna_xref;
- }
- }
- else if (score_matnum==1)
- {
- matptr = idmat;
- mat_xref = def_aa_xref;
- }
- else if (score_matnum==2)
- {
- matptr = gon80mt;
- mat_xref = def_aa_xref;
- }
- else if (score_matnum==3)
- {
- matptr = gon120mt;
- mat_xref = def_aa_xref;
- }
- else if (score_matnum==4)
- {
- matptr = gon250mt;
- mat_xref = def_aa_xref;
- }
- else if (score_matnum==5)
- {
- matptr = gon350mt;
- mat_xref = def_aa_xref;
- }
- else
- {
- matptr = score_matrix;
- mat_xref = score_aa_xref;
- }
- maxres = get_matrix(matptr, mat_xref, matrix, FALSE, 100);
- if (maxres == 0)
- {
- error("matrix not found for aln score");
- return;
- }
-
- profile = (sint **) ckalloc( (data.ncols+2) * sizeof (sint *) );
- for(p=0; p<data.ncols; p++)
- profile[p] = (sint *) ckalloc( (max_aa+2) * sizeof(sint) );
- freq = (sint *) ckalloc( (max_aa+2) * sizeof (sint) );
-
- for(p=0;p<data.ncols;p++)
- {
- for(r=0;r<max_aa;r++)
- freq[r]=0;
- for(s=data.firstseq;s<data.firstseq+data.nseqs;s++)
- if(p<seqlen_array[s+1] && seq_array[s+1][p+1]>=0 && seq_array[s+1][p+1]<max_aa)
- {
- freq[seq_array[s+1][p+1]]++;
- }
- for(r=0;r<max_aa;r++)
- {
- profile[p][r]=0;
- for(r1=0;r1<max_aa;r1++)
- profile[p][r]+=freq[r1]*matrix[r1][r];
- profile[p][r]/=(float)data.nseqs;
- }
- }
-/*
-fprintf(fd,"Profile...\n");
-for(r=0;r<max_aa;r++)
-{
- for(p=0;p<data.ncols;p++)
- fprintf(fd,"%d\t",profile[p][r]);
- fprintf(fd,"\n");
-}
-*/
- seqvector = (sint *) ckalloc( (max_aa+2) * sizeof(sint) );
- seqdist=(float *)ckalloc((data.nseqs+1)*sizeof(float));
- sorteddist=(float *)ckalloc((data.nseqs+1)*sizeof(float));
-
- for(p=0; p<data.ncols; p++)
- {
- for(s=data.firstseq; s<data.firstseq+data.nseqs; s++)
- {
- if (p<seqlen_array[s+1])
- for (r=0;r<max_aa; r++)
- seqvector[r]=matrix[r][(int)seq_array[s+1][p+1]];
- else
- for (r=0;r<max_aa; r++)
- seqvector[r]=matrix[r][gap_pos1];
- seqdist[s-data.firstseq]=0.0;
- for(r=0;r<max_aa;r++)
- {
- diff=profile[p][r]-seqvector[r];
- diff/=1000.0;
- seqdist[s-data.firstseq]+=diff*diff;
- }
- seqdist[s-data.firstseq]=sqrt((double)seqdist[s-data.firstseq]);
- }
-/*
-fprintf(fd,"\n\nPosition %d:\n",p+1);
-fprintf(fd,"Sequence Distances...\n");
-for(s=0;s<data.nseqs;s++)
- fprintf(fd,"%.1f\t",seqdist[s]);
-*/
-/* calculate mean,median and rms of seq distances */
- mean=median=0.0;
- if(include_gaps)
- {
- for(s=0; s<data.nseqs; s++)
- mean+=seqdist[s];
- mean/=data.nseqs;
- n=data.nseqs;
- for(s=0; s<data.nseqs; s++)
- sorteddist[s]=seqdist[s];
- }
- else
- {
- n=0;
- for(s=data.firstseq; s<data.firstseq+data.nseqs; s++)
- if(p<seqlen_array[s+1] && seq_array[s+1][p+1]>=0 && seq_array[s+1][p+1]<max_aa)
- {
- mean+=seqdist[s-data.firstseq];
- n++;
- }
- if(n>0) mean/=n;
- for(s=data.firstseq,i=0; s<data.firstseq+data.nseqs; s++)
- if(p<seqlen_array[s+1] && seq_array[s+1][p+1]>=0 && seq_array[s+1][p+1]<max_aa)
- sorteddist[i++]=seqdist[s-data.firstseq];
- }
- sort_scores(sorteddist,0,n-1);
-/*
-fprintf(fd,"\nSorted:\n");
-for(s=0;s<n;s++)
- fprintf(fd,"%.1f ",sorteddist[s]);
-*/
-
- if(n == 0)
- median = 0;
- else if(n % 2 == 0)
- median=(sorteddist[n/2-1]+sorteddist[n/2])/2.0;
- else
- median=sorteddist[n/2];
- if(score_scale<=5)
- data.colscore[p]=exp((double)(-mean*(6-score_scale)/4.0))*100.0*n/data.nseqs;
- else
- data.colscore[p]=exp((double)(-mean/(4.0*(score_scale-4))))*100.0*n/data.nseqs;
-/*
-fprintf(fd,"\nMean %.1f Median %.1f Score %.1f\n",mean,median,data.colscore[p]);
-*/
- if(n==0)
- {
- ul=0;
- }
- else
- {
- t = n/4.0 + 0.5;
- if(t - (int)t == 0.5)
- {
- q3=(sorteddist[(int)t]+sorteddist[(int)t+1])/2.0;
- q1=(sorteddist[n-(int)t]+sorteddist[n-(int)t-1])/2.0;
- }
- else if(t - (int)t > 0.5)
- {
- q3=sorteddist[(int)t+1];
- q1=sorteddist[n-(int)t-1];
- }
- else
- {
- q3=sorteddist[(int)t];
- q1=sorteddist[n-(int)t];
- }
- if (n<4)ul=sorteddist[0];
- else ul=q3+(q3-q1)*((float)score_cutoff/2.0);
- }
-/*
-fprintf(fd,"\nMedian %.1f Q1 %.1f Q3 %.1f UL %.1f\n",median,q1,q3,ul);
-fprintf(fd,"\nExceptions: ");
-for(s=0;s<data.nseqs;s++)
- if(seqdist[s]>ul) fprintf(fd,"%d ",s+1);
-*/
- for(s=data.firstseq;s<data.firstseq+data.nseqs;s++)
- if(seqdist[s-data.firstseq]>ul && p<seqlen_array[s+1] && seq_array[s+1][p+1]>=0 && seq_array[s+1][p+1]<max_aa)
- data.residue_exception[s-data.firstseq][p]=TRUE;
- else
- data.residue_exception[s-data.firstseq][p]=FALSE;
- }
-/*
-fclose(fd);
-*/
- for(p=0;p<data.ncols;p++)
- ckfree(profile[p]);
- ckfree(profile);
- ckfree(freq);
- ckfree(seqvector);
- ckfree(seqdist);
- ckfree(sorteddist);
-
-
-}
-
-
-void sort_scores(float *scores,int f,int l)
-{
- int i,last;
-
- if(f>=l) return;
-
- swap(scores,f,(f+l)/2);
- last=f;
- for(i=f+1;i<=l;i++)
- {
- if(scores[i]>scores[f])
- swap(scores,++last,i);
- }
- swap(scores,f,last);
- sort_scores(scores,f,last-1);
- sort_scores(scores,last+1,l);
-
-}
-
-void swap(float *scores,int s1, int s2)
-{
- float temp;
-
- temp=scores[s1];
- scores[s1]=scores[s2];
- scores[s2]=temp;
-}
-
-
-void set_scorescale(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- char str[FILENAMELEN];
- panel_data data;
-
- score_scale = newval+1;
-
- calc_colscores(FALSE,TRUE);
-
- sprintf(str,"Score Plot Scale: %2d",score_scale);
- SetTitle(scorescaletext,str);
-}
-
-void set_scorecutoff(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- char str[FILENAMELEN];
- int temp;
- panel_data data;
- temp=newval+1;
-
- score_cutoff = temp;
-
- calc_colscores(residue_exceptions,FALSE);
- sprintf(str,"Residue Exception Cutoff: %2d",score_cutoff);
- SetTitle(residue_cutofftext,str);
-}
-
-
-
-
-void calc_segment_exceptions(IteM i)
-{
- WatchCursor();
- segment_exceptions=TRUE;
- calc_seg_exceptions();
- show_segment_exceptions();
- SetValue(show_seg_toggle,1);
- SetStatus(segment_item,segment_exceptions);
- ArrowCursor();
-}
-
-void set_lengthcutoff(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- char str[100];
- int temp;
-
- temp=newval+1;
-
- length_cutoff = temp;
- sprintf(str,"Minimum Length of Segments: %2d",length_cutoff);
-
- if(aln_mode==MULTIPLEM)
- {
- remove_short_segments(seq_panel.seqs);
- }
- else
- {
- remove_short_segments(prf_panel[0].seqs);
- remove_short_segments(prf_panel[1].seqs);
- }
- if(segment_exceptions) show_segment_exceptions();
- SetTitle(length_cutofftext,str);
-
-}
-
-
-void set_score_user_matrix(ButtoN but)
-{
-
- if(get_user_matrixname(score_mtrxname,score_matrix,score_aa_xref,6,&score_matnum,scoremattext))
- {
- calc_colscores(residue_exceptions,TRUE);
- SetValue(score_matrix_list,score_matnum);
- }
-}
-
-void set_score_matrix(GrouP g)
-{
- int tmp;
-
- tmp = GetValue(g);
- if(tmp>0 && tmp<6)
- {
- score_matnum=tmp;
- }
- else
- {
- if (score_mtrxname[0]=='\0')
- {
- get_user_matrixname(score_mtrxname,score_matrix,score_aa_xref,6,&score_matnum,scoremattext);
- }
- else score_matnum=6;
- }
- calc_colscores(residue_exceptions,TRUE);
-
- SetValue(score_matrix_list,score_matnum);
-}
-
-void set_segment_user_matrix(ButtoN but)
-{
-
- if(get_user_matrixname(segment_mtrxname,segment_matrix,segment_aa_xref,5,&segment_matnum,segmentmattext))
- {
- calc_seg_exceptions();
- if(segment_exceptions) show_segment_exceptions();
- SetValue(seg_matrix_list,segment_matnum);
- }
-}
-
-
-void set_segment_matrix(GrouP g)
-{
- int tmp;
-
- tmp = GetValue(g);
- if(tmp>0 && tmp<5)
- {
- segment_matnum=tmp;
- }
- else
- {
- if (segment_mtrxname[0]=='\0')
- {
- get_user_matrixname(segment_mtrxname,segment_matrix,segment_aa_xref,5,&segment_matnum,segmentmattext);
- }
- else segment_matnum=5;
- }
-
- calc_seg_exceptions();
- if(segment_exceptions) show_segment_exceptions();
-
- SetValue(seg_matrix_list,segment_matnum);
-}
-
-
-void set_score_user_dnamatrix(ButtoN but)
-{
-
- if(get_user_matrixname(score_dnamtrxname,score_dnamatrix,score_dna_xref,3,&score_dnamatnum,scorednamattext))
- {
- calc_colscores(residue_exceptions,TRUE);
- SetValue(score_dnamatrix_list,score_dnamatnum);
- }
-}
-
-void set_score_dnamatrix(GrouP g)
-{
- int tmp;
-
- tmp = GetValue(g);
- if(tmp>0 && tmp<3)
- {
- score_dnamatnum=tmp;
- }
- else
- {
- if (score_dnamtrxname[0]=='\0')
- {
- get_user_matrixname(score_dnamtrxname,score_dnamatrix,score_dna_xref,3,&score_dnamatnum,scorednamattext);
- }
- else score_dnamatnum=3;
- }
- calc_colscores(residue_exceptions,TRUE);
-
- SetValue(score_dnamatrix_list,score_dnamatnum);
-}
-
-void set_segment_user_dnamatrix(ButtoN but)
-{
-
- if(get_user_matrixname(segment_dnamtrxname,segment_dnamatrix,segment_dna_xref,3,&segment_dnamatnum,segmentdnamattext))
- calc_seg_exceptions();
- if(segment_exceptions) show_segment_exceptions();
-
- SetValue(seg_dnamatrix_list,segment_dnamatnum);
-}
-
-void set_segment_dnamatrix(GrouP g)
-{
- int tmp;
-
- tmp = GetValue(g);
- if(tmp>0 && tmp<3)
- {
- segment_dnamatnum=tmp;
- }
- else
- {
- if (segment_dnamtrxname[0]=='\0')
- {
- get_user_matrixname(segment_dnamtrxname,segment_dnamatrix,segment_dna_xref,3,&segment_dnamatnum,segmentdnamattext);
- }
- else segment_dnamatnum=3;
- }
-
- calc_seg_exceptions();
- if(segment_exceptions) show_segment_exceptions();
-
- SetValue(seg_dnamatrix_list,segment_dnamatnum);
-}
-
-static void calc_colscores(Boolean update_seqs,Boolean update_scores)
-{
- panel_data data;
-
- if(aln_mode==MULTIPLEM)
- {
- GetPanelExtra(seq_panel.seqs,&data);
- make_colscores(data);
- SetPanelExtra(seq_panel.seqs,&data);
- if(update_seqs) draw_seqs(seq_panel.seqs);
- if(update_scores) draw_colscores(seq_panel.seqs);
- }
- else
- {
- GetPanelExtra(prf_panel[0].seqs,&data);
- make_colscores(data);
- SetPanelExtra(prf_panel[0].seqs,&data);
- if(update_seqs) draw_seqs(prf_panel[0].seqs);
- if(update_scores) draw_colscores(prf_panel[0].seqs);
- GetPanelExtra(prf_panel[1].seqs,&data);
- make_colscores(data);
- SetPanelExtra(prf_panel[1].seqs,&data);
- if(update_seqs) draw_seqs(prf_panel[1].seqs);
- if(update_scores) draw_colscores(prf_panel[1].seqs);
- }
-}
-
-void calc_seg_exceptions(void)
-{
- if(aln_mode==MULTIPLEM)
- {
- calc_panel_segment_exceptions(seq_panel.seqs);
- }
- else
- {
- calc_panel_segment_exceptions(prf_panel[0].seqs);
- calc_panel_segment_exceptions(prf_panel[1].seqs);
- }
-}
-
-void show_segment_exceptions(void)
-{
- if(aln_mode==MULTIPLEM)
- {
- draw_seqs(seq_panel.seqs);
- }
- else
- {
- draw_seqs(prf_panel[0].seqs);
- draw_seqs(prf_panel[1].seqs);
- }
-}
-
-static void remove_short_segments(PaneL p)
-{
- int i,j,k,start;
- panel_data data;
-
- GetPanelExtra(p,&data);
- if(data.nseqs<=0) return;
-
-/* Reset all the exceptions - a value of 1 indicates an exception that
-will be displayed. A value of -1 is used to remember exceptions that
-are temporarily hidden in the display */
- for(i=0;i<data.nseqs;i++)
- for(j=0;j<data.ncols;j++)
- if(data.segment_exception[i][j] == -1)
- data.segment_exception[i][j] = 1;
-
- for(i=0;i<data.nseqs;i++)
- {
- start = -1;
- for(j=0;j<=data.ncols;j++)
- {
- if(start == -1)
- {
- if(data.segment_exception[i][j]==1)
- start=j;
- }
- else
- {
- if(j==data.ncols || data.segment_exception[i][j]==0)
- {
- if(j-start<length_cutoff)
- for(k=start;k<j;k++)
- data.segment_exception[i][k] = -1;
- start = -1;
- }
- }
-
- }
- }
-
- SetPanelExtra(p,&data);
-}
-
-static void calc_weights(PaneL p)
-{
- int i,j;
- int status;
- sint *weight;
- float dscore;
- FILE *tree;
- panel_data data;
-
-#ifdef UNIX
- char tree_name[FILENAMELEN]=".score.ph";
-#else
- char tree_name[FILENAMELEN]="tmp.ph";
-#endif
-
- GetPanelExtra(p,&data);
- if(data.nseqs<=0) return;
-
-/* if sequence weights have been calculated before - don't bother
-doing it again (it takes too long). data.seqweight is set to NULL when
- new sequences are loaded. */
- if(data.seqweight!=NULL) return;
-
- WatchCursor();
- info("Calculating sequence weights...");
-/* count pairwise percent identities to make a phylogenetic tree */
- if(data.nseqs>=2)
- {
- for (i=1;i<=data.nseqs;i++) {
- for (j=i+1;j<=data.nseqs;j++) {
- dscore = countid(i+data.firstseq,j+data.firstseq);
- tmat[i][j] = (100.0 - dscore)/100.0;
- tmat[j][i] = tmat[i][j];
- }
- }
-
- if((tree = open_explicit_file(tree_name))==NULL) return;
-
- guide_tree(tree,data.firstseq+1,data.nseqs);
-
- status = read_tree(tree_name, data.firstseq, data.firstseq+data.nseqs);
- if (status == 0) return;
-
- }
-
- weight = (sint *) ckalloc( (data.firstseq+data.nseqs+1) * sizeof(sint) );
-/* get the sequence weights */
- calc_seq_weights(data.firstseq, data.firstseq+data.nseqs,weight);
- if(data.seqweight==NULL) data.seqweight=(sint *)ckalloc((data.nseqs+1) * sizeof(sint));
- for(i=data.firstseq;i<data.firstseq+data.nseqs;i++)
- data.seqweight[i-data.firstseq]=weight[i];
-
-/* clear the memory for the phylogenetic tree */
- if (data.nseqs >= 2)
- {
- clear_tree(NULL);
- remove(tree_name);
- }
- ckfree(weight);
- SetPanelExtra(p,&data);
- info("Done.");
- ArrowCursor();
-}
-
-static void calc_panel_segment_exceptions(PaneL p)
-{
- int i,j;
- float sum,prev_sum;
- float gscale;
- sint **profile;
- sint *weight,sweight;
- sint *gaps;
- sint maxres;
- int max=0,offset;
- short *mat_xref, *matptr;
- sint matrix[NUMRES][NUMRES];
- float *fsum;
- float *bsum;
- float *pscore;
- panel_data data;
-
-/* First, calculate sequence weights which will be used to build the
-profile */
- calc_weights(p);
-
- GetPanelExtra(p,&data);
- if(data.nseqs<=0) return;
-
- WatchCursor();
- info("Calculating profile scores...");
-
- for(i=0;i<data.nseqs;i++)
- for(j=0;j<data.ncols;j++)
- data.segment_exception[i][j]=0;
-
-/* get the comparison matrix for building the profile */
- if(dnaflag)
- {
- if (segment_dnamatnum==1)
- {
- matptr = swgapdnamt;
- mat_xref = def_dna_xref;
- }
- else if (segment_dnamatnum==2)
- {
- matptr = clustalvdnamt;
- mat_xref = def_dna_xref;
- }
- else
- {
- matptr = segment_dnamatrix;
- mat_xref = segment_dna_xref;
- }
-/* get a positive matrix - then adjust it according to scale */
- maxres = get_matrix(matptr, mat_xref, matrix, FALSE, 100);
-/* find the maximum value */
- for(i=0;i<=max_aa;i++)
- for(j=0;j<=max_aa;j++)
- if(matrix[i][j]>max) max=matrix[i][j];
-/* subtract max*scale/2 from each matrix value */
- offset=(float)(max*segment_dnascale)/20.0;
-
- for(i=0;i<=max_aa;i++)
- for(j=0;j<=max_aa;j++)
- matrix[i][j]-=offset;
- }
- else
- {
- if (segment_matnum==1)
- {
- matptr = gon80mt;
- mat_xref = def_aa_xref;
- }
- else if (segment_matnum==2)
- {
- matptr = gon120mt;
- mat_xref = def_aa_xref;
- }
- else if (segment_matnum==3)
- {
- matptr = gon250mt;
- mat_xref = def_aa_xref;
- }
- else if (segment_matnum==4)
- {
- matptr = gon350mt;
- mat_xref = def_aa_xref;
- }
- else
- {
- matptr = segment_matrix;
- mat_xref = segment_aa_xref;
- }
-/* get a negative matrix */
- maxres = get_matrix(matptr, mat_xref, matrix, TRUE, 100);
- }
-
- profile = (sint **) ckalloc( (data.ncols+2) * sizeof (sint *) );
- for(i=0; i<data.ncols+1; i++)
- profile[i] = (sint *) ckalloc( (LENCOL+2) * sizeof(sint) );
-
-/* calculate the profile */
- gaps = (sint *) ckalloc( (data.ncols+1) * sizeof (sint) );
- for (j=1; j<=data.ncols; j++)
- {
- gaps[j-1] = 0;
- for(i=data.firstseq+1;i<data.firstseq+data.nseqs;i++)
- if (j<seqlen_array[i])
- if ((seq_array[i][j] < 0) || (seq_array[i][j] > max_aa))
- gaps[j-1]++;
- }
- weight = (sint *) ckalloc( (data.firstseq+data.nseqs+1) * sizeof(sint) );
- for(i=data.firstseq;i<data.firstseq+data.nseqs;i++)
- weight[i]=data.seqweight[i-data.firstseq];
-
- build_profile(data.ncols,data.firstseq,data.firstseq+data.nseqs,matrix,weight,profile);
-
- sweight=0;
- for(i=data.firstseq;i<data.firstseq+data.nseqs;i++)
- sweight+=weight[i];
-
-/*Now, use the profile scores to mark segments of each sequence which score
-badly. */
-
- fsum = (float *) ckalloc( (data.ncols+2) * sizeof (float) );
- bsum = (float *) ckalloc( (data.ncols+2) * sizeof (float) );
- pscore = (float *) ckalloc( (data.ncols+2) * sizeof (float) );
- for(i=data.firstseq+1;i<data.firstseq+data.nseqs+1;i++)
- {
-/* In a forward phase, sum the profile scores. Mark negative sums as exceptions.
-If the sum is positive, then it gets reset to 0. */
- sum=0.0;
- for(j=1;j<=seqlen_array[i];j++)
- {
- gscale = (float)(data.nseqs-gaps[j-1]) / (float)data.nseqs;
- if(seq_array[i][j]<0 || seq_array[i][j]>=max_aa)
- {
- pscore[j-1]=0.0;
- sum=0.0;
- }
- else
- pscore[j-1]=(profile[j][seq_array[i][j]]-
- weight[i-1]*matrix[seq_array[i][j]][seq_array[i][j]])*gscale/sweight;
- sum+=pscore[j-1];
- if(sum>0.0) sum=0.0;
- fsum[j-1]=sum;
- }
-/* trim off any positive scoring residues from the end of the segments */
- prev_sum=0;
- for(j=seqlen_array[i]-1;j>=0;j--)
- {
- if(prev_sum>=0.0 && fsum[j]<0.0 && pscore[j]>=0.0)
- fsum[j]=0.0;
- prev_sum=fsum[j];
- }
-
-/* Now, in a backward phase, do the same summing process. */
- sum=0.0;
- for(j=seqlen_array[i];j>=1;j--)
- {
- if(seq_array[i][j]<0 || seq_array[i][j]>=max_aa)
- sum=0;
- else
- sum+=pscore[j-1];
- if(sum>0.0) sum=0.0;
- bsum[j-1]=sum;
- }
-/* trim off any positive scoring residues from the start of the segments */
- prev_sum=0;
- for(j=0;j<seqlen_array[i];j++)
- {
- if(prev_sum>=0.0 && bsum[j]<0.0 && pscore[j]>=0.0)
- bsum[j]=0.0;
- prev_sum=bsum[j];
- }
-/*Mark residues as exceptions if they score negative in the forward AND backward directions. */
- for(j=1;j<=seqlen_array[i];j++)
- if(fsum[j-1]<0.0 && bsum[j-1]<0.0)
- if(seq_array[i][j]>=0 && seq_array[i][j]<max_aa)
- data.segment_exception[i-data.firstseq-1][j-1]=-1;
-/*
-if(i==5) {
-fprintf(stderr,"%4d ",j);
-fprintf(stderr,"\n");
-for(j=0;j<seqlen_array[i];j++)
-fprintf(stderr,"%4d ",(int)fsum[j]);
-fprintf(stderr,"\n");
-}
-*/
- }
- for(i=0; i<data.ncols+1; i++)
- ckfree(profile[i]);
- ckfree(profile);
- ckfree(weight);
- ckfree(gaps);
- ckfree(pscore);
- ckfree(fsum);
- ckfree(bsum);
-
- SetPanelExtra(p,&data);
-
-/* Finally, apply the length cutoff to the segments - removing segments shorter
-than the cutoff */
- remove_short_segments(p);
-
- info("Done.");
- ArrowCursor();
-}
-
-
-void set_segment_dnascale(BaR bar, GraphiC p, Nlm_Int2 newval, Nlm_Int2 oldval)
-{
- char str[FILENAMELEN];
- panel_data data;
-
- segment_dnascale = newval+1;
- calc_seg_exceptions();
- if(segment_exceptions) show_segment_exceptions();
- sprintf(str,"DNA Marking Scale: %2d",segment_dnascale);
- SetTitle(segmentdnascaletext,str);
-}
-
-
-static void build_profile(int prf_length,int first_seq,int last_seq,sint matrix[NUMRES][NUMRES],sint *weight,sint **profile)
-{
- sint **weighting, d, i, res;
- sint r, pos;
- int f;
-
- weighting = (sint **) ckalloc( (NUMRES+2) * sizeof (sint *) );
- for (i=0;i<NUMRES+2;i++)
- weighting[i] = (sint *) ckalloc( (prf_length+2) * sizeof (sint) );
-
- for (r=0; r<prf_length; r++)
- {
- for (d=0; d<=max_aa; d++)
- {
- weighting[d][r] = 0;
- for (i=first_seq; i<last_seq; i++)
- if (r+1<seqlen_array[i+1])
- if (d == seq_array[i+1][r+1]) weighting[d][r] += weight[i];
- }
- weighting[gap_pos1][r] = 0;
- for (i=first_seq; i<last_seq; i++)
- if (r+1<seqlen_array[i+1])
- if (gap_pos1 == seq_array[i+1][r+1]) weighting[gap_pos1][r] += weight[i];
- weighting[gap_pos2][r] = 0;
- for (i=first_seq; i<last_seq; i++)
- if (r+1<seqlen_array[i+1])
- if (gap_pos2 == seq_array[i+1][r+1]) weighting[gap_pos2][r] += weight[i];
- }
-
- for (pos=0; pos< prf_length; pos++)
- {
- for (res=0; res<=max_aa; res++)
- {
- f = 0;
- for (d=0; d<=max_aa; d++)
- f += (weighting[d][pos] * matrix[d][res]);
- f += (weighting[gap_pos1][pos] * matrix[gap_pos1][res]);
- f += (weighting[gap_pos2][pos] * matrix[gap_pos2][res]);
- profile[pos+1][res] = f;
- }
- f = 0;
- for (d=0; d<=max_aa; d++)
- f += (weighting[d][pos] * matrix[d][gap_pos1]);
- f += (weighting[gap_pos1][pos] * matrix[gap_pos1][gap_pos1]);
- f += (weighting[gap_pos2][pos] * matrix[gap_pos2][gap_pos1]);
- profile[pos+1][gap_pos1] = f;
- f = 0;
- for (d=0; d<=max_aa; d++)
- f += (weighting[d][pos] * matrix[d][gap_pos2]);
- f += (weighting[gap_pos1][pos] * matrix[gap_pos1][gap_pos2]);
- f += (weighting[gap_pos2][pos] * matrix[gap_pos2][gap_pos2]);
- profile[pos+1][gap_pos2] = f;
- }
-
- for (i=0;i<=max_aa;i++)
- weighting[i]=ckfree((void *)weighting[i]);
- weighting=ckfree((void *)weighting);
-
-}
-
-
Deleted: trunk/packages/clustalw/trunk/xutils.c
===================================================================
--- trunk/packages/clustalw/trunk/xutils.c 2007-08-12 13:21:13 UTC (rev 398)
+++ trunk/packages/clustalw/trunk/xutils.c 2007-08-12 15:08:40 UTC (rev 399)
@@ -1,1340 +0,0 @@
-#include <stdio.h>
-#include <stdarg.h>
-#include <string.h>
-
-#include <vibrant.h>
-
-#include "clustalw.h"
-#include "xmenu.h"
-
-char fontbuf[80];
-FonT tmpFont=NULL;
-static void VSeqMgrFontProc ();
-
-static int get_series_matrixname(char *usermtrxname, short *usermat,short *aa_xref,int usermatnum,int *matnum,PrompT mattext);
-
-extern Boolean x_menus;
-extern WindoW mainw;
-extern GrouP matrix_list,pw_matrix_list;
-extern GrouP dnamatrix_list,pw_dnamatrix_list;
-extern GrouP seg_matrix_list,seg_dnamatrix_list;
-extern GrouP score_matrix_list,score_dnamatrix_list;
-extern Boolean interactive;
-extern Boolean dnaflag;
-extern char hyd_residues[];
-extern sint gap_dist;
-extern Boolean no_var_penalties, no_hyd_penalties, no_pref_penalties;
-extern Boolean use_endgaps;
-extern Boolean realign_endgappenalties;
-extern Boolean align_endgappenalties;
-extern sint divergence_cutoff;
-extern Boolean lowercase; /* Flag for GDE output - set on comm. line*/
-extern Boolean cl_seq_numbers;
-
-extern Boolean seqRange;
-
-extern sint output_order;
-extern Boolean save_log;
-extern Boolean quick_pairalign;
-extern Boolean neg_matrix;
-extern Boolean output_clustal, output_nbrf, output_phylip, output_gcg, output_gde, output_nexus;
-extern Boolean output_fasta;
-extern Boolean save_parameters;
-extern Boolean output_tree_clustal, output_tree_phylip, output_tree_distances, output_tree_nexus, output_pim;
-extern char seqname[];
-extern float transition_weight;
-extern float gap_open, gap_extend;
-extern float dna_gap_open, dna_gap_extend;
-extern float prot_gap_open, prot_gap_extend;
-extern float pw_go_penalty, pw_ge_penalty;
-extern float dna_pw_go_penalty, dna_pw_ge_penalty;
-extern float prot_pw_go_penalty, prot_pw_ge_penalty;
-extern sint wind_gap,ktup,window,signif;
-extern sint dna_wind_gap, dna_ktup, dna_window, dna_signif;
-extern sint prot_wind_gap,prot_ktup,prot_window,prot_signif;
-extern Boolean tossgaps; /* Ignore places in align. where ANY seq. has a gap*/
-extern Boolean kimura; /* Use correction for multiple substitutions */
-extern sint boot_ntrials; /* number of bootstrap trials */
-extern unsigned sint boot_ran_seed; /* random number generator seed */
-extern sint bootstrap_format;
-extern sint struct_penalties,struct_penalties1,struct_penalties2;
-extern sint output_struct_penalties;
-extern sint profile1_nseqs;
-extern sint nseqs;
-extern Boolean use_ss1, use_ss2;
-extern int inverted;
-extern char mtrxname[], pw_mtrxname[];
-extern char usermtrxname[], pw_usermtrxname[];
-extern sint matnum,pw_matnum;
-extern short usermat[], pw_usermat[];
-extern short aa_xref[], pw_aa_xref[];
-extern char dnamtrxname[], pw_dnamtrxname[];
-extern char dnausermtrxname[], pw_dnausermtrxname[];
-extern sint dnamatnum,pw_dnamatnum;
-extern short userdnamat[], pw_userdnamat[];
-extern short dna_xref[], pw_dna_xref[];
-extern Boolean use_ambiguities;
-
-extern MatMenu matrix_menu;
-extern MatMenu dnamatrix_menu;
-extern MatMenu pw_matrix_menu;
-
-extern sint helix_penalty;
-extern sint strand_penalty;
-extern sint loop_penalty;
-extern sint helix_end_minus;
-extern sint helix_end_plus;
-extern sint strand_end_minus;
-extern sint strand_end_plus;
-extern sint helix_end_penalty;
-extern sint strand_end_penalty;
-
-extern TexT savealntext;
-extern GrouP slow_para,fast_para;
-
-extern PrompT message; /* used in temporary message window */
-extern Boolean mess_output;
-extern FILE *save_log_fd;
-extern color color_lut[];
-extern spanel seq_panel; /* data for multiple alignment area */
-extern spanel prf_panel[]; /* data for profile alignment areas */
-extern Boolean aln_mode;
-extern Boolean fixed_prf_scroll;
-extern Boolean output_ss;
-extern Boolean output_gp;
-extern PrompT mattext,pwmattext;
-extern PrompT dnamattext,pwdnamattext;
-extern int save_format;
-extern Boolean residue_exceptions;
-extern Boolean segment_exceptions;
-extern int font_size;
-extern FonT datafont;
-extern int av_font[];
-extern TexT blocklentext;
-extern IteM segment_item;
-
-extern int pagesize;
-extern int orientation;
-extern Boolean ps_ruler,ps_header,resize,ps_curve,ps_resno;
-extern int first_printres,last_printres,blocklen;
-extern int firstres,lastres;
-
-void set_go_penalty(TexT t)
-{
- char str[10];
- float temp;
-
- GetTitle(t,str,10);
- temp = atof(str);
- if (temp < 0 || temp > 100)
- return;
- gap_open=temp;
-
- if(dnaflag)
- dna_gap_open = gap_open;
- else
- prot_gap_open = gap_open;
-}
-
-void set_ge_penalty(TexT t)
-{
- char str[10];
- float temp;
-
- GetTitle(t,str,10);
- temp = atof(str);
- if (temp < 0 || temp > 100)
- return;
- gap_extend=temp;
-
- if(dnaflag)
- dna_gap_extend = gap_extend;
- else
- prot_gap_extend = gap_extend;
-}
-
-void set_gap_dist(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 100)
- return;
- gap_dist = temp;
-
-}
-
-void set_ntrials(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- if (str == NULL) return;
- temp = atoi(str);
- if (temp < 0 || temp > 10000)
- return;
- boot_ntrials = temp;
-}
-
-void set_ran_seed(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 1000)
- return;
- boot_ran_seed = temp;
-}
-
-void set_div_seq(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 100)
- return;
- divergence_cutoff = temp;
-}
-
-void set_pw_go_penalty(TexT t)
-{
- char str[10];
- float temp;
-
- GetTitle(t,str,10);
- temp = atof(str);
- if (temp < 0 || temp > 100)
- return;
-
- pw_go_penalty = temp;
- if(dnaflag)
- dna_pw_go_penalty = pw_go_penalty;
- else
- prot_pw_go_penalty = pw_go_penalty;
-}
-
-void set_pw_ge_penalty(TexT t)
-{
- char str[10];
- float temp;
-
- GetTitle(t,str,10);
- temp = atof(str);
- if (temp < 0 || temp > 100)
- return;
-
- pw_ge_penalty = temp;
- if(dnaflag)
- dna_pw_ge_penalty = pw_ge_penalty;
- else
- prot_pw_ge_penalty = pw_ge_penalty;
-}
-
-void set_gp(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 100)
- return;
-
- wind_gap = temp;
- if(dnaflag)
- dna_wind_gap = wind_gap;
- else
- prot_wind_gap = wind_gap;
-}
-
-void set_ktuple(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 100)
- return;
-
- ktup = temp;
- if(dnaflag)
- dna_ktup = ktup;
- else
- prot_ktup = ktup;
-}
-
-void set_topdiags(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 100)
- return;
-
- signif = temp;
- if(dnaflag)
- dna_signif = signif;
- else
- prot_signif = signif;
-}
-
-void set_window(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 100)
- return;
-
- window = temp;
- if(dnaflag)
- dna_window = window;
- else
- prot_window = window;
-}
-
-void set_hyd_res(TexT t)
-{
- int i,j;
- char tstr[27];
-
- GetTitle(t,tstr,27);
- for (i=0,j=0;i<strlen(hyd_residues) && i<27;i++)
- {
- if (isalpha(tstr[i]))
- hyd_residues[j++] = tstr[i];
- }
- hyd_residues[j]='\0';
-}
-
-void set_button(ButtoN l,Boolean *value)
-{
- int tmp;
-
- tmp = GetStatus(l);
- if (tmp == TRUE)
- *value = TRUE;
- else
- *value = FALSE;
-}
-
-void set_toggle(PopuP l,Boolean *value)
-{
- int tmp;
-
- tmp = GetValue(l);
- if (tmp == 1)
- *value = TRUE;
- else
- *value = FALSE;
-}
-
-void set_pref_penalties(PopuP l)
-{
- set_toggle(l,&no_pref_penalties);
-}
-
-void set_hyd_penalties(PopuP l)
-{
- set_toggle(l,&no_hyd_penalties);
-}
-void set_var_penalties(PopuP l)
-{
- set_toggle(l,&no_var_penalties);
-}
-void set_endgaps(PopuP l)
-{
- set_toggle(l,&use_endgaps);
-}
-void set_align_endgappenalties(PopuP l)
-{
- set_toggle(l,&align_endgappenalties);
-}
-void set_realign_endgappenalties(PopuP l)
-{
- set_toggle(l,&realign_endgappenalties);
-}
-void set_case(PopuP l)
-{
- set_toggle(l,&lowercase);
-}
-void set_snos(PopuP l)
-{
- set_toggle(l,&cl_seq_numbers);
-}
-
-
-void setRange(PopuP l)
-{
- set_toggle(l, &seqRange);
-}
-
-void set_save_paras(PopuP l)
-{
- set_toggle(l,&save_parameters);
-}
-void set_transitions(TexT t)
-{
- char str[10];
- float temp;
-
- GetTitle(t,str,10);
- temp = atof(str);
- if (temp < 0 || temp > 100)
- return;
-
- transition_weight = temp;
-}
-
-void set_ambiguities(PopuP l)
-{
- set_toggle(l,&use_ambiguities);
-}
-
-void set_neg_matrix(PopuP l)
-{
- set_toggle(l,&neg_matrix);
-}
-
-void set_output_nbrf(ButtoN l)
-{
- set_button(l,&output_nbrf);
-}
-void set_output_phylip(ButtoN l)
-{
- set_button(l,&output_phylip);
-}
-void set_output_gcg(ButtoN l)
-{
- set_button(l,&output_gcg);
-}
-
-void set_output_order(PopuP g)
-{
- int tmp;
- tmp = GetValue(g);
- if (tmp == 1)
- output_order=INPUT;
- else
- output_order=ALIGNED;
-}
-
-void set_pagesize(PopuP g)
-{
- int tmp;
- char tstr[10];
-
- tmp = GetValue(g);
- if (tmp == 1)
- pagesize=A4;
- else if (tmp == 2)
- pagesize=A3;
- else
- pagesize=USLETTER;
- if(orientation==LANDSCAPE)
- {
- if(pagesize==A4) blocklen=150;
- else if (pagesize==A3) blocklen=250;
- else blocklen=150;
- }
- else
- {
- if(pagesize==A4) blocklen=80;
- else if (pagesize==A3) blocklen=150;
- else blocklen=150;
- }
- sprintf(tstr,"%d",blocklen);
- SetTitle(blocklentext,tstr);
-}
-void set_orientation(PopuP g)
-{
- int tmp;
- char tstr[10];
-
- tmp = GetValue(g);
- if (tmp == 1)
- orientation=LANDSCAPE;
- else
- orientation=PORTRAIT;
-
- if(orientation==LANDSCAPE)
- {
- if(pagesize==A4) blocklen=150;
- else if (pagesize==A3) blocklen=250;
- else blocklen=150;
- }
- else
- {
- if(pagesize==A4) blocklen=80;
- else if (pagesize==A3) blocklen=150;
- else blocklen=150;
- }
- sprintf(tstr,"%d",blocklen);
- SetTitle(blocklentext,tstr);
-}
-void set_resno(PopuP l)
-{
- set_toggle(l,&ps_resno);
-}
-void set_curve(PopuP l)
-{
- set_toggle(l,&ps_curve);
-}
-void set_ruler(PopuP l)
-{
- set_toggle(l,&ps_ruler);
-}
-void set_header(PopuP l)
-{
- set_toggle(l,&ps_header);
-}
-void set_resize(PopuP l)
-{
- set_toggle(l,&resize);
-}
-void set_fres(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- if (str == NULL) return;
- temp = atoi(str);
- if (temp < 0 || temp > 100000)
- return;
- firstres = temp;
-}
-void set_lres(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- if (str == NULL) return;
- temp = atoi(str);
- if (temp < 0 || temp > 100000)
- return;
- lastres = temp;
-}
-void set_fpres(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- if (str == NULL) return;
- temp = atoi(str);
- if (temp < 0 || temp > 10000)
- return;
- first_printres = temp;
-}
-void set_lpres(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- if (str == NULL) return;
- temp = atoi(str);
- if (temp < 0 || temp > 10000)
- return;
- last_printres = temp;
-}
-void set_blocklen(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- if (str == NULL) return;
- temp = atoi(str);
- if (temp < 0 || temp > 10000)
- return;
- blocklen = temp;
-}
-
-void set_output_tree_nexus(ButtoN l)
-{
- set_button(l,&output_tree_nexus);
-}
-
-
-void set_output_pim(ButtoN l)
-{
- set_button(l,&output_pim);
-}
-
-void set_output_tree_clustal(ButtoN l)
-{
- set_button(l,&output_tree_clustal);
-}
-void set_output_tree_phylip(ButtoN l)
-{
- set_button(l,&output_tree_phylip);
-}
-void set_output_tree_distances(ButtoN l)
-{
- set_button(l,&output_tree_distances);
-}
-void set_tossgaps(IteM i)
-{
- tossgaps=GetStatus(i);
-}
-void set_kimura(IteM i)
-{
- kimura=GetStatus(i);
-}
-void set_boot_format(PopuP g)
-{
- int tmp;
- tmp = GetValue(g);
- if (tmp == 1)
- bootstrap_format=BS_NODE_LABELS;
- else
- bootstrap_format=BS_BRANCH_LABELS;
-}
-
-char prompt_for_yes_no(char *title,char *prompt)
-{
- char lin2[MESSLENGTH*MESSLINES];
-
- if(!x_menus) return;
-
- strcpy(lin2,title);
- strcat(lin2,".\n");
- strcat(lin2,prompt);
- strcat(lin2,"?");
- if (Message(MSG_YN,lin2)==ANS_NO)
- return('n');
- else
- return('y');
-
-}
-
-
-/*
-* fatal()
-*
-* Prints error msg and exits.
-* Variadic parameter list can be passed.
-*
-* Return values:
-* none
-*/
-
-void fatal( char *msg,...)
-{
- va_list ap;
- char istr[MESSLENGTH*MESSLINES] = "FATAL ERROR: ";
- char vstr[1000];
-
-
- va_start(ap,msg);
- vsprintf(vstr,msg,ap);
- va_end(ap);
- strncat(istr,vstr,MESSLENGTH*MESSLINES-20);
- Message(MSG_FATAL,istr);
-}
-
-/*
-* error()
-*
-* Prints error msg.
-* Variadic parameter list can be passed.
-*
-* Return values:
-* none
-*/
-
-void error( char *msg,...)
-{
- va_list ap;
- char istr[MESSLENGTH*MESSLINES] = "ERROR: ";
- char vstr[1000];
-
-
- va_start(ap,msg);
- vsprintf(vstr,msg,ap);
- va_end(ap);
- strncat(istr,vstr,MESSLENGTH*MESSLINES-10);
- if (!interactive)
- fprintf(stdout,"%s",istr);
- else
- Message(MSG_ERROR,istr);
-}
-
-/*
-* warning()
-*
-* Prints warning msg.
-* Variadic parameter list can be passed.
-*
-* Return values:
-* none
-*/
-
-void warning( char *msg,...)
-{
- va_list ap;
- char istr[MESSLENGTH*MESSLINES] = "WARNING: ";
- char vstr[1000];
- va_start(ap,msg);
- vsprintf(vstr,msg,ap);
- va_end(ap);
- strncat(istr,vstr,MESSLENGTH*MESSLINES-10);
- if (!interactive)
- fprintf(stdout,"%s",istr);
- else
- Message(MSG_ERROR,istr);
-}
-
-/*
-* info()
-*
-* Prints info msg.
-* Variadic parameter list can be passed.
-*
-* Return values:
-* none
-*/
-
-void info( char *msg,...)
-{
- va_list ap;
- char istr[MESSLENGTH+10] = "";
- char vstr[1000];
-
- if (!mess_output) return;
-
- va_start(ap,msg);
- vsprintf(vstr,msg,ap);
- va_end(ap);
- strncat(istr,vstr,MESSLENGTH);
- if (!interactive)
- fprintf(stdout,"%s\n",istr);
- else
- {
- UseWindow(mainw);
- SelectFont(systemFont);
- SetTitle(message,istr);
- if(save_log && save_log_fd!=NULL)
- fprintf(save_log_fd,"%s\n",istr);
- Update();
- }
-
-}
-
-
-void set_helix_gp(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 9)
- return;
- helix_penalty = temp;
-
-}
-
-void set_strand_gp(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 9)
- return;
- strand_penalty = temp;
-
-}
-
-void set_loop_gp(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 9)
- return;
- loop_penalty = temp;
-
-}
-
-void set_terminal_gp(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 9)
- return;
- helix_end_penalty = temp;
-
-}
-
-void set_helix_minus(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 9)
- return;
- helix_end_minus = temp;
-
-}
-
-void set_helix_plus(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 9)
- return;
- helix_end_plus = temp;
-}
-
-void set_strand_plus(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 9)
- return;
- strand_end_plus = temp;
-}
-
-void set_strand_minus(TexT t)
-{
- char str[10];
- int temp;
-
- GetTitle(t,str,10);
- temp = atoi(str);
- if (temp < 0 || temp > 9)
- return;
- strand_end_minus = temp;
-}
-
-
-void set_inverted(IteM i)
-{
- inverted=GetStatus(i);
- if (inverted==FALSE)
- {
- strcpy(color_lut[0].name,"BLACK");
- color_lut[0].r=0.4;
- color_lut[0].g=0.4;
- color_lut[0].b=0.4;
- SelectColor(color_lut[0].r*255, color_lut[0].g*255, color_lut[0].b*255);
- color_lut[0].val=GetColor();
- }
- else
- {
- strcpy(color_lut[0].name,"WHITE");
- color_lut[0].r=1.0;
- color_lut[0].g=1.0;
- color_lut[0].b=1.0;
- SelectColor(color_lut[0].r*255, color_lut[0].g*255, color_lut[0].b*255);
- color_lut[0].val=GetColor();
- }
-
- if(aln_mode==MULTIPLEM)
- DrawPanel(seq_panel.seqs);
- else
- {
- DrawPanel(prf_panel[0].seqs);
- DrawPanel(prf_panel[1].seqs);
- }
-
-}
-
-void set_ss_output(ButtoN b)
-{
- int tmp;
-
- tmp = GetStatus(b);
- if (tmp) output_ss = TRUE;
- else output_ss = FALSE;
-
- if (output_ss && output_gp)
- output_struct_penalties=2;
- else if (output_ss)
- output_struct_penalties=0;
- else if (output_gp)
- output_struct_penalties=1;
- else
- output_struct_penalties=3;
-}
-void set_gp_output(ButtoN b)
-{
- int tmp;
-
- tmp = GetStatus(b);
- if (tmp) output_gp = TRUE;
- else output_gp = FALSE;
-
- if (output_ss && output_gp)
- output_struct_penalties=2;
- else if (output_ss)
- output_struct_penalties=0;
- else if (output_gp)
- output_struct_penalties=1;
- else
- output_struct_penalties=3;
-}
-
-void set_user_matrix(ButtoN but)
-{
- if(get_series_matrixname(usermtrxname,usermat,aa_xref,5,&matnum,mattext))
- strcpy(mtrxname,usermtrxname);
- SetValue(matrix_list,matnum);
-}
-
-
-void set_pw_user_matrix(ButtoN but)
-{
- if(get_user_matrixname(pw_usermtrxname,pw_usermat,pw_aa_xref,5,&pw_matnum,pwmattext))
- strcpy(pw_mtrxname,pw_usermtrxname);
- SetValue(pw_matrix_list,pw_matnum);
-}
-
-void set_pw_matrix(GrouP g)
-{
- int tmp;
-
- tmp = GetValue(g);
- if (tmp>0 && tmp<pw_matrix_menu.noptions)
- {
- pw_matnum = tmp;
- strcpy(pw_mtrxname,pw_matrix_menu.opt[tmp-1].string);
- }
- else if(pw_usermtrxname[0]=='\0')
- {
- if(get_user_matrixname(pw_usermtrxname,pw_usermat,pw_aa_xref,pw_matrix_menu.noptions,&pw_matnum,pwmattext))
- strcpy(pw_mtrxname,pw_usermtrxname);
- }
- else
- pw_matnum=pw_matrix_menu.noptions;
- SetValue(pw_matrix_list,pw_matnum);
-}
-void set_matrix(GrouP g)
-{
- int tmp;
- int status;
-
- tmp = GetValue(g);
- if (tmp>0 && tmp<matrix_menu.noptions)
- {
- matnum = tmp;
- strcpy(mtrxname,matrix_menu.opt[tmp-1].string);
- }
- else if(usermtrxname[0]=='\0')
- {
- if(get_series_matrixname(usermtrxname,usermat,aa_xref,matrix_menu.noptions,&matnum,mattext))
- strcpy(mtrxname,usermtrxname);
- }
- else matnum=matrix_menu.noptions;
-
- SetValue(matrix_list,matnum);
-}
-
-static int get_series_matrixname(char *usermtrxname, short *usermat,short *aa_xref,int usermatnum,int *matnum,PrompT mattext)
-{
- int ret=0;
- static Char filename[FILENAMELEN];
-
- if (GetInputFileName(filename,FILENAMELEN,"",""))
- {
- if(user_mat_series(filename, usermat, aa_xref))
- {
- strcpy(usermtrxname,filename);
- *matnum=usermatnum;
- SetTitle(mattext,usermtrxname);
- ret=1;
- }
- }
-
- return ret;
-}
-int get_user_matrixname(char *usermtrxname, short *usermat,short *aa_xref,int usermatnum,int *matnum,PrompT mattext)
-{
- int ret=0;
- static Char filename[FILENAMELEN];
-
- if (GetInputFileName(filename,FILENAMELEN,"",""))
- {
- if(user_mat(filename, usermat, aa_xref))
- {
- strcpy(usermtrxname,filename);
- *matnum=usermatnum;
- SetTitle(mattext,usermtrxname);
- ret=1;
- }
- }
-
- return ret;
-}
-
-void set_user_dnamatrix(ButtoN but)
-{
- if(get_user_matrixname(dnausermtrxname,userdnamat,dna_xref,3,&dnamatnum,dnamattext))
- strcpy(dnamtrxname,dnausermtrxname);
- SetValue(dnamatrix_list,dnamatnum);
-}
-
-
-void set_pw_user_dnamatrix(ButtoN but)
-{
- if(get_user_matrixname(pw_dnausermtrxname,pw_userdnamat,pw_dna_xref,3,&pw_dnamatnum,pwdnamattext))
- strcpy(pw_dnamtrxname,pw_dnausermtrxname);
- SetValue(pw_dnamatrix_list,pw_dnamatnum);
-}
-
-void set_pw_dnamatrix(GrouP g)
-{
- int tmp;
-
- tmp = GetValue(g);
- if (tmp>0 && tmp<dnamatrix_menu.noptions)
- {
- pw_dnamatnum = tmp;
- strcpy(pw_dnamtrxname,dnamatrix_menu.opt[tmp-1].string);
- }
- else if(pw_dnausermtrxname[0]=='\0')
- {
- if(get_user_matrixname(pw_dnausermtrxname,pw_userdnamat,pw_dna_xref,dnamatrix_menu.noptions,&pw_dnamatnum,pwdnamattext))
- strcpy(pw_dnamtrxname,pw_dnausermtrxname);
- }
- else pw_dnamatnum=dnamatrix_menu.noptions;
- SetValue(pw_dnamatrix_list,pw_dnamatnum);
-}
-void set_dnamatrix(GrouP g)
-{
- int tmp;
-
- tmp = GetValue(g);
- if (tmp>0 && tmp<dnamatrix_menu.noptions)
- {
- dnamatnum = tmp;
- strcpy(dnamtrxname,dnamatrix_menu.opt[tmp-1].string);
- }
- else if(dnausermtrxname[0]=='\0')
- {
- if(get_user_matrixname(dnausermtrxname,userdnamat,dna_xref,dnamatrix_menu.noptions,&dnamatnum,dnamattext))
- strcpy(dnamtrxname,dnausermtrxname);
- }
- else dnamatnum=dnamatrix_menu.noptions;
- SetValue(dnamatrix_list,dnamatnum);
-}
-
-FILE * open_input_file(char *file_name)
-{
- FILE * file_handle;
-
- if (*file_name == EOS) {
- error("Bad input file [%s]",file_name);
- return NULL;
- }
-#ifdef VMS
- if((file_handle=fopen(file_name,"r","rat=cr","rfm=var"))==NULL) {
-#else
- if((file_handle=fopen(file_name,"r"))==NULL) {
-#endif
- error("Cannot open input file [%s]",file_name);
- return NULL;
- }
- return file_handle;
-}
-
-
-void set_use_ss1(PopuP l)
-{
- set_toggle(l,&use_ss1);
- load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
-}
-void set_use_ss2(PopuP l)
-{
- set_toggle(l,&use_ss2);
- load_aln(prf_panel[0],0,profile1_nseqs-1,FALSE);
- load_aln(prf_panel[1],profile1_nseqs,nseqs-1,FALSE);
-}
-
-
-void set_output_clustal(ButtoN l)
-{
- set_button(l,&output_clustal);
-}
-void set_output_gde(ButtoN l)
-{
- set_button(l,&output_gde);
-}
-void set_output_nexus(ButtoN l)
-{
- set_button(l,&output_nexus);
-}
-
-
-void set_output_fasta(ButtoN l)
-{
- set_button(l,&output_fasta);
-}
-
-void set_format(GrouP g)
-{
- int i;
- char path[FILENAMELEN];
-
- get_path(seqname,path);
- GetTitle(savealntext, path,FILENAMELEN);
-/* remove the current extension */
- for(i=strlen(path)-1;i>=0;i--)
- if(path[i]=='.')
- {
- path[i]='\0';
- break;
- }
-
- i = GetValue(g);
- if (i==1)
- {
- save_format=CLUSTAL;
- strcat(path,".aln");
- }
- else if (i==2)
- {
- save_format=PIR;
- strcat(path,".pir");
- }
- else if (i==3)
- {
- save_format=MSF;
- strcat(path,".msf");
- }
- else if (i==4)
- {
- save_format=PHYLIP;
- strcat(path,".phy");
- }
- else if (i==5)
- {
- save_format=GDE;
- strcat(path,".gde");
- }
- else if (i==6)
- {
- save_format=NEXUS;
- strcat(path,".nxs");
- }
-
- else if (i==7)
- {
- save_format=FASTA;
- strcat(path,".fasta");
- }
-
- SetTitle(savealntext, path);
-}
-
-void set_residue_exceptions(IteM i)
-{
- if (residue_exceptions==FALSE)
- residue_exceptions=TRUE;
- else
- residue_exceptions=FALSE;
- if (aln_mode==MULTIPLEM)
- DrawPanel(seq_panel.seqs);
- else
- {
- DrawPanel(prf_panel[0].seqs);
- DrawPanel(prf_panel[1].seqs);
- }
-}
-
-
-void set_fs_toggle(PopuP l)
-{
- set_toggle(l,&quick_pairalign);
- if (quick_pairalign)
- {
- Hide(slow_para);
- Show(fast_para);
- }
- else
- {
- Hide(fast_para);
- Show(slow_para);
- }
-}
-
-void set_font_size(PopuP g)
-{
- int tmp;
- char font[30];
-
- tmp = GetValue(g);
-
- /*
- if ( tmp == 6 ) {
- printf("\n choosing differnt fonts %d\n ",tmp);
- VSeqMgrFontProc();
- printf("\n choosing differnt fonts ( %s ) ",fontbuf);
- datafont=ParseFont(fontbuf);
- }
- else {
- printf("\n NOT......... choosing differnt fonts %d\n ",tmp);
- */
- font_size=tmp-1;
- sprintf(font, "%s,%d,%c", "courier", av_font[font_size], 'm');
- datafont=ParseFont(font);
- /* } */
-
- if (aln_mode==MULTIPLEM)
- {
- DrawPanel(seq_panel.names);
- DrawPanel(seq_panel.seqs);
- }
- else
- {
- DrawPanel(prf_panel[0].names);
- DrawPanel(prf_panel[0].seqs);
-
- DrawPanel(prf_panel[1].names);
- DrawPanel(prf_panel[1].seqs);
- }
- correct_name_bars(FALSE);
- correct_seq_bars(FALSE);
-}
-
-void set_pscroll_mode(ButtoN l)
-{
- panel_data data;
-
- set_button(l,&fixed_prf_scroll);
- GetPanelExtra(prf_panel[0].seqs,&data);
- if(fixed_prf_scroll)
- data.lockoffset=data.firstvcol;
- else
- data.lockoffset=0;
- SetPanelExtra(prf_panel[0].seqs,&data);
- GetPanelExtra(prf_panel[1].seqs,&data);
- if(fixed_prf_scroll)
- data.lockoffset=data.firstvcol;
- else
- data.lockoffset=0;
- SetPanelExtra(prf_panel[1].seqs,&data);
- correct_seq_bars(FALSE);
-}
-
-void set_aln_mode(PopuP g)
-{
- int tmp;
- tmp = GetValue(g);
- if (tmp == 1)
- aln_mode = MULTIPLEM;
- else
- aln_mode = PROFILEM;
- switch_mode();
-}
-
-void set_show_segments(IteM l)
-{
- if (segment_exceptions==FALSE)
- segment_exceptions=TRUE;
- else
- segment_exceptions=FALSE;
- calc_seg_exceptions();
- SetStatus(segment_item,segment_exceptions);
- show_segment_exceptions();
-}
-
-
-void shift(Handle a, int dx, int dy)
-{
- PoinT pt;
-
- GetNextPosition (a, &pt);
- pt.x+=dx;
- pt.y+=dy;
- SetNextPosition(a, pt);
-}
-
-void stripspace(char *str)
-{
- register int i,j,p;
- char *tstr;
-
-#ifndef UNIX
- return;
-#endif
- p = strlen(str) - 1;
-
- while ( isspace(str[p]) )
- p--;
-
- str[p + 1] = EOS;
-
- tstr=(char *)ckalloc((p+2)*sizeof(char));
-
- for(i=0,j=0;i<=p;i++)
- if(!isspace(str[i]))
- tstr[j++]=str[i];
- tstr[j] = EOS;
- strcpy(str,tstr);
- ckfree(tstr);
-
-}
-
-/* extra code */
-
-static void VSeqMgrFontProc ()
-{
- Nlm_FontSpec font;
- FonT f;
-
- /* GetFontSpec(vsmp->font, &font); */
- printf(" before getfontspec \n");
- GetFontSpec(tmpFont, &font);
- printf(" done getfontspec \n");
- if (ChooseFont(&font, CFF_READ_FSP, NULL))
- {
- /***
- f = GetPermanentFont(&font);
- ***/
- f = CreateFont(&font);
- tmpFont = f;
- SelectFont(f);
- /* vsmp->lineheight = LineHeight();
- vsmp->leading = Leading();
- vsmp->charw = MaxCharWidth();
- vsmp->update_all = TRUE; */
- /* VSeqMgrShow(); */
- FontSpecToStr(&font, fontbuf, 80);
- datafont = ParseFont(fontbuf);
- printf(" font info \n ( %s ) \n", fontbuf);
- }
- return ;
-}
More information about the debian-med-commit
mailing list