[med-svn] [fasta3] 08/10: New upstream version 36.3.8f
Andreas Tille
tille at debian.org
Tue Dec 5 16:21:48 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository fasta3.
commit 2ee98997730aa71d5298843512acf174ad04f950
Author: Andreas Tille <tille at debian.org>
Date: Tue Dec 5 17:20:06 2017 +0100
New upstream version 36.3.8f
---
COPYRIGHT | 19 +
FASTA_LIST | 17 +
LICENSE | 202 ++
README | 77 +
README.md | 77 +
bin/README | 1 +
conf/README | 33 +
conf/fast_libs_e.www | 23 +
conf/fast_new | 38 +
conf/fastlibs | 42 +
data/VTML_10.mat | 35 +
data/VTML_120.mat | 34 +
data/VTML_160.mat | 34 +
data/VTML_20.mat | 34 +
data/VTML_200.mat | 34 +
data/VTML_40.mat | 34 +
data/VTML_80.mat | 34 +
data/blosum45.mat | 30 +
data/blosum50.mat | 29 +
data/blosum62.mat | 30 +
data/blosum80.mat | 30 +
data/dna.mat | 19 +
data/idn_aa.mat | 24 +
data/md_10.mat | 24 +
data/md_20.mat | 24 +
data/md_40.mat | 24 +
data/pam120.mat | 34 +
data/pam250.mat | 34 +
data/rna.mat | 19 +
data/vtml160.mat | 38 +
debian/changelog | 5 -
debian/compat | 1 -
debian/control | 75 -
debian/copyright | 51 -
debian/docs | 5 -
debian/fasta3-doc.doc-base | 15 -
debian/fasta3.install | 1 -
debian/fasta3.manpages | 6 -
debian/patches/Makefile.patch | 47 -
debian/patches/series | 1 -
debian/rules | 35 -
debian/source/format | 1 -
debian/upstream/metadata | 20 -
debian/watch | 6 -
doc/INSTALL | 17 +
doc/README.versions | 48 +
doc/README_v36.3.8d.md | 37 +
doc/changes_v34.html | 351 +++
doc/changes_v35.html | 212 ++
doc/changes_v36.html | 467 ++++
doc/fasta.defaults | 17 +
doc/fasta.history.tex | 180 ++
doc/fasta.options | 55 +
doc/fasta36.1 | 467 ++++
doc/fasta_func.doc | 300 +++
doc/fasta_guide.bib | 265 +++
doc/fasta_guide.fg1.tex | 60 +
doc/fasta_guide.fg2.tex | 25 +
doc/fasta_guide.pdf | Bin 0 -> 265293 bytes
doc/fasta_guide.tex | 2115 ++++++++++++++++++
doc/fasta_versions.html | 101 +
doc/fastf3.1 | 176 ++
doc/fasts3.1 | 169 ++
doc/map_db.1 | 45 +
doc/prss3.1 | 170 ++
doc/ps_lav.1 | 20 +
doc/readme.v30 | 38 +
doc/readme.v30t6 | 74 +
doc/readme.v30t7 | 175 ++
doc/readme.v31t0 | 160 ++
doc/readme.v31t1 | 113 +
doc/readme.v32t0 | 407 ++++
doc/readme.v33t0 | 1268 +++++++++++
doc/readme.v34t0 | 1683 +++++++++++++++
doc/readme.v35 | 535 +++++
doc/readme.v36 | 2213 +++++++++++++++++++
doc/readme.w32 | 67 +
make/Makefile | 53 +
make/Makefile.NetBSD | 40 +
make/Makefile.cray_pvp | 41 +
make/Makefile.fcom | 344 +++
make/Makefile.freebsd | 72 +
make/Makefile.hpux_it | 56 +
make/Makefile.ibm | 37 +
make/Makefile.linux | 1 +
make/Makefile.linux32 | 63 +
make/Makefile.linux32_sse2 | 68 +
make/Makefile.linux64 | 1 +
make/Makefile.linux64_sse2 | 65 +
make/Makefile.linux_icc | 58 +
make/Makefile.linux_icc_sse2 | 55 +
make/Makefile.linux_mysql | 57 +
make/Makefile.linux_pgsql | 58 +
make/Makefile.linux_sql | 58 +
make/Makefile.linux_sse2 | 1 +
make/Makefile.mp_com2 | 116 +
make/Makefile.mpi_icc_sse2 | 55 +
make/Makefile.nm_fcom | 304 +++
make/Makefile.nm_pcom | 217 ++
make/Makefile.nmk_icl | 35 +
make/Makefile.os_x | 69 +
make/Makefile.os_x86 | 61 +
make/Makefile.os_x86_64 | 61 +
make/Makefile.os_x86_clang | 59 +
make/Makefile.os_x86_icc | 61 +
make/Makefile.pLinux | 83 +
make/Makefile.pLinux_sql | 81 +
make/Makefile.pcom | 229 ++
make/Makefile.pcom_s | 162 ++
make/Makefile.pcom_t | 184 ++
make/Makefile.sgi | 58 +
make/Makefile.sse_alt | 23 +
make/Makefile.sun | 52 +
make/Makefile.sun_x86 | 51 +
make/Makefile35.common | 45 +
make/Makefile35.common_sql | 50 +
make/Makefile35.nmk_com | 30 +
make/Makefile35m.common_mysql | 49 +
make/Makefile35m.common_pgsql | 49 +
make/Makefile35m.common_sql | 48 +
make/Makefile36.nmk_com | 30 +
make/Makefile36m.common | 51 +
make/Makefile36mpi.common | 43 +
make/Makefile36t.common | 43 +
make/README | 44 +
make/make_osx_univ.sh | 31 +
misc/README | 14 +
misc/parse_m9.pl | 139 ++
misc/res2R.pl | 22 +
misc/shuffle_embed.pl | 148 ++
psisearch2/README.md | 92 +
psisearch2/m89_btop_msa2.pl | 927 ++++++++
psisearch2/psisearch2_msa.pl | 453 ++++
psisearch2/psisearch2_msa.py | 368 ++++
scripts/README | 108 +
scripts/README.scripts | 84 +
scripts/acc_examples | 5 +
scripts/ann_exons_ens.pl | 287 +++
scripts/ann_exons_ncbi.pl | 243 +++
scripts/ann_exons_up_www.pl | 239 ++
scripts/ann_feats2ipr.pl | 526 +++++
scripts/ann_feats2ipr_e.pl | 544 +++++
scripts/ann_feats_up_sql.pl | 463 ++++
scripts/ann_feats_up_www2.pl | 455 ++++
scripts/ann_ipr_www.pl | 467 ++++
scripts/ann_pdb_cath.pl | 345 +++
scripts/ann_pdb_vast.pl | 320 +++
scripts/ann_pfam27.pl | 656 ++++++
scripts/ann_pfam28.pl | 782 +++++++
scripts/ann_pfam30.pl | 859 ++++++++
scripts/ann_pfam30_tmptbl.pl | 875 ++++++++
scripts/ann_pfam_www.pl | 687 ++++++
scripts/ann_script_list | 9 +
scripts/ann_upfeats_pfam_www_e.pl | 801 +++++++
scripts/annot_blast_btop2.pl | 1306 +++++++++++
scripts/blastp_cmd.sh | 31 +
scripts/color_defs.pl | 170 ++
scripts/exp_up_ensg.pl | 145 ++
scripts/expand_links.pl | 100 +
scripts/expand_uniref50.pl | 83 +
scripts/lav2plt.pl | 349 +++
scripts/lavplt_ps.pl | 540 +++++
scripts/lavplt_svg.pl | 461 ++++
scripts/links2sql.pl | 61 +
scripts/m8_btop_msa.pl | 412 ++++
scripts/m9B_btop_msa.pl | 654 ++++++
scripts/plot_domain2t.cgi | 667 ++++++
scripts/summ_domain_ident.pl | 97 +
scripts/test_ann_scripts.sh | 30 +
seq/bovgh.seq | 38 +
seq/bovprl.seq | 17 +
seq/dna_test_s.nlib | 47 +
seq/dyr_human.aa | 4 +
seq/egmsmg.aa | 19 +
seq/grou_drome.pseg | 14 +
seq/gst.nlib | 284 +++
seq/gst.seq | 20 +
seq/gstm1_human.vaa | 2 +
seq/gstm1b_human.nt | 17 +
seq/gstm1b_human_fs.nt | 17 +
seq/gstt1_drome.aa | 4 +
seq/gstt1_pssm.asn1 | Bin 0 -> 56931 bytes
seq/gtm1_human.aa | 4 +
seq/gtt1_drome.aa | 4 +
seq/h10_human.aa | 4 +
seq/hahu.aa | 4 +
seq/hsgstm1b.gcg | 214 ++
seq/hsgstm1b.seq | 40 +
seq/humgstd.seq | 20 +
seq/lcbo.aa | 5 +
seq/m1r.aa | 5 +
seq/m2.aa | 5 +
seq/mchu.aa | 3 +
seq/mgstm1.3nt | 60 +
seq/mgstm1.aa | 5 +
seq/mgstm1.aaa | 8 +
seq/mgstm1.e05 | 20 +
seq/mgstm1.eeq | 20 +
seq/mgstm1.esq | 20 +
seq/mgstm1.gcg | 13 +
seq/mgstm1.lc | 8 +
seq/mgstm1.nt | 12 +
seq/mgstm1.nt1 | 12 +
seq/mgstm1.nt12r | 26 +
seq/mgstm1.nt13 | 36 +
seq/mgstm1.nt13r | 35 +
seq/mgstm1.nt1r | 13 +
seq/mgstm1.nts | 9 +
seq/mgstm1.raa | 5 +
seq/mgstm1.rev | 16 +
seq/mgstm1.seq | 20 +
seq/mgstm1_genclone.seq | 2088 ++++++++++++++++++
seq/mgtt2_x.seq | 12 +
seq/ms1.aa | 6 +
seq/mu.lib | 50 +
seq/musplfm.aa | 9 +
seq/mwkw.aa | 31 +
seq/mwrtc1.aa | 8 +
seq/myosin_bp.aa | 20 +
seq/n0.aa | 4 +
seq/n1.aa | 5 +
seq/n2.aa | 28 +
seq/n2_fs.lib | 84 +
seq/n2s.aa | 8 +
seq/n2t.aa | 16 +
seq/n_fs.lib | 20 +
seq/ngt.aa | 20 +
seq/ngts.aa | 7 +
seq/oohu.aa | 6 +
seq/oohu.raa | 7 +
seq/prio_atepa.aa | 5 +
seq/prot_test.lib | 51 +
seq/prot_test.lseg | 66 +
seq/prot_test_s.lseg | 25 +
seq/qrhuld.aa | 15 +
seq/titin_hum.aa | 431 ++++
seq/titin_hum.seq | 1174 ++++++++++
seq/xurt8c.aa | 5 +
seq/xurt8c.lc | 5 +
seq/xurtg.aa | 5 +
sql/README | 26 +
sql/create_seq_demo.sql | 30 +
sql/join_up50.pl | 99 +
sql/mysql_demo1.sql | 6 +
sql/mysql_demo_pv.sql | 6 +
sql/nr_to_sql.pl | 103 +
sql/pirpsd.sql | 8 +
sql/psql_demo.sql | 7 +
sql/psql_demo1.sql | 6 +
sql/psql_demo_pv.sql | 7 +
src/a_mark.h | 51 +
src/aamap.h | 17 +
src/ag_stats.c | 129 ++
src/aln_structs.h | 61 +
src/alt_parms.h | 399 ++++
src/altlib.h | 142 ++
src/apam.c | 502 +++++
src/best_stats.h | 52 +
src/build_ares.c | 248 +++
src/c_dispn.c | 573 +++++
src/cal_cons.c | 1226 +++++++++++
src/cal_cons2.c | 1164 ++++++++++
src/cal_consf.c | 591 +++++
src/comp_lib9.c | 3052 ++++++++++++++++++++++++++
src/compacc2.c | 4119 +++++++++++++++++++++++++++++++++++
src/compacc2e.c | 4316 +++++++++++++++++++++++++++++++++++++
src/dec_pthr_subs.c | 246 +++
src/dec_pthr_subs.h | 42 +
src/defs.h | 171 ++
src/doinit.c | 975 +++++++++
src/drop_func.h | 185 ++
src/dropff2.c | 1394 ++++++++++++
src/dropfs2.c | 1681 +++++++++++++++
src/dropfx.c | 4072 ++++++++++++++++++++++++++++++++++
src/dropfx2.c | 3892 +++++++++++++++++++++++++++++++++
src/dropfz2.c | 3969 ++++++++++++++++++++++++++++++++++
src/dropfz3.c | 3864 +++++++++++++++++++++++++++++++++
src/dropgsw2.c | 1128 ++++++++++
src/dropgsw2.h | 46 +
src/dropnfa.c | 2250 +++++++++++++++++++
src/dropnfa.h | 84 +
src/dropnnw2.c | 900 ++++++++
src/dropnsw.c | 424 ++++
src/dyn_string.h | 30 +
src/faatran.c | 445 ++++
src/getenv.c | 56 +
src/getopt.c | 64 +
src/getseq.c | 313 +++
src/global_sse2.c | 547 +++++
src/global_sse2.h | 41 +
src/glocal_sse2.c | 596 +++++
src/glocal_sse2.h | 41 +
src/h_altlib.h | 28 +
src/htime.c | 43 +
src/initfa.c | 3183 +++++++++++++++++++++++++++
src/karlin.c | 519 +++++
src/last_tat.c | 155 ++
src/last_thresh.c | 62 +
src/lav_defs.h | 44 +
src/lib_sel.c | 341 +++
src/list_db.c | 245 +++
src/llgetaa.c | 497 +++++
src/lsim4.c | 998 +++++++++
src/lsim4.h | 145 ++
src/map_db.c | 600 ++++++
src/mm_file.h | 153 ++
src/mmgetaa.c | 1116 ++++++++++
src/mrandom.c | 97 +
src/msg.h | 57 +
src/mshowalign2.c | 999 +++++++++
src/mshowbest.c | 665 ++++++
src/mw.h | 49 +
src/mysql_lib.c | 636 ++++++
src/ncbl2_head.h | 35 +
src/ncbl2_mlib.c | 2442 +++++++++++++++++++++
src/ncbl_head.h | 33 +
src/ncbl_lib.c | 491 +++++
src/nmgetlib.c | 2254 +++++++++++++++++++
src/param.h | 251 +++
src/pcomp_bufs.h | 33 +
src/pcomp_subs2.c | 686 ++++++
src/pgsql_lib.c | 631 ++++++
src/print_pssm.c | 793 +++++++
src/pssm_asn_subs.c | 1756 +++++++++++++++
src/pthr_subs.h | 49 +
src/pthr_subs2.c | 377 ++++
src/randtest.c | 38 +
src/re_getlib.c | 146 ++
src/res_stats.c | 703 ++++++
src/rstruct.h | 16 +
src/sc_to_e.c | 71 +
src/scaleswn.c | 3136 +++++++++++++++++++++++++++
src/scaleswt.c | 1566 ++++++++++++++
src/showrss.c | 82 +
src/smith_waterman_altivec.c | 3086 ++++++++++++++++++++++++++
src/smith_waterman_altivec.h | 26 +
src/smith_waterman_sse2.c | 432 ++++
src/smith_waterman_sse2.h | 43 +
src/structs.h | 196 ++
src/tatstats.c | 583 +++++
src/tatstats.h | 160 ++
src/thr_buf_structs.h | 119 +
src/thr_bufs2.h | 41 +
src/uascii.h | 59 +
src/upam.h | 872 ++++++++
src/url_subs.c | 383 ++++
src/uthr_subs.h | 52 +
src/wm_align.c | 581 +++++
src/work_thr2.c | 492 +++++
test/results/README | 1 +
test/test.bat | 73 +
test/test.sh | 79 +
test/test2.sh | 53 +
test/test2G.sh | 79 +
test/test2V.sh | 28 +
test/test_mpi.pbs | 59 +
test/test_mpi1.pbs | 27 +
test/test_mpi2.pbs | 33 +
test/test_s.sh | 47 +
test/test_z.sh | 22 +
360 files changed, 116366 insertions(+), 269 deletions(-)
diff --git a/COPYRIGHT b/COPYRIGHT
new file mode 100644
index 0000000..0c5a7b4
--- /dev/null
+++ b/COPYRIGHT
@@ -0,0 +1,19 @@
+ Copyright (c) 1996, 1997, 1998, 1999, 2002, 2014, 2015 by William R. Pearson
+ and The Rector & Visitors of the University of Virginia */
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+
+ Code in the smith_waterman_sse2.c and smith_waterman_sse2.h files
+ is copyright (c) 2006 by Michael Farrar. Code in the
+ global_sse2.c, global_sse2.h, glocal_sse2.c, and glocal_sse2.h
+ files is copyright (c) 2010 by Michael Farrar.
diff --git a/FASTA_LIST b/FASTA_LIST
new file mode 100644
index 0000000..a7b6bf7
--- /dev/null
+++ b/FASTA_LIST
@@ -0,0 +1,17 @@
+
+4 Aug 2010
+
+If you regularly install the latest version of the FASTA package from
+http://faculty.virginia.edu/wrpearson/fasta, you may want to join the
+fasta_list SYMPA mailing list. I use this list to announce new
+releases and solicit bug reports.
+
+To join the mailing list, go to the WWW page at:
+
+ lists.virginia.edu/sympa/info/fasta_list
+
+Select the "Subscribe" option on the lower left, and at the linked
+page, enter your email address, and click "submit". You will be asked
+to confirm your membership in the mailing list.
+
+Bill Pearson
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..e06d208
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright {yyyy} {name of copyright owner}
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
diff --git a/README b/README
new file mode 100644
index 0000000..950ae70
--- /dev/null
+++ b/README
@@ -0,0 +1,77 @@
+
+July, 2015
+
+This version of the FASTA programs is fasta-36.3.8. Since March, 2011
+(fasta-36.3.4), the FASTA programs are no longer interactive. Typing
+bin/fasta36 (or any of the other programs) provides a help message.
+The "classic" interactive mode is available by typing "fasta36 -I".
+In addition, there is only one version of the programs, "fasta36",
+"ssearch36", etc., which is threaded by default on Unix/Linux/MacOSX.
+
+As of November, 2014, the FASTA program code is avaiable under the
+Apache 2.0 open source license.
+
+Up to date release notes are available in the file doc/readme.v36
+
+Documentation on the fasta3 version programs is available in the files:
+
+ doc/fasta36.1 (unix man page)
+
+ doc/changes_v36.html (short descriptions of enhancements to
+ FASTA programs)
+
+ doc/readme.v36 (text descriptions of bug fixes and version history)
+
+ doc/fasta_guide.tex (Latex file which describes fasta-36,
+ and provides an introduction to the FASTA programs,
+ their use and installation.)
+
+ doc/fasta_guide.pdf (printable/viewable description of fasta-36)
+
+The latter two files provide background information on installing the
+fasta programs (in particular, the FASTLIBS file), that new users of
+the fasta3 package may find useful.
+
+================================================================
+
+The FASTA distribution directories (this directory) has been
+substantially re-organized to make it easier to find things. However,
+some documentation has not yet been completely updated to reflect the
+re-organization, so some things may not make sense.
+
+Files can now be found in several sub-directories
+ bin/ (pre-compiled binaries for some architectures)
+ conf/ example fastlibs files
+ data/ scoring matrices
+ doc/ documentation files
+ make/ make files
+ misc/ perl scripts to reformat -m 9 output, convert -R search.res files for 'R', and embed domains in shuffled sequences
+ scripts/ perl scripts for -V (annotate alignments) and -E (expand library) options
+ seq/ test sequences
+ src/ source code
+ sql/ sql files and scripts for using the sql database access
+ test/ test scripts
+
+For some binary distributions, only the doc/, data/, seq/, and bin/,
+directories are provided.
+
+================
+
+To make the standard FASTA programs:
+
+ cd src
+ make -f ../make/Makefile.linux_sse2 all
+
+where "../make/Makefile.linux_sse2" is the appropriate file for your system.
+
+The executable programs will then be found in ../bin
+(e.g. ../bin/fasta36, etc.)
+
+For a simple test of a program, try (from the src directory)
+
+ ../bin/fasta36 -q ../seq/mgstm1.aa ../seq/prot_test.lseg
+
+================================================================
+
+Bill Pearson
+wrp at virginia.edu
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..30b64a3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,77 @@
+
+## The FASTA package - protein and DNA sequence similarity searching and alignment programs
+
+The **FASTA** (pronounced FAST-Aye, not FAST-Ah) programs are a
+comprehensive set of similarity searching and alignment programs for
+searching protein and DNA sequence databases. Like the **BLAST** programs `blastp` and `blastn`, the `fasta` program itself uses a rapid heuristic strategy for finding similar regions in protein and DNA sequences. But in
+addition to heuristic similarity searching, the FASTA package provides
+programs for rigorous local (`ssearch`) and global (`ggsearch`)
+similarity searching, as well as a program for finding non-overlapping
+sequence similarities (`lalign`). Like BLAST, the FASTA package also
+includes programs for aligning translated DNA sequences against
+proteins (`fastx`, `fasty` are equivalent to `blastx`, `tfastx`,
+`tfasty` are similar to `tblastn`).
+
+####September, 2016
+
+The current FASTA version is fasta-36.3.8e.
+
+The fasta-36.3.6e version includes a new directory, `psisearch2`, with
+scripts to run iterative PSSM (PSI-BLAST or SSEARCH36) searches using
+an improved strategy for reducing PSSM contamination due to alignment
+over-extension.
+
+As of November, 2014, the FASTA program code is available under the
+Apache 2.0 open source license.
+
+Up-to-date release notes are available in the file `doc/readme.v36`.
+
+Documentation on the FASTA programs is available in the files:
+
+dir/file | description
+----------|------------
+`doc/fasta36.1` | (unix man page)
+`doc/changes_v36.html` | (short descriptions of enhancements to FASTA programs)
+`doc/readme.v36` | (text descriptions of bug fixes and version history)
+`doc/fasta_guide.tex` | (Latex file which describes fasta36, and provides an introduction to the FASTA programs, their use and installation.)
+`doc/fasta_guide.pdf1` | (printable/viewable description of fasta-36)
+
+`fasta_guide.pdf` provides background information on installing the
+fasta programs (in particular, the `FASTLIBS` file), that new users of
+the fasta3 package may find useful.
+
+Parts of the FASTA package are distributed across several sub-directories
+
+dir | description
+----|------------
+`bin/` | (pre-compiled binaries for some architectures)
+`conf/` | example `FASTLIBS` files (files for finding libraries)
+`data/` | scoring matrices
+`doc/` | documentation files
+`make/` | make files
+`misc/` | perl scripts to reformat -m 9 output, convert -R search.res files for 'R', and embed domains in shuffled sequences
+`psisearch2/` | perl/python scripts implementing the new `psisearch2_msa` iterative PSSM search
+`scripts/` | perl scripts for -V (annotate alignments) and -E (expand library) options
+`seq/` | test sequences
+`src/` | source code
+`sql/` | sql files and scripts for using the sql database access
+`test/` | test scripts
+
+For some binary distributions, only the `doc/`, `data/`, `seq/`, and `bin/`,
+directories are provided.
+
+To make the standard FASTA programs:
+```
+ cd src
+ make -f ../make/Makefile.linux_sse2 all
+```
+where `../make/Makefile.linux_sse2` is the appropriate Makefile for your system.
+
+The executable programs will then be found in `../bin`
+(e.g. `../bin/fasta36`, etc.)
+
+For a simple test of a program, try (from the src directory)
+```
+ ../bin/fasta36 -q ../seq/mgstm1.aa ../seq/prot_test.lseg
+```
+
diff --git a/bin/README b/bin/README
new file mode 100644
index 0000000..da233d3
--- /dev/null
+++ b/bin/README
@@ -0,0 +1 @@
+Placeholder file to create destination for program binaries.
diff --git a/conf/README b/conf/README
new file mode 100644
index 0000000..5758016
--- /dev/null
+++ b/conf/README
@@ -0,0 +1,33 @@
+
+22-Jan-2014
+
+fasta36/conf
+
+================
+
+Files that allow FASTA programs to find libraries using abbreviations.
+
+For example, if the fast_libs_e.www has the line:
+
+Swissprot (NCBI)$0Q${SLIB2}/fa_dbs/swissprot.lseg
+
+and export SLIB2=/slib2
+
+then:
+
+fasta36 ../seq/mgstm1.aa q
+
+is equivalent to:
+
+fasta36 ../seq/mgstm1.aa /slib2/fa_dbs/swissprot.lseg
+
+================
+
+fastlibs -- the original library abbreviation file
+
+fast_new -- allows abbreviations longer than one letter by using "+abbrev+"
+ NBRF PIR1 Annotated Protein Database (rel 56)$0+pir1+/slib2/fa_dbs/pir1.lseg
+
+fast_libs_e.www -- use environment variables in library file name
+
+(+long+ abbreviations and ${SLIB2} environment variables can be combined)
diff --git a/conf/fast_libs_e.www b/conf/fast_libs_e.www
new file mode 100644
index 0000000..d615eaa
--- /dev/null
+++ b/conf/fast_libs_e.www
@@ -0,0 +1,23 @@
+PIR1 Annotated (rel. 66) $0A${SLIB2}/fa_dbs/pir1.lseg
+Swissprot (NCBI)$0Q${SLIB2}/fa_dbs/swissprot.lseg
+NCBI Refseq NP only$0P${SLIB2}/fa_dbs/refseq_np.lseg
+NCBI Refseq proteins$0S${SLIB2}/fa_dbs/refseq_protein.lseg
+NCBI PDB structures$0D${SLIB2}/fa_dbs/pdbaa.lseg
+NCBI NR non-redundant$0N${SLIB2}/fa_dbs/nr.lseg
+Human/Refseq proteins$0H${SLIB2}/genomes/hum_refseq.lseg
+Mouse/Refseq proteins$0M${SLIB2}/genomes/mus_refseq.lseg
+Rat/Refseq proteins$0R${SLIB2}/genomes/rat_refseq.lseg
+Drosophila/RefSeq proteins$0F${SLIB2}/genomes/d_melanogaster.lseg
+C. elegans/RefSeq proteins$0W${SLIB2}/genomes/c_elegans.lseg
+Arabidopsis/RefSeq proteins$0L${SLIB2}/genomes/a_thaliana.lseg
+Yeast (S. cerevisiae)${SLIB2}/genomes/s_cerevisiae.lseg
+E. coli proteins$0E${SLIB2}/genomes/ecoli_k12.lseg
+GB170.0 Primate$1P@${RDLIB2}/gb_asn/gbpri.nam
+GB170.0 Rodent$1R@${RDLIB2}/gb_asn/gbrod.nam
+GB170.0 other Mammal$1M@${RDLIB2}/gb_asn/gbmam.nam
+GB170.0 verteBrates$1B@${RDLIB2}/gb_asn/gbvrt.nam
+GB170.0 Invertebrates$1I@${RDLIB2}/gb_asn/gbinv.nam
+GB170.0 Bacteria$1T@${RDLIB2}/gb_asn/gbbct.nam
+GB170.0 pLants$1L@${RDLIB2}/gb_asn/gbpln.nam
+GB171.0 Viral$1V@${RDLIB2}/gb_asn/gbvrl.nam
+GB171.0 Phage$1G@${RDLIB2}/gb_asn/gbphg.nam
diff --git a/conf/fast_new b/conf/fast_new
new file mode 100644
index 0000000..f5871a8
--- /dev/null
+++ b/conf/fast_new
@@ -0,0 +1,38 @@
+NBRF PIR1 Annotated Protein Database (rel 56)$0+pir1+/slib2/fa_dbs/pir1.lseg
+NBRF Protein database (complete)$0+nbrf+@/seqlib/lib/NBRF.nam
+NRL_3d structure database$0D/seqlib/lib/nrl_3d.seq 5
+NCBI/Blast non-redundant proteins$0+nr+/slib2/fa_dbs/nr.lseg
+NCBI/Blast Swissprot$0+sp+/slib2/fa_dbs/swissprot.lseg
+GENPEPT Translated Protein Database (rel 106.0)$0G/slib2/fa_dbs/genpept.fsa
+Swiss-Prot Release 34$0S/slib0/lib/swiss.seq 5
+Yeast proteins$0Y/slib0/genomes/yeast_nr.pep
+C. elegans blast server$0W/slib2/fa_dbs/C.elegans_blast.fa
+E. coli proteome$0E/slib0/genomes/ecoli.npep
+H. influenzae proteome$0I/slib0/genomes/hinf.npep
+H. pylori proteome$0L/slib0/genomes/hpyl.npep
+NCBI Entrez Human proteins$0H/slib2/fa_dbs/human.aa
+M. pneumococcus proteome$0M/slib0/genomes/mpneu.npep
+M. jannaschii proteome$0J/slib0/genomes/mjan.npep
+Synechosystis proteome$0C/slib0/genomes/synecho.npep
+GB108.0 Invertebrates$1I/seqlib2/gcggenbank/gb_in.seq 6
+GB108.0 Bacteria$1T@/slib0/lib/gb_ba.nam 6
+GB108.0 Primate$1P@/slib0/lib/gb_pri.nam
+GB108.0 Rodent$1R/seqlib2/gcggenbank/gb_ro.seq 6
+GB108.0 other Mammal$1M/seqlib2/gcggenbank/gb_om.seq 6
+GB108.0 verteBrates$1B/seqlib2/gcggenbank/gb_ov.seq 6
+GB108.0 Expressed Seq. Tags$1E@/slib0/lib/gb_est.nam
+GB108.0 High throughput genmomic$1h/seqlib2/gcggenbank/gb_htg.seq 6
+GB108.0 pLants$1L@/slib0/lib/gb_pl.nam 6
+GB108.0 genome Survey sequences$1S@/slib0/lib/gb_gss.nam 6
+GB108.0 Viral$1V/seqlib2/gcggenbank/gb_vi.seq 6
+GB108.0 Phage$1G/seqlib2/gcggenbank/gb_ph.seq 6
+GB108.0 Unannotated$1D/seqlib2/gcggenbank/gb_un.seq 6
+GB108.0 New$1u/seqlib2/gcggenbank/gb_new.seq 6
+GB108.0 All sequences (long)$1A@/slib0/lib/genbank.nam
+Yeast genome$1Y@/seqlib/yeast/yeast_chr.nam
+E. coli genome$1D/slib0/genomes/ecoli.gbk 1
+Blast Human ESTs$1F/slib2/fa_dbs/est_human
+TIGR Human Gene Index$1K/slib2/fa_dbs/HGI.nr.031898
+Blast Mouse ESTs$1C/slib2/fa_dbs/est_mouse
+TIGR Mouse Gene Index$1J/slib2/fa_dbs/MGI.nr.022498
+NCBI/BLAST NR DNA$1n/slib2/fa_dbs/nt
diff --git a/conf/fastlibs b/conf/fastlibs
new file mode 100644
index 0000000..6b94a66
--- /dev/null
+++ b/conf/fastlibs
@@ -0,0 +1,42 @@
+NBRF PIR1 Annotated Protein Database (rel 56)$0A/seqlib/lib/pir1.seq 5
+NBRF PIR1 Annotated (seg) (rel 56)$0B/slib2/fa_dbs/pir1.seg
+NBRF Protein database (complete)$0P@/seqlib/lib/NBRF.nam
+NRL_3d structure database$0D/seqlib/lib/nrl_3d.seq 5
+NCBI/Blast non-redundant proteins$0N/slib2/fa_dbs/nr
+NCBI/Blast non-redundant proteins (seg)$0K/slib2/fa_dbs/nr.seg
+NCBI/Blast Swissprot$0Q/slib2/fa_dbs/swissprot
+NCBI/Blast Swissprot (seg)$0R/slib2/fa_dbs/swissprot.seg
+OWL 30.1 non-redundant protein database$0O/slib2/OWL/owl.seq 5
+GENPEPT Translated Protein Database (rel 106.0)$0G/slib2/fa_dbs/genpept.fsa
+Swiss-Prot Release 34$0S/slib0/lib/swiss.seq 5
+Yeast proteins$0Y/slib0/genomes/yeast_nr.pep
+C. elegans blast server$0W/slib2/fa_dbs/C.elegans_blast.fa
+E. coli proteome$0E/slib0/genomes/ecoli.npep
+H. influenzae proteome$0I/slib0/genomes/hinf.npep
+H. pylori proteome$0L/slib0/genomes/hpyl.npep
+NCBI Entrez Human proteins$0H/slib2/fa_dbs/human.aa
+M. pneumococcus proteome$0M/slib0/genomes/mpneu.npep
+M. jannaschii proteome$0J/slib0/genomes/mjan.npep
+Synechosystis proteome$0C/slib0/genomes/synecho.npep
+GB108.0 Invertebrates$1I/seqlib2/gcggenbank/gb_in.seq 6
+GB108.0 Bacteria$1T@/slib0/lib/gb_ba.nam 6
+GB108.0 Primate$1P@/slib0/lib/gb_pri.nam
+GB108.0 Rodent$1R/seqlib2/gcggenbank/gb_ro.seq 6
+GB108.0 other Mammal$1M/seqlib2/gcggenbank/gb_om.seq 6
+GB108.0 verteBrates$1B/seqlib2/gcggenbank/gb_ov.seq 6
+GB108.0 Expressed Seq. Tags$1E@/slib0/lib/gb_est.nam
+GB108.0 High throughput genmomic$1h/seqlib2/gcggenbank/gb_htg.seq 6
+GB108.0 pLants$1L@/slib0/lib/gb_pl.nam 6
+GB108.0 genome Survey sequences$1S@/slib0/lib/gb_gss.nam 6
+GB108.0 Viral$1V/seqlib2/gcggenbank/gb_vi.seq 6
+GB108.0 Phage$1G/seqlib2/gcggenbank/gb_ph.seq 6
+GB108.0 Unannotated$1D/seqlib2/gcggenbank/gb_un.seq 6
+GB108.0 New$1u/seqlib2/gcggenbank/gb_new.seq 6
+GB108.0 All sequences (long)$1A@/slib0/lib/genbank.nam
+Yeast genome$1Y@/seqlib/yeast/yeast_chr.nam
+E. coli genome$1D/slib0/genomes/ecoli.gbk 1
+Blast Human ESTs$1F/slib2/fa_dbs/est_human
+TIGR Human Gene Index$1K/slib2/fa_dbs/HGI.nr.031898
+Blast Mouse ESTs$1C/slib2/fa_dbs/est_mouse
+TIGR Mouse Gene Index$1J/slib2/fa_dbs/MGI.nr.022498
+NCBI/BLAST NR DNA$1n/slib2/fa_dbs/nt
diff --git a/data/VTML_10.mat b/data/VTML_10.mat
new file mode 100644
index 0000000..9646f64
--- /dev/null
+++ b/data/VTML_10.mat
@@ -0,0 +1,35 @@
+#
+# VTML_10
+#
+# This matrix was produced from: vtml_10qij.mat using vtml_P.mat background frequencies
+#
+# VTML_10 substitution matrix, Units = bits/2.0
+# Expected score = -3.896435 bits; Entropy = 3.467957 bits
+# Target fraction identity = 0.9105
+# Lowest Score = -20, Highest Score= 12
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 7 -8 -8 -8 -5 -7 -7 -6 -9 -9 -9 -8 -7 -10 -6 -4 -5 -11 -10 -5 -8 -7 0 -7
+R -8 8 -7 -16 -9 -4 -14 -9 -5 -10 -10 -2 -8 -12 -9 -8 -8 -10 -9 -11 -11 -9 0 -7
+N -8 -7 9 -3 -10 -5 -7 -7 -4 -11 -11 -5 -9 -12 -10 -4 -5 -12 -8 -11 3 -6 0 -7
+D -8 -16 -3 8 -18 -6 -3 -8 -6 -15 -19 -7 -11 -20 -8 -7 -8 -12 -17 -11 2 -4 0 -7
+C -5 -9 -10 -18 12 -17 -18 -9 -8 -7 -16 -17 -6 -17 -11 -5 -7 -19 -6 -5 -14 -17 0 -7
+Q -7 -4 -5 -6 -17 9 -3 -10 -3 -12 -8 -4 -6 -10 -7 -6 -7 -19 -16 -9 -5 3 0 -7
+E -7 -14 -7 -3 -18 -3 8 -8 -8 -12 -10 -4 -10 -18 -8 -6 -7 -20 -9 -9 -5 2 0 -7
+G -6 -9 -7 -8 -9 -10 -8 7 -9 -19 -13 -9 -12 -13 -10 -6 -10 -11 -12 -12 -7 -9 0 -7
+H -9 -5 -4 -6 -8 -3 -8 -9 10 -11 -9 -7 -16 -7 -8 -6 -7 -8 -3 -10 -5 -5 0 -7
+I -9 -10 -11 -15 -7 -12 -12 -19 -11 8 -3 -11 -3 -7 -13 -11 -7 -8 -10 -1 -13 -12 0 -7
+L -9 -10 -11 -19 -16 -8 -10 -13 -9 -3 7 -10 -2 -5 -9 -10 -9 -8 -8 -5 -15 -9 0 -7
+K -8 -2 -5 -7 -17 -4 -4 -9 -7 -11 -10 8 -7 -18 -8 -7 -6 -10 -10 -10 -6 -4 0 -7
+M -7 -8 -9 -11 -6 -6 -10 -12 -16 -3 -2 -7 10 -4 -12 -10 -6 -16 -15 -5 -10 -8 0 -7
+F -10 -12 -12 -20 -17 -10 -18 -13 -7 -7 -5 -18 -4 9 -11 -9 -10 -5 -1 -8 -16 -14 0 -7
+P -6 -9 -10 -8 -11 -7 -8 -10 -8 -13 -9 -8 -12 -11 9 -6 -8 -11 -19 -9 -9 -7 0 -7
+S -4 -8 -4 -7 -5 -6 -6 -6 -6 -11 -10 -7 -10 -9 -6 8 -3 -10 -8 -10 -5 -6 0 -7
+T -5 -8 -5 -8 -7 -7 -7 -10 -7 -7 -9 -6 -6 -10 -8 -3 8 -19 -10 -6 -6 -7 0 -7
+W -11 -10 -12 -12 -19 -19 -20 -11 -8 -8 -8 -10 -16 -5 -11 -10 -19 12 -4 -17 -12 -19 0 -7
+Y -10 -9 -8 -17 -6 -16 -9 -12 -3 -10 -8 -10 -15 -1 -19 -8 -10 -4 10 -10 -12 -12 0 -7
+V -5 -11 -11 -11 -5 -9 -9 -12 -10 -1 -5 -10 -5 -8 -9 -10 -6 -17 -10 7 -11 -9 0 -7
+B -8 -11 3 2 -14 -5 -5 -7 -5 -13 -15 -6 -10 -16 -9 -5 -6 -12 -12 -11 8 -4 0 -7
+Z -7 -9 -6 -4 -17 3 2 -9 -5 -12 -9 -4 -8 -14 -7 -6 -7 -19 -12 -9 -4 8 0 -7
+X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 -7
+* -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 1
\ No newline at end of file
diff --git a/data/VTML_120.mat b/data/VTML_120.mat
new file mode 100644
index 0000000..61fbf77
--- /dev/null
+++ b/data/VTML_120.mat
@@ -0,0 +1,34 @@
+#
+# VTML_120
+#
+# This matrix was produced from: vtml_120qij.mat using vtml_P.mat background frequencies
+#
+# VTML_120 substitution matrix, Units = bits/2.0
+# Expected score = -0.712191 bits; Entropy = 0.933608 bits
+# Target fraction identity = 0.3740
+# Lowest Score = -7, Highest Score= 11
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 4 -2 -1 -1 0 -1 -1 0 -2 -2 -2 -1 -1 -3 -1 1 0 -4 -3 0 -1 -1 0
+R -2 6 -1 -3 -3 1 -2 -3 0 -4 -3 3 -2 -4 -2 -1 -2 -3 -3 -3 -2 0 0
+N -1 -1 6 2 -3 0 0 -1 1 -4 -4 0 -3 -4 -2 1 0 -5 -2 -3 4 0 0
+D -1 -3 2 6 -5 0 2 -1 -1 -5 -6 -1 -4 -7 -2 -1 -1 -6 -5 -4 4 1 0
+C 0 -3 -3 -5 10 -4 -5 -2 -2 -1 -4 -4 -1 -4 -3 0 -1 -6 -1 0 -4 -4 0
+Q -1 1 0 0 -4 5 2 -2 1 -3 -2 1 -1 -3 -1 -1 -1 -6 -4 -2 0 3 0
+E -1 -2 0 2 -5 2 5 -2 -1 -4 -4 1 -3 -5 -2 -1 -1 -6 -3 -3 1 3 0
+G 0 -3 -1 -1 -2 -2 -2 6 -2 -6 -5 -2 -4 -5 -3 -1 -2 -4 -5 -4 -1 -2 0
+H -2 0 1 -1 -2 1 -1 -2 7 -3 -2 -1 -3 -1 -2 -1 -1 -2 2 -3 0 0 0
+I -2 -4 -4 -5 -1 -3 -4 -6 -3 5 2 -3 2 0 -4 -3 -1 -2 -2 3 -4 -3 0
+L -2 -3 -4 -6 -4 -2 -4 -5 -2 2 5 -3 2 1 -3 -3 -2 -2 -1 1 -5 -3 0
+K -1 3 0 -1 -4 1 1 -2 -1 -3 -3 5 -2 -5 -1 -1 -1 -4 -3 -3 0 1 0
+M -1 -2 -3 -4 -1 -1 -3 -4 -3 2 2 -2 7 1 -4 -3 -1 -4 -3 1 -3 -2 0
+F -3 -4 -4 -7 -4 -3 -5 -5 -1 0 1 -5 1 7 -4 -3 -3 1 4 -1 -5 -4 0
+P -1 -2 -2 -2 -3 -1 -2 -3 -2 -4 -3 -1 -4 -4 7 0 -1 -4 -5 -3 -2 -1 0
+S 1 -1 1 -1 0 -1 -1 -1 -1 -3 -3 -1 -3 -3 0 4 2 -3 -2 -2 0 -1 0
+T 0 -2 0 -1 -1 -1 -1 -2 -1 -1 -2 -1 -1 -3 -1 2 5 -6 -3 0 0 -1 0
+W -4 -3 -5 -6 -6 -6 -6 -4 -2 -2 -2 -4 -4 1 -4 -3 -6 11 2 -4 -5 -6 0
+Y -3 -3 -2 -5 -1 -4 -3 -5 2 -2 -1 -3 -3 4 -5 -2 -3 2 7 -3 -3 -3 0
+V 0 -3 -3 -4 0 -2 -3 -4 -3 3 1 -3 1 -1 -3 -2 0 -4 -3 4 -3 -2 0
+B -1 -2 4 4 -4 0 1 -1 0 -4 -5 0 -3 -5 -2 0 0 -5 -3 -3 6 1 0
+Z -1 0 0 1 -4 3 3 -2 0 -3 -3 1 -2 -4 -1 -1 -1 -6 -3 -2 1 5 0
+X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
diff --git a/data/VTML_160.mat b/data/VTML_160.mat
new file mode 100644
index 0000000..6f155ae
--- /dev/null
+++ b/data/VTML_160.mat
@@ -0,0 +1,34 @@
+#
+# VTML_160
+#
+# This matrix was produced from: vtml_160qij.mat using vtml_P.mat background frequencies
+#
+# VTML_160 substitution matrix, Units = bits/3.0
+# Expected score = -0.493659 bits; Entropy = 0.617215 bits
+# Target fraction identity = 0.2884
+# Lowest Score = -8, Highest Score= 16
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 5 -2 -1 -1 1 -1 -1 0 -2 -2 -2 -1 -1 -3 0 1 1 -5 -4 0 -1 -1 0
+R -2 8 -1 -3 -3 2 -2 -3 1 -4 -4 4 -2 -5 -2 -1 -2 -4 -3 -4 -2 0 0
+N -1 -1 7 3 -3 0 0 0 1 -5 -5 0 -3 -5 -2 1 0 -6 -2 -4 5 0 0
+D -1 -3 3 8 -6 0 3 -1 0 -6 -7 0 -5 -8 -2 0 -1 -7 -6 -5 5 1 0
+C 1 -3 -3 -6 13 -5 -5 -3 -2 -1 -4 -5 -1 -4 -4 1 -1 -7 -1 1 -4 -5 0
+Q -1 2 0 0 -5 6 3 -3 2 -4 -3 2 -1 -4 -1 0 -1 -7 -4 -3 0 4 0
+E -1 -2 0 3 -5 3 6 -2 -1 -5 -4 1 -4 -6 -1 0 -1 -8 -4 -3 1 4 0
+G 0 -3 0 -1 -3 -3 -2 8 -3 -7 -7 -3 -5 -6 -3 0 -3 -5 -6 -5 0 -2 0
+H -2 1 1 0 -2 2 -1 -3 10 -4 -3 0 -4 0 -2 -1 -1 -2 3 -4 0 0 0
+I -2 -4 -5 -6 -1 -4 -5 -7 -4 6 3 -4 2 0 -5 -4 -1 -2 -2 4 -5 -4 0
+L -2 -4 -5 -7 -4 -3 -4 -7 -3 3 6 -4 4 2 -3 -4 -2 -2 -1 2 -6 -3 0
+K -1 4 0 0 -5 2 1 -3 0 -4 -4 6 -2 -6 -1 -1 -1 -5 -4 -3 0 1 0
+M -1 -2 -3 -5 -1 -1 -4 -5 -4 2 4 -2 8 1 -4 -3 -1 -4 -3 1 -4 -2 0
+F -3 -5 -5 -8 -4 -4 -6 -6 0 0 2 -6 1 9 -5 -3 -3 3 6 -1 -6 -5 0
+P 0 -2 -2 -2 -4 -1 -1 -3 -2 -5 -3 -1 -4 -5 10 0 -1 -5 -6 -3 -2 -1 0
+S 1 -1 1 0 1 0 0 0 -1 -4 -4 -1 -3 -3 0 5 2 -4 -2 -2 0 0 0
+T 1 -2 0 -1 -1 -1 -1 -3 -1 -1 -2 -1 -1 -3 -1 2 6 -7 -3 0 0 -1 0
+W -5 -4 -6 -7 -7 -7 -8 -5 -2 -2 -2 -5 -4 3 -5 -4 -7 16 4 -5 -6 -7 0
+Y -4 -3 -2 -6 -1 -4 -4 -6 3 -2 -1 -4 -3 6 -6 -2 -3 4 10 -3 -4 -4 0
+V 0 -4 -4 -5 1 -3 -3 -5 -4 4 2 -3 1 -1 -3 -2 0 -5 -3 5 -4 -3 0
+B -1 -2 5 5 -4 0 1 0 0 -5 -6 0 -4 -6 -2 0 0 -6 -4 -4 7 1 0
+Z -1 0 0 1 -5 4 4 -2 0 -4 -3 1 -2 -5 -1 0 -1 -7 -4 -3 1 6 0
+X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
diff --git a/data/VTML_20.mat b/data/VTML_20.mat
new file mode 100644
index 0000000..4ad6d45
--- /dev/null
+++ b/data/VTML_20.mat
@@ -0,0 +1,34 @@
+#
+# VTML_20
+#
+# This matrix was produced from: vtml_20qij.mat using vtml_P.mat background frequencies
+#
+# VTML_20 substitution matrix, Units = bits/2.0
+# Expected score = -2.916179 bits; Entropy = 2.912514 bits
+# Target fraction identity = 0.8307
+# Lowest Score = -16, Highest Score= 12
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 7 -7 -6 -6 -3 -5 -5 -4 -7 -7 -7 -6 -5 -8 -4 -2 -3 -9 -8 -3 -6 -5 0
+R -7 8 -5 -12 -7 -2 -10 -7 -3 -8 -8 0 -6 -10 -7 -6 -6 -8 -7 -9 -8 -6 0
+N -6 -5 8 -1 -8 -4 -5 -5 -3 -9 -9 -3 -7 -10 -8 -2 -4 -10 -6 -9 3 -4 0
+D -6 -12 -1 8 -14 -4 -1 -6 -4 -12 -15 -5 -9 -16 -6 -5 -6 -10 -14 -9 3 -2 0
+C -3 -7 -8 -14 12 -13 -14 -7 -6 -5 -12 -13 -4 -13 -9 -3 -5 -15 -4 -3 -11 -13 0
+Q -5 -2 -4 -4 -13 9 -1 -8 -2 -9 -6 -2 -4 -8 -5 -4 -5 -15 -12 -7 -4 4 0
+E -5 -10 -5 -1 -14 -1 7 -6 -6 -10 -8 -2 -8 -14 -6 -5 -6 -16 -7 -7 -3 3 0
+G -4 -7 -5 -6 -7 -8 -6 7 -7 -15 -11 -7 -10 -11 -8 -4 -8 -9 -10 -10 -5 -7 0
+H -7 -3 -3 -4 -6 -2 -6 -7 10 -9 -7 -5 -12 -5 -6 -5 -5 -6 -1 -8 -3 -4 0
+I -7 -8 -9 -12 -5 -9 -10 -15 -9 7 -2 -9 -2 -5 -10 -9 -5 -6 -8 1 -10 -9 0
+L -7 -8 -9 -15 -12 -6 -8 -11 -7 -2 6 -8 0 -3 -7 -8 -7 -6 -6 -3 -12 -7 0
+K -6 0 -3 -5 -13 -2 -2 -7 -5 -9 -8 7 -5 -14 -6 -5 -4 -9 -8 -8 -4 -2 0
+M -5 -6 -7 -9 -4 -4 -8 -10 -12 -2 0 -5 10 -3 -10 -8 -4 -13 -11 -3 -8 -6 0
+F -8 -10 -10 -16 -13 -8 -14 -11 -5 -5 -3 -14 -3 9 -9 -7 -8 -3 0 -6 -13 -11 0
+P -4 -7 -8 -6 -9 -5 -6 -8 -6 -10 -7 -6 -10 -9 9 -4 -6 -9 -15 -7 -7 -5 0
+S -2 -6 -2 -5 -3 -4 -5 -4 -5 -9 -8 -5 -8 -7 -4 7 -1 -8 -6 -8 -3 -4 0
+T -3 -6 -4 -6 -5 -5 -6 -8 -5 -5 -7 -4 -4 -8 -6 -1 8 -15 -8 -4 -5 -5 0
+W -9 -8 -10 -10 -15 -15 -16 -9 -6 -6 -6 -9 -13 -3 -9 -8 -15 12 -2 -13 -10 -15 0
+Y -8 -7 -6 -14 -4 -12 -7 -10 -1 -8 -6 -8 -11 0 -15 -6 -8 -2 9 -8 -10 -9 0
+V -3 -9 -9 -9 -3 -7 -7 -10 -8 1 -3 -8 -3 -6 -7 -8 -4 -13 -8 7 -9 -7 0
+B -6 -8 3 3 -11 -4 -3 -5 -3 -10 -12 -4 -8 -13 -7 -3 -5 -10 -10 -9 8 -2 0
+Z -5 -6 -4 -2 -13 4 3 -7 -4 -9 -7 -2 -6 -11 -5 -4 -5 -15 -9 -7 -2 8 0
+X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
diff --git a/data/VTML_200.mat b/data/VTML_200.mat
new file mode 100644
index 0000000..25a8f4a
--- /dev/null
+++ b/data/VTML_200.mat
@@ -0,0 +1,34 @@
+#
+# VTML_200
+#
+# This matrix was produced from: vtml_200qij.mat using vtml_P.mat background frequencies
+#
+# VTML_200 substitution matrix, Units = bits/3.0
+# Expected score = -0.358430 bits; Entropy = 0.412084 bits
+# Target fraction identity = 0.2295
+# Lowest Score = -6, Highest Score= 15
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 4 -2 -1 -1 1 -1 -1 0 -2 -1 -2 -1 -1 -3 0 1 1 -4 -3 0 -1 -1 0
+R -2 7 0 -2 -3 2 -1 -2 1 -3 -3 4 -2 -4 -1 -1 -1 -3 -2 -3 -1 0 0
+N -1 0 6 3 -2 1 1 0 1 -4 -4 1 -3 -4 -2 1 0 -5 -2 -3 4 1 0
+D -1 -2 3 6 -4 1 3 -1 0 -5 -5 0 -4 -6 -1 0 -1 -6 -4 -4 4 2 0
+C 1 -3 -2 -4 12 -3 -4 -2 -2 0 -3 -4 -1 -3 -3 1 0 -6 0 1 -3 -3 0
+Q -1 2 1 1 -3 5 2 -2 2 -3 -2 2 -1 -3 -1 0 0 -6 -3 -2 1 3 0
+E -1 -1 1 3 -4 2 5 -1 0 -4 -4 1 -3 -5 -1 0 -1 -6 -3 -3 2 3 0
+G 0 -2 0 -1 -2 -2 -1 8 -2 -6 -5 -2 -4 -5 -2 0 -2 -5 -5 -4 0 -1 0
+H -2 1 1 0 -2 2 0 -2 8 -3 -2 0 -3 0 -2 0 -1 -1 3 -3 0 1 0
+I -1 -3 -4 -5 0 -3 -4 -6 -3 5 3 -3 2 0 -4 -3 -1 -2 -2 4 -4 -3 0
+L -2 -3 -4 -5 -3 -2 -4 -5 -2 3 5 -3 3 2 -3 -3 -2 -1 -1 2 -4 -3 0
+K -1 4 1 0 -4 2 1 -2 0 -3 -3 5 -2 -5 -1 0 0 -4 -3 -3 0 1 0
+M -1 -2 -3 -4 -1 -1 -3 -4 -3 2 3 -2 6 1 -3 -2 -1 -3 -2 2 -3 -2 0
+F -3 -4 -4 -6 -3 -3 -5 -5 0 0 2 -5 1 8 -4 -3 -3 3 5 -1 -5 -4 0
+P 0 -1 -2 -1 -3 -1 -1 -2 -2 -4 -3 -1 -3 -4 9 0 -1 -4 -5 -3 -1 -1 0
+S 1 -1 1 0 1 0 0 0 0 -3 -3 0 -2 -3 0 4 2 -4 -2 -2 0 0 0
+T 1 -1 0 -1 0 0 -1 -2 -1 -1 -2 0 -1 -3 -1 2 4 -5 -3 0 0 0 0
+W -4 -3 -5 -6 -6 -6 -6 -5 -1 -2 -1 -4 -3 3 -4 -4 -5 15 4 -4 -5 -6 0
+Y -3 -2 -2 -4 0 -3 -3 -5 3 -2 -1 -3 -2 5 -5 -2 -3 4 9 -2 -3 -3 0
+V 0 -3 -3 -4 1 -2 -3 -4 -3 4 2 -3 2 -1 -3 -2 0 -4 -2 4 -3 -2 0
+B -1 -1 4 4 -3 1 2 0 0 -4 -4 0 -3 -5 -1 0 0 -5 -3 -3 6 2 0
+Z -1 0 1 2 -3 3 3 -1 1 -3 -3 1 -2 -4 -1 0 0 -6 -3 -2 2 5 0
+X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
diff --git a/data/VTML_40.mat b/data/VTML_40.mat
new file mode 100644
index 0000000..0a9e637
--- /dev/null
+++ b/data/VTML_40.mat
@@ -0,0 +1,34 @@
+#
+# VTML_40
+#
+# This matrix was produced from: vtml_40qij.mat using vtml_P.mat background frequencies
+#
+# VTML_40 substitution matrix, Units = bits/2.0
+# Expected score = -1.991667 bits; Entropy = 2.267456 bits
+# Target fraction identity = 0.6960
+# Lowest Score = -12, Highest Score= 12
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 6 -5 -4 -4 -1 -3 -3 -2 -5 -5 -5 -4 -4 -6 -3 0 -1 -7 -6 -1 -4 -3 0
+R -5 8 -3 -8 -5 -1 -7 -5 -2 -7 -6 2 -4 -8 -5 -4 -4 -6 -5 -7 -5 -4 0
+N -4 -3 8 0 -6 -2 -3 -3 -1 -7 -7 -2 -5 -8 -6 -1 -2 -8 -4 -7 4 -2 0
+D -4 -8 0 8 -10 -3 1 -4 -3 -10 -11 -3 -7 -12 -4 -3 -4 -9 -10 -7 4 -1 0
+C -1 -5 -6 -10 11 -9 -10 -5 -5 -3 -9 -9 -3 -9 -7 -2 -3 -12 -3 -2 -8 -9 0
+Q -3 -1 -2 -3 -9 8 1 -6 0 -7 -4 0 -3 -6 -3 -2 -3 -11 -8 -5 -2 4 0
+E -3 -7 -3 1 -10 1 7 -5 -4 -8 -7 -1 -6 -11 -4 -3 -4 -12 -5 -6 -1 4 0
+G -2 -5 -3 -4 -5 -6 -5 7 -5 -12 -9 -5 -8 -9 -6 -3 -6 -7 -8 -8 -3 -5 0
+H -5 -2 -1 -3 -5 0 -4 -5 10 -7 -5 -3 -8 -3 -4 -3 -4 -4 0 -6 -2 -2 0
+I -5 -7 -7 -10 -3 -7 -8 -12 -7 7 0 -7 0 -3 -8 -7 -3 -4 -5 2 -8 -7 0
+L -5 -6 -7 -11 -9 -4 -7 -9 -5 0 6 -6 1 -1 -5 -6 -5 -4 -4 -1 -9 -5 0
+K -4 2 -2 -3 -9 0 -1 -5 -3 -7 -6 7 -4 -10 -4 -3 -3 -7 -6 -6 -2 0 0
+M -4 -4 -5 -7 -3 -3 -6 -8 -8 0 1 -4 9 -1 -8 -6 -3 -9 -8 -2 -6 -4 0
+F -6 -8 -8 -12 -9 -6 -11 -9 -3 -3 -1 -10 -1 8 -7 -5 -6 -1 2 -4 -10 -8 0
+P -3 -5 -6 -4 -7 -3 -4 -6 -4 -8 -5 -4 -8 -7 8 -2 -4 -7 -11 -6 -5 -3 0
+S 0 -4 -1 -3 -2 -2 -3 -3 -3 -7 -6 -3 -6 -5 -2 7 1 -6 -4 -6 -2 -2 0
+T -1 -4 -2 -4 -3 -3 -4 -6 -4 -3 -5 -3 -3 -6 -4 1 7 -11 -6 -2 -3 -3 0
+W -7 -6 -8 -9 -12 -11 -12 -7 -4 -4 -4 -7 -9 -1 -7 -6 -11 12 0 -10 -8 -11 0
+Y -6 -5 -4 -10 -3 -8 -5 -8 0 -5 -4 -6 -8 2 -11 -4 -6 0 9 -6 -7 -6 0
+V -1 -7 -7 -7 -2 -5 -6 -8 -6 2 -1 -6 -2 -4 -6 -6 -2 -10 -6 6 -7 -5 0
+B -4 -5 4 4 -8 -2 -1 -3 -2 -8 -9 -2 -6 -10 -5 -2 -3 -8 -7 -7 8 0 0
+Z -3 -4 -2 -1 -9 4 4 -5 -2 -7 -5 0 -4 -8 -3 -2 -3 -11 -6 -5 0 7 0
+X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
diff --git a/data/VTML_80.mat b/data/VTML_80.mat
new file mode 100644
index 0000000..c4202ff
--- /dev/null
+++ b/data/VTML_80.mat
@@ -0,0 +1,34 @@
+#
+# VTML_80
+#
+# This matrix was produced from: vtml_80qij.mat using vtml_P.mat background frequencies
+#
+# VTML_80 substitution matrix, Units = bits/2.0
+# Expected score = -1.134601 bits; Entropy = 1.427882 bits
+# Target fraction identity = 0.5015
+# Lowest Score = -9, Highest Score= 11
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 5 -3 -2 -2 0 -2 -2 -1 -3 -3 -3 -2 -2 -4 -1 1 0 -5 -4 0 -2 -2 0
+R -3 7 -2 -5 -4 1 -3 -3 0 -5 -4 3 -3 -6 -3 -2 -3 -4 -3 -5 -3 -1 0
+N -2 -2 7 1 -4 -1 -1 -1 0 -5 -5 0 -4 -5 -4 1 -1 -6 -2 -5 4 -1 0
+D -2 -5 1 7 -7 -1 2 -2 -1 -7 -8 -2 -5 -9 -2 -1 -2 -7 -7 -5 4 0 0
+C 0 -4 -4 -7 10 -6 -7 -3 -3 -2 -5 -6 -1 -6 -4 0 -2 -8 -1 0 -5 -6 0
+Q -2 1 -1 -1 -6 7 2 -4 1 -5 -3 1 -2 -4 -2 -1 -2 -8 -5 -3 -1 4 0
+E -2 -3 -1 2 -7 2 6 -3 -2 -5 -5 0 -4 -7 -2 -1 -2 -8 -4 -4 0 4 0
+G -1 -3 -1 -2 -3 -4 -3 7 -3 -8 -7 -3 -6 -6 -4 -1 -4 -5 -6 -5 -1 -3 0
+H -3 0 0 -1 -3 1 -2 -3 9 -5 -3 -1 -5 -1 -3 -1 -2 -2 1 -4 0 0 0
+I -3 -5 -5 -7 -2 -5 -5 -8 -5 6 1 -5 1 -1 -6 -5 -2 -3 -3 3 -6 -5 0
+L -3 -4 -5 -8 -5 -3 -5 -7 -3 1 5 -4 2 0 -4 -4 -3 -2 -2 0 -6 -4 0
+K -2 3 0 -2 -6 1 0 -3 -1 -5 -4 6 -2 -7 -2 -2 -1 -5 -4 -4 -1 0 0
+M -2 -3 -4 -5 -1 -2 -4 -6 -5 1 2 -2 8 0 -5 -4 -1 -6 -4 0 -4 -3 0
+F -4 -6 -5 -9 -6 -4 -7 -6 -1 -1 0 -7 0 8 -5 -3 -4 1 3 -2 -7 -5 0
+P -1 -3 -4 -2 -4 -2 -2 -4 -3 -6 -4 -2 -5 -5 8 -1 -2 -5 -7 -4 -3 -2 0
+S 1 -2 1 -1 0 -1 -1 -1 -1 -5 -4 -2 -4 -3 -1 5 1 -4 -3 -3 0 -1 0
+T 0 -3 -1 -2 -2 -2 -2 -4 -2 -2 -3 -1 -1 -4 -2 1 6 -7 -4 -1 -1 -2 0
+W -5 -4 -6 -7 -8 -8 -8 -5 -2 -3 -2 -5 -6 1 -5 -4 -7 11 1 -6 -6 -8 0
+Y -4 -3 -2 -7 -1 -5 -4 -6 1 -3 -2 -4 -4 3 -7 -3 -4 1 8 -4 -4 -4 0
+V 0 -5 -5 -5 0 -3 -4 -5 -4 3 0 -4 0 -2 -4 -3 -1 -6 -4 5 -5 -3 0
+B -2 -3 4 4 -5 -1 0 -1 0 -6 -6 -1 -4 -7 -3 0 -1 -6 -4 -5 7 0 0
+Z -2 -1 -1 0 -6 4 4 -3 0 -5 -4 0 -3 -5 -2 -1 -2 -8 -4 -3 0 6 0
+X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
diff --git a/data/blosum45.mat b/data/blosum45.mat
new file mode 100644
index 0000000..07b8f7a
--- /dev/null
+++ b/data/blosum45.mat
@@ -0,0 +1,30 @@
+# Matrix made by matblas from blosum45.iij
+# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 45
+# Entropy = 0.3795, Expected = -0.2789
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -2 -2 0 -1 -1 0
+R -2 7 0 -1 -3 1 0 -2 0 -3 -2 3 -1 -2 -2 -1 -1 -2 -1 -2 -1 0 -1
+N -1 0 6 2 -2 0 0 0 1 -2 -3 0 -2 -2 -2 1 0 -4 -2 -3 4 0 -1
+D -2 -1 2 7 -3 0 2 -1 0 -4 -3 0 -3 -4 -1 0 -1 -4 -2 -3 5 1 -1
+C -1 -3 -2 -3 12 -3 -3 -3 -3 -3 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -2 -3 -2
+Q -1 1 0 0 -3 6 2 -2 1 -2 -2 1 0 -4 -1 0 -1 -2 -1 -3 0 4 -1
+E -1 0 0 2 -3 2 6 -2 0 -3 -2 1 -2 -3 0 0 -1 -3 -2 -3 1 4 -1
+G 0 -2 0 -1 -3 -2 -2 7 -2 -4 -3 -2 -2 -3 -2 0 -2 -2 -3 -3 -1 -2 -1
+H -2 0 1 0 -3 1 0 -2 10 -3 -2 -1 0 -2 -2 -1 -2 -3 2 -3 0 0 -1
+I -1 -3 -2 -4 -3 -2 -3 -4 -3 5 2 -3 2 0 -2 -2 -1 -2 0 3 -3 -3 -1
+L -1 -2 -3 -3 -2 -2 -2 -3 -2 2 5 -3 2 1 -3 -3 -1 -2 0 1 -3 -2 -1
+K -1 3 0 0 -3 1 1 -2 -1 -3 -3 5 -1 -3 -1 -1 -1 -2 -1 -2 0 1 -1
+M -1 -1 -2 -3 -2 0 -2 -2 0 2 2 -1 6 0 -2 -2 -1 -2 0 1 -2 -1 -1
+F -2 -2 -2 -4 -2 -4 -3 -3 -2 0 1 -3 0 8 -3 -2 -1 1 3 0 -3 -3 -1
+P -1 -2 -2 -1 -4 -1 0 -2 -2 -2 -3 -1 -2 -3 9 -1 -1 -3 -3 -3 -2 -1 -1
+S 1 -1 1 0 -1 0 0 0 -1 -2 -3 -1 -2 -2 -1 4 2 -4 -2 -1 0 0 0
+T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -1 -1 2 5 -3 -1 0 0 -1 0
+W -2 -2 -4 -4 -5 -2 -3 -2 -3 -2 -2 -2 -2 1 -3 -4 -3 15 3 -3 -4 -2 -2
+Y -2 -1 -2 -2 -3 -1 -2 -3 2 0 0 -1 0 3 -3 -2 -1 3 8 -1 -2 -2 -1
+V 0 -2 -3 -3 -1 -3 -3 -3 -3 3 1 -2 1 0 -3 -1 0 -3 -1 5 -3 -3 -1
+B -1 -1 4 5 -2 0 1 -1 0 -3 -3 0 -2 -3 -2 0 0 -4 -2 -3 4 2 -1
+Z -1 0 0 1 -3 4 4 -2 0 -3 -2 1 -1 -3 -1 0 -1 -2 -2 -3 2 4 -1
+X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 -2 -1 -1 -1 -1 -1
+
diff --git a/data/blosum50.mat b/data/blosum50.mat
new file mode 100644
index 0000000..513e4f2
--- /dev/null
+++ b/data/blosum50.mat
@@ -0,0 +1,29 @@
+# Matrix made by matblas from blosum50.iij
+# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 50
+# Entropy = 0.4808, Expected = -0.3573
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -2 -1 -1 -3 -1 1 0 -3 -2 0 -2 -1 -1
+R -2 7 -1 -2 -4 1 0 -3 0 -4 -3 3 -2 -3 -3 -1 -1 -3 -1 -3 -1 0 -1
+N -1 -1 7 2 -2 0 0 0 1 -3 -4 0 -2 -4 -2 1 0 -4 -2 -3 4 0 -1
+D -2 -2 2 8 -4 0 2 -1 -1 -4 -4 -1 -4 -5 -1 0 -1 -5 -3 -4 5 1 -1
+C -1 -4 -2 -4 13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -3 -3 -2
+Q -1 1 0 0 -3 7 2 -2 1 -3 -2 2 0 -4 -1 0 -1 -1 -1 -3 0 4 -1
+E -1 0 0 2 -3 2 6 -3 0 -4 -3 1 -2 -3 -1 -1 -1 -3 -2 -3 1 5 -1
+G 0 -3 0 -1 -3 -2 -3 8 -2 -4 -4 -2 -3 -4 -2 0 -2 -3 -3 -4 -1 -2 -2
+H -2 0 1 -1 -3 1 0 -2 10 -4 -3 0 -1 -1 -2 -1 -2 -3 2 -4 0 0 -1
+I -1 -4 -3 -4 -2 -3 -4 -4 -4 5 2 -3 2 0 -3 -3 -1 -3 -1 4 -4 -3 -1
+L -2 -3 -4 -4 -2 -2 -3 -4 -3 2 5 -3 3 1 -4 -3 -1 -2 -1 1 -4 -3 -1
+K -1 3 0 -1 -3 2 1 -2 0 -3 -3 6 -2 -4 -1 0 -1 -3 -2 -3 0 1 -1
+M -1 -2 -2 -4 -2 0 -2 -3 -1 2 3 -2 7 0 -3 -2 -1 -1 0 1 -3 -1 -1
+F -3 -3 -4 -5 -2 -4 -3 -4 -1 0 1 -4 0 8 -4 -3 -2 1 4 -1 -4 -4 -2
+P -1 -3 -2 -1 -4 -1 -1 -2 -2 -3 -4 -1 -3 -4 10 -1 -1 -4 -3 -3 -2 -1 -2
+S 1 -1 1 0 -1 0 -1 0 -1 -3 -3 0 -2 -3 -1 5 2 -4 -2 -2 0 0 -1
+T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 2 5 -3 -2 0 0 -1 0
+W -3 -3 -4 -5 -5 -1 -3 -3 -3 -3 -2 -3 -1 1 -4 -4 -3 15 2 -3 -5 -2 -3
+Y -2 -1 -2 -3 -3 -1 -2 -3 2 -1 -1 -2 0 4 -3 -2 -2 2 8 -1 -3 -2 -1
+V 0 -3 -3 -4 -1 -3 -3 -4 -4 4 1 -3 1 -1 -3 -2 0 -3 -1 5 -4 -3 -1
+B -2 -1 4 5 -3 0 1 -1 0 -4 -4 0 -3 -4 -2 0 0 -5 -3 -4 5 2 -1
+Z -1 0 0 1 -3 4 5 -2 0 -3 -3 1 -1 -4 -1 0 -1 -2 -2 -3 2 5 -1
+X -1 -1 -1 -1 -2 -1 -1 -2 -1 -1 -1 -1 -1 -2 -2 -1 0 -3 -1 -1 -1 -1 -1
diff --git a/data/blosum62.mat b/data/blosum62.mat
new file mode 100644
index 0000000..6174a66
--- /dev/null
+++ b/data/blosum62.mat
@@ -0,0 +1,30 @@
+# Matrix made by matblas from blosum62.iij
+# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 62
+# Entropy = 0.6979, Expected = -0.5209
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0
+R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1
+N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1
+D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1
+C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2
+Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1
+E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1
+G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1
+H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1
+I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1
+L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1
+K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1
+M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1
+F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2
+S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0
+T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2
+Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1
+V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1
+B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1
+Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1
+X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1
+
diff --git a/data/blosum80.mat b/data/blosum80.mat
new file mode 100644
index 0000000..23191a3
--- /dev/null
+++ b/data/blosum80.mat
@@ -0,0 +1,30 @@
+# Matrix made by matblas from blosum80_3.iij
+# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 80
+# Entropy = 0.9868, Expected = -0.7442
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 7 -3 -3 -3 -1 -2 -2 0 -3 -3 -3 -1 -2 -4 -1 2 0 -5 -4 -1 -3 -2 -1
+R -3 9 -1 -3 -6 1 -1 -4 0 -5 -4 3 -3 -5 -3 -2 -2 -5 -4 -4 -2 0 -2
+N -3 -1 9 2 -5 0 -1 -1 1 -6 -6 0 -4 -6 -4 1 0 -7 -4 -5 5 -1 -2
+D -3 -3 2 10 -7 -1 2 -3 -2 -7 -7 -2 -6 -6 -3 -1 -2 -8 -6 -6 6 1 -3
+C -1 -6 -5 -7 13 -5 -7 -6 -7 -2 -3 -6 -3 -4 -6 -2 -2 -5 -5 -2 -6 -7 -4
+Q -2 1 0 -1 -5 9 3 -4 1 -5 -4 2 -1 -5 -3 -1 -1 -4 -3 -4 -1 5 -2
+E -2 -1 -1 2 -7 3 8 -4 0 -6 -6 1 -4 -6 -2 -1 -2 -6 -5 -4 1 6 -2
+G 0 -4 -1 -3 -6 -4 -4 9 -4 -7 -7 -3 -5 -6 -5 -1 -3 -6 -6 -6 -2 -4 -3
+H -3 0 1 -2 -7 1 0 -4 12 -6 -5 -1 -4 -2 -4 -2 -3 -4 3 -5 -1 0 -2
+I -3 -5 -6 -7 -2 -5 -6 -7 -6 7 2 -5 2 -1 -5 -4 -2 -5 -3 4 -6 -6 -2
+L -3 -4 -6 -7 -3 -4 -6 -7 -5 2 6 -4 3 0 -5 -4 -3 -4 -2 1 -7 -5 -2
+K -1 3 0 -2 -6 2 1 -3 -1 -5 -4 8 -3 -5 -2 -1 -1 -6 -4 -4 -1 1 -2
+M -2 -3 -4 -6 -3 -1 -4 -5 -4 2 3 -3 9 0 -4 -3 -1 -3 -3 1 -5 -3 -2
+F -4 -5 -6 -6 -4 -5 -6 -6 -2 -1 0 -5 0 10 -6 -4 -4 0 4 -2 -6 -6 -3
+P -1 -3 -4 -3 -6 -3 -2 -5 -4 -5 -5 -2 -4 -6 12 -2 -3 -7 -6 -4 -4 -2 -3
+S 2 -2 1 -1 -2 -1 -1 -1 -2 -4 -4 -1 -3 -4 -2 7 2 -6 -3 -3 0 -1 -1
+T 0 -2 0 -2 -2 -1 -2 -3 -3 -2 -3 -1 -1 -4 -3 2 8 -5 -3 0 -1 -2 -1
+W -5 -5 -7 -8 -5 -4 -6 -6 -4 -5 -4 -6 -3 0 -7 -6 -5 16 3 -5 -8 -5 -5
+Y -4 -4 -4 -6 -5 -3 -5 -6 3 -3 -2 -4 -3 4 -6 -3 -3 3 11 -3 -5 -4 -3
+V -1 -4 -5 -6 -2 -4 -4 -6 -5 4 1 -4 1 -2 -4 -3 0 -5 -3 7 -6 -4 -2
+B -3 -2 5 6 -6 -1 1 -2 -1 -6 -7 -1 -5 -6 -4 0 -1 -8 -5 -6 6 0 -3
+Z -2 0 -1 1 -7 5 6 -4 0 -6 -5 1 -3 -6 -2 -1 -2 -5 -4 -4 0 6 -1
+X -1 -2 -2 -3 -4 -2 -2 -3 -2 -2 -2 -2 -2 -3 -3 -1 -1 -5 -3 -2 -3 -1 -2
+
diff --git a/data/dna.mat b/data/dna.mat
new file mode 100644
index 0000000..914cac4
--- /dev/null
+++ b/data/dna.mat
@@ -0,0 +1,19 @@
+# Sample dna matrix
+ A C G T U R Y M W S K D H V B N X
+A 5 -4 -4 -4 -4 2 -1 2 2 -1 -1 1 1 1 -2 -1 -1
+C -4 5 -4 -4 -4 -1 2 2 -1 2 -1 -2 1 1 1 -1 -1
+G -4 -4 5 -4 -4 2 -1 -1 -1 2 2 1 -2 1 1 -1 -1
+T -4 -4 -4 5 5 -1 2 -1 2 -1 2 1 1 -2 1 -1 -1
+U -4 -4 -4 5 5 -1 2 -1 2 -1 2 1 1 -2 1 -1 -1
+R 2 -1 2 -1 -1 2 -2 -1 1 1 1 1 -1 1 -1 -1 -1
+Y -1 2 -1 2 2 -2 2 -1 1 1 1 -1 1 -1 1 -1 -1
+M 2 2 -1 -1 -1 -1 -1 2 1 1 -1 -1 1 1 -1 -1 -1
+W 2 -1 -1 2 2 1 1 1 2 -1 1 1 1 -1 -1 -1 -1
+S -1 2 2 -1 -1 1 1 1 -1 2 1 -1 -1 1 1 -1 -1
+K -1 -1 2 2 2 1 1 -1 1 1 2 1 -1 -1 1 -1 -1
+D 1 -2 1 1 1 1 -1 -1 1 -1 1 1 -1 -1 -1 -1 -1
+H 1 1 -2 1 1 -1 1 1 1 -1 -1 -1 1 -1 -1 -1 -1
+V 1 1 1 -2 -2 1 -1 1 -1 1 -1 -1 -1 1 -1 -1 -1
+B -2 1 1 1 1 -1 1 -1 -1 1 1 -1 -1 -1 1 -1 -1
+N -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
+X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
diff --git a/data/idn_aa.mat b/data/idn_aa.mat
new file mode 100644
index 0000000..f972612
--- /dev/null
+++ b/data/idn_aa.mat
@@ -0,0 +1,24 @@
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+R -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+N -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+D -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+C -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+Q -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+E -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+G -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+H -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+I -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+L -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+K -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+M -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10
+F -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10 -10
+P -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10 -10
+S -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10 -10
+T -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10 -10
+W -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10 -10
+Y -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10 -10
+V -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10 -10
+B -10 -10 2 2 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10 -10
+Z -10 -10 -10 -10 -10 2 2 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 4 -10
+X -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 -10 0
diff --git a/data/md_10.mat b/data/md_10.mat
new file mode 100644
index 0000000..918e604
--- /dev/null
+++ b/data/md_10.mat
@@ -0,0 +1,24 @@
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 11 -13 -12 -11 -13 -13 -10 -8 -15 -13 -15 -14 -13 -18 -7 -5 -4 -20 -19 -6 -12 -11 -1
+R -12 12 -13 -18 -10 -5 -15 -9 -5 -17 -14 -2 -14 -22 -11 -10 -12 -9 -17 -17 -15 -10 -1
+N -12 -13 13 -3 -14 -11 -12 -11 -5 -13 -19 -6 -15 -20 -17 -4 -7 -21 -12 -17 5 -11 -1
+D -11 -18 -3 12 -20 -13 -2 -9 -10 -19 -21 -15 -18 -23 -18 -12 -14 -24 -13 -15 5 -7 -1
+C -13 -10 -14 -20 17 -19 -22 -12 -12 -18 -16 -21 -15 -11 -18 -7 -14 -9 -7 -12 -17 -21 -1
+Q -13 -5 -11 -13 -19 13 -5 -15 -3 -19 -12 -6 -14 -22 -8 -13 -13 -17 -16 -17 -12 4 -1
+E -10 -15 -12 -2 -22 -5 12 -9 -15 -19 -20 -8 -17 -23 -17 -15 -15 -20 -21 -14 -7 3 -1
+G -8 -9 -11 -9 -12 -16 -9 11 -16 -21 -21 -15 -18 -22 -16 -7 -14 -13 -21 -13 -10 -13 -1
+H -16 -5 -5 -10 -12 -3 -15 -16 16 -17 -13 -13 -15 -14 -10 -11 -13 -20 -3 -19 -7 -9 -1
+I -13 -17 -14 -19 -17 -20 -19 -21 -18 12 -7 -17 -4 -11 -19 -14 -7 -20 -15 -1 -16 -19 -1
+L -15 -14 -19 -21 -16 -12 -20 -21 -13 -7 10 -18 -4 -6 -10 -13 -15 -13 -16 -8 -20 -16 -1
+K -14 -2 -6 -15 -21 -6 -8 -15 -13 -17 -18 12 -12 -24 -17 -13 -10 -19 -20 -18 -11 -7 -1
+M -13 -14 -15 -18 -15 -14 -18 -19 -15 -4 -4 -12 16 -14 -17 -15 -7 -16 -18 -5 -16 -16 -1
+F -18 -22 -19 -22 -11 -22 -23 -22 -14 -11 -6 -23 -14 14 -17 -11 -18 -13 -3 -12 -21 -22 -1
+P -7 -12 -17 -18 -18 -8 -17 -16 -10 -19 -10 -16 -17 -17 13 -6 -9 -22 -20 -16 -17 -13 -1
+S -5 -10 -4 -12 -7 -13 -15 -7 -11 -14 -13 -13 -15 -11 -6 11 -4 -15 -12 -14 -8 -14 -1
+T -4 -12 -7 -14 -14 -13 -15 -14 -13 -7 -16 -10 -7 -19 -9 -4 12 -19 -17 -10 -10 -14 -1
+W -21 -9 -21 -21 -10 -17 -21 -13 -21 -21 -13 -21 -17 -13 -21 -15 -18 18 -12 -16 -21 -19 -1
+Y -20 -17 -12 -13 -7 -16 -21 -20 -3 -15 -16 -20 -17 -3 -20 -12 -17 -12 15 -18 -13 -19 -1
+V -6 -17 -17 -15 -12 -17 -14 -13 -19 -1 -8 -18 -5 -12 -16 -14 -10 -16 -18 11 -16 -15 -1
+B -12 -15 5 5 -17 -12 -7 -10 -7 -16 -20 -11 -17 -21 -17 -8 -10 -22 -13 -16 13 -9 -1
+Z -16 -18 -17 -8 -32 1 9 -17 -17 -29 -26 -11 -24 -34 -21 -21 -21 -29 -29 -22 -9 13 -1
+X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
diff --git a/data/md_20.mat b/data/md_20.mat
new file mode 100644
index 0000000..d4f7ab9
--- /dev/null
+++ b/data/md_20.mat
@@ -0,0 +1,24 @@
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 10 -10 -9 -8 -10 -10 -7 -5 -12 -10 -12 -11 -9 -15 -5 -2 -1 -17 -16 -3 -9 -8 -1
+R -10 12 -10 -14 -7 -3 -11 -6 -3 -14 -12 0 -11 -18 -9 -7 -9 -6 -14 -14 -12 -7 -1
+N -9 -10 13 -1 -11 -8 -9 -8 -2 -11 -15 -4 -12 -16 -13 -1 -4 -18 -9 -14 6 -8 -1
+D -8 -14 -1 12 -16 -9 1 -6 -7 -16 -18 -11 -15 -20 -15 -9 -11 -20 -11 -12 6 -4 -1
+C -10 -7 -11 -16 17 -16 -19 -9 -9 -14 -13 -17 -12 -8 -14 -4 -11 -7 -4 -10 -14 -17 -1
+Q -10 -3 -8 -9 -16 13 -3 -12 0 -16 -9 -3 -11 -18 -5 -10 -10 -14 -12 -14 -9 5 -1
+E -7 -11 -9 1 -19 -3 11 -7 -12 -16 -17 -5 -14 -20 -14 -12 -12 -17 -18 -11 -4 4 -1
+G -5 -6 -8 -6 -9 -12 -7 11 -13 -17 -18 -12 -15 -19 -12 -5 -11 -10 -17 -11 -7 -9 -1
+H -12 -3 -2 -7 -9 0 -12 -13 15 -14 -10 -9 -12 -11 -7 -8 -10 -16 0 -15 -4 -6 -1
+I -10 -14 -11 -16 -14 -16 -16 -17 -14 12 -4 -14 -1 -8 -15 -11 -4 -16 -12 2 -13 -16 -1
+L -12 -11 -15 -18 -13 -9 -17 -18 -10 -4 10 -15 -2 -4 -7 -10 -12 -10 -13 -5 -17 -13 -1
+K -11 0 -4 -12 -17 -3 -5 -12 -9 -14 -15 12 -9 -21 -13 -10 -7 -16 -17 -15 -8 -4 -1
+M -9 -11 -12 -15 -12 -11 -15 -16 -12 -1 -2 -9 15 -10 -14 -12 -4 -13 -14 -3 -13 -13 -1
+F -15 -19 -16 -19 -8 -18 -20 -19 -11 -8 -4 -19 -10 13 -14 -8 -15 -10 0 -9 -17 -19 -1
+P -5 -9 -13 -15 -14 -5 -14 -12 -7 -15 -7 -13 -14 -14 12 -3 -7 -18 -16 -13 -14 -10 -1
+S -2 -8 -1 -9 -4 -10 -12 -5 -8 -11 -10 -10 -12 -8 -3 10 -1 -12 -9 -11 -5 -11 -1
+T -1 -9 -4 -11 -10 -10 -12 -11 -10 -4 -12 -7 -4 -15 -7 -1 11 -16 -14 -7 -7 -11 -1
+W -17 -6 -18 -18 -7 -14 -18 -10 -17 -17 -10 -17 -14 -10 -18 -12 -15 18 -9 -13 -18 -16 -1
+Y -16 -14 -9 -11 -4 -12 -18 -17 0 -12 -12 -17 -14 0 -16 -9 -13 -9 14 -15 -10 -15 -1
+V -3 -14 -14 -12 -9 -14 -11 -11 -15 2 -5 -15 -2 -9 -13 -11 -7 -13 -14 11 -13 -12 -1
+B -9 -12 6 6 -14 -9 -4 -7 -4 -13 -17 -8 -13 -18 -14 -5 -7 -19 -10 -13 12 -6 -1
+Z -12 -13 -13 -4 -27 4 10 -13 -12 -24 -21 -6 -20 -29 -17 -17 -17 -24 -24 -18 -6 12 -1
+X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
diff --git a/data/md_40.mat b/data/md_40.mat
new file mode 100644
index 0000000..ff34bd3
--- /dev/null
+++ b/data/md_40.mat
@@ -0,0 +1,24 @@
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 9 -7 -6 -6 -7 -7 -5 -3 -10 -6 -9 -8 -7 -11 -2 0 1 -13 -12 -1 -6 -6 -1
+R -7 11 -6 -10 -5 0 -8 -4 0 -10 -9 3 -8 -14 -6 -5 -6 -4 -10 -11 -8 -4 -1
+N -6 -6 12 2 -8 -5 -5 -5 0 -8 -12 -1 -9 -13 -9 1 -2 -16 -6 -10 7 -5 -1
+D -6 -10 2 11 -13 -6 3 -4 -5 -12 -15 -8 -11 -16 -11 -6 -7 -15 -8 -9 6 -1 -1
+C -6 -5 -8 -13 16 -12 -15 -7 -6 -11 -11 -13 -9 -6 -11 -2 -7 -4 -2 -7 -11 -13 -1
+Q -7 0 -5 -6 -12 12 0 -9 2 -13 -6 0 -8 -14 -3 -7 -7 -11 -9 -11 -6 6 -1
+E -5 -8 -5 3 -15 0 10 -4 -8 -12 -13 -3 -11 -16 -10 -8 -8 -13 -14 -8 -1 5 -1
+G -3 -4 -5 -4 -7 -9 -4 10 -10 -13 -14 -9 -12 -15 -9 -2 -8 -7 -15 -8 -5 -7 -1
+H -10 0 0 -5 -6 2 -8 -10 14 -11 -7 -6 -9 -7 -4 -6 -7 -12 2 -12 -2 -3 -1
+I -6 -10 -8 -12 -11 -13 -12 -13 -11 11 -1 -11 1 -6 -11 -8 -2 -12 -9 4 -10 -12 -1
+L -9 -9 -12 -14 -11 -6 -13 -14 -7 -1 9 -12 1 -1 -5 -7 -9 -7 -9 -2 -13 -10 -1
+K -8 3 -1 -8 -13 0 -3 -9 -6 -11 -12 11 -7 -18 -10 -7 -5 -12 -13 -12 -5 -2 -1
+M -7 -8 -9 -11 -8 -8 -11 -12 -9 1 1 -7 14 -7 -10 -8 -2 -11 -11 0 -10 -10 -1
+F -11 -14 -12 -16 -6 -14 -16 -15 -7 -6 -1 -17 -7 13 -11 -5 -11 -7 2 -6 -14 -15 -1
+P -2 -6 -9 -12 -11 -3 -10 -9 -4 -11 -5 -10 -10 -11 12 -1 -4 -14 -12 -9 -11 -7 -1
+S 0 -5 1 -6 -2 -7 -8 -2 -6 -8 -7 -7 -8 -5 -1 9 1 -10 -7 -7 -3 -8 -1
+T 1 -6 -2 -7 -7 -7 -8 -8 -7 -2 -9 -5 -2 -11 -4 1 10 -14 -10 -4 -5 -8 -1
+W -14 -4 -17 -15 -4 -12 -13 -7 -11 -12 -7 -13 -11 -7 -14 -10 -14 18 -6 -11 -16 -12 -1
+Y -12 -9 -6 -8 -2 -9 -14 -14 2 -9 -9 -13 -11 2 -12 -7 -11 -6 14 -11 -7 -11 -1
+V -1 -11 -10 -9 -7 -11 -8 -8 -12 4 -2 -12 0 -6 -10 -7 -4 -10 -11 10 -10 -9 -1
+B -6 -8 7 6 -11 -6 -1 -5 -2 -10 -13 -5 -10 -14 -10 -3 -5 -16 -7 -10 11 -3 -1
+Z -8 -8 -8 0 -21 6 10 -9 -7 -18 -16 -3 -15 -23 -12 -12 -12 -19 -18 -14 -3 11 -1
+X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
diff --git a/data/pam120.mat b/data/pam120.mat
new file mode 100644
index 0000000..322256a
--- /dev/null
+++ b/data/pam120.mat
@@ -0,0 +1,34 @@
+#
+# This matrix was produced by "pam" Version 1.0.6 [28-Jul-93]
+#
+# PAM 120 substitution matrix, scale = ln(2)/2 = 0.346574
+#
+# Expected score = -1.64, Entropy = 0.979 bits
+#
+# Lowest score = -8, Highest score = 12
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 3 -3 -1 0 -3 -1 0 1 -3 -1 -3 -2 -2 -4 1 1 1 -7 -4 0 0 -1 -1
+R -3 6 -1 -3 -4 1 -3 -4 1 -2 -4 2 -1 -5 -1 -1 -2 1 -5 -3 -2 -1 -2
+N -1 -1 4 2 -5 0 1 0 2 -2 -4 1 -3 -4 -2 1 0 -4 -2 -3 3 0 -1
+D 0 -3 2 5 -7 1 3 0 0 -3 -5 -1 -4 -7 -3 0 -1 -8 -5 -3 4 3 -2
+C -3 -4 -5 -7 9 -7 -7 -4 -4 -3 -7 -7 -6 -6 -4 0 -3 -8 -1 -3 -6 -7 -4
+Q -1 1 0 1 -7 6 2 -3 3 -3 -2 0 -1 -6 0 -2 -2 -6 -5 -3 0 4 -1
+E 0 -3 1 3 -7 2 5 -1 -1 -3 -4 -1 -3 -7 -2 -1 -2 -8 -5 -3 3 4 -1
+G 1 -4 0 0 -4 -3 -1 5 -4 -4 -5 -3 -4 -5 -2 1 -1 -8 -6 -2 0 -2 -2
+H -3 1 2 0 -4 3 -1 -4 7 -4 -3 -2 -4 -3 -1 -2 -3 -3 -1 -3 1 1 -2
+I -1 -2 -2 -3 -3 -3 -3 -4 -4 6 1 -3 1 0 -3 -2 0 -6 -2 3 -3 -3 -1
+L -3 -4 -4 -5 -7 -2 -4 -5 -3 1 5 -4 3 0 -3 -4 -3 -3 -2 1 -4 -3 -2
+K -2 2 1 -1 -7 0 -1 -3 -2 -3 -4 5 0 -7 -2 -1 -1 -5 -5 -4 0 -1 -2
+M -2 -1 -3 -4 -6 -1 -3 -4 -4 1 3 0 8 -1 -3 -2 -1 -6 -4 1 -4 -2 -2
+F -4 -5 -4 -7 -6 -6 -7 -5 -3 0 0 -7 -1 8 -5 -3 -4 -1 4 -3 -5 -6 -3
+P 1 -1 -2 -3 -4 0 -2 -2 -1 -3 -3 -2 -3 -5 6 1 -1 -7 -6 -2 -2 -1 -2
+S 1 -1 1 0 0 -2 -1 1 -2 -2 -4 -1 -2 -3 1 3 2 -2 -3 -2 0 -1 -1
+T 1 -2 0 -1 -3 -2 -2 -1 -3 0 -3 -1 -1 -4 -1 2 4 -6 -3 0 0 -2 -1
+W -7 1 -4 -8 -8 -6 -8 -8 -3 -6 -3 -5 -6 -1 -7 -2 -6 12 -2 -8 -6 -7 -5
+Y -4 -5 -2 -5 -1 -5 -5 -6 -1 -2 -2 -5 -4 4 -6 -3 -3 -2 8 -3 -3 -5 -3
+V 0 -3 -3 -3 -3 -3 -3 -2 -3 3 1 -4 1 -3 -2 -2 0 -8 -3 5 -3 -3 -1
+B 0 -2 3 4 -6 0 3 0 1 -3 -4 0 -4 -5 -2 0 0 -6 -3 -3 4 2 -1
+Z -1 -1 0 3 -7 4 4 -2 1 -3 -3 -1 -2 -6 -1 -1 -2 -7 -5 -3 2 4 -1
+X -1 -2 -1 -2 -4 -1 -1 -2 -2 -1 -2 -2 -2 -3 -2 -1 -1 -5 -3 -1 -1 -1 -2
+
diff --git a/data/pam250.mat b/data/pam250.mat
new file mode 100644
index 0000000..9c79415
--- /dev/null
+++ b/data/pam250.mat
@@ -0,0 +1,34 @@
+#
+# This matrix was produced by "pam" Version 1.0.6 [28-Jul-93]
+#
+# PAM 250 substitution matrix, scale = ln(2)/3 = 0.231049
+#
+# Expected score = -0.844, Entropy = 0.354 bits
+#
+# Lowest score = -8, Highest score = 17
+#
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 2 -2 0 0 -2 0 0 1 -1 -1 -2 -1 -1 -3 1 1 1 -6 -3 0 0 0 0
+R -2 6 0 -1 -4 1 -1 -3 2 -2 -3 3 0 -4 0 0 -1 2 -4 -2 -1 0 -1
+N 0 0 2 2 -4 1 1 0 2 -2 -3 1 -2 -3 0 1 0 -4 -2 -2 2 1 0
+D 0 -1 2 4 -5 2 3 1 1 -2 -4 0 -3 -6 -1 0 0 -7 -4 -2 3 3 -1
+C -2 -4 -4 -5 12 -5 -5 -3 -3 -2 -6 -5 -5 -4 -3 0 -2 -8 0 -2 -4 -5 -3
+Q 0 1 1 2 -5 4 2 -1 3 -2 -2 1 -1 -5 0 -1 -1 -5 -4 -2 1 3 -1
+E 0 -1 1 3 -5 2 4 0 1 -2 -3 0 -2 -5 -1 0 0 -7 -4 -2 3 3 -1
+G 1 -3 0 1 -3 -1 0 5 -2 -3 -4 -2 -3 -5 0 1 0 -7 -5 -1 0 0 -1
+H -1 2 2 1 -3 3 1 -2 6 -2 -2 0 -2 -2 0 -1 -1 -3 0 -2 1 2 -1
+I -1 -2 -2 -2 -2 -2 -2 -3 -2 5 2 -2 2 1 -2 -1 0 -5 -1 4 -2 -2 -1
+L -2 -3 -3 -4 -6 -2 -3 -4 -2 2 6 -3 4 2 -3 -3 -2 -2 -1 2 -3 -3 -1
+K -1 3 1 0 -5 1 0 -2 0 -2 -3 5 0 -5 -1 0 0 -3 -4 -2 1 0 -1
+M -1 0 -2 -3 -5 -1 -2 -3 -2 2 4 0 6 0 -2 -2 -1 -4 -2 2 -2 -2 -1
+F -3 -4 -3 -6 -4 -5 -5 -5 -2 1 2 -5 0 9 -5 -3 -3 0 7 -1 -4 -5 -2
+P 1 0 0 -1 -3 0 -1 0 0 -2 -3 -1 -2 -5 6 1 0 -6 -5 -1 -1 0 -1
+S 1 0 1 0 0 -1 0 1 -1 -1 -3 0 -2 -3 1 2 1 -2 -3 -1 0 0 0
+T 1 -1 0 0 -2 -1 0 0 -1 0 -2 0 -1 -3 0 1 3 -5 -3 0 0 -1 0
+W -6 2 -4 -7 -8 -5 -7 -7 -3 -5 -2 -3 -4 0 -6 -2 -5 17 0 -6 -5 -6 -4
+Y -3 -4 -2 -4 0 -4 -4 -5 0 -1 -1 -4 -2 7 -5 -3 -3 0 10 -2 -3 -4 -2
+V 0 -2 -2 -2 -2 -2 -2 -1 -2 4 2 -2 2 -1 -1 -1 0 -6 -2 4 -2 -2 -1
+B 0 -1 2 3 -4 1 3 0 1 -2 -3 1 -2 -4 -1 0 0 -5 -3 -2 3 2 -1
+Z 0 0 1 3 -5 3 3 0 2 -2 -3 0 -2 -5 0 0 -1 -6 -4 -2 2 3 -1
+X 0 -1 0 -1 -3 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 0 0 -4 -2 -1 -1 -1 -1
+
diff --git a/data/rna.mat b/data/rna.mat
new file mode 100644
index 0000000..b759092
--- /dev/null
+++ b/data/rna.mat
@@ -0,0 +1,19 @@
+# Sample rna matrix with +2 for G:A, TU:C
+ A C G T U R Y M W S K D H V B N X
+A 5 -4 2 -4 -4 2 -1 1 1 -1 -1 1 1 1 -2 -1 -1
+C -4 5 -4 2 2 -1 1 1 -1 1 -1 -2 1 1 1 -1 -1
+G -4 -4 5 -4 -4 1 -1 -1 -1 1 1 1 -2 1 1 -1 -1
+T -4 -4 -4 5 5 -1 2 -1 1 -1 1 1 1 -2 1 -1 -1
+U -4 -4 -4 5 5 -1 2 -1 1 -1 1 1 1 -2 1 -1 -1
+R 2 -1 2 -1 -1 2 -2 -1 1 1 1 1 -1 1 -1 -1 -1
+Y -1 2 -1 2 2 -2 2 -1 1 1 1 -1 1 -1 1 -1 -1
+M 1 1 -1 -1 -1 -1 -1 2 1 1 -1 -1 1 1 -1 -1 -1
+W 1 -1 -1 1 1 1 1 1 1 -1 1 1 1 -1 -1 -1 -1
+S -1 1 1 -1 -1 1 1 1 -1 2 1 -1 -1 1 1 -1 -1
+K -1 -1 1 1 1 1 1 -1 1 1 2 1 -1 -1 1 -1 -1
+D 1 -2 1 1 1 1 -1 -1 1 -1 1 1 -1 -1 -1 -1 -1
+H 1 1 -2 1 1 -1 1 1 1 -1 -1 -1 1 -1 -1 -1 -1
+V 1 1 1 -2 -2 1 -1 1 -1 1 -1 -1 -1 1 -1 -1 -1
+B -2 1 1 1 1 -1 1 -1 -1 1 1 -1 -1 -1 1 -1 -1
+N -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
+X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
diff --git a/data/vtml160.mat b/data/vtml160.mat
new file mode 100644
index 0000000..e525883
--- /dev/null
+++ b/data/vtml160.mat
@@ -0,0 +1,38 @@
+#
+# VTML160
+#
+# This matrix was produced with scripts written by
+# Tobias Mueller and Sven Rahmann [June-2001].
+#
+# VTML160 substitution matrix, Units = Third-Bits
+# Expected Score = -1.297840 Third-Bits
+# Lowest Score = -7, Highest Score = 16
+#
+# Entropy H = 0.562489 Bits
+#
+# 30-Jun-2001
+ A R N D C Q E G H I L K M F P S T W Y V B Z X *
+A 5 -2 -1 -1 1 -1 -1 0 -2 -1 -2 -1 -1 -3 0 1 1 -5 -3 0 -1 -1 0 -7
+R -2 7 0 -3 -3 2 -1 -3 1 -4 -3 4 -2 -5 -2 -1 -1 -4 -3 -4 -2 0 0 -7
+N -1 0 7 3 -3 0 0 0 1 -4 -4 0 -3 -5 -2 1 0 -5 -2 -4 5 0 0 -7
+D -1 -3 3 7 -5 1 3 -1 0 -6 -6 0 -5 -7 -1 0 -1 -7 -5 -4 6 3 0 -7
+C 1 -3 -3 -5 13 -4 -5 -2 -2 -1 -4 -4 -1 -4 -3 1 0 -7 -1 1 -4 -5 0 -7
+Q -1 2 0 1 -4 6 2 -3 2 -4 -2 2 -1 -4 -1 0 -1 -6 -4 -3 0 4 0 -7
+E -1 -1 0 3 -5 2 6 -2 -1 -5 -4 1 -3 -6 -1 0 -1 -7 -3 -3 2 5 0 -7
+G 0 -3 0 -1 -2 -3 -2 8 -3 -7 -6 -2 -5 -6 -3 0 -2 -5 -5 -5 -1 -2 0 -7
+H -2 1 1 0 -2 2 -1 -3 9 -4 -3 0 -3 0 -2 -1 -1 -1 3 -3 0 0 0 -7
+I -1 -4 -4 -6 -1 -4 -5 -7 -4 6 3 -4 2 0 -4 -3 -1 -2 -2 4 -5 -4 0 -7
+L -2 -3 -4 -6 -4 -2 -4 -6 -3 3 6 -3 4 2 -3 -3 -2 -1 -1 2 -5 -3 0 -7
+K -1 4 0 0 -4 2 1 -2 0 -4 -3 5 -2 -5 -1 -1 -1 -5 -3 -3 0 2 0 -7
+M -1 -2 -3 -5 -1 -1 -3 -5 -3 2 4 -2 8 1 -4 -3 -1 -4 -2 1 -4 -3 0 -7
+F -3 -5 -5 -7 -4 -4 -6 -6 0 0 2 -5 1 9 -5 -3 -3 3 6 -1 -6 -5 0 -7
+P 0 -2 -2 -1 -3 -1 -1 -3 -2 -4 -3 -1 -4 -5 9 0 -1 -5 -6 -3 -2 -1 0 -7
+S 1 -1 1 0 1 0 0 0 -1 -3 -3 -1 -3 -3 0 4 2 -4 -2 -2 1 0 0 -7
+T 1 -1 0 -1 0 -1 -1 -2 -1 -1 -2 -1 -1 -3 -1 2 5 -6 -3 0 0 -1 0 -7
+W -5 -4 -5 -7 -7 -6 -7 -5 -1 -2 -1 -5 -4 3 -5 -4 -6 16 4 -5 -6 -7 0 -7
+Y -3 -3 -2 -5 -1 -4 -3 -5 3 -2 -1 -3 -2 6 -6 -2 -3 4 10 -3 -3 -4 0 -7
+V 0 -4 -4 -4 1 -3 -3 -5 -3 4 2 -3 1 -1 -3 -2 0 -5 -3 5 -4 -3 0 -7
+B -1 -2 5 6 -4 0 2 -1 0 -5 -5 0 -4 -6 -2 1 0 -6 -3 -4 5 2 0 -7
+Z -1 0 0 3 -5 4 5 -2 0 -4 -3 2 -3 -5 -1 0 -1 -7 -4 -3 2 5 0 -7
+X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -7
+* -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 -7 1
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 9dfbcc0..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-fasta3 (36.3.8f-1) UNRELEASED; urgency=low
-
- * Initial release (Closes: #coming)
-
- -- Steffen Moeller <moeller at debian.org> Tue, 05 Dec 2017 17:19:26 +0100
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index ca445e7..0000000
--- a/debian/control
+++ /dev/null
@@ -1,75 +0,0 @@
-Source: fasta3
-Section: non-free/science
-Priority: optional
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Steffen Moeller <moeller at debian.org>
-Build-Depends: debhelper (>= 9)
-Standards-Version: 3.9.6
-Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/fasta3/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/fasta3/trunk/
-Homepage: http://fasta.bioch.virginia.edu
-
-Package: fasta3
-Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: tools for searching collections of biological sequences
- The FASTA programs find regions of local or global similarity between
- Protein or DNA sequences, either by searching Protein or DNA databases,
- or by identifying local duplications within a sequence. Other
- programs provide information on the statistical significance of an
- alignment. Like BLAST, FASTA can be used to infer functional and
- evolutionary relationships between sequences as well as help identify
- members of gene families.
- .
- * Protein
- .
- - Protein-protein FASTA
- - Protein-protein Smith-Waterman (ssearch)
- - Global Protein-protein (Needleman-Wunsch) (ggsearch)
- - Global/Local protein-protein (glsearch)
- - Protein-protein with unordered peptides (fasts)
- - Protein-protein with mixed peptide sequences (fastf)
- .
- * Nucleotide
- .
- - Nucleotide-Nucleotide (DNA/RNA fasta)
- - Ordered Nucleotides vs Nucleotide (fastm)
- - Un-ordered Nucleotides vs Nucleotide (fasts)
- .
- * Translated
- .
- - Translated DNA (with frameshifts, e.g. ESTs)
- vs Proteins (fastx/fasty)
- - Protein vs Translated DNA (with frameshifts)
- (tfastx/tfasty)
- - Peptides vs Translated DNA (tfasts)
- .
- * Statistical Significance
- .
- - Protein vs Protein shuffle (prss)
- - DNA vs DNA shuffle (prss)
- - Translated DNA vs Protein shuffle (prfx)
- .
- * Local Duplications
- .
- - Local Protein alignments (lalign)
- - Plot Protein alignment "dot-plot" (plalign)
- - Local DNA alignments (lalign)
- - Plot DNA alignment "dot-plot" (plalign)
-
-Package: fasta3-doc
-Architecture: all
-Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: user guide for FASTA tools
- The FASTA programs find regions of local or global similarity between
- Protein or DNA sequences, either by searching Protein or DNA databases,
- or by identifying local duplications within a sequence. Other
- programs provide information on the statistical significance of an
- alignment. Like BLAST, FASTA can be used to infer functional and
- evolutionary relationships between sequences as well as help identify
- members of gene families.
- .
- The use of the package's many binaries and the equally representated
- conceptual approaches towards sequence analyses are summarised in
- this PDF.
-
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 08c2bf4..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,51 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: FASTA
-Source: http://faculty.virginia.edu/wrpearson/fasta/fasta3/
-
-Files: *
-Copyright: 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
- The Rector & Visitors of the University of Virginia
-License: Apache-2.0
-
-Files: src/wm_align.c
-Copyright: William R. Pearson, University of Virginia
- Webb Miller, Penn State University
-License: Apache-2.0
-
-Files: src/karlin.c
-Copyright: 1990, 1993 Stephen Altschul, NCBI
-License: Apache-2.0
-
-Files: src/smith_waterman_altivec.c
-Copyright: 2004, Erik Lindahl <lindahl at sbc.su.se>
- Stockholm Bioinformatics Center, 2004
-License: Apache-2.0
-
-Files: src/glo[bc]al_sse2.[ch]
- src/smith_waterman_sse2.c
-Copyright: 2006,2010 Michael Farrar <farrar.michael at gmail.com>
-License: Academics-only
- This program may not be sold or incorporated into a commercial product,
- in whole or in part, without written consent of Michael Farrar. For
- further information regarding permission for use or reproduction, please
- contact: Michael Farrar at farrar.michael at gmail.com.
-
-Files: debian/*
-Copyright: 2015 Steffen Moeller <moeller at debian.org>
-License: Apache-2.0
-
-License: Apache-2.0
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- .
- http://www.apache.org/licenses/LICENSE-2.0
- .
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- .
- On Debian systems, the complete text of the Apache version 2.0 license
- can be found in "/usr/share/common-licenses/Apache-2.0".
diff --git a/debian/docs b/debian/docs
deleted file mode 100644
index ab8aa33..0000000
--- a/debian/docs
+++ /dev/null
@@ -1,5 +0,0 @@
-README
-doc/readme*
-doc/README*
-doc/changes*
-doc/fasta_guide.pdf
diff --git a/debian/fasta3-doc.doc-base b/debian/fasta3-doc.doc-base
deleted file mode 100644
index 80cb0da..0000000
--- a/debian/fasta3-doc.doc-base
+++ /dev/null
@@ -1,15 +0,0 @@
-Document: fasta3
-Title: The FASTA program package
-Author: William R. Pearson
-Abstract: This documentation describes the version 36 of the FASTA program
- package (see W. R. Pearson and D. J. Lipman (1988), "Improved Tools for
- Biological Sequence Analysis", PNAS 85:2444-2448. W. R. Pearson (1996)
- "Effective protein sequence comparison" Meth. Enzymol. 266:227-258;
- and Pearson et. al. (1997) Genomics 46:24-36 [18]. Version 3 of the
- FASTA packages contains many programs for searching DNA and protein
- databases and for evaluating statistical significance from randomly
- shuffled sequences.
-Section: Science/Biology
-
-Format: PDF
-Files: /usr/share/doc/fasta3/fasta_guide.pdf
diff --git a/debian/fasta3.install b/debian/fasta3.install
deleted file mode 100644
index eb6cf92..0000000
--- a/debian/fasta3.install
+++ /dev/null
@@ -1 +0,0 @@
-bin/[a-z]* usr/bin
diff --git a/debian/fasta3.manpages b/debian/fasta3.manpages
deleted file mode 100644
index d71b3a8..0000000
--- a/debian/fasta3.manpages
+++ /dev/null
@@ -1,6 +0,0 @@
-doc/fasta36.1
-doc/fastf3.1
-doc/fasts3.1
-doc/map_db.1
-doc/prss3.1
-doc/ps_lav.1
diff --git a/debian/patches/Makefile.patch b/debian/patches/Makefile.patch
deleted file mode 100644
index ac20e6b..0000000
--- a/debian/patches/Makefile.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-Index: fasta3-36.3.7a/make/Makefile
-===================================================================
---- fasta3-36.3.7a.orig/make/Makefile
-+++ fasta3-36.3.7a/make/Makefile
-@@ -34,6 +34,7 @@
- THR_LIBS = -lpthread
- THR_CC =
-
-+BIN = ../bin
- XDIR = /seqprg/bin
-
- DROPGSW_NA_O = dropgsw2.o wm_align.o calcons_sw.o
-Index: fasta3-36.3.7a/make/Makefile.linux64_sse2
-===================================================================
---- fasta3-36.3.7a.orig/make/Makefile.linux64_sse2
-+++ fasta3-36.3.7a/make/Makefile.linux64_sse2
-@@ -12,7 +12,7 @@
-
- SHELL=/bin/bash
-
--CC = gcc -g -O -msse2
-+CC = gcc -g -O -msse2 $(CPPFLAGS)
- #CC= gcc -pg -g -O -msse2 -ffast-math
- #CC = gcc -g -DDEBUG -msse2
-
-@@ -25,7 +25,7 @@
-
- # standard options
-
--CFLAGS= -DSHOW_HELP -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=8 -DTHR_EXIT=pthread_exit -DPROGRESS -DM10_CONS -DFASTA_HOST='"your_fasta_host_here"' -D_REENTRANT -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DSAMP_STATS -DPGM_DOC -DUSE_MMAP -D_LARGEFILE64_SOURCE -DBIG_LIB64
-+CFLAGS+= -DSHOW_HELP -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=8 -DTHR_EXIT=pthread_exit -DPROGRESS -DM10_CONS -DFASTA_HOST='"your_fasta_host_here"' -D_REENTRANT -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DSAMP_STATS -DPGM_DOC -DUSE_MMAP -D_LARGEFILE64_SOURCE -DBIG_LIB64
- # -I/usr/include/mysql -DMYSQL_DB
- # -DSUPERFAMNUM -DSFCHAR="'|'"
-
-Index: fasta3-36.3.7a/make/Makefile36m.common
-===================================================================
---- fasta3-36.3.7a.orig/make/Makefile36m.common
-+++ fasta3-36.3.7a/make/Makefile36m.common
-@@ -34,7 +34,7 @@
- # and "-L/usr/lib64/mysql -lmysqlclient -lz" in LIB_M
- # some systems may also require a LD_LIBRARY_PATH change
-
--LIB_M= -lm -lz
-+LIB_M= $(LDFLAGS) -lm -lz
- #LIB_M= -L/usr/lib64/mysql -lmysqlclient -lz -lm
- NCBL_LIB=ncbl2_mlib.o
- #NCBL_LIB=ncbl2_mlib.o mysql_lib.o
diff --git a/debian/patches/series b/debian/patches/series
deleted file mode 100644
index 5b1c0a4..0000000
--- a/debian/patches/series
+++ /dev/null
@@ -1 +0,0 @@
-Makefile.patch
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index a60608a..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/make -f
-#DH_VERBOSE = 1
-
-DPKG_EXPORT_BUILDFLAGS = 1
-include /usr/share/dpkg/default.mk
-
-ARCH=`dpkg-architecture -qDEB_TARGET_GNU_CPU`
-
-#MAKEFILE="../make/Makefile"
-#
-#ifeq (x86_64,$(ARCH))
-MAKEFILE="../make/Makefile.linux64"
-#endif
-
-export DEB_BUILD_MAINT_OPTIONS = hardening=+all
-CFLAGS+=-flto
-LDFLAGS+=-flto
-
-# package maintainers to append CFLAGS
-#export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic
-# package maintainers to append LDFLAGS
-#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
-
-%:
- dh $@
-
-override_dh_auto_build:
- cd src && $(MAKE) -f $(MAKEFILE)
-
-override_dh_auto_clean:
- if [ -d src ]; then cd src && $(MAKE) -f $(MAKEFILE) clean-up; fi
-
-override_dh_compress:
- dh_compress --exclude=.pdf
-
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/upstream/metadata b/debian/upstream/metadata
deleted file mode 100644
index be6be7b..0000000
--- a/debian/upstream/metadata
+++ /dev/null
@@ -1,20 +0,0 @@
-Reference:
- - Author: William R. Pearson and D. J. Lipman
- Title: "Improved tools for biological sequence comparison"
- Journal: Proc Natl Acad Sci U S A
- Year: 1988
- Volume: 85
- Number: 8
- Pages: 2444-8
- PMID: 3162770
- URL: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC280013/
- eprint: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC280013/pdf/pnas00260-0036.pdf
- - Author: William R. Pearson
- Title: "Effective protein sequence comparison"
- Journal: Methods Enzymol.
- Year: 1996
- Volume: 266
- Pages: 227-58
- DOI: 10.1016/S0076-6879(96)66017-0
- PMID: 8743688
- URL: http://www.sciencedirect.com/science/article/pii/S0076687996660170
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 6fdeb49..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,6 +0,0 @@
-version=3
-http://faculty.virginia.edu/wrpearson/fasta/fasta3/fasta-(3.*)\.tar\.gz
-
-# This is also at Github but
-# https://github.com/wrpearson/fasta36/releases
-# is lagging begind a bit
\ No newline at end of file
diff --git a/doc/INSTALL b/doc/INSTALL
new file mode 100644
index 0000000..285bbcd
--- /dev/null
+++ b/doc/INSTALL
@@ -0,0 +1,17 @@
+
+22-Jan-2014
+
+fasta36/doc/INSTALL
+
+To compile the FASTA programs, change to the fasta36/src directory:
+
+ cd ~/fasta36/src
+
+and type:
+
+ make -f ../make/Makefile.linux64_sse2 all
+
+The file fasta36/make/README gives more information about the
+different Makefiles available. The code is routinely compiled and
+tested under Linux (64-bit, sse2) and MacOS X (64-bit, sse2).
+
diff --git a/doc/README.versions b/doc/README.versions
new file mode 100644
index 0000000..e7a7ae1
--- /dev/null
+++ b/doc/README.versions
@@ -0,0 +1,48 @@
+
+ $Id: README.versions 120 2010-01-31 19:42:09Z wrp $
+ $Revision: 210 $
+
+January, 2010
+
+This directory contains the newest version of FASTA, version 36.
+FASTA36 is a major update to FASTA35 that provides the ability to
+display multiple significant alignments to a query sequence. Previous
+versions of FASTA displayed only the best alignment between the query
+and library sequence; if the library sequence was long, with multiple
+similar regions, only the best was shown. This contrasts with BLAST,
+which has always displayed multiple "HSPs" when they are present.
+
+FASTA36 provides some additional improvements; like BLAST, it now uses
+statistical estimates to set thresholds for band optimization, which
+can increase search speed as much as 2-fold, and it provides much more
+flexibility in specifying the files that are searched (indirect files
+of filenames can include additional indirection). But the main
+improvement is the display of multiple HSPs.
+
+All of the traditional alignment programs: ssearch36, fasta36,
+[t]fast[xy]36 and glsearch36 display multiple HSPs. The peptide and
+mixed peptide alignment programs ([t]fasts36, fastf36, fastm36) still
+show a single HSP.
+
+Currently, the PVM/MPI parallel versions of the programs still display
+a single HSP.
+
+As of late 2007, there is almost no reason to use the fasta2 programs;
+the major programs present in fasta2 that were not present in fasta3
+(version 34) -- align (global alignments) and lalign (non-overlapping
+local alignments) are now available in fasta version 36.
+
+For more information about the programs in the current FASTA v36
+package, see the "changes_v36.html" and "readme.v36" files.
+
+There are still a very few programs in the fasta2 package that are not
+available in the fasta3 package - programs for global alignments
+without end-gap penalties, the "grease" Kyte-Doolittle plot, and
+"garnier" and "chofas" for classic (but inaccurate) secondary
+structure prediction. You should not use the fasta2 programs for
+library searching; the fasta3 programs are more sensitive and have
+better statistics.
+
+Precompiled versions of the programs for Windows and MacOS are available in the
+executables directory.
+
diff --git a/doc/README_v36.3.8d.md b/doc/README_v36.3.8d.md
new file mode 100644
index 0000000..474cea8
--- /dev/null
+++ b/doc/README_v36.3.8d.md
@@ -0,0 +1,37 @@
+
+
+## The FASTA package - protein and DNA sequence similarity searching and alignment programs
+
+Changes in **fasta-36.3.8d** released 13-April-2016:
+
+1. Various bug fixes to `pssm_asn_subs.c` that avoid coredumps when
+ reading NCBI PSSM ASN.1 binary files. `pssm_asn_subs.c` can now read
+ UUPACAA sequences.
+
+2. default gap penalties for VT40 (from -14/-2 to -13/-1), VT80 (from
+ -14/-2 to -11/-1), and VT120 (from -10/-1 to 11/-1) have changed
+ slightly.
+
+3. Introduction of `scripts/m9B_btop_msa.pl` and
+ `scripts/m8_btop_msa.pl`, which uses the BTOP (`-m 9B` or `-m 8CB`)
+ encoded alignment strings to produce a query driving multiple
+ sequence alignment (MSA) in ClustalW format. This MSA can be used
+ as input to `psiblast` to produce an ASN.1 PSSM.
+
+4. The `scripts/annot_blast_btop2.pl` script replaces
+ `scripts/annot_blast_btop.pl` and allows annotation of both the query
+ and subject sequences.
+
+5. Various domain annotation scripts have been renamed for clarity.
+ For example, `ann_feats_up_sql.pl` uses an SQL implementation of
+ Uniprot features tables to annotate domains. Likewise,
+ `ann_pfam_www.pl` gets domain information from the Pfam web site,
+ while `ann_pfam27.pl` gets the information from the downloaded
+ Pfam27 mySQL tables, and `ann_pfam28.pl` uses the Pfam28 mySQL
+ tables.
+
+6. percent identity in sub-alignment scores is calculated like a BLAST
+ percent identity -- gaps are not included in the denominator.
+
+For more detailed information, see `doc/readme.v36`.
+
diff --git a/doc/changes_v34.html b/doc/changes_v34.html
new file mode 100644
index 0000000..54a820a
--- /dev/null
+++ b/doc/changes_v34.html
@@ -0,0 +1,351 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<title>ChangeLog - FASTA v34</title>
+<style type="text/css">
+body { margin-left: 6px; }
+.sidebar {
+font-size: 12px; font-family: sans-serif; text-decoration:none; background-color: #FFFFCC; }
+.fasta { font-family: sans-serif; }
+fasta h3 { font-size: 14px; color: #000000 }
+.fasta td {background-color: #FFFFCC }
+.fasta a { text-decoration: none; }
+.fasta li { font-size: 12px; margin-left:-1em }
+</style>
+<head>
+<body>
+<div class=fasta>
+<h3>ChangeLog - FASTA v34</h3>
+<pre>
+ $Id: changes_v34.html 120 2010-01-31 19:42:09Z wrp $
+ $Revision: 210 $
+</pre>
+<h3>May 28, 2007</h3>
+
+Small modification for GCG ASCII (libtype=5) header line.
+
+<hr>
+<h3>October 6, 2006 CVS fa34t26b3</h3>
+
+New Windows programs available using Intel C++ compiler. First
+threaded programs for Windows; first SSE2 acceleration of SSEARCH for
+Windows.
+
+<h3>July 18, 2006 CVS fa34t26b2</h3>
+
+More powerful environment variable substitutions for FASTLIBS files.
+
+The library file name parsing programs now provide the option for
+environment variable substitions. For example, SLIB2=/slib2 as an
+environment variable (e.g. export SLIB2=/slib2 for ksh and bash), then
+<pre>
+fasta34 -q query.aa '${SLIB2}/swissprot.fa' expands as expected.
+</pre>
+While this is not important for command lines, where the Unix shell
+would expand things anyway, it is very helpful for various
+configuration files, such as files of file names, where:
+<pre>
+<${SLIB2}/blast
+swissprot.fa
+</pre>
+now expands properly, and in FASTLIBS files the line:
+<pre>
+NCBI/Blast Swissprot$0S${SLIB2}/blast/swissprot.fa
+</pre>
+expands properly. Currently, Environment variable expansion only
+takes place for library file names, and the <directory in a file of
+file names.
+
+<h3>July 2, 2006 fa34t26b0</h3>
+
+This release provides an extremely efficient SSE2 implementation of
+the Smith-Waterman algorithm for the SSE2 vector instructions written
+by Michael Farrar (farrar.michael at gmail.com). The SSE code speeds up
+Smith-Waterman 8 - 10-fold in my tests, making it comparable to Eric
+Lindahl's Altivec code for the Apple/IBM G4/G5 architecture.
+
+<h3>May 24, 2006 fa34t25d8</h3>
+
+In addition, support for ASN.1 PSSM:2 files provided by the NCBI
+PSI-BLAST WWW site is included. This code will not work with
+iteration 0 PSSM's (which have no PSSM information). For ASN.1
+PSSM's, which provide the matrix name (and in some cases the gap
+penalties), the scoring matrix and gap penalties are set appropriately
+if they were not specified on the command line. ASN.1 PSSM's are type 2:
+<pre>
+ssearch34 -P "pssm.asn1 2" .....
+</pre>
+
+<h3>May 18, 2006</h3>
+
+Support for NCBI Blast formatdb databases has been expanded. The
+FASTA programs can now read some NCBI *.pal and *.nal files, which are
+used to specify subsets of databases. Specifically, the
+swissprot.00.pal and pdbaa.00.pal files are supported. FASTA supports
+files that refer to *.msk files (i.e. swissprot.00.pal refers to
+swissprot.00.msk); it does not currently support .pal files that
+simply list other .pal or database files (e.g. FASTA does not support
+nr.pal or swissprot.pal).
+
+<hr>
+<h3>Nov 20, 2005</h3>
+
+Changes to support asymmetric matrices - a scoring matrix read in from
+a file can be asymmetric. Default matrices are all symmetric.
+
+<h3>Sept 2, 2005</h3>
+
+The prss34 program has been modified to use the same display routines
+as the other search programs. To be more consistent with the other
+programs, the old "-w shuffle-window-size" is now "-v window-size".
+
+<tt><b>prss34/prfx34</b></tt> will also show the optimal alignment for which the
+significance is calculated by using the "-A" option.
+
+Since the new program reports results exactly like other
+<tt><b>fasta/ssearch/fastxy34</b></tt> programs, parsing for statistical significance
+is considerably different. The old format program can be make using
+"make prss34o".
+
+<h3>May 5, 2005 CVS fa34t25d1</h3>
+
+Modification to the -x option, so that both an "X:X" match score and
+an "X:not-X" mismatch score can be specified. (This score is also used
+to give a positive score to a "*:*" match - the end of a reading frame,
+while giving a negative score to "*:not-*".
+
+<h3>Jan 24, 2005</h3>
+
+Include a new program, "print_pssm", which reads a blastpgp binary
+checkpoint file and writes out the frequency values as text. These
+values can be used with a new option with ssearch34(_t) and prss34,
+which provides the ability to read a text PSSM file. To specify a
+text PSSM, use the option -P "query.ckpt 1" where the "1" indicates a
+text, rather than a binary checkpoint file. "initfa.c" has also been
+modified to work with PSSM files with zero's in the in the frequency
+table. Presumably these positions (at the ends) do not provide
+information. (Jan 26, 2005) blastpgp actually uses BLOSUM62 values
+when zero frequencies are provided, so read_pssm() has been modified
+to use scoring matrix values for zero frequencies as well.
+
+<hr>
+<h3>Nov 4-8, 2004</h3>
+
+Incorporation of Erik Lindahl "anti-diagonal" Altivec code for
+Smith-Waterman, only. Altivec SSEARCH is now faster than FASTA for
+
+<h3>Aug 25,26, 2004 CVS fa34t24b3</h3>
+
+Small change in output format for <tt>p34comp*</tt> programs in
+">>>query_file#1 string" line before alignments. This line is not present
+in the non-parallel versions - it would be better for them to be consistent.
+
+<hr>
+<h3>Dec 10, 2003 CVS fa34t23b3</h3>
+
+Cause default ktup to drop for short query sequences. For protein queries < 50, <i>ktup=1</i>;
+for DNA queries < 20, 50, 100 <i>ktup</i> = 1, 2, 3, respectively.
+
+<h3>Dec 7, 2003</h3>
+
+A new option, "-U" is available for RNA sequence comparison. "-U"
+functions like "-n", indicating that the query is an RNA sequence. In
+addition, to account for "G:U" base pairs, "-U" modifies the scoring
+matrices so that a "G:A" match has the same score as a "G:G" match,
+and "T:C" match has the same score as a "T:T" match.
+
+<h3>Nov 2, 2003</h3>
+
+Support for more sophisticated display options. Previously, one could
+have only on "-m #" option, even though several of the options were
+orthogonal (-m 9c is independent of -m 1 and -m2, which is independent
+of -m 6 (HTML)). In particular -m 9c can be combined with -m 6, which
+can be very helpful for runs that need HTML output but can also
+exploit the encoding provided by -m 9c.
+
+The "-m 9" option now also allows "-m 9i", which shows the standard
+best score information, plus percent identity and alignment length.
+
+<h3>Sept 25, 2003</h3>
+
+A new option is available for annotating alignments. -V '@#?!'
+can be used to annotate sites in a sequence, e.g:
+<pre>
+>GTM1_HUMAN ...
+PMILGYWDIRGLAHAIRLLLEYTDS<b>@</b>S<b>?</b>YEEKKYT at MG
+DAPDYDRS at QWLNEKFKLGLDFPNLPYLIDGAHKIT
+</pre>
+might mark known and expected (S,T) phosphorylation sites. These
+symbols are then displayed on the query coordinate line:
+<pre>
+ 10 20 <b>@?</b> 30 @ 40 @ 50 60
+GTM1_H PMILGYWDIRGLAHAIRLLLEYTDSSYEEKKYTMGDAPDYDRSQWLNEKFKLGLDFPNLP
+ ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+gtm1_h PMILGYWDIRGLAHAIRLLLEYTDSSYEEKKYTMGDAPDYDRSQWLNEKFKLGLDFPNLP
+ 10 20 30 40 50 60
+</pre>
+This annotation is mostly designed to display post-translational
+modifications detected by MassSpec with FASTS, but is also available
+with FASTA and SSEARCH.
+
+<h3>June 16, 2003 version: fasta34t22</h3>
+
+ssearch34 now supports PSI-BLAST PSSM/profiles. Currently, it only
+supports the "checkpoint" file produced by blastall, and only on
+certain architectures where byte-reordering is unnecessary. It has not
+been tested extensively with the -S option.
+<pre>
+ssearch34 -P blast.ckpt -f -11 -g -1 -s BL62 query.aa library
+</pre>
+Will use the frequency information in the blast.chkpt file to do a
+position specific scoring matrix (PSSM) search using the
+Smith-Waterman algorithm. Because ssearch34 calculates scores for
+each of the sequences in the database, we anticipate that PSSM
+ssearch34 statistics will be more reliable than PSI-Blast statistics.
+
+The Blast checkpoint file is mostly double precision frequency
+numbers, which are represented in a machine specific way. Thus, you
+must generate the checkpoint file on the same machine that you run
+ssearch34 or prss34 -P query.ckpt. To generate a checkpoint file,
+run:
+<pre>
+blastpgp -j 2 -h 1e-6 -i query.fa -d swissprot -C query.ckpt -o /dev/null
+</pre>
+(This searches swissprot for 2 iterations ("-j 2" using a E()
+threshold 1e-6 saving the resulting position specific frequencies in
+query.ckpt. Note that the original query.fa and query.ckpt must
+match.)
+
+<h3>Apr 11, 2003 CVS fa34t21b3</h3>
+
+Fixes for "-E" and "-F" with ssearch34, which was inadvertantly disabled.
+<p>
+A new option, "-t t", is available to specify that all the protein
+sequences have implicit termination codons "*" at the end. Thus, all
+protein sequences are one residue longer, and full length matches are
+extended one extra residue and get a higher score. For
+fastx34/tfastx34, this helps extend alignments to the very end in
+cases where there may be a mismatch at the C-terminal residues.</p>
+<p>
+<tt>-m 9c</tt> has also been modified to indicate locations of termination
+codons ( *1).</p>
+
+<h3>Mar 17, 2003 CVS fa34t21b2</h3>
+
+A new option on scoring matrices "-MS" (e.g. "BL50-MS") can be used to
+turn the I/L, K/Q identities on or off. Thus, to make "fastm34" use
+the isobaric identities, use "-s M20-MS". To turn them off for "fasts34",
+use "-s M20".
+
+<h3>Jan 25, 2003</h3>
+
+Add option "-J start:stop" to pv34comp*/mp34comp*. "-J x" used to
+allow one to start at query sequence "x"; now both start and stop can
+be specified.
+
+<hr>
+<h3>Nov 14-22, 2002 CVS fa34t20b6</h3>
+
+Include compile-time define (-DPGM_DOC) that causes all the fasta
+programs to provide the same command line echo that is provided by the
+PVM and MPI parallel programs. Thus, if you run the program:
+<pre>
+fasta34_t -q -S gtt1_drome.aa /slib/swissprot 12
+</pre>
+the first lines of output from FASTA will be:
+<pre>
+# fasta34_t -q gtt1_drome.aa /slib/swissprot
+ FASTA searches a protein or DNA sequence data bank
+ version 3.4t20 Nov 10, 2002
+ Please cite:
+ W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
+</pre>
+This has been turned on by default in most FASTA Makefiles.
+
+<h3>Aug 27, 2002</h3>
+
+Modifications to mshowbest.c and drop*.c (and p2_workcomp.c,
+compacc.c, doinit.c, etc.) to provide more information about the
+alignment with the -m 9 option. There is now a "-m 9c" option, which
+displays an encoded alignment after the -m 9 alignment information.
+The encoding is a string of the form: "=#mat+#ins=#mat-#del=#mat".
+Thus, an alignment over 218 amino acids with no gaps (not necessarily
+100% identical) would be =218. The alignment:
+<pre>
+ 10 20 30 40 50 60 70
+GT8.7 NVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKFKL--GLDFPNLPYL-IDGSHKITQ
+ :.:: . :: :: . .::: : .: ::.: .: : ..:.. ::: :..:
+XURTG NARGRMECIRWLLAAAGVEFDEK---------FIQSPEDLEKLKKDGNLMFDQVPMVEIDG-MKLAQ
+ 20 30 40 50 60
+</pre>
+would be encoded: "=23+9=13-2=10-1=3+1=5". The alignment encoding is
+with respect to the beginning of the alignment, not the beginning of
+either sequence. The beginning of the alignment in either sequence is
+given by the an0/an1 values. This capability is particularly useful
+for [t]fast[xy], where it can be used to indicate frameshift positions
+"/#\#" compactly. If "-m 9c" is used, the "The best scores" title
+line includes "aln_code".
+
+<h3>Aug 14, 2002 CVS tag fa34t20</h3>
+
+Changes to nmgetlib.c to allow multiple query searches coming from
+STDIN, either through pipes or input redirection. Thus, the command
+<pre>
+cat prot_test.lseg | fasta34 -q -S @ /seqlib/swissprot
+</pre>
+produces 11 searches. If you use the multiple query functions, the
+query subset applies only to the first sequence.
+
+Unfortunately, it is not possible to search against a STDIN library,
+because the FASTA programs do not keep the entire library in memory
+and need to be able to re-read high-scoring library sequences. Since
+it is not possible to fseek() against STDIN, searching against a STDIN
+library is not possible.
+
+<h3>Aug 5, 2002</h3>
+
+<tt><b>fasts34(_t)</b></tt> and <tt><b>fastm34(_t)</b></tt> have been modified to allow searches with
+DNA sequences. This gives a new capability to search for DNA motifs,
+or to search for ordered or unordered DNA sequences spaced at
+arbitrary distances.
+
+<h3>June 25, 2002</h3>
+
+Modify the statistical estimation strategy to sample all the sequences
+in the database, not just the first 60,000. The histogram is still
+based only on the first 60,000 scores and lengths, though all scores
+an lengths are shown. The fit to the data may be better than the
+histogram indicates, but it should not be worse.
+
+<h3>June 19, 2002</h3>
+
+Added "-C #" option, where 6 <= # <= MAX_UID (20), to specify the
+length of the sequence name display on the alignment labels. Until
+now, only 6 characters were ever displayed. Now, up to MAX_UID
+characters are available.
+
+<h3>Mar 16, 2002</h3>
+
+Added create_seq_demo.sql, nt_to_sql.pl to show how to build an SQL
+protein sequence database that can be used with with the mySQL
+versions of the fasta34 programs. Once the mySQL seq_demo database
+has been installed, it can be searched using the command:
+<pre>
+fasta34 -q mgstm1.aa "seq_demo.sql 16"
+</pre>
+mysql_lib.c has been modified to remove the restriction that mySQL
+protein sequence unique identifiers be integers. This allows the
+program to be used with the PIRPSD database. The RANLIB() function
+call has been changed to include "libstr", to support SQL text keys.
+Due to the size of libstr[], unique ID's must be < MAX_UID (20)
+characters.
+<p>
+A "pirpsd.sql" file is available for searching the mySQL distribution
+of the PIRPSD database. PIRPSD is available from
+ftp://nbrfa.georgetown.edu/pir_databases/psd/mysql.
+
+</div>
+</body>
+</html>
diff --git a/doc/changes_v35.html b/doc/changes_v35.html
new file mode 100644
index 0000000..5feaa93
--- /dev/null
+++ b/doc/changes_v35.html
@@ -0,0 +1,212 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<title>ChangeLog - FASTA v35</title>
+<style type="text/css">
+body { margin-left: 6px; }
+.sidebar {
+font-size: 12px; font-family: sans-serif; text-decoration:none; background-color: #FFFFCC; }
+.fasta { font-family: sans-serif; }
+.fasta h2 { font-size: 16px; color: #000000 }
+.fasta h3 { font-size: 14px; color: #000000 }
+.fasta td {background-color: #FFFFCC }
+.fasta a { text-decoration: none; }
+.fasta li { margin-left:-1em }
+</style>
+<head>
+<body>
+<div class=fasta>
+<h2>ChangeLog - FASTA v35</h2>
+<pre><small>
+ $Id: changes_v35.html 120 2010-01-31 19:42:09Z wrp $
+ $Revision: 210 $
+</small>
+</pre>
+<hr>
+<h2>Summary - Major Changes in FASTA version 35 (August, 2007)</h2>
+<ol>
+<li>Accurate shuffle based statistics for searches of small libraries (or pairwise comparisons).
+<p/>
+<li>
+Inclusion of <b>lalign35</b> (SIM) into FASTA3. Accurate statistics for
+<b>lalign35</b> alignments. <b>plalign</b> has been replaced by
+<b>lalign35</b> and <b>lav2ps</b>.
+</li>
+<p/>
+<li>
+Two new global alignment programs: <b>ggsearch35</b> and <b>glsearch35</b>.
+</li>
+</ol>
+<hr>
+<h3>February 7, 2008</h3>Allow annotations in library, as well as
+query sequences. Currently, annotations are only available within
+sequences (i.e., they are not read from the feature table), but they
+should be available in FASTA format, or any of the other ascii text
+formats (EMBL/Swissprot, Genbank, PIR/GCG). If annotations are
+present in a library and the annotation characters includes '*', then
+the -V '*' option MUST be used. However, special characters other
+than '*' are ignored, so annotations of '@', '%', or '@' should be
+transparent.
+<p>
+In translated sequence comparisons, annotations are only available for
+the protein sequence.
+<p>
+<h3>January 25, 2007</h3> Support protein queries and sequence
+libraries that contain 'O' (pyrrolysine) and 'U' (selenocysteine).
+('J' was supported already). Currently, 'O' is mapped automatically to
+'K' and 'U' to 'C'.
+<p />
+<h3>Dec. 13, 2007 CVS fa35_03_02m</h3>
+<p>
+Add ability to search a subset of a library using a file name and a
+list of accession/gi numbers. This version introduces a new filetype,
+10, which consists of a first line with a target filename, format, and
+accession number format-type, and optionally the accession number
+format in the database, followed by a list of accession numbers. For
+example:
+<pre>
+ </slib2/blast/swissprot.lseg 0:2 4|
+ 3121763
+ 51701705
+ 7404340
+ 74735515
+ ...
+</pre>
+Tells the program that the target database is swissprot.lseg, which is
+in FASTA (library type 0) format.
+<p>
+The accession format comes after the ":". Currently, there are four
+accession formats, two that require ordered accessions (:1, :2), and
+two that hash the accessions (:3, :4) so they do not need to be
+ordered. The number and character after the accession format
+(e.g. "4|") indicate the offset of the beginning of the accession and
+the character that terminates the accession. Thus, in the typical
+NCBI Fasta definition line:
+<pre>
+ >gi|1170095|sp|P46419|GSTM1_DERPT Glutathione S-transferase (GST class-mu)
+</pre>
+The offset is 4 and the termination character is '|'. For databases
+distributed in FASTA format from the European Bioinformatics
+Institute, the offset depends on the name of the database, e.g.
+<pre>
+ >SW:104K_THEAN Q4U9M9 104 kDa microneme/rhoptry antigen precursor (p104).
+</pre>
+and the delimiter is ' ' (space, the default).
+<p>
+Accession formats 1 and 3 expect strings; accession formats 2 and 4
+work with integers (e.g. gi numbers).
+<p />
+<h3>December 10, 2007</h3>
+Provide encoded annotation information with
+-m 9c alignment summaries. The encoded alignment information makes it
+much simpler to highlight changes in critical residues.
+<p />
+<h3>August 22, 2007</h3> <a name="lav2svg" /> A new program is
+available, <tt>lav2svg</tt>, which creates SVG (Scalable Vector
+Graphics) output. In addition, <a href="#ps_lav"><tt>ps_lav</tt></a>,
+which was introduced May 30, 2007, has been replaced
+by <tt>lav2ps</tt>. SVG files are more easily edited with Adobe
+Illustrator than postscript (<tt>lav2ps</tt>) files.
+<p>
+<h3>July 25, 2007 CVS fa35_02_02</h3>
+Change default gap penalties for OPTIMA5 matrix to -20/-2 from -24/-4.
+<p>
+<h3>July 23, 2007</h3>
+Add code to support to support sub-sequence ranges for "library"
+sequences - necessary for fully functional prss (ssearch35) and
+lalign35. For all programs, it is now possible to specify a subset of
+both the query and the library, e.g.
+<pre>
+lalign35 -q mchu.aa:1-74 mchu.aa:75-148
+</pre>
+Note, however, that the subset range applied to the library will be
+applied to every sequence in the library - not just the first - and
+that the same subset range is applied to each sequence. This probably
+makes sense only if the library contains a single sequence (this is
+also true for the query sequence file).
+<p>
+<h3>July 3, 2007 CVS fa35_02_01</h3>
+
+Merge of previous <tt><b>fasta34</tt></b> with development version <tt><b>fasta35</tt></b>.
+
+<h3>June 26, 2007</h3>
+
+Add amino-acid 'J' for 'I' or 'L'.
+<p>
+Add Mueller and Vingron (2000) J. Comp. Biol. 7:761-776 VT160 matrix,
+"-s VT160", and OPTIMA_5 (Kann et al. (2000) Proteins 41:498-503).
+<h3>June 7, 2007</h3>
+
+<tt><b>ggssearch35(_t)</b></tt>, <tt><b>glsearch35(_t)</b></tt> can now use PSSMs.
+
+<h3>May 30, 2007 CVS fa35_01_04</h3>
+
+<a name="ps_lav" /> Addition of <tt><b>ps_lav</b></tt>
+(now <a href="#lav2svf">lav2ps</a> or <a href="#lav2svg">lav2svg</a>) -- which can be used to plot the lav
+output of
+<tt><b>lalign35 -m 11</b></tt>.
+<pre>lalign35 -m 11 | lav2ps</pre> replaces <tt><b>plalign</b></tt>
+(from <tt><b>FASTA2</b></tt>).
+
+<h3>May 2, 2007</h3>
+
+The labels on the alignment scores are much more informative (and more
+diverse). In the past, alignment scores looked like:
+<pre>
+>>gi|121716|sp|P10649|GSTM1_MOUSE Glutathione S-transfer (218 aa)
+ s-w opt: 1497 Z-score: 1857.5 bits: 350.8 E(): 8.3e-97
+Smith-Waterman score: 1497; 100.0% identity (100.0% similar) in 218 aa overlap (1-218:1-218)
+^^^^^^^^^^^^^^
+</pre>
+where the highlighted text was either: "Smith-Waterman" or "banded
+Smith-Waterman". In fact, scores were calculated in other ways,
+including global/local for <tt><b>fasts</b></tt> and <tt><b>fastf</b></tt>. With the addition of
+<tt><b>ggsearch35,</b></tt> <tt><b>glsearch35,</b></tt> and <tt><b>lalign35,</b></tt> there are many more ways to
+calculate alignments: "Smith-Waterman" (ssearch and protein fasta),
+"banded Smith-Waterman" (DNA fasta), "Waterman-Eggert",
+"trans. Smith-Waterman", "global/local", "trans. global/local",
+"global/global (N-W)". The last option is a global global alignment,
+but with the affine gap penalties used in the Smith-Waterman
+algorithm.
+
+<h3>April 19, 2007 CVS fa34t27br_lal_3</h3>
+
+Two new programs, <tt><b>ggsearch35(_t)</b></tt> and <tt><b>glsearch35(_t)</b></tt> are now available.
+<tt><b>ggsearch35(_t)</b></tt> calculates an alignment score that is global in the
+query and global in the library; <tt><b>glsearch35(_t)</b></tt> calculates an alignment
+that is global in the query and local, while local in the library
+sequence. The latter program is designed for global alignments to domains.
+
+Both programs assume that scores are normally distributed. This
+appears to be an excellent approximation for ggsearch35 scores, but
+the distribution is somewhat skewed for global/local (glsearch)
+scores. <tt><b>ggsearch35(_t)</b></tt> only compares the query to library sequences
+that are beween 80% and 125% of the length of the query; glsearch
+limits comparisons to library sequences that are longer than 80% of
+the query. Initial results suggest that there is relatively little
+length dependence of scores over this range (scores go down
+dramatically outside these ranges).
+
+<h3>March 29, 2007 CVS fa34t27br_lal_1</h3>
+
+At last, the <tt><b>lalign</b></tt> (SIM) algorithm has been moved from <b>FASTA21</b> to
+<b>FASTA35</b>. A <tt><b><a href="#ps_lav">plalign</a></b></tt>
+equivalent is also available using <tt>lalign -m 11 | lav2ps</tt>
+or <tt>| lav2svg</tt>.
+
+The statistical estimates for <tt>lalign35</tt> should be much more accurate
+than those from the earlier lalign, because lambda and K are estimated
+from shuffles.
+
+In addition, all programs can now generate accurate statistical
+estimates with shuffles if the library has fewer than 500 sequences.
+If the library contains more than 500 sequences and the sequences are
+related, then the -z 11 option should be used.
+p<hr>
+<a href="changes_v34.html">FASTA v34 Change Log</a>
+<p> </p>
+</div>
+</body>
+</html>
diff --git a/doc/changes_v36.html b/doc/changes_v36.html
new file mode 100644
index 0000000..e45529a
--- /dev/null
+++ b/doc/changes_v36.html
@@ -0,0 +1,467 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<title>ChangeLog - FASTA v36</title>
+<style type="text/css">
+body { margin-left: 6px; }
+.sidebar {
+font-size: 12px; font-family: sans-serif; text-decoration:none; background-color: #FFFFCC; }
+.fasta { font-family: sans-serif; }
+.fasta h2 { font-size: 16px; color: #000000 }
+.fasta h3 { font-size: 14px; color: #000000 }
+.fasta td {background-color: #FFFFCC }
+.fasta a { text-decoration: none; }
+.fasta li { margin-left:-1em }
+</style>
+<head>
+<body>
+<div class=fasta>
+<h2>ChangeLog - FASTA v36</h2>
+<pre><small>
+ $Id: changes_v36.html $
+</small>
+</pre>
+<hr>
+<h2>Latest Updates - FASTA version 36.3.8d (April, 2016)</h2>
+<ol>
+<li>
+The <tt>fasta-36.3.8d/scripts/</tt> directory now provides a
+script, <tt>annot_blast_btop2.pl</tt> that allows annotations and
+sub-alignment scoring on BLAST alignments that use the tabular format
+with BTOP alignment encoding.
+<p>
+ <li>
+ Bug fixes for overlapping domain domain scoring. v36.3.7 was not thread-safe.
+ <li>
+ Annotation scripts accessing the Pfam domain database can now use
+ the <tt>--vdoms</tt> option to highlight missing parts of a Pfam
+ domain model. In addtion, domains from clans are labeled as clans
+ unless <tt>--no-clans</tt> is specified.
+ </ol>
+<h2>Updates - FASTA version 36.3.7 (November, 2014)</h2>
+<ol>
+ <li>The FASTA programs have been released under the Apache2.0 Open
+ Source License. The COPYRIGHT file, and copyright notices in
+ program files, have been updated to reflect this change.
+ <p>
+<li>Alignment sub-scoring scripts have been extended to allow
+overlapping domains. This requires a modified annotation file format.
+ The "classic" format placed the beginning and end of a domain on different lines:
+<pre>
+ 1 [ - GST_N
+ 88 ] -
+ 90 [ - GST_C
+ 208 ] -
+</pre>
+Since the closing "]" was associated with the previous "[", domains could not overlap.
+<p>
+The new format is:
+<pre>
+ 1 - 88 GST_N
+ 90 - 208 GST_C
+</pre>
+which allows annotations of the form:
+<pre>
+ 1 - 88 GST_N
+ 75 - 123 GST-middle
+ 90 - 208 GST_C
+</pre>
+<p>
+ <li> New annotation scripts are available in
+ the <tt>fasta-36.3.7/scripts</tt> directory,
+ e.g. <tt>ann_pfam_www_e.pl</tt> (Pfam) and <tt>ann_up_www2_e.pl</tt>
+ (Uniprot) to support this new format. If the domain annotations
+ provided by Pfam or Uniprot overlap, then overlapping domains are
+ provided. The <tt>_e.pl</tt> new scripts can be directed to provide
+ non-overlapping domains, using the boundary averaging strategy in
+ the older scripts, by specifying the <tt>--no-over</tt> option.
+</ol>
+
+<h2>Updates - FASTA version 36.3.6f (August, 2014)</h2>
+<p>
+FASTA version 36.3.6f extends previous versions in several ways:
+<ol>
+<li>
+There is a new command line option, <tt>-XI</tt>, that causes the
+alignment programs to report 100% identity only when there are no
+mismatches. In previous versions, one mismatch in 10,000 would round
+up to 100.0% identity; with <tt>-XI</tt>, the identity will be
+reported as 99.9%.
+<li>
+The option to provide alignment encodings (-m 9c, or -m 9C forCIGAR
+strings) has been extended to provide mis-match information in the
+alignment encoding using the -m 9d (classic FASTA alignment encoding)
+or -m 9D (CIGAR string). For protein alignments, which are often <
+40% identity, enabling mismatch encoding produces very long CIGAR
+strings.
+<li>
+Provide more scripts for annotating proteins using either UniProt or
+Pfam web resources.
+</ol>
+<p>
+Additional bug fixes are documented in <tt>fasta-36.3.6f/doc/readme.v36</tt>
+<p>
+<h2>Updates - FASTA version 36.3.6 (July, 2013)</h2>
+<p>
+FASTA version 36.3.6 provides two new features:
+<ol>
+<li>
+A new script-based strategy for including annotation information.
+<li>
+Domain annotation information can be used to produce partition the
+alignment, and partition the scores of the alignment (sub-alignment
+scores). Sub-alignment scores can be used to identify regions of
+alignment over-extension, where a homologous domain aligns, but the
+alignment extends beyond the homologous region into an adjacent
+non-homologous domain.
+</ol>
+Several scripts are provided (e.g. scripts/ann_feats_up_www.pl) that
+can be used to add Uniprot feature and domain annotations to searches
+of SwissProt and Uniprot.
+
+<p><i>(fasta-36.3.5 January 2013)</i>
+The NCBI's transition from BLAST to BLAST+ several years ago broke the
+ability of <tt>ssearch36</tt> to use PSSMs, because <tt>psiblast</tt>
+did not produce the binary ASN.1 PSSMs that <tt>ssearch36</tt> could
+parse. With the January 2013 <tt>fasta-36.3.5f</tt>,
+release <tt>ssearch36</tt> can read binary ASN.1 PSSM files produced
+by the NCBI <tt>datatool</tt> utility.
+See <a href='fasta_guide.pdf'>fasta_guide.pdf</a> for more information
+(look for the <tt>-P</tt> option).
+<hr>
+<h2>Summary - Major Changes in FASTA version 36.3.5 (May, 2011)</h2>
+<ol>
+<li>
+ By default, the FASTA36 programs are no longer interactive. Typing
+<tt>fasta36</tt> presents a short help message, and
+<tt>fasta36 -help</tt> presents a complete list of options. To see the interactive prompts, use
+<tt>fasta36 -I</tt>.
+<p>Likewise, the score histogram is no longer shown by default; use
+ the <tt>-H</tt> option to show the histogram (or compile with
+ -DSHOW_HIST for previous behavior).
+<p>
+The <tt>_t</tt> (<tt>fasta36_t</tt>) versions of the programs are
+built automatically on Linux/MacOSX machines and
+named <tt>fasta36</tt>, etc. (the programs are threaded by default,
+and only one program version is built).
+<p>
+Documentation has been significantly revised and updated.
+See <tt>doc/fasta_guide.pdf</tt> for a description of the programs and options.
+<p>
+<li>
+ Display of all significant alignments between query and library
+ sequence. BLAST has always displayed multiple high-scoring
+ alignments (HSPs) between the query and library sequence; previous
+ versions of the FASTA programs displayed only the best alignment,
+ even when other high-scoring alignments were present. This is the
+ major change in FASTA36. For most programs
+ (<tt>fasta36</tt>, <tt>ssearch36</tt>,
+ <tt>[t]fast[xy]36</tt>), if the library sequence contains additional
+ significant alignments, they will be displayed with the alignment
+ output, and as part of <tt>-m 9</tt> output (the initial list of high
+ scores).
+<p>
+ By default, the statistical threshold for alternate alignments
+ (HSPs) is the E()-threshold / 10.0. For proteins, the default
+ expect threshold is E()< 10.0, the secondary threshold for showing
+ alternate alignments is thus E() < 1.0. Fror translated
+ comparisons, the E()-thresholds are 5.0/0.5; for DNA:DNA 2.0/0.2.
+<p>
+ Both the primary and secondary E()-thresholds are set with the
+ -E "prim sec" command line option. If the secondary
+ value is betwee zero and 1.0, it is taken as the actual
+ threshold. If it is > 1.0, it is taken as a divisor for the primary
+ threshold. If it is negative, alternative alignments are disabled
+ and only the best alignment is shown.
+<p>
+<li>
+ New statistical options, <tt>-z 21, 22, 26</tt>, provide a second E()-value
+ estimate based on shuffles of the highest scoring sequences.
+<p>
+<li>
+New output options. <tt>-m 8</tt> provides the same output format as
+tabular BLAST; <tt>-m 8C</tt> mimics tabular blast with comment
+lines. <tt>-m 9C</tt> provides CIGAR encoded alignments.
+<p>
+(fasta-36.3.4) Alignment option <tt>-m B</tt> provides BLAST-like alignments (no context, coordinates at the beginning and end of the alignment line, <tt>Query/Sbjct</tt>.
+<p>
+<li>
+ Improved performance using statistics based thresholds for
+ gap-joining and band-optimization in the heuristic FASTA local
+ alignment programs (<tt>fasta36</tt>, <tt>[t]fast[xy]36</tt>). By
+ default (fasta36.3) <tt>fasta36</tt>, <tt>[t]fast[xy]36</tt> can use
+ a similar strategy to BLAST to set the thresholds for combining
+ ungapped regions and performing band alignments. This dramatically
+ reduces the number of band alignments performed, for a speed increase
+ of 2 - 3X. The original statistical thresholds can be enabled with
+ the <tt>-c O</tt> (upper-case letter 'O') command line option.
+ Protein and translated protein alignment programs can also use ktup=3
+ for increased speed, though ktup=2 is still the default.
+<p>
+ Statistical thresholds can dramatically reduce the number of
+ "optimized" scores, from which statistical estimates are calculated.
+ To address this problem, the statistical estimation procedure has
+ been adjusted to correct for the fraction of scores that were
+ optimized. This process can dramatically improve statistical accuracy
+ for some matrices and gap pentalies, e.g. BLOSUM62 -11/-1.
+<p>
+ With the new joining thresholds, the
+<tt>-c "E-opt E-join"</tt> options have expanded meanings. <tt>-c "E-opt E-join"</tt>
+ calculates a threshold designed (but not guaranteed) to do band
+ optimization and joining for that fraction of sequences. Thus, <tt>-c
+ "0.02 0.1"</tt> seeks to do band optimization (E-opt) on 2% of alignments,
+ and joining on 10% of alignments. <tt>-c "40 10"</tt> sets the gap
+ threshold as in earlier versions.
+<p>
+<li>
+A new option (<tt>-e expand_script.sh</tt>) is available that allows
+the set of sequences that are aligned to be larger than the set of
+sequences searched. When the <tt>-e expand_script.sh</tt> option is
+used, the <tt>expand_script.sh</tt> script is run with an input
+argument that is a file of accession numbers and E()-values; this
+information can be used to produce a fasta-formatted list of
+additional sequences, which will then be compared and aligned (if they
+are significant), and included in the list of high scoring sequences
+and the alignments. The expanded set of sequences does not change the
+database size o statisical parameters, it simply expands the set of
+high-scoring sequences.
+<p>
+<li>
+The <tt>-m F</tt> option can be used to produce multiple output formats in different files from the same search. For example, <tt>-m "F9c,10 m9c10.output" -m "FBB blastBB.output"</tt> produces two output files in addition to the normally formatted output sent to <tt>stdout</tt>. The <tt>m9c10.output</tt> file contains <tt>-m 9c</tt> score descriptions and <tt>-m 10</tt> alignments, while <tt>blastBB.output</tt> contains BLAST-like output (<tt>-m BB</tt>).
+<p>
+<li>
+ Scoring matrices can vary with query sequence length. In large-scale
+ searches with metagenomics reads, some reads may be too short to
+ produce statistically significant scores against comprehensive
+ databases (e.g. a DNA read of 90 nt is translated into 30 aa, which
+ would require a scoring matrix with at least 1.3 bits/position to
+ produce a 40 bit score). fasta-36.3.* includes the option to specify
+ a "variable" scoring matrix by including '?' as the first letter of
+ the scoring matrix abbreviation, e.g. fasta36_t -q -s '?BP62' would
+ use BP62 for sequences long enough to produce significant alignment
+ scores, but would use scoring matrices with more information content
+ for shorter sequences. The FASTA programs include BLOSUM50 (0.49
+ bits/pos) and BLOSUM62 (0.58 bits/pos) but can range to MD10 (3.44
+ bits/position). The variable scoring matrix option searches down the
+ list of scoring matrices to find one with information content high
+ enough to produce a 40 bit alignment score. (Several bugs in the
+ process are fixed in fasta-36.3.2.)
+<p>
+
+<li>
+Several less-used options
+(<tt>-1</tt>, <tt>-B</tt>, <tt>-o</tt>, <tt>-x</tt>, <tt>-y</tt>) have
+become <i>extended</i> options, available via the <tt>-X</tt> (upper case X) option.
+The old <tt>-X off1,off2</tt> option is now <tt>-o off1,off2</tt>.
+<p>
+By default, the program will read up to 2 GB (32-bit systems) or 12 GB
+(64-bit systems) of the database into memory for multi-query searches.
+The amount of memory available for databases can be set with
+the <tt>-XM4G</tt> option.
+<p>
+<li>
+ Much greater flexibility in specifying combinations of library files
+ and subsets of libraries. It has always been possible to search a
+ list of libraries specified by an indirect (@) file; the FASTA36
+ programs can include indirect files of library names inside of
+ indirect files of library names.
+<p>
+<li>
+ <tt>fasta-36.3.2</tt> <b>ggsearch36</b> (global/global)
+ and <b>glsearch36</b> now incorporate SSE2 accelerated global
+ alignment, developed by Michael Farrar. These programs are now about
+ 20-fold faster.
+<p>
+<li>
+<tt>fasta-36.2.1</tt> (and later versions) are fully threaded, both for
+searches, and for alignments. The programs routinely run 12 - 15X
+faster on dual quad-core machines with "hyperthreading".
+</ol>
+<hr>
+<h2>Summary - Major Changes in FASTA version 35 (August, 2007)</h2>
+<ol>
+<li>Accurate shuffle based statistics for searches of small libraries (or pairwise comparisons).
+<p/>
+<li>
+Inclusion of <b>lalign35</b> (SIM) into FASTA3. Accurate statistics for
+<b>lalign35</b> alignments. <b>plalign</b> has been replaced by
+<b>lalign35</b> and <b>lav2ps</b>.
+</li>
+<p/>
+<li>
+Two new global alignment programs: <b>ggsearch35</b> and <b>glsearch35</b>.
+</li>
+</ol>
+<hr>
+<h3>February 7, 2008</h3>Allow annotations in library, as well as
+query sequences. Currently, annotations are only available within
+sequences (i.e., they are not read from the feature table), but they
+should be available in FASTA format, or any of the other ascii text
+formats (EMBL/Swissprot, Genbank, PIR/GCG). If annotations are
+present in a library and the annotation characters includes '*', then
+the -V '*' option MUST be used. However, special characters other
+than '*' are ignored, so annotations of '@', '%', or '@' should be
+transparent.
+<p>
+In translated sequence comparisons, annotations are only available for
+the protein sequence.
+<p>
+<h3>January 25, 2007</h3> Support protein queries and sequence
+libraries that contain 'O' (pyrrolysine) and 'U' (selenocysteine).
+('J' was supported already). Currently, 'O' is mapped automatically to
+'K' and 'U' to 'C'.
+<p />
+<h3>Dec. 13, 2007 CVS fa35_03_02m</h3>
+<p>
+Add ability to search a subset of a library using a file name and a
+list of accession/gi numbers. This version introduces a new filetype,
+10, which consists of a first line with a target filename, format, and
+accession number format-type, and optionally the accession number
+format in the database, followed by a list of accession numbers. For
+example:
+<pre>
+ </slib2/blast/swissprot.lseg 0:2 4|
+ 3121763
+ 51701705
+ 7404340
+ 74735515
+ ...
+</pre>
+Tells the program that the target database is swissprot.lseg, which is
+in FASTA (library type 0) format.
+<p>
+The accession format comes after the ":". Currently, there are four
+accession formats, two that require ordered accessions (:1, :2), and
+two that hash the accessions (:3, :4) so they do not need to be
+ordered. The number and character after the accession format
+(e.g. "4|") indicate the offset of the beginning of the accession and
+the character that terminates the accession. Thus, in the typical
+NCBI Fasta definition line:
+<pre>
+ >gi|1170095|sp|P46419|GSTM1_DERPT Glutathione S-transferase (GST class-mu)
+</pre>
+The offset is 4 and the termination character is '|'. For databases
+distributed in FASTA format from the European Bioinformatics
+Institute, the offset depends on the name of the database, e.g.
+<pre>
+ >SW:104K_THEAN Q4U9M9 104 kDa microneme/rhoptry antigen precursor (p104).
+</pre>
+and the delimiter is ' ' (space, the default).
+<p>
+Accession formats 1 and 3 expect strings; accession formats 2 and 4
+work with integers (e.g. gi numbers).
+<p />
+<h3>December 10, 2007</h3>
+Provide encoded annotation information with
+-m 9c alignment summaries. The encoded alignment information makes it
+much simpler to highlight changes in critical residues.
+<p />
+<h3>August 22, 2007</h3> <a name="lav2svg" /> A new program is
+available, <tt>lav2svg</tt>, which creates SVG (Scalable Vector
+Graphics) output. In addition, <a href="#ps_lav"><tt>ps_lav</tt></a>,
+which was introduced May 30, 2007, has been replaced
+by <tt>lav2ps</tt>. SVG files are more easily edited with Adobe
+Illustrator than postscript (<tt>lav2ps</tt>) files.
+<p>
+<h3>July 25, 2007 CVS fa35_02_02</h3>
+Change default gap penalties for OPTIMA5 matrix to -20/-2 from -24/-4.
+<p>
+<h3>July 23, 2007</h3>
+Add code to support to support sub-sequence ranges for "library"
+sequences - necessary for fully functional prss (ssearch35) and
+lalign35. For all programs, it is now possible to specify a subset of
+both the query and the library, e.g.
+<pre>
+lalign35 -q mchu.aa:1-74 mchu.aa:75-148
+</pre>
+Note, however, that the subset range applied to the library will be
+applied to every sequence in the library - not just the first - and
+that the same subset range is applied to each sequence. This probably
+makes sense only if the library contains a single sequence (this is
+also true for the query sequence file).
+<p>
+<h3>July 3, 2007 CVS fa35_02_01</h3>
+
+Merge of previous <tt><b>fasta34</tt></b> with development version <tt><b>fasta35</tt></b>.
+
+<h3>June 26, 2007</h3>
+
+Add amino-acid 'J' for 'I' or 'L'.
+<p>
+Add Mueller and Vingron (2000) J. Comp. Biol. 7:761-776 VT160 matrix,
+"-s VT160", and OPTIMA_5 (Kann et al. (2000) Proteins 41:498-503).
+<h3>June 7, 2007</h3>
+
+<tt><b>ggssearch35(_t)</b></tt>, <tt><b>glsearch35(_t)</b></tt> can now use PSSMs.
+
+<h3>May 30, 2007 CVS fa35_01_04</h3>
+
+<a name="ps_lav" /> Addition of <tt><b>ps_lav</b></tt>
+(now <a href="#lav2svf">lav2ps</a> or <a href="#lav2svg">lav2svg</a>) -- which can be used to plot the lav
+output of
+<tt><b>lalign35 -m 11</b></tt>.
+<pre>lalign35 -m 11 | lav2ps</pre> replaces <tt><b>plalign</b></tt>
+(from <tt><b>FASTA2</b></tt>).
+
+<h3>May 2, 2007</h3>
+
+The labels on the alignment scores are much more informative (and more
+diverse). In the past, alignment scores looked like:
+<pre>
+>>gi|121716|sp|P10649|GSTM1_MOUSE Glutathione S-transfer (218 aa)
+ s-w opt: 1497 Z-score: 1857.5 bits: 350.8 E(): 8.3e-97
+Smith-Waterman score: 1497; 100.0% identity (100.0% similar) in 218 aa overlap (1-218:1-218)
+^^^^^^^^^^^^^^
+</pre>
+where the highlighted text was either: "Smith-Waterman" or "banded
+Smith-Waterman". In fact, scores were calculated in other ways,
+including global/local for <tt><b>fasts</b></tt> and <tt><b>fastf</b></tt>. With the addition of
+<tt><b>ggsearch35,</b></tt> <tt><b>glsearch35,</b></tt> and <tt><b>lalign35,</b></tt> there are many more ways to
+calculate alignments: "Smith-Waterman" (ssearch and protein fasta),
+"banded Smith-Waterman" (DNA fasta), "Waterman-Eggert",
+"trans. Smith-Waterman", "global/local", "trans. global/local",
+"global/global (N-W)". The last option is a global global alignment,
+but with the affine gap penalties used in the Smith-Waterman
+algorithm.
+
+<h3>April 19, 2007 CVS fa34t27br_lal_3</h3>
+
+Two new programs, <tt><b>ggsearch35(_t)</b></tt> and <tt><b>glsearch35(_t)</b></tt> are now available.
+<tt><b>ggsearch35(_t)</b></tt> calculates an alignment score that is global in the
+query and global in the library; <tt><b>glsearch35(_t)</b></tt> calculates an alignment
+that is global in the query and local, while local in the library
+sequence. The latter program is designed for global alignments to domains.
+
+Both programs assume that scores are normally distributed. This
+appears to be an excellent approximation for ggsearch35 scores, but
+the distribution is somewhat skewed for global/local (glsearch)
+scores. <tt><b>ggsearch35(_t)</b></tt> only compares the query to library sequences
+that are beween 80% and 125% of the length of the query; glsearch
+limits comparisons to library sequences that are longer than 80% of
+the query. Initial results suggest that there is relatively little
+length dependence of scores over this range (scores go down
+dramatically outside these ranges).
+
+<h3>March 29, 2007 CVS fa34t27br_lal_1</h3>
+
+At last, the <tt><b>lalign</b></tt> (SIM) algorithm has been moved from <b>FASTA21</b> to
+<b>FASTA35</b>. A <tt><b><a href="#ps_lav">plalign</a></b></tt>
+equivalent is also available using <tt>lalign -m 11 | lav2ps</tt>
+or <tt>| lav2svg</tt>.
+
+The statistical estimates for <tt>lalign35</tt> should be much more accurate
+than those from the earlier lalign, because lambda and K are estimated
+from shuffles.
+
+In addition, all programs can now generate accurate statistical
+estimates with shuffles if the library has fewer than 500 sequences.
+If the library contains more than 500 sequences and the sequences are
+related, then the -z 11 option should be used.
+p<hr>
+<a href="changes_v34.html">FASTA v34 Change Log</a>
+<p> </p>
+</div>
+</body>
+</html>
diff --git a/doc/fasta.defaults b/doc/fasta.defaults
new file mode 100644
index 0000000..7404c57
--- /dev/null
+++ b/doc/fasta.defaults
@@ -0,0 +1,17 @@
+#pgm mol matrix g_open g_ext fr_shft e_cut ktup
+# -n/-p -s -e -f -h/-j -E argv[3]
+fasta prot BL50 -10 -2 - 10.0 2
+fasta dna +5/-4 -14 -4 - 2.0 6
+ssearch prot bl50 -10 -2 - 10.0 -
+ssearch dna +5/-4 -14 -4 - 2.0 -
+fastx prot BL50 -12 -2 -20 5.0 2
+fasty prot BL50 -12 -2 -20/-24 5.0 2
+tfastx dna BL50 -14 -2 -20 5.0 2
+tfasty dna BL50 -14 -2 -20/-24 5.0 2
+fasts prot MD20-MS - - - 5.0 -
+tfasts prot MD10-MS - - - 2.0 -
+fastf prot MD20 - - - 5.0 -
+tfastf prot MD10 - - - 2.0 -
+fastm prot MD20 - - - 5.0 -
+tfastm prot MD10 - - - 2.0 -
+lalign prot BL50 -12 -2 10.0 -
diff --git a/doc/fasta.history.tex b/doc/fasta.history.tex
new file mode 100644
index 0000000..015f37c
--- /dev/null
+++ b/doc/fasta.history.tex
@@ -0,0 +1,180 @@
+\begin{longtable}{p{0.75 in}p{5.25 in}}
+\multicolumn{2}{c}{\textbf{FASTA version history (cont.)}} \\
+\hline\\[-1.0ex]
+% \textbf{Date} & {\bf Improvements} \\[0.5ex] \hline \\[-1.5ex]
+\endhead
+\multicolumn{2}{l}{{\Large {\bf B FASTA version history}}} \\[2 ex]
+\hline\\[-1.0ex]
+% {\bf Date} & {\bf Improvements} \\[0.5ex] \hline \\[-1.5ex]
+\endfirsthead
+\hline\\
+& \\
+\endfoot
+\hline\\
+& \\
+\endlastfoot
+
+\multicolumn{2}{c}{ \FASTA v33, Oct, 1999 -- Dec, 2000 } \\[1 ex]
+\hline \\[-0.5 ex]
+
+Oct 1999 & Add support for NCBI Blast2.0 formatted libraries, and
+memory mapped databases. \FASTA now reads both \texttt{BLAST1.4} and
+\texttt{BLAST2.0} formatted databases. (version 3.2t08)\\ & Include
+Maximum Likelihood Estimates for Lambda and K ( -z 2) \\
+
+ & Include a new strategy for searching with low
+complexity regions. The \texttt{pseg} program can produce libraries
+with low complexity regions as lower case characters, which can be
+ignored during the initial \texttt{FASTA}/\texttt{SSEARCH} scan, but are considered when
+producing the final alignments. (3.3t01)\\
+
+ & Change output to report bit scores, which are also used by BLAST. \\
+
+Mar 2000 & Another new statistics option, -z 6, uses Mott's
+approach \cite{mot921} for calculating a
+composition dependent Lambda for each sequence. (3.3t05) \\
+
+Dec 2000 & Automatically change the gap penalties when alternate
+(known) scoring matrices are used using Reese and Pearson gap
+penalties \cite{wrp022}. First implementation to read from MySQL
+databases. \\ May 2001 & change all \FASTA gap penalties from
+first-residue, additional residue to the gap-open, gap-extend values
+used by BLAST. \\[0.5ex]
+
+\hline \\[-0.5 ex]
+\multicolumn{2}{c}{ \FASTA v34, Jan, 2001 -- Jan, 2007 } \\[1 ex]
+\hline \\[-0.5 ex]
+
+Jun 2002 & Modify statistical estimation strategy to sample all the
+sequences in the database, not just the first 60,000. (3.4t11) \\
+
+Jan 2003 & Implementation of vector-accelerated (Altivec) code for
+Smith-Waterman ({\tt SSEARCH}) and banded Smith-Waterman (\FASTA)
+using the Rognes and Seebug \cite{rog003} algorithm. This code was
+removed in Sept, 2003, because of possible conflict with a patent
+application, but was restored using a different algorithm in
+Nov. 2004. \\
+
+Jun 2003 & Provide \texttt{PSI-SEARCH} --- an implementation of
+\texttt{SSEARCH} that can search with \texttt{PSI-BLAST} PSSM profile
+files. \texttt{PSI-SEARCH} estimates statistical significance from
+the distribution of actual alignment scores; thus the estimates are
+much more reliable than \texttt{PSI-BLAST} estimates. Also, change
+the similarity display to work with profiles. (3.4t22) \\
+
+July 2003 & Provide ASN.1 definition line parsing for \texttt{BLAST}
+{\tt formatdb} v.4 libraries. Restructure the programs to use a table-driven
+approach to parameter setting. Two tables now define the algorithm,
+query sequence type, library type, scoring matrix, and gap penalties for
+all programs. \\
+
+Sept 2003 & A new option {\tt -V} for annotating alignments
+provided. Designed for highlighting post-translational modifications
+with {\tt fasts}, it can also be used to highlight active sites and
+other conserved residues. (3.4t23) \\
+
+Dec 2003 & Addition of {\tt -U} option for RNA sequence
+comparison. {\tt G:A} matches score like {\tt G:G} matches to account
+for {\tt G:U} basepairs. Change default {\it ktup} for short query
+sequences. Increase band-width for DNA banded final alignments. \\
+
+July 2004 & Allow searching of \texttt{Postgres}, as well as
+\texttt{MySQL} database queries. \\
+
+Nov 2004 & (\texttt{fa34t24}) Incorporation of Erik Lindahl "anti-diagonal" Altivec
+implementation of \cite{woz974} for Smith-Waterman only. Altivec
+{\tt ssearch34} is now faster than {\tt fasta34} for query sequences $<$ 250 amino acids. \\
+
+Jan 2005 & Change {\tt FASTS} to accommodate very large numbers of
+peptides ($>$100) for full coverage on long proteins \\
+
+Jun. 2006 & (\texttt{fa34t26}) Incorporation of Smith-Waterman
+algorithm for the SSE2 vector instructions written by Michael Farrar
+\cite{farrar2007}. The SSE code speeds up Smith-Waterman 8 --
+16-fold. \\[1.0 ex]
+
+\hline \\[-0.5 ex]
+\multicolumn{2}{c}{ \FASTA v35, March, 2007 -- March, 2010 } \\[1 ex]
+\hline \\[-0.5 ex]
+
+Mar. 2007 & fasta v35 -- Accurate shuffle-based $E()$-values for all searches and alignments; statistics from searches against small libraries are supplemented with shuffled alignments.\\[1 ex]
+
+ & More efficient threading strategies on multi-core computers, for 12X speedup on 16-core machines.\\[1 ex]
+
+ & Inclusion of \texttt{lalign} (\texttt{SIM}) local domain alignments. \texttt{lalign} alignments now have accurate shuffle-based $E()$-values.\\[1 ex]
+
+Apr. 2007 & Introduction of \texttt{ggsearch}, for global alignment searches, and \texttt{glsearch}, for searches with scores that are global in the query and local in the library. \texttt{ggsearch} and \texttt{glsearch} calculate $E()$-values using the normal distribution. Both programs can search with \texttt{PSI-BLAST} PSSMs.\\[1 ex]
+
+Dec. 2007 & Efficient strategy for searching subsets of databases (lists of GI or accession numbers) \\[1 ex]
+
+Feb. 2008 & Annotations in either query or library sequences can be highlighted in the alignment, and the state of annotated residues is compactly summarized with \texttt{-m 9c}. \\[1 ex]
+
+Oct. 2008 & Modification \texttt{lsim4.c} (\texttt{lalign35}) provided by Xiaoqui Huang to ensure
+that self-alignments do not cross the identity diagonal. \\[1ex]
+%\pagebreak
+\hline \\[-0.5 ex]
+\multicolumn{2}{c}{ \FASTA v36, March, 2010 -- } \\[1 ex]
+\hline \\[-0.5 ex]
+
+Mar. 2010 & \FASTA v36 displays all significant alignments between
+query and library sequence. BLAST has always displayed multiple
+high-scoring alignments (HSPs) between the query and library sequence;
+previous versions of the FASTA programs displayed only the best
+alignment, even when other high-scoring alignments were present.\\[1
+ ex]
+
+& New statistical options, \texttt{-z 21, 22, 26}, provide a second $E2()$-value
+estimate based on shuffles of the highest scoring sequences. \\[1 ex]
+
+ & Improved performance using statistics-based thresholds for
+ gap-joining and band-optimization in the heuristic FASTA local
+ alignment programs, increasing speed 2 - 3X. \\[1 ex]
+
+ & Greater flexibility in specifying combinations of library files
+ and subsets of libraries. \FASTA v36
+ programs can include indirect files of library names inside of
+ indirect files of library names. \\[1 ex]
+
+ & \FASTA36 programs are fully threaded, both for
+ searches, and for alignments. The programs routinely run 12 - 15X
+ faster on 8-core machines with "hyperthreading" (effectively 16 cores).
+ \\[1 ex]
+
+ & \texttt{-z 21} .. \texttt{26} E2() statistical estimates from
+ shuffled best scores.\\[1.0ex]
+
+Sep. 2010 & \texttt{-m 8}, \texttt{-m 8C} BLAST tabular output. \\[1.0ex]
+
+Nov, 2010 & Variable scoring matrices (\texttt{-m ?BP62}).\\[1.0ex]
+
+Dec, 2010 & (\texttt{fasta-36.3.1}) SSE2 vectorized \texttt{ggsearch36}, \texttt{glsearch36} (Michael Farrar).\\[1.0ex]
+
+Jan, 2011 & (\texttt{fasta-36.3.2}) MPI versions implemented and tested.\\[1ex]
+
+Feb, 2011 & Introduce \texttt{-m B}, \texttt{-m BB} BLAST-like output.\\[1.0ex]
+
+Mar, 2011 & (\texttt{fasta-36.3.4}) Program is no longer interactive by
+default. \texttt{fasta36 -h} and \texttt{fasta36 -help} provide
+common/complete options, with many defaults. \texttt{doc/fasta\_guide.pdf} available.\\[1.0ex]
+
+May, 2011 & (\texttt{fasta-36.3.5}) Introduce (1) \texttt{-e
+ expand.sh} scripts to extend the effective size of the database
+searched, based on significant hits; (2) \texttt{-m "F\# output.file"}
+to send different output formats to different files; and (3)
+\texttt{-X} expanded options, \texttt{-o} replaces the old \texttt{-X}
+and \texttt{-Xo} replaces \texttt{-o}. \\[1.0ex]
+
+Jan, 2012 & Include \texttt{.fastq} files as library type 7 \\[1.0ex]
+
+May, 2012 & allow reverse-complement alignments with \texttt{ggsearch} and \texttt{glsearch} \\[1.0ex]
+
+Jun, 2012 & Introduce \texttt{-V !script.pl} driven alignments, and variant scoring.\\[1.0ex]
+
+Aug, 2012 & Introduce \texttt{-V !ann\_feats.pl} sub-alignment (region-based) scoring.\\[1.0ex]
+
+Apr, 2013 & Extend \texttt{ENV} options to introduce a domain-plotting option for FASTA web sites.\\[1.0ex]
+
+Nov, 2014 & (\texttt{fasta-36.3.7}) Allow overlapping domains in annotation scripts.\\[1.0ex]
+
+\hline
+\end{longtable}
diff --git a/doc/fasta.options b/doc/fasta.options
new file mode 100644
index 0000000..bfc7b17
--- /dev/null
+++ b/doc/fasta.options
@@ -0,0 +1,55 @@
+doinit.c
+ case 'B': m_msg->z_bits = 0;
+ case 'C': m_msg->nmlen
+ case 'D': ppst->debug_lib = 1;
+ case 'F': m_msg->e_low
+ case 'H': m_msg->nohist = 0
+ case 'i': m_msg->revcomp = 1
+ case 'l': m_msg->flstr
+ case 'L': m_msg->long_info = 1
+ case 'm': m_msg->markx
+ case 'N': m_msg->maxn
+ case 'O': m_msg->outfile
+ case 'q':
+ case 'Q': m_msg->quiet = 1;
+ case 'R': m_msg->dfile
+ case 'T': max_workers
+ PCOMPLIB: worker_1,worker_n
+ case 'v': ppst->zs_win
+ case 'w': m_msg->aln.llen
+ case 'W': m_msg->aln.llcntx
+ case 'X': m_msg->sq0off,&m_msg->sq1off
+ case 'z': ppst->zsflag
+ case 'v': ppst->zs_win
+ case 'V': m_msg->ann_arr
+ case 'Z': ppst->zdb_size
+
+initfa.c
+ case '1': ppst->param_u.fa.iniflag=1;
+ case '3': m_msg->nframe = 3; /* TFASTA */
+ m_msg->nframe = 1; /* for TFASTXY */
+ m_msg->qframe = 1; /* for FASTA, FASTX */
+ case 'a': m_msg->aln.showall = 1;
+ case 'A': ppst->sw_flag= 1;
+ case 'b': m_msg->mshow
+ case 'c': ppst->param_u.fa.optcut
+ case 'd': m_msg->ashow;
+ case 'E': m_msg->e_cut, m_msg->e_cut_r
+ case 'f': ppst->gdelval
+ case 'g': ppst->ggapval
+ case 'h': help /ppst->gshift (-USHOW_HELP)
+ case 'I': m_msg->self = 1
+ case 'j': ppst->gshift, ppst->gsubs
+ case 'k': m_msg->shuff_max
+ case 'K': ppst->max_repeat
+ case 'M': m_msg->n1_low,&m_msg->n1_high
+ case 'n': m_msg->qdnaseq = SEQT_DNA (1)
+ case 'o': ppst->param_u.fa.optflag = 0;
+ case 'p': m_msg->qdnaseq = SEQT_PROT (0);
+ case 'r': ppst->p_d_mat,&ppst->p_d_mis
+ case 's': standard_pam(smstr); ppst->pamoff=atoi(bp+1);
+ case 'S': ppst->ext_sq_set = 1;
+ case 't': ppst->tr_type
+ case 'x': ppst->pam_x
+ case 'y': ppst->param_u.fa.optwid
+ case 'z': ppst->zsflag
diff --git a/doc/fasta36.1 b/doc/fasta36.1
new file mode 100644
index 0000000..4f4d52f
--- /dev/null
+++ b/doc/fasta36.1
@@ -0,0 +1,467 @@
+.TH fasta36/ssearch36/[t]fast[x,y]36/lalign36 1 local
+.SH NAME
+fasta36 \- scan a protein or DNA sequence library for similar
+sequences
+
+fastx36 \ - compare a DNA sequence to a protein sequence
+database, comparing the translated DNA sequence in forward and
+reverse frames.
+
+tfastx36 \ - compare a protein sequence to a DNA sequence
+database, calculating similarities with frameshifts to the forward and
+reverse orientations.
+
+fasty36 \ - compare a DNA sequence to a protein sequence
+database, comparing the translated DNA sequence in forward and reverse
+frames.
+
+tfasty36 \ - compare a protein sequence to a DNA sequence
+database, calculating similarities with frameshifts to the forward and
+reverse orientations.
+
+fasts36 \- compare unordered peptides to a protein sequence database
+
+fastm36 \- compare ordered peptides (or short DNA sequences)
+to a protein (DNA) sequence database
+
+tfasts36 \- compare unordered peptides to a translated DNA
+sequence database
+
+fastf36 \- compare mixed peptides to a protein sequence database
+
+tfastf36 \- compare mixed peptides to a translated DNA
+sequence database
+
+ssearch36 \- compare a protein or DNA sequence to a
+sequence database using the Smith-Waterman algorithm.
+
+ggsearch36 \- compare a protein or DNA sequence to a
+sequence database using a global alignment (Needleman-Wunsch)
+
+glsearch36 \- compare a protein or DNA sequence to a
+sequence database with alignments that are global in the query and
+local in the database sequence (global-local).
+
+lalign36 \- produce multiple non-overlapping alignments for protein
+and DNA sequences using the Huang and Miller sim algorithm for the
+Waterman-Eggert algorithm.
+
+prss36, prfx36 \- discontinued; all the FASTA programs will estimate
+statistical significance using 500 shuffled sequence scores if two
+sequences are compared.
+
+.SH DESCRIPTION
+
+Release 3.6 of the FASTA package provides a modular set of sequence
+comparison programs that can run on conventional single processor
+computers or in parallel on multiprocessor computers. More than a
+dozen programs \- fasta36, fastx36/tfastx36, fasty36/tfasty36,
+fasts36/tfasts36, fastm36, fastf36/tfastf36, ssearch36, ggsearch36,
+and glsearch36 \- are currently available.
+
+All the comparison programs share a set of basic command line options;
+additional options are available for individual comparison functions.
+
+Threaded versions of the FASTA programs (built by default under
+Unix/Linux/MacOX) run in parallel on modern Linux and Unix multi-core
+or multi-processor computers. Accelerated versions of the
+Smith-Waterman algorithm are available for architectures with the
+Intel SSE2 or Altivec PowerPC architectures, which can speed-up
+Smith-Waterman calculations 10 - 20-fold.
+
+In addition to the serial and threaded versions of the FASTA programs,
+MPI parallel versions are available as \fCfasta36_mpi\fP,
+\fCssearch36_mpi\fP, \fCfastx36_mpi\fP, etc. The MPI parallel versions
+use the same command line options as the serial and threaded versions.
+
+.SH Running the FASTA programs
+.LP
+By default, the FASTA programs are no longer interactive; they are run
+from the command line by specifying the program, query.file, and
+library.file. Program options \fImust\fP preceed the
+query.file and library.file arguments:
+.sp
+.ti 0.5i
+\fCfasta36 -option1 -option2 -option3 query.file library.file > fasta.output\fP
+.sp
+The "classic" interactive mode, which prompts for a query.file and
+library.file, is available with the \fC-I\fP option. Typing a program
+name without any arguments (\fCssearch36\fP) provides a short help
+message; \fCprogram_name -help\fP provides a complete set of program
+options.
+.LP
+Program options \fIMUST\fP preceed the query.file and library.file arguments.
+
+.SH FASTA program options
+.LP
+The default scoring matrix and gap penalties used by each of the
+programs have been selected for high sensitivity searches with the
+various algorithms. The default program behavior can be modified by
+providing command line options \fIbefore\fP the query.file and
+library.file arguments. Command line options can also be used in
+interactive mode.
+
+Command line arguments come in several classes.
+
+(1) Commands that specify the comparison type. FASTA, FASTS, FASTM,
+SSEARCH, GGSEARCH, and GLSEARCH can compare either protein or DNA
+sequences, and attempt to recognize the comparison type by looking the
+residue composition. \fC-n\fP, \fC-p\fP specify DNA (nucleotide) or
+protein comparison, respectively. \fC-U\fP specifies RNA comparison.
+
+(2) Commands that limit the set of sequences compared: \fC-1\fP,
+\fC-3\fP, \fC-M\fP.
+
+(3) Commands that modify the scoring parameters: \fC-f gap-open penalty\P, \fC-g
+gap-extend penalty\fP, \fC-j inter-codon frame-shift, within-codon frameshift\fP,
+\fC-s scoring-matrix\fP, \fC-r
+match/mismatch score\fP, \fC-x X:X score\fP.
+
+(4) Commands that modify the algorithm (mostly FASTA and [T]FASTX/Y):
+\fC-c\fP, \fC-w\fP, \fC-y\fP, \fC-o\fP. The \fC-S\fP can be used to
+ignore lower-case (low complexity) residues during the initial score
+calculation.
+
+(5) Commands that modify the output: \fC-A\fP, \fC-b number\fP, \fC-C
+width\fP, \fC-d number\fP, \fC-L\fP, \fC-m 0-11,B\fP, \fC-w
+line-width\fP, \fC-W context-width\fP, \fC-o offset1,ofset2\fP
+
+(6) Commands that affect statistical estimates: \fC-Z\fP, \fC-k\fP.
+.SH Option summary:
+.TP
+\-1
+Sort by "init1" score (obsolete)
+.TP
+\-3
+([t]fast[x,y] only) use only forward frame translations
+.TP
+\-a
+Displays the full length (included unaligned regions) of both
+sequences with fasta36, ssearch36, glsearch36, and fasts36.
+.TP
+\-A (fasta36 only) For DNA:DNA, force Smith-Waterman alignment for
+output. Smith-Waterman is the default for FASTA protein alignment and
+[t]fast[x,y], but not for DNA comparisons with FASTA. For
+protein:protein, use band-alignment algorithm.
+.TP
+\-b #
+number of best scores/descriptions to show (must be <
+expectation cutoff if -E is given). By default, this option is no
+longer used; all scores better than the expectation (E()) cutoff are
+listed. To guarantee the display of # descriptions/scores, use \fC-b
+=#\fP, i.e. \fC-b =100\fP ensures that 100 descriptions/scores will be
+displayed. To guarantee at least 1 description, but possibly many
+more (limited by \fC-E e_cut\fP), use \fC-b >1\fP.
+.TP
+\-c "E-opt E-join"
+threshold for gap joining (E-join) and band optimization (E-opt) in
+FASTA and [T]FASTX/Y. FASTA36 now uses BLAST-like statistical
+thresholds for joining and band optimization. The default statistical
+thresholds for protein and translated comparisons are E-opt=0.2,
+E-join=0.5; for DNA, E-join = 0.1 and E-opt= 0.02. The actual number
+of joins and optimizations is reported after the E-join and E-opt
+scoring parameters. Statistical thresholds improves search speed 2 -
+3X, and provides much more accurate statistical estimates for matrices
+other than BLOSUM50. The "classic" joining/optimization thresholds
+that were the default in fasta35 and earlier programs are available
+using -c O (upper case O), possibly followed a value > 1.0 to set
+the optcut optimization threshold.
+.TP
+\-C #
+length of name abbreviation in alignments, default = 6. Must be less
+than 20.
+.TP
+\-d #
+number of best alignments to show ( must be < expectation (-E) cutoff
+and <= the -b description limit).
+.TP
+\-D
+turn on debugging mode. Enables checks on sequence alphabet that
+cause problems with tfastx36, tfasty36 (only available after compile
+time option). Also preserves temp files with -e expand_script.sh option.
+.TP
+\-e expand_script.sh
+Run a script to expand the set of sequences displayed/aligned based on
+the results of the initial search. When the -e expand_script.sh
+option is used, after the initial scan and statistics calculation, but
+before the "Best scores" are shown, expand_script.sh with a single
+argument, the name of a file that contains the accession information
+(the text on the fasta description line between > and the first space)
+and the E()-value for the sequence. expand_script.sh then uses this
+information to send a library of additional sequences to stdout. These
+additional sequences are included in the list of high-scoring
+sequences (if their scores are significant) and aligned. The
+additional sequences do not change the statistics or database size.
+.TP
+\-E e_cut e_cut_r
+expectation value upper limit for score and alignment display.
+Defaults are 10.0 for FASTA36 and SSEARCH36 protein searches, 5.0 for
+translated DNA/protein comparisons, and 2.0 for DNA/DNA
+searches. FASTA version 36 now reports additional alignments between
+the query and the library sequence, the second value sets the
+threshold for the subsequent alignments. If not given, the threshold
+is e_cut/10.0. If given and value > 1.0, e_cut_r = e_cut / value; for
+value < 1.0, e_cut_r = value; If e_cut_r < 0, then the additional
+alignment option is disabled.
+.TP
+\-f #
+penalty for opening a gap.
+.TP
+\-F #
+expectation value lower limit for score and alignment display.
+-F 1e-6 prevents library sequences with E()-values lower than 1e-6
+from being displayed. This allows the use to focus on more distant
+relationships.
+.TP
+\-g #
+penalty for additional residues in a gap
+.TP
+\-h
+Show short help message.
+.TP
+\-help
+Show long help message, with all options.
+.TP
+\-H
+show histogram (with fasta-36.3.4, the histogram is not shown by default).
+.TP
+\-i
+(fasta DNA, [t]fastx[x,y]) compare against
+only the reverse complement of the library sequence.
+.TP
+\-I
+interactive mode; prompt for query filename, library.
+.TP
+\-j # #
+([t]fast[x,y] only) penalty for a frameshift between two codons,
+([t]fasty only) penalty for a frameshift within a codon.
+.TP
+\-J
+(lalign36 only) show identity alignment.
+.TP
+\-k
+specify number of shuffles for statistical parameter estimation (default=500).
+.TP
+\-l str
+specify FASTLIBS file
+.TP
+\-L
+report long sequence description in alignments (up to 200 characters).
+.TP
+\-m 0,1,2,3,4,5,6,8,9,10,11,B,BB,"F# out.file" alignment display
+options. \fC-m 0, 1, 2, 3\fP display different types of alignments.
+\fC-m 4\fP provides an alignment "map" on the query. \fC-m 5\fP
+combines the alignment map and a \fC-m 0\fP alignment. \fC-m 6\fP
+provides an HTML output.
+.TP
+\fC-m 8\fP seeks to mimic BLAST -m 8 tabular output. Only query and
+library sequence names, and identity, mismatch, starts/stops,
+E()-values, and bit scores are displayed. \fC-m 8C\fp mimics BLAST
+tabular format with comment lines. \fC-m 8\fP formats do not show
+alignments.
+.TP
+\fC-m 9\fP does not change the alignment output, but provides
+alignment coordinate and percent identity information with the best
+scores report. \fC-m 9c\fP adds encoded alignment information to the
+\fC-m 9\fP; \fC-m 9C\fP adds encoded alignment information as a CIGAR
+formatted string. To accomodate frameshifts, the CIGAR format has been
+supplemented with F (forward) and R (reverse). \fC-m 9i\fP provides
+only percent identity and alignment length information with the best
+scores. With current versions of the FASTA programs, independent
+\fC-m\fP options can be combined; e.g. \fC-m 1 -m 9c -m 6\fP.
+.TP
+\-m 11 provides \fClav\fP format output from lalign36. It does not
+currently affect other alignment algorithms. The \fClav2ps\fP and
+\fClav2svg\fP programs can be used to convert \fClav\fP format output
+to postscript/SVG alignment "dot-plots".
+.TP
+\-m B provides \fCBLAST\fP-like alignments. Alignments are labeled as
+"Query" and "Sbjct", with coordinates on the same line as the
+sequences, and \fCBLAST\fP-like symbols for matches and
+mismatches. \fC-m BB\fP extends BLAST similarity to all the output,
+providing an output that closely mimics BLAST output.
+.TP
+\-m "F# out.file" allows one search to write different alignment
+formats to different files. The 'F' indicates separate file output;
+the '#' is the output format (1-6,8,9,10,11,B,BB, multiple compatible
+formats can be combined separated by commas -',').
+.TP
+\-M #-#
+molecular weight (residue) cutoffs. -M "101-200" examines only
+library sequences that are 101-200 residues long.
+.TP
+\-n
+force query to nucleotide sequence
+.TP
+\-N #
+break long library sequences into blocks of # residues. Useful for
+bacterial genomes, which have only one sequence entry. -N 2000 works
+well for well for bacterial genomes. (This option was required when
+FASTA only provided one alignment between the query and library
+sequence. It is not as useful, now that multiple alignments are
+available.)
+.TP
+\-o "#,#"
+offsets query, library sequence for numbering alignments
+.TP
+\-O file
+send output to file.
+.TP
+\-p
+force query to protein alphabet.
+.TP
+\-P pssm_file
+(ssearch36, ggsearch36, glsearch36 only). Provide blastpgp checkpoint
+file as the PSSM for searching. Two PSSM file formats are available,
+which must be provided with the filename. 'pssm_file 0' uses a binary
+format that is machine specific; 'pssm_file 1' uses the "blastpgp -u 1
+-C pssm_file" ASN.1 binary format (preferred).
+.TP
+\-q/-Q
+quiet option; do not prompt for input (on by default)
+.TP
+\-r "+n/-m"
+(DNA only) values for match/mismatch for DNA comparisons. \fC+n\fP is
+used for the maximum positive value and \fC-m\fP is used for the
+maximum negative value. Values between max and min, are rescaled, but
+residue pairs having the value -1 continue to be -1.
+.TP
+\-R file
+save all scores to statistics file (previously -r file)
+.TP
+\-s name
+specify substitution matrix. BLOSUM50 is used by default; PAM250,
+PAM120, and BLOSUM62 can be specified by setting -s P120, P250, or
+BL62. Additional scoring matrices include: BLOSUM80 (BL80), and
+MDM10, MDM20, MDM40 (Jones, Taylor, and Thornton, 1992 CABIOS
+8:275-282; specified as -s MD10, -s MD20, -s MD40), OPTIMA5 (-s OPT5,
+Kann and Goldstein, (2002) Proteins 48:367-376), and VTML160 (-s
+VT160, Mueller and Vingron (2002) J. Comp. Biol. 19:8-13). Each
+scoring matrix has associated default gap penalties. The BLOSUM62
+scoring matrix and -11/-1 gap penalties can be specified with -s BP62.
+.IP
+Alternatively, a BLASTP format scoring matrix file can be specified,
+e.g. -s matrix.filename. DNA scoring matrices can also be specified
+with the "-r" option.
+.IP
+With fasta36.3, variable scoring matrices can
+be specified by preceeding the scoring matrix abbreviation with '?',
+e.g. -s '?BP62'. Variable scoring matrices allow the FASTA programs to
+choose an alternative scoring matrix with higher information content
+(bit score/position) when short queries are used. For example, a 90
+nucleotide FASTX query can produce only a 30 amino-acid alignment, so
+a scoring matrix with 1.33 bits/position is required to produce a 40
+bit score. The FASTA programs include BLOSUM50 (0.49 bits/pos) and
+BLOSUM62 (0.58 bits/pos) but can range to MD10 (3.44
+bits/position). The variable scoring matrix option searches down the
+list of scoring matrices to find one with information content high
+enough to produce a 40 bit alignment score.
+.TP
+\-S
+treat lower case letters in the query or database as low complexity
+regions that are equivalent to 'X' during the initial database scan,
+but are treated as normal residues for the final alignment display.
+Statistical estimates are based on the 'X'ed out sequence used during
+the initial search. Protein databases (and query sequences) can be
+generated in the appropriate format using John Wooton's "pseg"
+program, available from ftp://ftp.ncbi.nih.gov/pub/seg/pseg. Once you
+have compiled the "pseg" program, use the command:
+.IP
+\fCpseg database.fasta -z 1 -q > database.lc_seg\fP
+.TP
+\-t #
+Translation table - [t]fastx36 and [t]fasty36 support the BLAST
+tranlation tables. See
+\fChttp://www.ncbi.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c/\fP.
+.TP
+\-T #
+(threaded, parallel only) number of threads or workers to use (on
+Linux/MacOS/Unix, the default is to use as many processors as are
+available; on Windows systems, 2 processors are used).
+.TP
+\-U
+Do RNA sequence comparisons: treat 'T' as 'U', allow G:U base pairs (by
+scoring "G-A" and "T-C" as score(G:G)-3). Search only one strand.
+.TP
+\-V "?$%*"
+Allow special annotation characters in query sequence. These characters
+will be displayed in the alignments on the coordinate number line.
+.TP
+\-w # line width for similarity score, sequence alignment, output.
+.TP
+\-W # context length (default is 1/2 of line width -w) for alignment,
+like fasta and ssearch, that provide additional sequence context.
+.TP
+\-X extended options. Less used options. Other options include
+\fC-XB\fP, \fC-XM4G\fP, \fC-Xo\fP, \fC-Xx\fP, and \fC-Xy\fP; see
+\fBfasta_guide.pdf\fP.
+.TP
+\-z 1, 2, 3, 4, 5, 6
+Specify the statistical calculation. Default is -z 1 for local
+similarity searches, which uses regression against the length of the
+library sequence. -z -1 disables statistics. -z 0 estimates
+significance without normalizing for sequence length. -z 2 provides
+maximum likelihood estimates for lambda and K, censoring the 250
+lowest and 250 highest scores. -z 3 uses Altschul and Gish's
+statistical estimates for specific protein BLOSUM scoring matrices and
+gap penalties. -z 4,5: an alternate regression method. \-z 6 uses a
+composition based maximum likelihood estimate based on the method of
+Mott (1992) Bull. Math. Biol. 54:59-75.
+.TP
+\-z 11,12,14,15,16
+compute the regression against scores of randomly
+shuffled copies of the library sequences. Twice as many comparisons
+are performed, but accurate estimates can be generated from databases
+of related sequences. -z 11 uses the -z 1 regression strategy, etc.
+.TP
+\-z 21, 22, 24, 25, 26
+compute two E()-values. The standard (library-based) E()-value is
+calculated in the standard way (-z 1, 2, etc), but a second E2()
+value is calculated by shuffling the high-scoring sequences (those
+with E()-values less than the threshold). For "average" composition
+proteins, these two estimates will be similar (though the
+best-shuffle estimates are always more conservative). For biased
+composition proteins, the two estimates may differ by 100-fold or
+more. A second -z option, e.g. -z "21 2", specifies the estimation
+method for the best-shuffle E2()-values. Best-shuffle E2()-values
+approximate the estimates given by PRSS (or in a pairwise SSEARCH).
+.TP
+\-Z db_size
+Set the apparent database size used for expectation value calculations
+(used for protein/protein FASTA and SSEARCH, and for [T]FASTX/Y).
+.SH Reading sequences from STDIN
+.LP
+The FASTA programs can accept a query sequence from
+the unix "stdin" data stream. This makes it much easier to use
+fasta36 and its relatives as part of a WWW page. To indicate that
+stdin is to be used, use "@" as the query sequence file name. "@" can
+also be used to specify a subset of the query sequence to be used,
+e.g:
+.sp
+.ti 0.5i
+cat query.aa | fasta36 @:50-150 s
+.sp
+would search the 's' database with residues 50-150 of query.aa. FASTA
+cannot automatically detect the sequence type (protein vs DNA) when
+"stdin" is used and assumes protein comparisons by default; the '-n'
+option is required for DNA for STDIN queries.
+.SH Environment variables:
+.TP
+FASTLIBS
+location of library choice file (-l FASTLIBS)
+.TP
+SRCH_URL1, SRCH_URL2
+format strings used to define options to re-search the
+database.
+.TP
+REF_URL
+the format string used to define the option to lookup the library
+sequence in entrez, or some other database.
+
+.SH AUTHOR
+Bill Pearson
+.br
+wrp at virginia.EDU
+
+Version: $ Id: $
+Revision: $Revision: 210 $
diff --git a/doc/fasta_func.doc b/doc/fasta_func.doc
new file mode 100644
index 0000000..b9330e5
--- /dev/null
+++ b/doc/fasta_func.doc
@@ -0,0 +1,300 @@
+Over all structure of the fasta3 program. (Some functions
+are different for translated comparisons FASTX, FASTY, TFASTX, TFASTY.)
+
+main() { /* complib.c structure */
+
+ /* get command line arguments, set up initial parameter values */
+ initenv (argc, argv, &m_msg, &pst,&aa0[0],outtty);
+
+ /* allocate space for sequence arrays */
+ /* get the query file name if not on command line */
+ /* get query */
+ m_msg.n0 = getseq (m_msg.tname,aa0[0], MAXTOT, m_msg.libstr,&pst.dnaseq,
+ &m_msg.sq0off);
+
+ /* reset some parameters if DNA */
+ resetp (aa0[0], m_msg.n0, &m_msg, &pst);
+
+ /* get a library name if not on command line */
+ libchoice(m_msg.lname,sizeof(m_msg.lname),&m_msg);
+ /* use library name to build list of library files */
+ libselect(m_msg.lname, &m_msg);
+
+ /* get additional options (ktup, prss-window) if not specified */
+ query_parm (&m_msg, &pst);
+
+ /* do final parameter initializations */
+ last_init(&m_msg, &pst);
+
+ /* set up structures for saved scores[20000], statistics[50000] */
+ nbest = 0;
+
+ /* initialize the comparison function */
+ init_work (aa0[0], m_msg.n0, &pst, &f_str[0]);
+
+ /* open the library */
+ for (iln = 0; iln < m_msg.nln; iln++) {
+ if (openlib(m_msg.lbnames[iln],m_msg)!=1) {continue;}
+ }
+
+ /* get the library sequence and do the comparison */
+ while ((n1=GETLIB(aa1ptr,maxt,libstr,&lmark,&lcont))>0) {
+ do_work (aa0[itt], m_msg.n0, aa1, n1, itt, &pst, f_str[itt], &rst);
+
+ /* save the scores */
+ /* save the scores for statistics */
+ }
+
+ /* all done with all libraries */
+ process_hist(stats,nstats,pst);
+
+ /* sort the scores by z-value */
+ sortbestz (bptr, nbest);
+
+ /* sort the scores by E-value */
+ sortbeste (bptr, nbest);
+
+ /* print the histogram */
+ prhist (stdout,m_msg,pst,gstring2);
+
+ /* show the high scoring sequences */
+ showbest (stdout, aa0, aa1, maxn, bptr, nbest, qlib, &m_msg, pst,
+ f_str, gstring2);
+
+ /* show the high-scoring alignments */
+ showalign(outfd, aa0, aa1, maxn, bptr, nbest, qlib, m_msg, pst,
+ f_str, gstring2);
+
+ /* thats all folks !!! */
+}
+
+================
+complib.c /* version set as mp_verstr */
+
+main()
+printsum() /* prints summary of run (residues, entries, time) */
+void fsigint() /* sets up interrupt handler for HUP not used */
+
+================
+compacc.c
+
+void selectbest() /* select best 15000/20000 based on raw score */
+void selectbestz() /* select best 15000/20000 based on z-score */
+void sortbest() /* sort based on raw score */
+void sortbestz() /* sort based on z-score */
+void sortbeste() /* sort based on E() score - different from z-score for DNA */
+
+prhist() /* print histogram */
+
+shuffle() /* shuffle sequence (prss) */
+wshuffle() /* window shuffle */
+
+================
+showbest.c
+
+void showbest() /* present list of high scoring sequences */
+
+================
+showalign.c
+
+void showalign() /* show list of high-scoring alignments */
+void do_show() /* show an individual alignment */
+void initseq() /* setup seqc0/seqc1 which contain alignment characters */
+void freeseq() /* free them up */
+
+================
+htime.c
+
+time_t s_time() /* get the time in usecs */
+void ptime() /* print elapsed time */
+
+================
+apam.c
+
+initpam () /* read in PAM matrix or change default array */
+void mk_n_pam() /* make DNA pam from +5/-3 values */
+================
+doinit.c
+
+void initenv() /* read environment variables, general options */
+================
+initfa.c /* version set as "verstr" */
+
+alloc_pam() /* allocate 2D pam array */
+initpam2() /* fill it up from 1D pam triangle */
+f_initenv() /* function-specific environment variables */
+f_getopt() /* function-specific options */
+f_getarg() /* function specific argument - ktup */
+resetp() /* reset scoring matrix, optional parameters for DNA-DNA */
+reseta() /* reset scoring matrix, optional parameters for prot-DNA */
+query_parm() /* ask for additional program arguments (ktup) */
+last_init() /* last chance to set up parameters based on query,lib,parms */
+f_initpam() /* not used - could set parameters from pam matrix */
+
+================
+scaleswn.c
+
+process_hist() /* do statistics calculations */
+
+ proc_hist_r() /* regression fit z=1, also used by z=5 */
+ float find_z() /* gives z-score for score, length, mu, rho, var */
+ float find_zr() /* gives z-score for score, length, mu, rho, var */
+ fit_llen() /* first estimate of mu, rho, var */
+ fit_llens() /* second estimate of mu, rho, var, mu2, rho2 */
+
+ proc_hist_r2() /* regression_i fit z=4 */
+ float find_zr2() /* gives z-score for score, length, mu, rho, mu2, rho2 */
+ fit_llen2() /* iterative estimate of mu, rho, var */
+
+ proc_hist_ln() /* ln()-scaled z=2 */ /* no longer used */
+ float find_zl() /* gives z-score from ln()-scaled scores */
+
+ proc_hist_ml() /* estimate lambda, K using Maximum Likelihood */
+ float find_ze() /* z-score from lambda, K */
+
+ proc_hist_n() /* no length-scaling z=0 */
+ float find_zn() /* gives z-score from mu, var (no scaling) */
+
+ proc_hist_a() /* Altschul-Gish params z= 3 */
+ ag_parm() /* match pst.pamfile name, look_p() */
+ look_p() /* lookup Lambda, K, H given param struct */
+ float find_za()
+
+eq_s() /* returns (double)score (available for length correction) */
+ln_s() /* returns (double)score * ln(200)/ln(length) */
+
+proc_hist_r() /* regression fit z=1, also used by z=5 */
+alloc_hist() /* set up arrays for score vs length */
+free_hist() /* free them */
+inithist() /* calls alloc_hist(), sets some other globals */
+addhist() /* update score vs length hist */
+inithistz() /* initialize displayed (z-score) histogram hist[]*/
+addhistz() /* add to hist[], increment num_db_entries */
+addhistzp() /* add to hist[], don't change num_db_entries */
+prune_hist() /* remove scores from score vs length */
+update_db_size() /* num_db_entries = nlib - ntrimmed */
+set_db_size() /* -Z db_size; set nlib */
+
+double z_to_E() /* z-value to E() (extreme value distribution */
+double zs_to_E() /* z-score (mu=50, sigma=10) to E() */
+double zs_to_bit() /* z-score to BLAST2 bit score */
+
+float E_to_zs() /* E() to z-score */
+double zs_to_Ec() /* z-score to num_db_entries*(1 - P(zs))
+
+summ_stats() /* put stat summary in string */
+vsort() /* not used, does shell sort */
+calc_ks() /* does Kolmogorov-Smirnoff calculation for histogram */
+================
+dropnfa.c /* contains worker comparison functions */
+
+init_work() /* set up struct f_struct fstr - hash query */
+get_param() /* actually prints parameters to string */
+close_work() /* clean up fstr */
+do_work() /* do a comparison */
+ do_fasta() /* use the fasta() function */
+ savemax() /* save the best region during scan */
+ spam() /* rescan the best regions */
+ sconn() /* try to connect the best regions for initn */
+ kssort() /* sort by score */
+ kpsort() /* sort by left end pos */
+ shscore() /* best self-score */
+ dmatch() /* do band alignment for opt score */
+ FLOCAL_ALIGN() /* fast band score-only */
+
+do_opt() /* do an "optimized comparison */
+
+do_walign() /* put an alignment into res[] for calcons() */
+ sw_walign() /* SW alignment driver - find boundaries */
+ ALIGN() /* actual alignment driver */
+ nw_align() /* recursive global alignment */
+ CHECK_SCORE() /* double check */
+ DISPLAY() /* Miller's display routine */
+
+ bd_walign() /* band alignment driver for DNA */
+ LOCAL_ALIGN() /* find boundaries in band */
+ B_ALIGN() /* produce band alignment */
+ bg_align() /* recursively produce band alignment */
+ BCHECK_SCORE() /* double check */
+
+calcons() /* calculate ascii alignment seqc0,seqc1 from res[]*/
+calc_id() /* calculate % identity with no alignment */
+================
+nxgetaa.c
+
+getseq() /* get a query (prot or DNA) */
+getntseq() /* get a nt query (for fastx, fasty) */
+gettitle() /* get a description */
+
+int openlib() /* open a library */
+closelib() /* close it */
+GETLIB() /* get a fasta-format next library entry */
+RANLIB() /* jump back in, get description, position for getlib() */
+
+lgetlib() /* get a Genbank flat-file format next library entry */
+lranlib() /* jump back in, get description, position for lgetlib() */
+
+pgetlib() /* get CODATA format next library entry */
+pranlib() /* jump back in, get description, position for lgetlib() */
+
+egetlib() /* get EMBL format next library entry */
+eranlib() /* jump back in, get description, position for egetlib() */
+
+igetlib() /* get Intelligenetics format next library entry */
+iranlib() /* jump back in, get description, position for igetlib() */
+
+vgetlib() /* get PIR/VMS/GCG format next library entry */
+vranlib() /* jump back in, get description, position for vgetlib() */
+
+gcg_getlib() /* get GCG binary format next library entry */
+gcg_ranlib() /* jump back in, get description, position for gcg_getlib() */
+
+int scanseq() /* find %ACGT */
+
+revcomp() /* do reverse complement */
+sf_sort() /* sort superfamily numbers */
+================
+c_dispn.c
+
+discons() /* display alignment from seqc0, seqc1 */
+disgraph() /* display graphical representation, -m 4,5 */
+aancpy() /* copy a binary sequence to ascii */
+r_memcpy()
+l_memcpy()
+iidex() /* lookup ascii-encoding of residue */
+cal_coord() /* calculate coordinates of alignment ends */
+
+================
+ncbl_lib.c
+
+ncbl_openlib()
+ncbl_closelib()
+ncbl_getliba()
+ncbl_getlibn()
+ncbl_ranlib()
+src_ulong_read()
+src_long_read()
+src_char_read()
+src_fstr_read()
+newname()
+
+================
+lib_sel.c
+
+getlnames()
+libchoice()
+libselect()
+addfile()
+ulindex()
+
+================
+nrand48.c
+
+irand(time) /* initialize random number generator */
+nrand(n) /* get a number 0 - n */
+
+================
+url_subs.c
+
+void do_url1() /* setup search links */
+
diff --git a/doc/fasta_guide.bib b/doc/fasta_guide.bib
new file mode 100644
index 0000000..4d722e6
--- /dev/null
+++ b/doc/fasta_guide.bib
@@ -0,0 +1,265 @@
+
+ at article( WRP881,
+ author = {W. R. Pearson
+ and D. J. Lipman},
+ title = {Improved tools for biological sequence comparison},
+ year = 1988,
+ journal = {Proc. Natl. Acad. Sci. USA},
+ volume = 85,
+ pages = {2444-2448},
+ annote = 88190088 )
+
+ at incollection( day787,
+ author = {M. Dayhoff
+ and R. M. Schwartz
+ and B. C. Orcutt},
+ title = {A model of evolutionary change in proteins},
+ year = 1978,
+ volume = {5, supplement 3},
+ booktitle = {Atlas of Protein Sequence and Structure},
+ editor = {M. Dayhoff},
+ publisher = {National Biomedical Research Foundation},
+ pages = {345-352},
+ address = {Silver Spring, MD} )
+
+ at article( WRP960,
+ author = {W. R. Pearson},
+ title = {Effective protein sequence comparison},
+ year = 1996,
+ journal = {Methods Enzymol.},
+ volume = 266,
+ pages = {227-258},
+ annote = 97422296 )
+
+ at article( wrp971,
+ author = {Z. Zhang
+ and W. R. Pearson
+ and W. Miller},
+ title = {Aligning a {DNA} sequence with a protein sequence},
+ year = 1997,
+ journal = {J. Computational Biology},
+ volume = 4,
+ pages = {339-349},
+ annote = 97422296 )
+
+ at article( wrp973,
+ author = {W. R. Pearson
+ and T. C. Wood
+ and Z. Zhang
+ and W. Miller},
+ title = {Comparison of {DNA} sequences with protein sequences},
+ year = 1997,
+ journal = {Genomics},
+ volume = 46,
+ pages = {24-36},
+ annote = 98066759 )
+
+ at article( wrp951,
+ author = {W. R. Pearson},
+ title = {
+Comparison of methods for searching protein sequence databases},
+ year = 1995,
+ journal = {Prot. Sci.},
+ volume = 4,
+ pages = {1145-1160},
+ annote = 97422296 )
+
+ at article( wrp981,
+ author = {W. R. Pearson},
+ title = {
+Empirical statistical estimates for sequence similarity searches},
+ year = 1998,
+ journal = {J. Mol. Biol.},
+ volume = 276,
+ pages = {71-84},
+ annote = 98179551 )
+
+ at article( tay925,
+ author = {D. T. Jones
+ and W. R. Taylor
+ and J. M. Thornton},
+ title = {
+The rapid generation of mutation data matrices from protein sequences},
+ year = 1992,
+ journal = {Comp. Appl. Biosci.},
+ volume = 8,
+ pages = {275-282} )
+
+ at article( woo935,
+ author = {J. C. Wootton
+ and S. Federhen},
+ title = {
+Statistics of local complexity in amino acid sequences and sequence databases},
+ year = 1993,
+ journal = {Comput. Chem.},
+ volume = 17,
+ pages = {149-163} )
+
+ at article( alt960,
+ author = {S. F. Altschul
+ and W. Gish},
+ title = {Local alignment statistics},
+ year = 1996,
+ journal = {Methods Enzymol.},
+ volume = 266,
+ pages = {460-480} )
+
+ at article( alt915,
+ author = {S. F. Altschul},
+ title = {
+Amino acid substitution matrices from an information theoretic
+perspective},
+ year = 1991,
+ journal = {J. Mol. Biol.},
+ volume = 219,
+ pages = {555-65} )
+
+ at article( WAT815,
+ author = {T. F. Smith
+ and M. S. Waterman},
+ title = {Identification of common molecular subsequences},
+ year = 1981,
+ journal = {J. Mol. Biol.},
+ volume = 147,
+ pages = {195-197},
+ annote = 81267385 )
+
+ at article( wrp021,
+ author = {A. J. Mackey
+ and T. A. J. Haystead
+ and W. R. Pearson},
+ title = {
+Getting more From Less: Algorithms for Rapid Protein Identification
+with Multiple Short Peptide Sequences},
+ year = 2002,
+ journal = {Mol. Cell. Proteomics},
+ volume = 1,
+ pages = {139-147} )
+
+ at article( farrar2007,
+ author = {M. Farrar},
+ title = {
+Striped {S}mith-{W}aterman speeds database searches six times over
+ other SIMD implementations},
+ year = 2007,
+ journal = {Bioinformatics},
+ volume = 23,
+ pages = {156-161},
+ annote = 17110365 )
+
+ at article{kan023,
+author = {Maricel G Kann and Richard A Goldstein},
+journal = {Proteins},
+title = {Performance evaluation of a new algorithm for the detection of remote homologs with sequence comparison},
+pages = {367--76},
+volume = {48},
+year = {2002},
+month = {Aug},
+pmid = {12112703}
+}
+
+ at article{Muller2002,
+author = {Tobias Muller and Rainer Spang and Martin Vingron},
+journal = {Mol Biol Evol},
+title = {Estimating amino acid substitution models: a comparison of Dayhoff's estimator, the resolvent approach and a maximum likelihood method},
+pages = {8--13},
+volume = {19},
+year = {2002},
+date-added = {2011-03-14 22:15:08 -0400},
+date-modified = {2011-03-14 22:15:08 -0400},
+pmid = {11752185},
+URL = {http://mbe.oxfordjournals.org/content/19/1/8.long}
+}
+
+ at article( hen929,
+ author = {S. Henikoff
+ and J. G. Henikoff},
+ title = {Amino acid substitutions matrices from protein blocks},
+ year = 1992,
+ journal = {Proc. Natl. Acad. Sci. USA},
+ volume = 89,
+ pages = {10915-10919} )
+
+ at article( WAT875,
+ author = {M. S. Waterman
+ and M. Eggert},
+ title = {
+A new algorithm for best subsequences alignment with application to
+t{RNA}-r{RNA} comparisons},
+ year = 1987,
+ journal = {J. Mol. Biol.},
+ volume = 197,
+ pages = {723-728} )
+
+ at article( mil908,
+ author = {X. Huang
+ and R. C. Hardison
+ and W. Miller},
+ title = {A space-efficient algorithm for local similarities},
+ year = 1990,
+ journal = {Comp. Appl. Biosci.},
+ volume = 6,
+ pages = {373-381} )
+
+
+ at article( uniprot11,
+ author = {UniProt Consortium},
+ title = {
+Ongoing and future developments at the Universal Protein Resource.},
+ year = 2011,
+ journal = {Nucleic Acids Res},
+ volume = 39,
+ pages = {D214-D219},
+ annote = 21051339 )
+
+ at article( wrp022,
+ author = {J. T. Reese
+ and W. R. Pearson},
+ title = {
+Empirical determination of effective gap penalties for sequence
+comparison},
+ year = 2002,
+ journal = {Bioinformatics},
+ volume = 18,
+ pages = {1500-1507},
+ annote = 22310732 )
+
+ at article( rog003,
+ author = {T. Rognes
+ and E. Seeberg},
+ title = {
+Six-fold speed-up of Smith-Waterman sequence database searches using
+parallel processing on common microprocessors},
+ year = 2000,
+ journal = {Bioinformatics},
+ volume = 16,
+ pages = {699-706},
+ annote = 20551510 )
+
+ at article( mot921,
+ author = {R. Mott},
+ title = {
+Maximum-likelihood estimation of the statistical distribution of
+Smith-Waterman local sequence similarity scores},
+ year = 1992,
+ journal = {Bull. Math. Biol.},
+ volume = 54,
+ pages = {59-75} )
+
+ at article( woz974,
+ author = {A. Wozniak},
+ title = {
+Using video-oriented instructions to speed up sequence comparison},
+ year = 1997,
+ journal = {Comput Appl Biosci},
+ volume = 13,
+ pages = {145-150},
+ annote = 97292450 )
+
+ at article{wrp136,
+ Author = {L. J. Mills and W. R. Pearson},
+ Journal = {Bioinformatics},
+ Pages = {3007-3013},
+ Title = {Adjusting scoring matrices to correct overextended alignments.},
+ Volume = 29,
+ Year = 2013}
diff --git a/doc/fasta_guide.fg1.tex b/doc/fasta_guide.fg1.tex
new file mode 100644
index 0000000..bb223d9
--- /dev/null
+++ b/doc/fasta_guide.fg1.tex
@@ -0,0 +1,60 @@
+\begin{footnotesize}
+\begin{quote}
+\begin{verbatim}
+# ../bin/ssearch36 -q -w 80 ../seq/mgstm1.aa a
+SSEARCH performs a Smith-Waterman search
+ version 36.3.6 June, 2013(preload9)
+Please cite:
+ T. F. Smith and M. S. Waterman, (1981) J. Mol. Biol. 147:195-197;
+ W.R. Pearson (1991) Genomics 11:635-650
+Query: ../seq/mgstm1.aa
+ 1>>>mGSTM1 mouse glutathione transferase M1 - 218 aa
+Library: PIR1 Annotated (rel. 66)
+ 5121825 residues in 13143 sequences
+
+Statistics: Expectation_n fit: rho(ln(x))= 7.4729+/-0.000484; mu= 2.0282+/- 0.027
+ mean_var=56.9651+/-10.957, 0's: 9 Z-trim(119.4): 17 B-trim: 67 in 1/62
+ Lambda= 0.169930
+ statistics sampled from 13135 (13143) to 13135 sequences
+Algorithm: Smith-Waterman (SSE2, Michael Farrar 2006) (7.2 Nov 2010)
+Parameters: BL50 matrix (15:-5), open/ext: -10/-2
+ Scan time: 3.820
+The best scores are: s-w bits E(13143)
+sp|P08010|GSTM2_RAT Glutathione S-transferase Mu 2; GST 4-4; GT ( 218) 1248 312.0 7.7e-86
+sp|P04906|GSTP1_RAT Glutathione S-transferase P; Chain 7; GST - ( 210) 344 90.4 3.8e-19
+sp|P00502|GSTA1_RAT Glutathione S-transferase alpha-1; GST 1-1 ( 222) 237 64.1 3.2e-11
+sp|P14942|GSTA4_RAT Glutathione S-transferase alpha-4; GST 8-8 ( 222) 179 49.9 6.1e-07
+sp|P12653|GSTF1_MAIZE Glutathione S-transferase 1; GST class-pi ( 214) 120 35.4 0.013
+sp|P04907|GSTF3_MAIZE Glutathione S-transferase 3; GST class-pi ( 222) 115 34.2 0.032
+sp|P20432|GSTT1_DROME Glutathione S-transferase 1-1; DDT-dehydr ( 209) 100 30.5 0.38
+sp|P11277|SPTB1_HUMAN Spectrin beta chain, erythrocytic; Beta- (2137) 108 31.6 1.9
+... (alignments deleted) ...
+>>sp|P14942|GSTA4_RAT Glutathione S-transferase alpha-4; GST 8-8; (222 aa)
+ s-w opt: 179 Z-score: 231.0 bits: 49.9 E(13143): 6.1e-07
+Smith-Waterman score: 179; 25.6% identity (54.5% similar) in 211 aa overlap (5-207:7-207)
+ 10 20 30 40 50 60 70
+mGSTM MPMILGYWNVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKF-KLG-LDFPNLPYL-IDGSHKITQSNA
+ : :.. :: . :: :: . ..: .: ... ::. : : : : ..: . ::: .::. :
+sp|P14 MEVKPKLYYFQGRGRMESIRWLLATAGVEFEE---------EFLETREQYEKLQKDGCLLFGQVPLVEIDG-MLLTQTRA
+ 10 20 30 40 50 60 70
+ 80 90 100 110 120 130 140 150
+mGSTM ILRYLARKHHLDGETEEERIRADIVENQVMDTRMQLIMLCYNPDFEKQKPEFLKTIPEKMKLYSEF--LGK---RPWFAG
+ :: ::: :..: :. .::.: :. . ..: :..: .. ::.. : . : . . : . : . ...:
+sp|P14 ILSYLAAKYNLYGKDLKERVRIDMYADGTQDLMMMIIGAPFKAPQEKEESLALAVKRAKNRYFPVFEKILKDHGEAFLVG
+ 80 90 100 110 120 130 140 150
+ 160 170 180 190 200 210
+mGSTM DKVTYVDFLAYDILDQYRMFEPKCLDAFPNLRDFLARFEGLKKISAYMKSSRYIATPIFSKMAHWSNK
+ ......:. . . . . :. :: :. : .:. .. :. ... . :
+sp|P14 NQLSWADIQLLEAILMVEEVSAPVLSDFPLLQAFKTRISNIPTIKKFLQPGSQRKPPPDGHYVDVVRTVLKF
+ 160 170 180 190 200 210 220
+... (alignments deleted) ...
+218 residues in 1 query sequences
+5121825 residues in 13143 library sequences
+ Tcomplib [36.3.6 May, 2013(preload9)] (4 proc in memory [0G])
+ start: Thu Jun 6 11:23:28 2013 done: Thu Jun 6 11:23:30 2013
+ Total Scan time: 3.820 Total Display time: 0.130
+Function used was SSEARCH [36.3.6 May, 2013(preload9)]
+\end{verbatim}
+\end{quote}
+\end{footnotesize}
+\vspace{-4.0ex}
diff --git a/doc/fasta_guide.fg2.tex b/doc/fasta_guide.fg2.tex
new file mode 100644
index 0000000..d636b7d
--- /dev/null
+++ b/doc/fasta_guide.fg2.tex
@@ -0,0 +1,25 @@
+\begin{footnotesize}
+\begin{verbatim}
+>>GST26_SCHMA Glutathione S-transferase class-mu (218 aa)
+ initn: 422 init1: 359 opt: 407 Z-score: 836.8 bits: 162.0 E(437847): 3.7e-39
+Smith-Waterman score: 451; 42.4% identity (73.4% similar) in 203 aa overlap (6-208:6-203)
+
+ 10 20 30 40 50 60 70 80
+mGSTM1 MPMILGYWNVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKFKLGLDFPNLPYLIDGSHKITQSNAILRYL
+ :::.:.::..: :.:::. ...:.:. : : ..: : :.::::::.:::::: :::. :.::: ::.::.
+GST26_ MAPKFGYWKVKGLVQPTRlllehleetyeeRAY---DRNEIDA--WSNDKFKLGLEFPNLPYYIDGDFKLTQSMAIIRYI
+ 10 20 30 40 50 60 70
+
+ 90 100 110 120 130 140 150 160
+mGSTM1 ARKHHLDGETEEERIRADIVENQVMDTRMQLIMLCYNPDFEKQKPEFLKTIPEKMKLYSEFLGKRPWFAGDKVTYVDFLA
+ : ::.. : .:: . ...:. :.: :: .. ..:: ..: : .::. .: ..:.... :... .. :. ::. ::.
+GST26_ ADKHNMLGACPKERAEISMLEGAVLDIRMGVLRIAYNKEYETLKVDFLNKLPGRLKMFEDRLSNKTYLNGNCVTHPDFML
+ 80 90 100 110 120 130 140 150
+
+ 170 180 190 200 210
+mGSTM1 YDILDQYRMFEPKCLDAFPNLRDFLARFEGLKKISAYMKSSRYIATPIFSKMAHWSNK
+ :: :: .. .::. ::.: .: .: : .:. :..::::: :.
+GST26_ YDALDVVLYMDSQCLNEFPKLVSFKKCIEDLPQIKNYLNSSRYIKWPLQGWDATFGGGDTPPK
+ 160 170 180 190 200 210
+\end{verbatim}
+\end{footnotesize}
diff --git a/doc/fasta_guide.pdf b/doc/fasta_guide.pdf
new file mode 100644
index 0000000..0e04c65
Binary files /dev/null and b/doc/fasta_guide.pdf differ
diff --git a/doc/fasta_guide.tex b/doc/fasta_guide.tex
new file mode 100644
index 0000000..b88568d
--- /dev/null
+++ b/doc/fasta_guide.tex
@@ -0,0 +1,2115 @@
+\documentclass[11pt]{article}
+\RequirePackage{helvet}
+\newcommand{\CURRENT}{fasta-36.3.8}
+\renewcommand{\familydefault}{\sfdefault}
+\usepackage{cite}
+\usepackage{url}
+\usepackage{fancyhdr}
+\usepackage{url}
+\usepackage{needspace}
+\usepackage{longtable}
+\addtolength{\oddsidemargin}{-0.75in}
+\addtolength{\evensidemargin}{-0.75in}
+\addtolength{\textwidth}{1.5in}
+\addtolength{\topmargin}{-0.75in}
+\addtolength{\textheight}{1.5in}
+
+\lhead{\CURRENT}
+\rhead{\today}
+\cfoot{\thepage}
+\pagestyle{fancy}
+\newcommand{\FASTA}{\texttt{FASTA }}
+\hyphenation{Swiss-Prot}
+
+\parskip 0.5ex
+
+\begin{document}
+%% .he 'FASTA3.DOC''Release 3.6, March, 2011'
+
+\section*{\Large{The FASTA program package}}
+
+%% \begin{quote}
+%% \emph{This document is undergoing extensive revision.
+%% Some parts of it are old; while still accurate, they are less
+%% relevant. Recent improvements to the FASTA programs are not well
+%% documented, particularly with respect to options for selecting
+%% databases and database sequences.}
+%% \end{quote}
+
+\section*{Introduction}
+
+This documentation describes the version 36 of the FASTA program
+package (see W. R. Pearson and D. J. Lipman (1988), ``Improved Tools
+for Biological Sequence Analysis'', PNAS 85:2444-2448,\cite{wrp881}
+W. R. Pearson (1996) ``Effective protein sequence comparison''
+Meth. Enzymol. 266:227-258 \cite{wrp960}; and Pearson et. al. (1997)
+Genomics 46:24-36 \cite{wrp973}. Version 3 of the FASTA packages
+contains many programs for searching DNA and protein databases and for
+evaluating statistical significance from randomly shuffled sequences.
+
+This document is divided into four sections: (1) A summary overview of
+the programs in the FASTA3 package; (2) A guide to using the FASTA
+programs; (3) A guide to installing the programs and
+databases. Section (4) provides answers to some Frequently Asked
+Questions (FAQs). In addition to this document, the
+\texttt{changes\_v36.html}, \texttt{changes\_v35.html} and
+\texttt{changes\_v34.html} files list functional changes to the programs.
+The \texttt{readme.v30..v36} files provide a more complete revision
+history of the programs, including bug fixes.
+
+The programs are easy to use; if you are using them on a machine that
+is administered by someone else, you can focus on sections (1) and (2)
+to learn how to use the programs. If you are installing the programs
+on your own machine, you will need to read section (3) carefully.
+
+\emph{FASTA and BLAST} -- FASTA and BLAST have the same goal: to
+identify statistically significant sequence similarity that can be
+used to infer homology. The FASTA programs offer several advantages
+over BLAST:
+\begin{enumerate}
+\item
+Rigorous algorithms unavailable in BLAST (Table I). Smith-Waterman
+(\texttt{ssearch36}), global: global (\texttt{ggsearch36}), and
+global:local (\texttt{glsearch36}) programs are available, and these
+programs can be used with \texttt{psiblast} PSSM profiles.
+\item
+Better translated alignments. \texttt{fastx36}, \texttt{fasty36},
+\texttt{tfastx36}, and \texttt{tfastx36} allow frame-shifts in
+alignments; frame-shifts are treated like gap-penalties, alignments
+tend to be longer in error-prone reads.
+\item
+Better statistics. BLAST calculates very accurate statistics for
+protein:protein alignments, but its model-based strategy is less
+robust for translated-DNA:protein and DNA:DNA scores. FASTA uses an
+empirical estimation strategy, and now provides both search-based, and
+high-scoring shuffle-based statistics (\texttt{-z 21}).
+\item
+More flexible library sequence formats. The FASTA programs can read
+FASTA, NCBI/ \texttt{formatdb}, and several other sequence formats, and can
+directly query MySQL and Postgres databases. The programs offer
+several strategies for specifying subsets of databases.
+\item
+A very efficient threaded implementation. The FASTA programs are
+fully threaded; both similarity scores and alignments can be
+calculated in parallel on multi-core hardware. On multi-core
+machines, FASTA can be faster than BLAST while producing better
+alignments with more accurate statistical estimates.
+\item
+ A powerful annotation facility. The FASTA programs can incorporate
+ functional site annotations, site variation, and domain-based
+ sub-alignment scoring using annotations from sequence libraries.
+ Scripts are available to download site and domain information from
+ Uniprot, and domain information from Pfam and CATH. Domain
+ information can be used to sub-divide alignment scores to ensure
+ that the aligned domain is homologous.
+
+\end{enumerate}
+
+In addition, the FASTA programs from \texttt{fasta-36.3.4} on provide
+an option to produce very BLAST-like output (\texttt{-m BB}), so that
+analysis pipelines require minimal modification.
+
+\section{An overview of the \texttt{FASTA} programs}
+
+\begin{table}
+\caption{\label{table1} Comparison programs in the FASTA36 package}
+\vspace{0.5ex}
+\begin{tabular}{ p{0.8in} p{0.6in} p{4.6 in}}
+\hline \\[-1.0ex]
+FASTA \mbox{program} & BLAST equiv. & Description \\[1.2ex]
+\hline \\[-1.0ex]
+\texttt{fasta36} & \texttt{blastp}/ \texttt{blastn} &
+Compare a protein sequence to a protein sequence
+database or a DNA sequence to a DNA sequence database using the FASTA
+algorithm \cite{wrp881,wrp960}. Search speed and selectivity are
+controlled with the \emph{ktup}(wordsize) parameter. For protein
+comparisons, \emph{ktup} = 2 by default; \emph{ktup} =1 is more sensitive
+but slower. For DNA comparisons, \emph{ktup}=6 by default; \emph{ktup}=3 or
+\emph{ktup}=4 provides higher sensitivity.\\[1 ex]
+
+\texttt{ssearch36} & & Compare a protein sequence to a protein sequence
+database or a DNA sequence to a DNA sequence database using the
+Smith-Waterman algorithm \cite{wat815}. \texttt{ssearch36} uses SSE2
+acceleration, and is only 2 - 5X slower than \texttt{fasta36} \cite{farrar2007}. \\[1 ex]
+
+\texttt{ggsearch36}/ \texttt{glsearch36} & & Compare a protein sequence to a protein sequence
+database or a DNA sequence to a DNA sequence database using
+an optimal global:global (\texttt{ggsearch36}) or global:local
+(\texttt{glsearch36}) algorithm.\\[1 ex]
+
+\texttt{fastx36}/ \texttt{fasty36} & \texttt{blastx} &
+Compare a DNA sequence to a protein
+sequence database, by comparing the translated DNA sequence in three
+frames and allowing gaps and frameshifts. \texttt{fastx36} uses a
+simpler, faster algorithm for alignments that allows frameshifts only
+between codons; \texttt{fasty36} is slower but can produce better alignments
+because frameshifts are allowed within codons \cite{wrp971}.\\[1 ex]
+
+\texttt{tfastx36}/ \texttt{tfasty36}& \texttt{tblastn} &
+Compare a protein sequence to a DNA sequence
+database, calculating similarities with frameshifts to the forward and
+reverse orientations \cite{wrp971}.\\[1 ex]
+
+\texttt{fastf36/ tfastf36} & &
+Compares an ordered peptide mixture, as would be obtained by
+Edman degradation of a CNBr cleavage of a protein, against a protein
+(\texttt{fastf}) or DNA (\texttt{tfastf}) database \cite{wrp021}.\\[1 ex]
+
+\texttt{fasts36/ tfasts36} & &
+Compares set of short peptide fragments, as would be obtained
+from mass-spec. analysis of a protein, against a
+protein (\texttt{fasts}) or DNA (\texttt{tfasts}) database \cite{wrp021}.\\[1 ex]
+
+\texttt{lalign36} & & Calculate multiple, non-intersecting alignments
+using the sim2 implementation of the Waterman-Eggert
+algorithm\cite{wat875} developed by Xiaoqui Huang and Web
+Miller\cite{mil908}. Statistical estimates are calculated from
+Smith-Waterman scores of shuffled sequences. \\[1 ex]
+
+\hline \\
+\end{tabular}
+\end{table}
+
+Although there are a large number of programs in this package, they
+belong to three groups: (1) Traditional similarity searching programs:
+\texttt{fasta36}, \texttt{fastx36}, \texttt{fasty36},
+\texttt{tfastx36}, \texttt{tfasty36}, \texttt{ssearch36},
+\texttt{ggsearch36}, and \texttt{glsearch36}; (2) Programs for
+searching with short fragments: \texttt{fasts36}, \texttt{fastf36},
+\texttt{tfasts36}, \texttt{tfastf36}, and \texttt{fastm36}; (3) A
+program for finding non-overlapping local alignments: \texttt{lalign36}.
+Programs that start with \texttt{fast} search protein databases, while
+\texttt{tfast} programs search translated DNA databases. Table I
+gives a brief description of the programs.
+
+In addition, there are several programs included. \texttt{map\_db} is
+used to index FASTA format sequence databases for more efficient
+scanning. \texttt{scripts/lav2plt.pl} can plot the \texttt{.lav}
+files produced by \texttt{lalign -m 11} as postscript
+(\texttt{lav2plt.pl --dev ps}) or SVG (\texttt{lav2plt.pl --dev svg}) output.
+
+\section{Using the FASTA Package}
+\subsection{Introduction/Overview}
+
+All the FASTA sequence comparison programs use similar command line
+options and arguments. The simplest command line arguments are (in
+order): the name of a query sequence file, a library file, and
+(possibly) the \emph{ktup} parameter. If command line options are
+provided, they \emph{must} precede the standard query-file and
+library-file arguments. Thus:
+\begin{quote}
+\texttt{fasta36 -s BP62 query.file library.file}
+\end{quote}
+will compare the sequences in \texttt{query.file} with those in
+\texttt{library.file} using the \texttt{BLOSUM62} scoring matrix with
+BLASTP gap penalties (\texttt{-11/-1}).
+
+The program can also be run by typing:
+\begin{quote}
+\texttt{fasta36 -I}
+\end{quote}
+which presents the ``classic'' interative mode (this was
+the default behavior before version \texttt{36.3.4}).
+In interactive mode,
+you will be prompted for: (1) the name of the test sequence file; (2)
+the name of the library file; (3) whether you want ktup = 1 or 2. (
+1 -- 6 for DNA sequences).
+
+Current versions of the FASTA programs expect a query file and library, if you simply type ``\texttt{fasta36}'', you will see a short help message:
+\begin{footnotesize}
+\begin{quote}
+\begin{verbatim}
+% ssearch36
+USAGE
+ ssearch36 [-options] query_file library_file
+ ssearch36 -help for a complete option list
+
+DESCRIPTION
+ SSEARCH performs a Smith-Waterman search
+ version: 36.3.4 Mar, 2011
+
+COMMON OPTIONS (options must precede query_file library_file)
+ -s: [BL50] scoring matrix;
+ -f: [-10] gap-open penalty;
+ -g: [-2] gap-extension penalty;
+ -S filter lowercase (seg) residues;
+ -b: high scores reported (limited by -E by default);
+ -d: number of alignments shown (limited by -E by default);
+ -I interactive mode;
+\end{verbatim}
+\end{quote}
+\end{footnotesize}
+``\texttt{fasta36 -help}'' (or any of the other program names in Table
+I) provides complete listing of the options available for the program
+and their default values.
+
+The package includes several test files. To check to make certain
+that everything is working, you can try:
+\begin{quote}
+\begin{verbatim}
+fasta36 ../seq/musplfm.aa ../seq/prot_test.lib
+or
+tfastx36 ../seq/mgstm1.aa ../seq/gst.nlib
+\end{verbatim}
+\end{quote}
+
+\subsection{Sequence files}
+
+The \texttt{fasta36} programs can read query and library files in many
+standard formats (Section \ref{fastlibs}). The default file format for query and library files
+-- the format that will be used if no additional file format
+information is provided -- is \texttt{FASTA} format. Like
+\texttt{BLAST}, version 36 can compare a query file with multiple
+query sequences to a sequence database, performing an independent
+search with each sequence in the query file.
+
+FASTA format files consist of a description line, beginning
+with a '$>$' character, followed by the sequence itself:
+\begin{quote}
+\begin{verbatim}
+>sequence name and description 1
+A F A S Y T .... actual sequence.
+F S S .... second line of sequence.
+>sequence name and description 2
+PMILTYV ... sequence 2
+\end{verbatim}
+\end{quote}
+All of the characters of the description line are read, and special
+characters can be used to indicate additional information about the
+sequence. In general, non-amino-acid/non-nucleotide sequences in the
+sequence lines are ignored.
+
+FASTA format files from major sequence distributors, like the NCBI and
+EBI, have specially formatted description lines, e.g.:\\
+\indent
+\texttt{
+>gi|54321|ref|np\_12345| example NCBI refseq sequence\\
+}
+or\\
+\indent
+\texttt{
+>sw:gstm1\_human P01234 glutathione transferase GSTM1 - human\\
+}
+
+Several sample test files are included with the FASTA distribution:
+\texttt{seq/*.aa} and \texttt{seq/*.seq}, as well as two small sequence
+libraries, \texttt{seq/prot\_test.lib} and \texttt{seq/gst.nlib}.
+
+You can build your own library by concatenating several sequence
+files. Just be sure that each sequence is preceded by a line
+beginning with a '\texttt{>}' followed by a sequence name/description. Sequences
+entered with word processors should use a ``text'' mode, e.g. ``Save as
+text'' with MS-WORD, with end of line characters and no special
+formatting characters in the file. The FASTA program cannot read
+Microsoft Word .DOC files, or rich text (.RTF) files; query and
+library sequence files should contain only sequence descriptions,
+sequences, and end-of-line characters.
+
+\subsection{Running the programs}
+As mentioned earlier, the FASTA programs can be run either
+interactively, by typing the name of a FASTA program (and possibly
+command line options), followed by \texttt{-I} (\texttt{fasta36 -I})
+or from the command line, entering command line options, and the
+query and library file names. For searches of large databases that
+may take several minutes (or longer), it is more convenient
+to run searches from the command line, e.g.:
+\begin{quote}
+\begin{verbatim}
+fasta36 query.file library.file > output.file
+\end{verbatim}
+\end{quote}
+The command line shown above could be typed in a Unix or MacOSX
+terminal window, or from the MS-Windows command line interface
+(command.exe). The command line syntax shown above works for all
+the FASTA programs, e.g.:
+\begin{quote}
+\begin{verbatim}
+lalign36 mchu.aa mchu.aa > mchu.laln
+fastx36 mgstm1.seq prot_test.lseg > mgstm1.fx_out
+ssearch36 mgstm1.aa xurtg.aa > mgstm1_xurtg.ss
+\end{verbatim}
+\end{quote}
+
+\emph{Command line options} -- The FASTA programs provide a variety of
+command line options that modify the default scoring matrix
+(\texttt{-sBL62}) and gap penalties (\texttt{-f -11}, \texttt{-g -1}), other
+algorithm parameters, the output options (\texttt{-E 0.1}, \texttt{-d 20},
+\texttt{-m 9i}), and statistical procedures (\texttt{z -2}). A complete
+list of command line options is shown near the end of this document.
+Unlike the \texttt{BLAST} programs, all \texttt{FASTA} command line options
+must precede the query file name and library file name (and there are
+no command line options available to specify the query and library
+file names). Thus, you should type:
+\begin{quote}
+\begin{verbatim}
+ssearch36 -s BL62 -f -11 -g -1 query.file library.file > output.file
+\end{verbatim}
+\end{quote}
+If you include \texttt{-I} as one of the options, you can provide
+command line options (e.g. to change the scoring matrix or gap
+penalties) without a query file or library file, and the program will
+use the options but prompt for the necessary files .
+
+\subsection{Interpreting the results}
+
+Fig. \ref{ssearch_run} shows the output from a typical FASTA program
+(\texttt{ssearch36}). The output file can be
+viewed as four parts: (a) the initial command line and description of
+the query sequence used (mgstm1.aa, 218 aa) and library (PIR1, 13,143
+entries); (b) a description of the search statistics, algorithm
+(Smith-Waterman, SSE2 accelerated), and search parameters (BLOSUM50
+matrix, gap penalties: -10 to open a gap, -2 for each residue in a
+gap); (c) a list of high scoring library sequences, descriptions, similarity scores, and statistical significance; (d) the alignments that produced the scores.
+
+\begin{figure}
+\include{fasta_guide.fg1}
+\caption{\label{ssearch_run}\texttt{ssearch36} results}
+\vspace{1.0ex}
+Comparison of \texttt{seq/mgstm1.aa} against a small protein database
+(\texttt{pir1.lseg}). Some high-scoring sequences and all but one
+alignment were removed to reduce the output size.
+\end{figure}
+
+\subsubsection{Identifying homologs}
+In the description section (which starts: \texttt{The best scores
+ are:}), four numbers after the description of each library sequence
+are shown: (i) (in parentheses) the length of the library sequence;
+(ii) the raw Smith-Waterman score for the alignment (\texttt{s-w}; for
+the \texttt{fasta36}, \texttt{[t]fast[x,y]36} programs, this column
+would be labeled \texttt{opt}, for the \emph{opt}imized -- banded
+Smith-Waterman -- score), (iii) the \emph{bit} score, and (iv) the
+expectation (E()), or statistical significance, of the alignment
+score. The E()-value depends on the size of the database searched, in
+this case, 13,143 sequences, so the database size is given at the top
+of the list.
+
+The bit score is equivalent to a BLAST bit score; together with the
+length of query and library sequences, it can be used to calculate the
+significance of the alignment.\footnote{$E(D) = D m n 2^{-b}$,
+where $D$ is the number of sequences in the database, $m, n$ are the
+lengths of the two sequences, and $b$ is the bit score.} Bit scores
+are convenient because they provide a matrix independent score that
+can be compared with other searches performed with other matrices and
+gap penalties against other databases. However, the E()-value, or
+expectation, provides the most direct measure of the statistical
+significance of the match.
+
+In this example, the \texttt{GSTP1\_RAT}, \texttt{GSTA1\_RAT}, and
+\texttt{GSTA4\_RAT} proteins share strong significant similarity
+(better than $E() < 6.1 \times 10^{-7}$ ), while the
+\texttt{GSTF1\_MAIZE}, \texttt{GSTF3\_MAIZE}, and
+\texttt{GSTT1\_DROME} sequences do not share significant similarity
+($E() < 0.001$). However, \texttt{GSTF1\_MAIZE},
+\texttt{GSTF3\_MAIZE}, and \texttt{GSTT1\_DROME} are all glutathione
+transferase homologs, they simply do not share statistically
+significant similarity with this particular \texttt{mGSTM1} query.
+Statistically significant sequence similarity scores \emph{can} be
+used to infer \emph{homology} (common ancestry), but non-significant
+scores \emph{cannot} be used to infer \emph{non-homology}.
+
+While percent identity is often used to characterize the quality of an
+alignment and the likelihood that it reflects homology, the E()-value
+is a much more reliable value for homology infernence (once homology is
+established, the percent identity is much more useful for estimating
+evolutionary distance). Often sequences that share less than 30\%
+identity will share very significant similarity (in the example above
+\texttt{mgstm1.aa} and \texttt{GSTA4\_RAT}, with E() $<$ 6.1E-07 are
+25.6\% identical). The expectation value captures information about
+conservative replacements, identities, and alignment length to provide
+a \emph{single} value that captures the significance of the alignment.
+
+For protein searches, library sequences with E()-values $<$ 0.001 for
+searches of a 10,000 entry protein database are almost always
+homologous. Some sequences with E()-values from 1 - 10 may also be
+related, but unrelated sequences ( 1--10 per search) will have scores
+in this range as well.
+
+E()-values $<$ 0.001 can reliably be used to infer homology, assuming
+that the statistical estimates are accurate. The two most common
+causes of statistical problems are low-complexity regions and
+amino-acid composition bias. Low-complexity regions are can be
+identified using the \texttt{pseg} program \cite{woo935}, and filtered
+out using the \texttt{-S} option. Composition bias rarely produces
+highly-signficiant E()-values, but can cause unrelated sequences to
+have E()-values between 0.01 and 0.001. The FASTA programs offer two
+shuffle-based strategies for evaluating composition bias; calculating
+similarity scores for random sequences with the same length and amino
+acid composition (\texttt{-z 11} $..$ \texttt{16}),\footnote{Random
+ shuffles are performed for pairwise alignments and \texttt{lalign36}
+ by default.} and calculating statistical estimates derived from
+shuffles of the high-scoring sequences (\texttt{-z 21, 22, 24, 25,
+ 26}).
+
+When \texttt{-z 21 .. 26} shuffles are performed, the FASTA36 programs
+present two E()-values in the list of high scoring sequences and the
+alignments; the traditional one based on the library search, and a
+second \texttt{E2()} value, based on the shuffles of the high scoring
+sequences. \texttt{-z 21 .. 26} shuffles are most useful for
+evaluating the significance of translated-DNA:protein searches like
+\texttt{fastx36}. Out-of-frame translations can produce cryptic low
+complexity regions, which are most apparent when the high-scoring
+sequences are shuffled. \texttt{-z 21} shuffles are more efficient
+than individual sequence shuffles, because the set of high scoring
+sequences is shuffled 500 -- 1,000 times, rather than 500 shuffles for
+each of 50 -- 100 high scoring library sequences. It is almost as
+effective, because homologous sequences share similar amino-acid
+composition.
+
+The statistical routines assume that the library contains a large
+sample of unrelated sequences. If the library contains fewer than
+500 sequences (\texttt{MAX\_RSTATS}), then the library sequences are
+shuffled to produce 500 random scores, from which lambda and K
+statistical parameters are estimated. If the library contains a large
+number of \emph{related} sequences, then the statistical parameters
+should be estimated by using the \texttt{-z 11-15}, options.
+\texttt{-z} options greater than 10 calculate a shuffled similarity
+score for each library sequence, in addition to the unshuffled score,
+and estimate the statistical parameters from the scores of the
+shuffled sequences.
+
+\subsubsection{Looking at alignments}
+
+The description section described above contains the critical
+information for inferring homology, the \texttt{E()}-value. The
+alignment section shows the actual alignments that produced the
+similarity score and statistical estimates. In
+Fig. \ref{ssearch_run}, the alignment display reports the percent
+identity, percent similarity (number of aligned residues with BLOSUM50
+values $\ge$ 0), and the boundaries of the alignment. Note that for
+the \texttt{ssearch36} and \texttt{fasta36}, the alignment shown can
+include residues that are not part of the best local alignment
+(e.g. residues 1--5 and 207--218 in \texttt{mGSTM1} in
+Fig. \ref{ssearch_run}). The amount of additional sequence context
+shown is the alignment line length (60 residues, set by \texttt{-w
+ len}) divided by 2 by default, but can be adjusted with the
+\texttt{-W context} option.
+
+\begin{figure}
+\include{fasta_guide.fg2}
+\vspace{-3.0ex}
+\caption{\label{seg-aln}Alignment with \texttt{-S} filtered sequence}
+\end{figure}
+
+Fig. \ref{seg-aln} shows an example of a \texttt{fasta36} alignment
+produced using the \texttt{-S} option to filter out lower-case (low
+complexity) residues. Here, additional scores (\texttt{initn},
+\texttt{init1} are shown, in addition to the \texttt{opt} score which
+is used to rank the sequences and calculate statistical significance.
+The \texttt{init1} score is the highest scoring alignment without
+gaps; \texttt{initn} is a score that combines consistent
+(non-overlapping) runs without gaps, and \texttt{opt} is the score of
+a banded Smith-Waterman of width 16 for \emph{ktup=2} that is applied
+to sequences with \texttt{initn} scores over the optimization
+threshold. In Fig. \ref{seg-aln}, the \texttt{init1} score is
+based on the long, un-gapped region from residues 46--208 in
+\texttt{mGSTM1}, while the \texttt{initn} and \texttt{opt} scores
+include the other regions joined by gaps. The \texttt{initn} score is
+higher than the \texttt{opt} score, because it uses a simpler,
+length-independent, gap penalty.
+
+The \texttt{init1} and \texttt{initn} scores are shown for
+historical reasons, and can be used to illustrate the FASTA algorithm.
+But the \texttt{opt} score is the most reliable and sensitive score
+for inferring homology; the others can be ignored.
+
+For \texttt{fasta36} with proteins, the final alignment and score is
+calculated with the Smith-Waterman algorithm. For DNA sequences, a
+banded Smith-Waterman is used. (The \texttt{-A} option produces banded
+Smith-Waterman alignments for proteins, and full Smith-Waterman for
+DNA.) In Fig. \ref{seg-aln}, the \texttt{opt} score and
+\texttt{Smith-Waterman} scores are calculated on exactly the same
+alignment, but the \texttt{opt} score excludes the contribution from
+the ``low-complexity'' region between 19--30 in \texttt{GST26\_SCHMA}.
+
+\subsubsection{Results without alignments}
+
+While sequence alignments are very informative, it is often not
+practical to examine all the statistically significant alignments in
+large-scale searches. The \texttt{-m 9} and \texttt{-m 8} options
+present summaries of each alignment (alignment boundaries, percent
+identity, and other information) in a much more compact
+form. \texttt{-m 9i} adds three columns to the summary report line,
+the fraction identical, fraction similar, and alignment length (in
+addition, variants are reported if they affect identity). \texttt{-m
+ 9I} is similar to \texttt{-m 9i}, but also reports domain content if
+annotations are used. \texttt{-m 9c} or \texttt{-m 9C} (see options
+below) provide a detailed encoding of the alignment, that allows it to
+be reconstructed. For large-scale searches, we routinely use
+\texttt{-m 8} with the \texttt{-d 0} option, which sets the number of
+alignments shown to 0 (thus none are shown). Alternatively, the
+\texttt{-m 8} and \texttt{-m 8C} ouput options produce BLAST-format
+tabular results summaries (\texttt{-m 8C} provides commented tabular
+results). \texttt{-m 8CC} adds an alignment CIGAR string and
+annotation string to the BLAST tabular format. \texttt{-m BB}
+produces an output that mimics BLAST output (with alignments).
+
+\subsubsection{Alignments with annotations}
+
+The command line \texttt{-V} option (described below) causes the FASTA
+programs to ``decorate'' its sequence alignments with annotation
+information, such as functional sites, variants, and domain-based
+sub-alignment scores. For example, a comparison of the
+\texttt{seqs/gstm1\_human.vaa} sequence with SwissProt using the
+\texttt{scripts/ann\_feats\_up\_www2.pl} script:
+\begin{footnotesize}
+\begin{quote}
+\begin{verbatim}
+ssearch36 -m 9i -V \!../scripts/ann_feats_up_www2.pl ../seq/gstm1\_human.vaa /slib/swissprot.fa
+\end{verbatim}
+\end{quote}
+\end{footnotesize}
+Produces the following addtional output:
+\begin{footnotesize}
+\begin{verbatim}
+Annotation symbols:
+ = : Active site
+ * : Modified
+ # : Substrate binding
+ ^ : Metal binding
+ @ : Site
+
+The best scores are: s-w bits E(458668) %_id %_sim alen
+sp|P09488.3|GSTM1_HUMAN Glutathione ( 218) 1500 375.2 2.4e-103 1.000 1.000 218 |Var: K173N;S210T;
+sp|Q03013.3|GSTM4_HUMAN Glutathione ( 218) 1375 344.5 4.4e-94 0.904 0.963 218 |Var: S2P;A160V;L208V;Y209F;...
+sp|Q5R8E8.3|GSTM2_PONAB Glutathione ( 218) 1310 328.5 2.9e-89 0.862 0.959 218
+sp|Q9TSM5.3|GSTM1_MACFA Glutathione ( 218) 1308 328.0 4e-89 0.858 0.954 218
+sp|Q9TSM4.3|GSTM2_MACFA Glutathione ( 218) 1307 327.7 4.8e-89 0.862 0.954 218
+sp|P28161.2|GSTM2_HUMAN Glutathione ( 218) 1306 327.5 5.7e-89 0.853 0.959 218 |Var: S173N;
+sp|P46439.3|GSTM5_HUMAN Glutathione ( 218) 1305 327.2 6.7e-89 0.876 0.954 218 |Var: L179P;
+\end{verbatim}
+\end{footnotesize}
+This summary of high scoring hits shows one of the effects of
+annotation---substitution of variant residues to increase the score.
+In this case, the query sequence \texttt{gstm1\_human.vaa} is a known
+variant of the canonical \texttt{GSTM1\_HUMAN}/\texttt{P09488} UniProt
+sequence. Without the \texttt{-V} annotation option,
+\texttt{gstm1\_human.vaa} would be 99\% identical to
+\texttt{GSTM1\_HUMAN}, but because UniProt documents the variant
+residues in the feature table, the \texttt{K173N} and \texttt{S210T}
+substutions are made in the library (subject) sequence, producing a
+perfect match.
+
+In addition to the variant substitution shown above, the alignments
+provide a more complete view of the annotations available on the
+library (subject) proteins. Below is the report for the
+\texttt{GSTM4\_HUMAN} alignment:
+\begin{footnotesize}
+\begin{quote}
+\begin{verbatim}
+>>sp|Q03013.3|GSTM4_HUMAN Glutathione S-trans (218 aa)
+ Variant: 2P=2P : S2P : UniProtKB FT ID: VAR_033979
+ Site:@ : 7Y=7Y : Site: Glutathione binding
+ Site:@ : 46W=46W : Site: Glutathione binding
+ Site:@ : 59N=59N : Site: Glutathione binding
+ Site:@ : 72Q=72Q : Site: Glutathione binding
+ Region: 2-88:2-88 : score=599; bits=150.1; Id=0.989; Q=415.2 : GST N-terminal :1
+ Site:# : 116Y=116Y : Substrate binding: Substrate
+ Variant: 160V=160V : A160V : UniProtKB FT ID: VAR_033980
+ Variant: 208V=208V : L208V : UniProtKB FT ID: VAR_049487
+ Region: 90-208:90-208 : score=699; bits=175.1; Id=0.833; Q=489.3 : GST C-terminal :2
+ Variant: 209F=209F : Y209F : UniProtKB FT ID: VAR_049488
+ Variant: 211K=211K : R211K : UniProtKB FT ID: VAR_049489
+ Variant: 212M=212M : V212M : UniProtKB FT ID: VAR_049490
+ s-w opt: 1375 Z-score: 1823.2 bits: 344.5 E(458668): 4.4e-94
+Smith-Waterman score: 1375; 90.4% identity (96.3% similar) in 218 aa overlap (1-218:1-218)
+\end{verbatim}
+\end{quote}
+\end{footnotesize}
+This report can be broken into three parts: (1) information on
+variants, described above, (2) information on annotated sites, and (3)
+information on annotated domains. For each annotated site, the
+coordinate and amino-acid resdiue in the query and library (subject)
+sequence is shown, as well as the conservation state (\texttt{=} in
+all these examples). For annotated domains, the overall alignment
+score is broken into pieces, based on the boundaries of the domains.
+In this case, the full alignment extends from residues 1--218,
+producing a raw Smith-Waterman score of 1375 and a bit score of
+344.5. The GST N-terminal domain is annotated from residue 2--88 on
+\texttt{GSTM4\_HUMAN}, and the 2--88 region of the alignment produces
+a score of 599 and bit score of 150.1. This region is 98.9\%
+identical, and the probability of that similarity score is
+$10^{-41.52}$ (the Qvalue score is $-10 log_{10} P$). The
+alignment associated with GST C-terminal domain is slightly less well
+conserved (83.3\% identical), but longer, so it produces a higher
+Smith-Waterman score (699), bit score (175.1) and Q-value.
+
+Sub-alignment scores can be used to identify cases of alignment
+over-extension \cite{wrp136}, where an alignment extends well beyond
+the homologous domain. In this case, the homologous region will
+produce the vast majority of the score, and the non-homologous
+over-extension will produce very little score. For example, when
+\texttt{SRC8\_HUMAN} aligns with \texttt{LASP1\_MOUSE}, the alignment
+spans 200 residues, but only about 49 of those residues, an
+\texttt{SH3} domain, are homologous:
+
+\begin{footnotesize}
+\begin{quote}
+\begin{verbatim}
+>>sp|Q61792.1|LASP1_MOUSE LIM and SH3 domain protein 1; LASP-1; (263 aa)
+ Region: 369-398:66-95 : score=20; bits=13.7; Id=0.200; Q=0.0 : Nebulin_repeat InterPro
+ Region: 400-434:97-131 : score=-8; bits=8.7; Id=0.150; Q=0.0 : Nebulin_repeat InterPro
+ Region: 435-499:132-203 : score=13; bits=11.4; Id=0.197; Q=0.0 : NODOM :0
+ Region: 499-547:204-261 : score=124; bits=47.8; Id=0.474; Q=92.1 : SH3 InterPro
+ s-w opt: 148 Z-score: 253.4 bits: 55.6 E(459565): 1.2e-06
+Smith-Waterman score: 159; 26.3% identity (55.6% similar) in 205 aa overlap (369-547:66-261)
+\end{verbatim}
+\end{quote}
+\end{footnotesize}
+The 150 aligned residues outside the \texttt{SH3} homology produce
+less than 20\% of the alignment score, while spannign to
+non-homologous Nebulin repeat domains.
+
+\subsection{Program Options}
+
+Command line options are available to change the scoring parameters
+and output display. Unlike the NCBI BLAST programs, command line
+options \emph{must} precede the query file name and library file name
+arguments. To see the command-line options for a program and their
+defaults, type \texttt{program\_name -help}, e.g. \texttt{fasta36
+ -help} or \texttt{ssearch36 -help}. For a quick list of the most
+common options, just type the program name without any options
+(e.g. \texttt{fasta36$<$ret$>$}).
+
+\subsubsection{Command line options}
+\begin{description}
+\item[\texttt{-a}] (\texttt{fasta36}, \texttt{ssearch36},
+ \texttt{glsearch36}, \texttt{fasts36}) show both sequences in their
+ entirety.
+\item[\texttt{-A}] force Smith-Waterman alignments for
+ \texttt{fasta36} DNA sequences. By default, only \texttt{fasta36}
+ protein sequence comparisons use Smith-Waterman alignments.
+ Likewise, for proteins, use band alignments (Smith-Waterman is used
+ by default).
+\item[\texttt{-b \#}] Number of sequence scores to be shown on output.
+ In the absence of this option, \texttt{fasta36} (and
+ \texttt{ssearch36}) display all library sequences obtaining
+ similarity scores with expectations less than the expectation (-E)
+ threshold, 10.0 for proteins, and 2.0 for DNA:DNA and
+ protein:translated DNA. The \texttt{-b \#} option can limit the
+ display further. There are two ``sub-modes'' of \texttt{-b}.
+ \texttt{-b =100} will force 100 high scores to be displayed,
+ regardless of the expectation (\texttt{-E}) threshold, and
+ \texttt{-b >1} will show at least \texttt{1}, but is otherwise
+ limited by \texttt{-E}. Thus, \texttt{-b 10} will show \emph{no
+ more than} 10 results, limited by \texttt{-E}; \texttt{-b =10}
+ will always show \emph{exactly} \texttt{10} results, and \texttt{-b
+ >5} will show \emph{at least} \texttt{5} results, but could show
+ many more if more results have e\-values $\le$ \texttt{-E e\_cut}.
+\item[\texttt{-c \#,\#}] (\texttt{fasta36}, \texttt{[t]fast[x,y]36}
+ only) Fraction of alignments optimized (second value is fraction of
+ sequences joined). FASTA36 uses a statistical threshold strategy
+ that joins and optimizes only the fraction of the alignments with an
+ \texttt{initn} score expected \texttt{-c} times. Thus, \texttt{-c
+ 0.05} should optimize about 5\% of sequences. The actual number
+ of sequences optimized (and joined) is displayed in the scoring
+ parameters line. Thus:
+\begin{quote}
+\begin{verbatim}
+Parameters: BL50 matrix (15:-5), open/ext: -10/-2
+ ktup: 2, E-join: 1 (0.687), E-opt: 0.2 (0.294), width: 16
+\end{verbatim}
+\end{quote}
+reports that 20\% of the sequences in the database should have been
+band-optimized, and 29.4\% were. Reducing the \texttt{-c opt} fraction
+improves performance, but dropping the fraction below 0.02 can
+reduce the accuracy of the statistical estimates.
+
+\texttt{-c O} (letter 'O') sets the joining/optimization
+thresholds as they were prior to \texttt{fasta-36.3.3} (original thresholds). Positive
+values set the thresholds to specific score values, as was the case
+in older versions of \texttt{fasta}.
+
+\item[\texttt{-C}]
+length of the sequence name printed at the beginning of alignment
+lines (default 6 characters).
+\item[\texttt{-d \#}]
+Maximum number of alignments to be displayed (must be \texttt{<=} to the number of descriptions, \texttt{-b \#})
+\item[\texttt{-D}]
+ Provide some debugging output. Used in conjunction
+ with the \texttt{-e expand\_script.sh}, the \texttt{link\_acc\_file}
+ and \texttt{link\_lib\_file} are not deleted during the run; so that
+ \texttt{expand\_script.sh} scripts can be tested.
+
+\item[\texttt{-e expand\_script.sh}]
+
+ Expand the set of sequences that
+ are aligned to beyond the set of sequences searched. When the
+ \texttt{-e expand\_script.sh} option is used, the
+ \texttt{expand\_script.sh} script is run after the initial search
+ scan but before the list of high-scoring sequences is displayed.
+ \texttt{expand\_script.sh} is given a single argument, the name of a
+ file that contains a list of accession strings (the text between the
+ \texttt{>} and the first space ('\textvisiblespace') character
+ followed by the E()-value for the sequence (separated by a
+ \texttt{<tab>} character), e.g.:
+\begin{quote}
+\begin{verbatim}
+gi|121719|sp|P08010|GSTM2_RAT<tab>2.69e-86
+gi|121746|sp|P09211|GSTP1_HUMAN<tab>1.51e-20
+gi|121749|sp|P04906|GSTP1_RAT<tab>1.16e-19
+gi|62822551|sp|P00502|GSTA1_RAT<tab>9.5e-12
+\end{verbatim}
+\end{quote}
+The script should produce a fasta-formatted list of additional
+sequences printed to \texttt{stdout}. The script is run with the command:
+\begin{quote}
+\begin{verbatim}
+expand_script.sh link_acc.tmp_file > link_lib.tmp_file
+\end{verbatim}
+\end{quote}
+The sequences in \texttt{link\_lib.tmp\_file} (a temporary file name is
+actually used, and the file is deleted unless the \texttt{-D} option
+is used) are then compared and, if they are significant, included in
+the list of high scoring sequences and the alignments. The expanded
+set of sequences does not change the database size or statisical
+parameters, it simply expands the set of high-scoring sequences.
+
+The \texttt{fasta36/misc} directory contains
+\texttt{expand\_uniref50.pl} that uses a mySQL table based on the
+\texttt{uniref50} clusters. Using the script and the
+\texttt{uniref50} cluster information, one can search
+\texttt{uniref50.fasta}, but then expand the hits so that
+\texttt{uniprot} appears to be searched.
+
+\item[\texttt{-E e\_cut [e\_cut\_r]}] Limit the number of scores and
+ alignments shown based on the expected number of scores. Used to
+ override the expectation value of 10.0 (protein:protein; 5.0
+ translated-DNA:protein; 2.0 DNA:DNA) used by default. \texttt{-E
+ 2.0} will show all library sequences with scores with an
+ expectation value $<=$ 2.0. With \texttt{fasta-36}, a second
+ value, \texttt{e\_cut\_r} is available to limit the E()-values of
+ additional sequence alignments between the query and library
+ sequences. If not given, the threshold is \texttt{e\_cut}/10.0. If
+ given with a value $>$ 1.0, \texttt{e\_cut\_r} = \texttt{e\_cut} /
+ value; for a value $<$ 1.0, \texttt{e\_cut\_r} = value; If
+ \texttt{e\_cut\_r} $<$ 0, then the additional alignment option is
+ disabled.
+\item[\texttt{-f \#}]
+Gap open penalty (-10 by default for proteins,
+-12 for DNA, -12 for \texttt{[t]fast[xy]}).
+\item[\texttt{-F \#}]
+Limit the number of scores and alignments shown based on the expected
+number of scores. \texttt{-E \#} sets the highest E()-value shown; \texttt{-F \#} sets
+the lowest E()-value displayed. Thus, \mbox{\texttt{-F 0.0001}} will not show any matches or
+alignments with E() $<$ 0.0001. This allows one to skip over close
+relationships to search for more distant relationships.
+\item[\texttt{-g \#}]
+Penalty per residue in a gap (-2 by default for proteins,
+-4 for DNA, -2 for \texttt{[t]fast[xy]}). A single residue gap costs \texttt{f} $+$ \texttt{g}.
+\item[\texttt{-h}]
+Short help message. Help options with \texttt{':'}, e.g. \texttt{-s:},
+require an argument (\texttt{-s BP62}). Defaults are shown in square
+brackets, e.g.: \texttt{-s:\ [BL50]}.
+\item[\texttt{-help}]
+Long help message
+\item[\texttt{-H}]
+Show histogram.
+\item[\texttt{-i}]
+DNA queries - search with reverse complement. For
+\texttt{tfastx36/y36}, search the reverse complement of the library sequence
+only (complement of \texttt{-3} option).
+\item[\texttt{-I}]
+Interactive mode (the default for versions older than \texttt{fasta-36.3.4}).
+\item[\texttt{-j \#}]
+Penalty for frameshift between codons (\texttt{[t]fastx36}, \texttt{[t]fasty36}) and within a codon (\texttt{fasty36}/ \texttt{tfasty36} only).
+\item[\texttt{-J}]
+(\texttt{lalign36} only) show the identity alignment (normally
+ suppressed, \texttt{-I} in versions before \texttt{fasta-36.3.4}).
+\item[\texttt{-k \#}]
+number of shuffles for statistical estimates from shuffling.
+\item[\texttt{-l file}]
+Location of library menu file (FASTLIBS).
+\item[\texttt{-L}]
+Display longer library sequence description.
+\item[\texttt{-M low-high}]
+Range of amino acid sequence lengths to be included in the search.
+\item[\texttt{-m \#}]
+Specify alignment type: 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, B, BB, ``F\# out\_file''
+\begin{small}
+\begin{verbatim}
+ -m 0 -m 1 -m 2 -m 3 -m 4
+MWRTCGPPYT MWRTCGPPYT MWRTCGPPYT MWRTCGPPYT
+::..:: ::: xx X ..KS..Y... MWKSCGYPYT ----------
+MWKSCGYPYT MWKSCGYPYT
+\end{verbatim}
+\end{small}
+If the \texttt{-V '*@\%'} annotation option has been used,
+annotations can be included in either the coordinate line (the default) or the
+middle alignment line (\texttt{-m 0M}, \texttt{-m 1M}), or both
+(\texttt{-m 0B}, \texttt{-m 1B}). See \texttt{-V} for more details.
+
+\indent \texttt{-m 5}: a combination of \texttt{-m 4} and \texttt{-m
+ 0}. \texttt{-m 6} provides \texttt{-m 5} plus HTML formatting. In
+addition, independent \texttt{-m} options can be combined. Thus, one
+can use \texttt{-m 1 -m 6 -m 9}.
+
+\item[\texttt{-m 8}] provides BLAST tabular format output (a tab
+ delimited line with the query name, library name, percent identity,
+ and other alignment information). ``\texttt{-m 8C}'' provides the
+ additional information provided by the BLAST tabular format with
+ comment lines. BLAST tabular format has been extended to include
+ either a CIGAR string alignment encoding (\texttt{-m 8CC} with BLAST
+ comments, \texttt{-m 8XC} without comments) and, if available, an
+ annotation encoding matching FASTA \texttt{-m 9C} output. All the
+ \texttt{-m 9c/C/d/D} encodings are available with BLAST tabular
+ output using \texttt{-m 8C[c/C/d/D]}.
+
+\item[\texttt{-m 9}] display alignment coordinates and scores with the
+ best score information. \texttt{-m 9i} provides alignment length,
+ percent identity, and percent similarity only. \texttt{-m 9i} also
+ provides variation information if it improves the score. \texttt{-m
+ 9I} provides both identity and domain information on the summary
+ line.
+
+ \texttt{ -m 9, -m 9c} and \texttt{-m 9C} extend the normal best score information:
+\begin{footnotesize}
+\begin{verbatim}
+The best scores are: opt bits E(14548)
+XURTG4 glutathione transferase (EC 2.5.1.18) 4 - ( 219) 1248 291.7 1.1e-79
+\end{verbatim}
+\end{footnotesize}
+
+to include the additional information (on the same line, separated by
+$<$tab$>$ characters):
+\begin{footnotesize}
+\begin{verbatim}
+%_id %_gid sw alen an0 ax0 pn0 px0 an1 ax1 pn1 px1 gapq gapl fs
+0.771 0.771 1248 218 1 218 1 218 1 218 1 219 0 0 0
+\end{verbatim}
+\end{footnotesize}
+
+The first two values are fraction identical and fraction similar
+(score $\ge 0$), followed by the Smith-Waterman alignment score (\texttt{sw}), the
+alignment length (\texttt{alen}), and the coordinates of the beginning
+and end of the alignment in the query and target (library) sequences
+(\texttt{an0} beginning, \texttt{ax1} end in query; \texttt{an1}
+beginning, \texttt{ax1} end in target/library), and the coordinate
+system for the beginning and end of the query and target/library
+sequence (\texttt{pn0} is the displayed coordinate of the first
+residue of the query sequence, \texttt{px0} is the displayed
+coordinate of the last residue, \texttt{pn1},\texttt{px1} provide the
+coordinates for the target/library sequence). \texttt{gapq},
+\texttt{gapl} report the number of gaps in the query and library
+sequence; \texttt{fs} reports the number of frameshifts.
+
+\texttt{ -m 9c} provides additional information: an encoded alignment string. For example, the alignment:
+\begin{footnotesize}
+\begin{verbatim}
+ 10 20 30 40 50 60 70
+GT8.7 NVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKFKL--GLDFPNLPYL-IDGSHKITQ
+ :.:: . :: :: . .::: : .: ::.: .: : ..:.. ::: :..:
+XURTG NARGRMECIRWLLAAAGVEFDEK---------FIQSPEDLEKLKKDGNLMFDQVPMVEIDG-MKLAQ
+ 20 30 40 50 60
+\end{verbatim}
+\end{footnotesize}
+would be encoded:
+\begin{footnotesize}
+\texttt{=23+9=13-2=10-1=3+1=5}
+\end{footnotesize}.
+The numbers in the alignment encoding is with repect to the beginning
+of the alignment, not the sequences. The beginning coordinate of the
+alignment is given earlier in the \texttt{-m 9c} line. \texttt{-m 9C}
+provides the alignment encoding in CIGAR format:
+\begin{footnotesize}
+\texttt{28M9D13M2I10M1I3M1D5M}
+\end{footnotesize}.
+
+(June, 2014) The \texttt{-m 9c/C} option has been extended to
+\texttt{-m 9d/D}, which encodes the positions of mismatches as well as
+insertions and deletions. For the example above, the \texttt{-m 9d}
+encoding would be:
+\begin{footnotesize}
+\texttt{=1x1=2x4=2x1=2x7=3-9=1x2=1x4=2x1=1x1+2x1=1x1=1x3=1x2+1=3-1x1=1x2=1}
+\end{footnotesize}
+while \texttt{-m 9D} would be:
+\begin{footnotesize}
+\texttt{1M1X2M4X2M1X2M7X3M9D1M2X1M4X2M1X1M1X2I1X1M1X1M3X1M2X1I3M1D1X1M2X1M}
+\end{footnotesize}
+\item[\texttt{-m 10}]
+a parseable format for use with other programs.
+\item[\texttt{-m 11}]
+Provide \texttt{lav}-like output (used by \texttt{lalign}) for graphical output.
+\begin{quote}
+\texttt{lalign36 -m 11 mchu.aa mchu.aa | lav2plt.pl --dev ps > mchu\_laln.ps}
+\end{quote}
+Produces a postscript plot of the local alignments. Likewise,
+\texttt{lav2plt.pl --dev svg} produces SVG output.
+
+\item[\texttt{-m BB}] Format output to mimic BLAST format. \texttt{-m
+ B} formats alignments to look like BLAST alignments (Query/Sbjct),
+ but is FASTA output otherwise. \texttt{-mBB} imitates BLAST as much
+ as possible, and cannot be used with other \texttt{-m} options.
+
+\item[\texttt{-m "F\# out.file"}] Send an alternate result format to \texttt{out.file}.
+Normally, the \texttt{-m out\_fmt} option applies to the default output
+file, which is either \texttt{stdout}, or specified with \texttt{-O out\_file} (or within
+the program in interactive mode). With \texttt{-m F}, an output format can be
+associated with a separate output file, which will contain a complete
+FASTA program output. Thus,
+\begin{quote}
+\begin{small}
+\begin{verbatim}
+ ssearch36 -m 9c -m "FBB blast.out" -m "F9c,10 m9c_10.out" query library
+\end{verbatim}
+\end{small}
+\end{quote}
+Sends the \texttt{-m 9c} output to \texttt{stdout}, but will also send
+\texttt{-m BB} output to the \texttt{blast.out} file, and \texttt{-m 9c -m
+ 10} output to \texttt{m9\_c10.out}. Consistent \texttt{-m out\_fmt}
+commands can be set to the same file by separating them with ','.
+Producing alternative format alignments in different files has little
+additional computational cost.
+
+Because a space (\textvisiblespace) is used to separate the output
+format (\texttt{-m}) values from the file name, the \texttt{-m F}
+argument must typically be surrounded by quotation marks (\texttt{"}).
+
+One of the shortcomings of this approach is that it affects only the
+output format, not the other options that modify the amount of output.
+Thus, if you specify \texttt{-E 0.001}; that expect threshold will be
+used for all the output files. When a \texttt{-m} option does modify
+the output (e.g. \texttt{-m 8} sets \texttt{-d 0}), that modification
+is specific to the output file.
+
+\item[\texttt{-M low-high}]
+Include library sequences with lengths between low and
+high.
+\item[\texttt{-n}]
+Force the query sequence to be treated as a DNA sequence.
+Useful when query sequences contain a large number of
+ambiguous residues, e.g. transcription factor binding sites.
+\item[\texttt{-N \#}]
+break long library sequences into blocks of \# residues. Useful for
+bacterial genomes, which have only one sequence entry. -N 2000 works
+well for well for bacterial genomes. (This option was required when
+FASTA only provided one alignment between the query and library
+sequence. It is not as useful, now that multiple alignments are
+available.)
+
+\item[\texttt{-o off1,off2}]
+(Previously \texttt{-X}.) Specifies offsets for the beginning of the query and library sequence.
+For example, if you are comparing upstream regions for two genes, and
+the first sequence contains 500 nt of upstream sequence while the
+second contains 300 nt of upstream sequence, you might try:
+\begin{quote}
+\texttt{fasta -o "-500 -300" seq1.nt seq2.nt}
+\end{quote}
+If the \texttt{-o} option is not used, FASTA assumes numbering starts with 1.
+(You should double check to be certain the negative numbering works
+properly.)
+
+\item[\texttt{-O}] Send a copy of results to \texttt{filename}.
+ Helpful for environments without STDOUT, but should be avoided (use
+ \texttt{> filename} instead).
+
+\item[\texttt{-p}]
+Force query to be treated as protein sequence.
+
+\item[\texttt{-P PSSM\_file}]
+Specify a PSI-BLAST format PSSM (Position Specific Scoring Matrix)
+file. \texttt{ssearch36}, \texttt{ggsearch36}, and
+\texttt{glsearch36} can use a PSSM file to improve the sensitivity of
+a search. The FASTA programs accept two PSSM file formats:\\[2ex]
+\begin{tabular}{l l l}
+\hline\\[-1.5ex]
+format & \texttt{blastpgp} & option \\[0.5ex]
+\hline\\[-1.5ex]
+0 & \texttt{blastpgp -C pssm.chk -u 0} & byte-encoded \\
+2 & \texttt{blastpgp -C pssm.asnb -u 2} & binary ASN.1 \\
+% & \texttt{psiblast -out\_pssm\_text} \\[1.5ex]
+\hline\\[-0.5ex]
+\end{tabular}\\
+which can be specified after the file name, e.g.:
+\begin{quote}
+\texttt{ssearch36 -P 'pssm.asnb 2' pssm\_query.aa +sp+}
+\end{quote}
+Searches with a PSI-BLAST PSSM must still require a query sequence
+file, and the query sequence file must match the PSSM seed sequence.
+The format 0 byte-encoded PSSM is machine dependent; it must be
+created by \texttt{blastpgp} on the same architecture as
+\texttt{ssearch36}. In general, you should use the binary ASN.1 (format 2) file.
+
+With the release of \texttt{NCBI-BLAST+}, \texttt{psiblast} replaces
+\texttt{blastpgp}, and \texttt{psiblast} does not produce the binary
+ASN.1 PSSM checkpoint data. However, the text ASN.1 PSSM checkpoint
+file (produced with the \texttt{psiblast} option \texttt{-out\_pssm})
+can be converted to a binary ASN.1 format that \texttt{ssearch36} can
+read using the NCBI \texttt{datatool} program (available from
+\url{ftp://ftp.ncbi.nlm.nih.gov/toolbox/ncbi_tools++/BIN/CURRENT/datatool})
+together with
+\url{http://www.ncbi.nlm.nih.gov/data_specs/asn/NCBI_all.asn}. More
+information about \texttt{datatool} is available from
+\url{http://www.ncbi.nlm.nih.gov/data_specs/NCBI_data_conversion.html}.
+The NCBI BLASTP/PSI-BLAST website provides the same PSSM text ASN.1
+file with the downloads link. A text ASN.1 PSSM file can be converted
+to a binary ASN.1 file using the command:
+\begin{quote}
+\texttt{datatool -m NCBI\_all.asn -v pssm.asn\_txt -e pssm.asnb}
+\end{quote}
+The \texttt{pssm.asnb} can then be used with
+\texttt{ssearch36} with the \texttt{-P 'pssm.asnb 2'}
+option shown above.
+
+\item[\texttt{-Q,-q}]
+Quiet - does not prompt for any input. Writes scores and alignments
+to the terminal or standard output file (on by default, turned off
+with \texttt{-I}).
+\item[\texttt{-r +n/-m}]
+Specify match/mismatch scores for DNA comparisons. The default is
+\texttt{+5/-4}. \texttt{+3/-2} can perform better in some cases.
+\item[\texttt{-R file}]
+Save a results summary line for every sequence in the sequence
+library. The summary line includes the sequence identifier,
+superfamily number (if available) position
+in the library, and the similarity scores calculated. This option can
+be used to evaluate the sensitivity and selectivity of different
+search strategies \cite{wrp951,wrp981}.
+\item[\texttt{-s file}] Specify the scoring matrix file.
+ \texttt{fasta36} uses the same scoring matrice format as Blast.
+ Several scoring matrix files are included in the standard
+ distribution in the \texttt{data/} directory. For protein
+ sequences: \texttt{codaa.mat} - based on minimum mutation matrix;
+ \texttt{idnaa.mat} - identity matrix; \texttt{pam250.mat} - the
+ PAM250 matrix; \cite{day787}, (\texttt{-s P250}), and
+ \texttt{pam120.mat} - a PAM120 matrix (\texttt{-s P120}). The
+ default scoring matrix is BLOSUM50 (\texttt{-s BL50}). Other
+ matrices include a series of modern PAM-based matrices
+ \cite{tay925}: MDM40/\texttt{-s MD40}, MDM20/\texttt{-s MD20}, and
+ MDM10/\texttt{-s MD10}, and a selection from the BLOSUM series
+ \cite{hen929} BLOSUM50, 62, and 80/\texttt{-s BL50}, \texttt{-s
+ BL62}, \texttt{-s BL80}. \texttt{-s BP62} sets the scoring matrix
+ to BLOSUM62 and the gap penalties to -11/-1, identical to
+ \texttt{BLASTP}. In addition, the VTML160 matrix (\texttt{-s
+ VT160}) \cite{muller2002} and OPTIMA\_5 (\texttt{-s OPT5})
+ \cite{kan023} are available.
+
+If the scoring matrix is prefaced by a question mark,
+e.g. \texttt{?BP62}, then the scoring matrix is adjusted for each
+query to ensure that a 100\% identical match can produce a score of at
+least 40 bits. This is designed for \texttt{fastx36} searches with
+potentially short DNA queries; A 120 nt DNA query can only produce a
+40 amino-acid alignment, which, with BLOSUM62 -11/-1, cannot produce
+more than 23 bits of score. A scoring matrix with a higher information
+content is required; in the set available by default, MD40, with 2.22
+bits/position, would be used. For more information about alignment
+length and information content, see \cite{alt915}.
+
+\item[\texttt{-S}] Filter out lower-case characters in the query or
+ library sequences for the initial score calculation (used to filter
+ low-complexity -- \texttt{seg}-ed -- residues). The \texttt{pseg}
+ program \cite{woo935} can be used to lower-case mask low complexity
+ regions in protein sequences. With the \texttt{-S} option, lower
+ case characters in the query or database sequences are treated as
+ \texttt{X}'s during the initial scan, but are treated as normal
+ residues during the final alignment display. Since statistical
+ significance is calculated from the similarity score calculated
+ during the library search, the lower case residues do not contribute
+ to the score. However, if a significant alignment contains low
+ complexity regions, the residues are shown (as lower
+ case characters, Fig. \ref{seg-aln}).
+
+The \texttt{pseg} program can be used to produce databases (or query
+sequences) with lower case residues indicating low complexity regions
+using the command:
+\begin{verbatim}
+pseg ./swissprot.fasta -z 1 -q > swissprot.lseg
+\end{verbatim}
+
+The \texttt{-S} option should always be used with \texttt{FASTX/Y} and
+\texttt{TFASTX/Y} because out-of-frame translations often generate
+low-complexity protein sequences. However, only lower case characters
+in the protein sequence (or protein database) are masked; lower case
+DNA sequences are translated into upper case protein sequences, and
+not treated as low complexity by the translated alignment
+programs. (There is an option in the \texttt{Makefile},
+\texttt{-DDNALIB\_LC}, to enable preserving case in DNA sequences.)
+
+\item[\texttt{-t \#}]
+Translation table - fastx36, tfastx36, fasty36, and
+tfasty3 now support the BLAST translation tables. See
+\url{http://www.ncbi.nih.gov/Taxonomy/Utils/wprintgc.cgi}.
+
+\texttt{-t t} or \texttt{-t t\#} enables the addition of
+an implicit termination codon to a protein:translated DNA match. That
+is, each protein sequence implicitly ends with \texttt{*}, which
+matches the termination codes for the appropriate genetic code.
+\texttt{-t t\#} sets implicit termination and a different genetic
+code.
+\item[\texttt{-T \#}]
+set number of threads/workers. Normally on a multi-core machine, the maximum
+number of processors/cores is used.
+\item[\texttt{-U}]
+Treat the query sequence an RNA sequence. In addition to selecting a
+DNA/RNA alphabet, this option causes changes to the scoring matrix so
+that \texttt{G:A} , \texttt{T:C} or \texttt{U:C} are scored as \mbox{\texttt{G:G -3}}.
+\item[\texttt{-v \#}]
+Do window shuffles with the window size specified.
+\item[\texttt{-V str}] Specify annotation characters that can be
+ included (and will be ignored), in the query sequence file, but are
+ displayed in the alignments. If a query file contains
+ \texttt{"ACVS*ITRLFT?"}, where \texttt{"*"} and \texttt{"?"} are
+ used to indicate phosphorylation, giving the option \mbox{\texttt{-V
+ '*?'}}, the annotated characters in the query will (\texttt{S*},
+ \texttt{F?}) will be highlighted in the alignment (on the number
+ line). A \texttt{fasts36} alignment of \texttt{seq/ngts.aa} compared
+ to \texttt{seq/mgstm1.aa} with \texttt{-V '*?'} produces:
+\begin{footnotesize}
+\begin{verbatim}
+ * 10??
+GT8.7 ILGYWN------------EYTDSSYDEKR----------------------------
+ :::::: :::::::::::
+GT8.7 MPMILGYWNVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKFKLGLDFPNL
+ 10 20 30 40 50 60
+\end{verbatim}
+\end{footnotesize}
+In addition to showing the alignments of post-translationally modified
+sites, the \texttt{-V} option can be used to highlight active sites in
+library sequences. In the \texttt{-m 9c} output, the state of the
+annotated sites is summarized when \texttt{-V} is used.
+
+(fasta-36.3.6 June 2012) The \texttt{-V} option has been extended to:
+(1) allow feature descriptions to be specified in a file,
+e.g. \texttt{-V =annot.defs} where \texttt{annot.defs} contains:
+\begin{footnotesize}
+\begin{verbatim}
+*:phosphorylation
+@:active site
+^:binding site
+\end{verbatim}
+\end{footnotesize}
+The annotation character is left of the ':', the definition is on the
+right. The \texttt{annot.defs} file can also be specified by setting
+the \texttt{FA\_ANNOT\_DEF} environment variable to the file name;
+
+(2) to include optional annotation file, e.g. \texttt{-V
+ '<features.annot'}, or script, e.g. \texttt{-V '!features.pl'} for
+library annotations and \texttt{-V 'q!features.pl'} for query annotations. (Some shells require \texttt{\textbackslash!features.pl}.) Similar to the library expansion script, the
+\texttt{features.pl} script is run against a temporary file containing
+the list of high scoring sequence accessions (the text before the
+first space), e.g.
+\begin{footnotesize}
+\begin{verbatim}
+gi|121735|sp|P09488.3|GSTM1_HUMAN
+gi|1170096|sp|Q03013.3|GSTM4_HUMAN
+gi|67461004|sp|Q5R8E8.3|GSTM2_PONAB
+...
+\end{verbatim}
+\end{footnotesize}
+The \texttt{features.pl} script then produces a file of annotations on
+those sequences, in the format:
+\begin{verbatim}
+>accession1
+position label value
+>accession2
+...
+\end{verbatim}
+For example:
+\begin{footnotesize}
+\begin{verbatim}
+>gi|121735|sp|P09488.3|GSTM1_HUMAN
+23 *
+33 *
+34 *
+116 ^
+173 V N
+210 V T
+>gi|1170096|sp|Q03013.3|GSTM4_HUMAN
+2 V P
+116 ^
+160 V V
+208 V V
+209 V F
+211 V K
+212 V M
+>gi|67461004|sp|Q5R8E8.3|GSTM2_PONAB
+...
+\end{verbatim}
+\end{footnotesize}
+The same format is used for the \texttt{-V '<feature.annot'} file.
+
+The \texttt{V} label is special; it indicates that the feature is a
+variant residue and specifies the alternative residue in the label
+field. Thus, \texttt{GSTM4\_HUMAN} can have a \texttt{M} at position
+2. Unlike modification or active site annotations, variant residues
+can change the sequence of the library sequence if replacing the
+canonical library residue with the variant residue improves the score.
+Thus, without the \texttt{-V '!feature.pl'} script, the human
+\texttt{GSTM1B} variant with dbSNP:rs449856 would align to
+\texttt{GSTM1\_HUMAN} (\texttt{P04988}) like this (the \texttt{-m 1}
+format option was used to highlight differences) :
+\begin{footnotesize}
+\begin{verbatim}
+The best scores are: opt bits E(1)
+sp|P09488.3|GSTM1_HUMAN Glutathione S-transferase Mu 1; GST HB subuni ( 218) 1490 335.9 3.7e-97
+
+>>sp|P09488.3|GSTM1_HUMAN Glutathione S-transferase Mu 1; GST HB s (218 aa)
+ initn: 1490 init1: 1490 opt: 1490 Z-score: 1776.8 bits: 335.9 E(1): 3.7e-97
+Smith-Waterman score: 1490; 99.1% identity (100.0% similar) in 218 aa overlap (1-218:1-218)
+
+...
+ 170 180 190 200 210
+gtm1_h YDVLDLHRIFEPNCLDAFPNLKDFISRFEGLEKISAYMKSSRFLPRPVFTKMAVWGNK
+ x x
+sp|P09 YDVLDLHRIFEPKCLDAFPNLKDFISRFEGLEKISAYMKSSRFLPRPVFSKMAVWGNK
+ 170 180 190 200 210
+\end{verbatim}
+\end{footnotesize}
+
+With a \texttt{-V '!feature.pl'} script to annotate the variants,
+the alignment becomes:
+\begin{footnotesize}
+\begin{verbatim}
+The best scores are: opt bits E(1)
+sp|P09488.3|GSTM1_HUMAN Glutathione S-transferase Mu 1; GST HB s ( 218) 1500 338.1 8e-98
+
+>>sp|P09488.3|GSTM1_HUMAN Glutathione S-transferase Mu 1; GST HB s (218 aa)
+ Variant: K173N;S210T;
+ initn: 1500 init1: 1500 opt: 1500 Z-score: 1788.7 bits: 338.1 E(1): 8e-98
+Smith-Waterman score: 1500; 100.0% identity (100.0% similar) in 218 aa overlap (1-218:1-218)
+...
+ 170 180 190 200 210
+gtm1_h YDVLDLHRIFEPNCLDAFPNLKDFISRFEGLEKISAYMKSSRFLPRPVFTKMAVWGNK
+
+sp|P09 YDVLDLHRIFEPNCLDAFPNLKDFISRFEGLEKISAYMKSSRFLPRPVFTKMAVWGNK
+ 170 V 180 190 200 21V
+\end{verbatim}
+\end{footnotesize}
+In addition to removing the two differences as residues 173 and 210,
+which produces a 100\% identical alignment, inclusion of the variant
+library sequence also improves the raw similarity score and $E()$-value.
+An example script (\texttt{misc/up\_feats.pl}) that extracts
+annotations from a mysql database of Uniprot features is provided.
+
+If the annotation script produces lines beginning with '=', then these
+lines are taken as annotation definitions, similar to the
+\texttt{annot.defs} file described above. Thus:
+\begin{footnotesize}
+\begin{verbatim}
+=*:phosphorylation
+=@:active site
+=^:binding site
+>gi|121735|sp|P09488.3|GSTM1_HUMAN
+23 *
+33 *
+34 *
+116 ^
+173 V N
+210 V T
+\end{verbatim}
+\end{footnotesize}
+will produce the same annotation descriptions as the
+\texttt{annot.defs} file.
+
+Scripts to produce annotations are available in the \texttt{scripts/}
+directory as \texttt{scripts/ann\_feats*.pl}. Scripts with
+\texttt{www} in the name,
+e.g. \texttt{scripts/ann\_feats\_up\_www2.pl} and
+\texttt{scripts/ann\_pfam\_www.pl} download annotation information
+from Uniprot or Pfam web services, respectively. Scripts lacking
+\texttt{www} require require a MySQL database that associates features
+or domains with sequence identifiers (accessions). With \CURRENT,
+domain annotations are allows to overlap each other (which often
+happens in Pfam and UniProt); FASTA 36.3.6 did not support overlapping
+domains. Scripts that can produce overlapping domain annotations have
+\texttt{\_e} in their names, but will produce non-overlapping domain
+annotations with the \texttt{--no-over} option. Thus:
+\texttt{scripts/ann\_pfam\_www\_e.pl --acc sp|P43553|ALR2\_YEAST}
+produces:
+\begin{quote}
+\begin{verbatim}
+>sp|P43553|ALR2_YEAST
+451 - 683 PF01544 :1
+667 - 799 PF01544 :1
+\end{verbatim}
+\end{quote}
+While \texttt{scripts/ann\_pfam\_www\_e.pl --acc --no-over
+ sp|P43553|ALR2\_YEAST} produces:
+\begin{quote}
+\begin{verbatim}
+>sp|P43553|ALR2_YEAST
+451 - 675 PF01544 :1
+676 - 799 PF01544 :1
+\end{verbatim}
+\end{quote}
+
+\item[\texttt{-w \#}]
+ Display width value ($<$200). Sets the approximate width of the
+ high-score descriptions and the length of residue
+ alignments. \texttt{-w 60} by default.
+
+\item[\texttt{-W \#}] context length (default is 1/2 of line width -w)
+ for alignment, for programs like \texttt{fasta36} and
+ \texttt{ssearch36}, that provide additional sequence context.
+
+\item[\texttt{-X extended\_option}]
+A number of rarely used options are now only available as extended options:
+
+\begin{description}
+
+\item[\texttt{X1}] sort output by \texttt{init1} score (for
+ compatibility with FASTP; obsolete).
+
+\item[\texttt{XB}] (Previously \texttt{-B}.) Show the z-score, rather
+ than the bit-score in the list of best scores (rarely used, provided
+ for backward compatibility).
+
+\item[\texttt{XI}] Modify rounding used in percent identity/percent
+ similarity display to ensure that sequences that have a mismatch are
+ not shown as 100.0\% identical. Without this option, a single
+ mismatch in a 10,000 residue alignment would be shown as 100.0\%
+ identical; with this option, it would be shown as 99.9\%
+ identical.
+
+\item[\texttt{Xo}] (\texttt{fasta36}, \texttt{[t]fast[x/y]36} only)
+ (Previously \texttt{-o}.) Turn off the default \texttt{opt} score
+ calculation and sort results by \texttt{initn} scores (reduces
+ sensitivity and statistical accuracy, obsolete).
+
+\item[\texttt{XM}] The maximum amount of memory available for storing
+ the library in multi-sequence searches. The value is specified in
+ MBytes (\texttt{-XL16}) or GBytes (\texttt{-XL4G}) and can also be
+ set using the \texttt{LIB\_MEMK} environment variable
+ (\texttt{LIB\_MEMK=4G}). Negative values remove the memory
+ restriction. By default (set as a compile-time option,
+ \texttt{-DMAX\_MEMK=2}), set to 2 GBytes in 32-bit environments and
+ 12 GBytes in 64-bit environments.
+
+\item[\texttt{XN/XX}] Alter the treatment of N:N (DNA) or X:X
+ (protein) alignments for counts of identities and similarities. By
+ default the \texttt{FASTA} programs count N:N or X:X as identical,
+ but not similar, because their alignment scores are typically
+ negative. \texttt{-XNS}, \texttt{-XN+}, \texttt{-XXS}, and
+ \texttt{-XX+} treat N:N and X:X alignments as ``similar'' , even
+ though their alignment scores are negative, when calculating percent
+ similarity. \texttt{-XND}, \texttt{-XN-}, \texttt{-XXD}, and
+ \texttt{-XX-} treat N:N and X:X alignments as non-identical for
+ calculating percent identity.
+
+\item[\texttt{Xx}] (Previously \texttt{-x}) Specify the penalty for a
+ match to an \texttt{X}, and mismatch to \texttt{X}, independently of
+ the PAM matrix. Particularly useful for \texttt{fastx3/fasty36},
+ where termination codons are encoded as \texttt{X}. For example,
+ \texttt{-Xx=0,-1} scores an \texttt{X:X} match as 0, and
+ \texttt{X:not-X} as -1.
+
+\item[\texttt{Xy}] (Previously \texttt{-y}.) Set the width of the band
+ used for calculating "optimized" scores. For proteins and ktup=2,
+ the width is 16. For proteins with ktup=1, the width is 32 by
+ default. For DNA the width is 16.
+
+\end{description}
+
+\item[\texttt{-z -1,0,1,2,3,4,5,6}]\hfill\\
+\texttt{-z -1} turns off statistical calculations. \texttt{z 0} estimates
+the significance of the match from the mean and standard deviation of
+the library scores, without correcting for library sequence length.
+\texttt{-z 1} (the default) uses a weighted regression of average score
+vs library sequence length; \texttt{-z 2} uses maximum likelihood
+estimates of $\lambda$
+and $K$; \texttt{-z 3} uses Altschul-Gish parameters \cite{alt960};
+\texttt{-z 4 - 5} uses two variations on the \texttt{-z 1}
+strategy. \texttt{-z 1} and \texttt{-z 2} are the best methods, in
+general.
+\item[\texttt{-z 11,12,14,15,16}]\hfill\\
+estimate the statistical parameters from shuffled copies of each
+library sequence. This allows accurate statistics to be estimated for libraries comprised of a single protein family.
+
+\item[\texttt{-z 21,22,24,25,26}]\hfill\\
+estimate the statistical parameters from shuffled copies of the
+highest scoring sequences reported in the search.
+library sequence. This shuffling strategy is much more like
+\texttt{prss}, since the sequences shuffled share compositional
+similarity to the query.
+\item[\texttt{-Z db\_size}]
+sets the apparent size of the database to be used when calculating
+expectation E()-values. If you searched a database with 1,000
+sequences, but would like to have the E()-values calculated in the
+context of a 100,000 sequence database, use \texttt{-Z 100000}.
+\item[\texttt{-3}]
+translate only three forward frames or search with only the forward
+strand (complement of \texttt{-i}).
+\end{description}
+
+Thus, to tell \texttt{fasta36} to align \texttt{seq1.aa} with \texttt{seq2.aa} showing the entirety of both sequences, with 80 characters per line, one would type:
+\begin{verbatim}
+fasta36 -w 80 -s BP62 -a seq1.aa seq2.aa
+\end{verbatim}
+The \texttt{-w 80} and \texttt{-a} options must precede the file
+names. If you just enter the options on the command line followed by
+\texttt{-I}, the program will prompt for the file names.
+
+In addition, the FASTA programs can accept query sequence data from
+\texttt{STDIN}. To specify that stdin be used as the query or library
+file, the file name should be specified as \texttt{@}. Thus:
+\begin{quote}
+\texttt{cat query.aa | fasta36 @:25-75 /slib/swissprot }
+\end{quote}
+would take residues 25-75 from \texttt{query.aa} and search the
+\texttt{/slib/swissprot}.
+
+\subsubsection{Environment variables}
+
+FASTA allows virtually every option to be set on the command line
+(except the \emph{ktup}, which must be set as the third command line
+argument), but it is often convenient to set the \texttt{FASTLIBS}
+environment variable to specify the location of the \texttt{fastlibs}
+database description file.
+
+\texttt{FASTLIBS} -- \texttt{FASTLIBS}
+specifies the location of the file that contains the list of library
+descriptions, locations, and library types (see section on finding
+library files).
+
+\texttt{LIB\_MEMK} -- Set the maximum amount of memory (MBytes) to be
+available for library buffering (equivalent to \texttt{-XM\#}, see
+above). By default, \texttt{2GB} is available on 32-bit systems
+(\texttt{LIB\_MEMK=2G}); \texttt{8GB} on 64-bit systems.
+
+\texttt{REF\_URL}, \texttt{SRCH\_URL} and \texttt{SRCH\_URL1} -- These
+environment variables are used in HTML mode (\texttt{-m 6}) to provide
+links from the sequence alignment (see the links at
+\url{http://fasta.bioch.virginia.edu/fasta_www2/}). \texttt{REF\_URL}
+is associated with the \texttt{Entrez Lookup} link; \texttt{SRCH\_URL}
+with the \texttt{Re-search database} link, and \texttt{SRCH\_URL1}
+with the \texttt{General re-search} link. In each case, the text
+corresponds to a HTML URL, but with positions containing the
+\texttt{\%s} or \texttt{\%ld} (for numbers) part of a 'C'
+\texttt{sprintf()} call for specific variables. \texttt{REF\_URL} uses
+the database (\texttt{protein} or \texttt{nucleotide}), together with
+a query term (typically the \texttt{gi} number). \texttt{SRCH\_URL}
+and \texttt{SRCH\_URL1} use \texttt{db}, \texttt{query} (\texttt{gi},
+\texttt{pgm} (\texttt{fa}, \texttt{ss}, \texttt{fx}, etc.), and
+\texttt{start}, \texttt{stop}, and \texttt{n1} (library sequence
+length), where \texttt{start} and \texttt{stop} are the boundaries of
+the alignment, for sub-sequence searches. The values of these
+environment variables are used with \texttt{sprintf} to build a new
+URL that is linked in the output.
+
+\texttt{TMP\_DIR} -- Location (if defined) of the temporary files used
+by the \texttt{-e expand\_script.sh} option.
+
+In addition, environment variables can be used inside both the
+\texttt{fastlibs} file and in the \texttt{@db.nam} files of file
+names. The \texttt{fasta36/conf/fast\_libs\_e.www} file, included with
+the distribution, shows an example, as do the descriptions of file of
+file names files shown below. Whenever a word of the form
+\texttt{\$\{WORD\}} is found in \texttt{fastlibs} or a file of file
+names, the \texttt{\$\{WORD\}} environment variable is expanded and
+inserted in the string. Thus, if \texttt{<\$\{SLIB\}/blast\_dbs/}
+describes where a list of files will be found and \texttt{\$\{SLIB\}}
+is \texttt{"/seqdata"}, then the resulting substitution yields:
+\texttt{</seqdata/blast\_dbs/}.
+
+\section{Installing FASTA and the sequence databases}
+
+\subsection{Obtaining/preparing the sequence libraries}
+
+The FASTA program package does not include any protein or DNA sequence
+libraries. Protein and DNA sequence databases are available via
+anonymous FTP from the NCBI (\url{ftp://ftp.ncbi.nih.gov/blast/db},
+\url{ftp://ftp.ncbi.nih.gov/blast/db}), UniProt
+(\url{ftp://ftp.uniprot.org/pub/databases/uniprot}), and the EBI
+(\texttt{ftp.ebi.ac.uk/pub/databases}).
+
+\emph{Protein Sequence Databases} -- Protein sequence databases are
+available from the NCBI, UniProt, and the EBI. The NCBI provides a
+``raw'' database, \texttt{nr}, and a well-curated, less redundant
+database, \texttt{refseq\_protein}, and a copy of the very well
+annotated \texttt{swissprot} database. Protein sequence databases can
+also be downloaded from UniProt and the EBI; both sites provide the
+same UniProt\cite{uniprot11} database.
+
+Protein libraries, particularly those used for translated-DNA:protein
+comparisons with \texttt{fastx36} or \texttt{fasty36}, show be scanned
+to remove low-complexity regions. Matches between low complexity
+regions can violate the composition assumptions used by the FASTA
+statistical estimates. The \texttt{pseg} program (\cite{woo935},
+\url{ftp://ftp.ncbi.nih.gov/pub/seg/pseg}) can be used to lower-case
+low complexity regions, which then can be ignored during the initial
+database search by using the \texttt{-S} option. To lower-case low
+complexity regions, run the \texttt{pseg} program against the protein sequence database:
+\begin{quote}
+\begin{verbatim}
+pseg /seqdata/swissprot.fa -z 1 -q > /seqdata/swissprot.lseg
+\end{verbatim}
+\end{quote}
+And then you can run most FASTA programs with \texttt{-S}:
+\begin{quote}
+\begin{verbatim}
+ssearch36 -S mgstm1.aa /seqdata/swissprot.lseg
+\end{verbatim}
+\end{quote}
+
+Fig. \ref{seg-aln} shows the effect of including the \texttt{-S}
+option with lower-cased low-complexity sequences. The \texttt{opt}
+score (407), which is used to sort the results and calculate
+statistics, is lower than the Smith-Waterman score (451), even though
+exactly the same residues are aligned for each score. The \texttt{opt}
+score excludes residues 19-30, because they were marked as
+low-complexity by \texttt{pseg}; thus they are shown as lower-case.
+The Smith-Waterman score includes the contribution from that part of
+the alignment.
+
+Out-of-frame translated DNA sequences often produce low-complexity
+regions \cite{wrp973}, so it is particularly important to avoid
+low-complexity alignments when using \texttt{fastx36} and
+\texttt{fasty36}
+
+\subsection{Searching taxonomic subsets}
+
+Because increasing database size reduces search sensitivity (an
+alignment with an $E()$-value of $0.001$ in a search of a 100,000
+entry database will have an $E()$-value of 0.1, not significant, if
+found in a database of 10,000,000 sequences), it is much more
+effective to search smaller, less redundant databases (you can always
+search the larger database later). Thus, the \texttt{refseq\_protein}
+database from the NCBI is preferred over \texttt{nr}; even better are
+databases that reflect a limited phylogenetic range
+(e.g. \texttt{refseq\_human} for vertebrate sequences).
+
+While the NCBI provides organism-specific \texttt{refseq} subsets on
+their FTP site, they can be difficult to find. Alternatively, you can
+use the NCBI \texttt{Entrez} web site to download a list of
+\texttt{gi} numbers specific to a particular organism or taxonomic
+range. The FASTA programs can search a subset of a large sequence
+database that is specified by a list of \texttt{gi} numbers by using
+library format 10. For example, given a list of \texttt{gi} numbers
+for the human proteins in \texttt{swissprot.lseg}, the file
+\texttt{sp\_human.db}, with the content:
+\begin{quote}
+\begin{verbatim}
+<${SLIB}/swissprot.lseg 0:2 4|
+3121763
+51701705
+7404340
+205831112
+74735515
+...
+\end{verbatim}
+\end{quote}
+could be used to search the human subset of
+\texttt{swissprot.lseg}. The \texttt{gi} numbers for the SwissProt
+entries begin with the second line. The first line specifies the
+location of the file where the sequences containing the \texttt{gi}
+numbers can be found (\texttt{\$\{SLIB\}/swissprot.lseg}, the
+\texttt{libtype} of that file (\texttt{0:fasta}), the character offset
+to the beginning of the sequence identifier in that file (\texttt{2}),
+the identifier type (\texttt{4}), and the character
+that separates the fields in the FASTA descriptor (\texttt{|}). The
+identifier type can take four formats:
+
+\begin{tabular}{l l}
+\hline\\[-1.5ex]
+1 & ordered accession strings (letters or numbers)\\
+2 & ordered numbers (digits only) \\
+3 & un-ordered accession strings \\
+4 & un-ordered numbers \\
+\hline\\
+\end{tabular}\\
+(Ordered accession strings/numbers are ordered in both the library and the subset file.)
+
+Thus, given the \texttt{0:2 4|} specification above, the line:
+\begin{quote}
+\texttt{>gi|3121763|sp|O15143.3|ARC1B\_HUMAN Actin-related protein 2/3 ...}
+\end{quote}
+would be parsed, looking for an number starting at column 4 (the first
+column is numbered 0), and ending with \texttt{|}. The order of
+sequences in the library do not have to correspond to the order in the
+\texttt{sp\_human.db} file (un-ordered). Given a the
+\texttt{sp\_human.db} file, a file \texttt{swissprot.lseg} in the
+directory specified by the environment variable \texttt{\$\{SLIB\}},
+and a command of the form:
+\begin{quote}
+\texttt{fasta36 -S mgstm1.aa 'sp\_human.db 10'}
+\end{quote}
+Would use the \texttt{sp\_human.db} file to search the subset of
+\texttt{swissprot.lseg} that contained the specified \texttt{gi}
+numbers.
+
+\subsection{DNA sequence libraries}
+
+Because of the large size of DNA databases, you will probably want to
+keep DNA databases in only one format. The FASTA3 programs that
+search DNA databases --- \texttt{fasta36}, \texttt{fastm36}, and
+\texttt{tfastx/y36} --- can read DNA databases in Genbank flatfile (not
+ASN.1), FASTA, and BLAST2.0 (\texttt{formatdb}) formats, as well as
+EMBL format. BLAST2.0 format is preferred for DNA sequence libraries,
+because the files are considerably more compact than GenBank format.
+The NCBI does not provide software for converting from Genbank flat
+files to Blast2.0 DNA databases, but you can use the Blast
+\texttt{formatdb} program to convert ASN.1 formatted Genbank files,
+which are available from the NCBI \texttt{ftp} site.
+
+The NCBI also provides the comprehensive \texttt{nt} DNA database, and
+several EST databases in Blast2.0/\texttt{formatdb} format from
+\texttt{ftp://ncbi.nih.gov/blast/db}.
+
+
+\subsection{Finding the library files}
+
+All the FASTA programs comparison programs have the command line syntax:
+\begin{quote}
+\texttt{fasta36 query.file /seqdata/library}
+\end{quote}
+However, in addition to simply specifying the location of the database
+to be searched
+(\texttt{/seqdata/library}), the FASTA programs
+provide several methods for referring to sequence databases without specifying a specific file. These methods can be used to provide abbreviations for sequence libraries, e.g.:
+\begin{quote}
+\texttt{fasta36 query.file s}
+or
+\texttt{fasta36 query.file +sp+}
+\end{quote}
+To use abbreviations like \texttt{'s'} or \texttt{'+sp+'} to reference a
+sequence database, a \texttt{FASTLIBS} file must be used, see section
+\ref{fastlibs}.
+
+Large DNA and protein databases are often distributed across several
+files. For example, the NCBI \texttt{nr} protein database is found in
+5 files, \texttt{nr.00} ... \texttt{nr.04}. To search databases in
+multiple files, the names of the files are specified in a file of
+filenames, \texttt{nr.nam}:
+\begin{quote}
+\begin{verbatim}
+<${SLIB}/blast_dbs/
+nr.00 12
+nr.01 12
+nr.02 12
+nr.03 12
+nr.04 12
+\end{verbatim}
+\end{quote}
+In this file, the first line \texttt{<\$\{SLIB2\}/blast\_dbs/},
+beginning with \texttt{<}, specifies the location and format (Blast2.0
+\texttt{formatdb}) the data files. Text of the form
+\texttt{\$\{SLIB\}} refers to Unix/MacOSX/Windows environment
+variables; the value of \texttt{\$\{SLIB\}} is set by a Unix/MacOSX
+shell environment command. Thus, if the value of \texttt{\$\{SLIB\}}
+is \texttt{/seqdata}, then the first sequence library file to be read
+will be \texttt{/seqdata/blast\_dbs/nr.00}, in format 12 (Blast2.0
+\texttt{formatdb}).
+
+To refer to the \texttt{nr.nam} file as a file of file names, it must
+be prefixed by a \texttt{@} character, e.g.
+\begin{quote}
+\texttt{fasta36 query.file \textbf{@}nr.nam}
+\end{quote}
+Files of file names can contain references to other files of file names:
+\begin{quote}
+\begin{verbatim}
+<${SLIB}/fasta_dbs/
+ at pdb.nam
+ at swissprot.nam
+\end{verbatim}
+\end{quote}
+The FASTA file of file names is similar to the NCBI
+\texttt{prot\_db.pal} and \texttt{dna\_db.nal}, files, but
+unfortunately they are different, and currently FASTA cannot read NCBI
+\texttt{.pal} or \texttt{.nal} files that contain a \texttt{DBLIST}
+line. FASTA can read NCBI \texttt{.pal} or \texttt{.nal} files that do
+not contain a \texttt{DBLIST} line.
+
+FASTA version \texttt{fasta-36.3.6} provides an alternative way to
+generate a database to be searched: the \texttt{!script.sh} file.
+Like the \texttt{-e expand\_file.sh} script, a shell script or program
+can be used to produce a database to a temporary file, which is then
+seached. For example, if the file \texttt{cat\_db.sh} contains the
+command \texttt{echo /seqdb/swissprot.lseg}, the command:
+\begin{quote}
+\begin{verbatim}
+fasta36 query.aa \!@cat_db.sh
+\end{verbatim}
+\end{quote}
+will cause \texttt{cat\_db.sh} to produce a temporary file with the
+line \texttt{swissprot.lseg}, which is interpreted as an indirect file
+of filenames; thus, because of the \texttt{@}, the file will be
+interpreted as an indirect file, and the \texttt{swissprot.lseg} file
+will be searched. Note that on Unix systems, the \texttt{'!'} must be
+preceeded by a \texttt{'\textbackslash'} so that it is not interpreted by the
+shell, as shown above.
+
+\subsection{\texttt{FASTLIBS}}
+\label{fastlibs}
+
+All the search programs in the FASTA3 package can use the environment
+variable \texttt{FASTLIBS} to find the protein and DNA sequence
+libraries. (Alternatively, you can specify the \texttt{FASTLIBS} file
+with the \texttt{-l fastlibs.file} option.) The \texttt{FASTLIBS}
+variable contains the name of a file that has the actual filenames of
+the libraries. The \texttt{fastlibs} file included with the
+distribution is an example of a file that can be referred to by
+FASTLIBS. To use the \texttt{fastlibs} file, type:
+\begin{quote}
+\texttt{setenv FASTLIBS /seqdata/info/fastgbs} (csh/tcsh)\\
+ or\\
+\texttt{export FASTLIBS=/seqdata/info/fastgbs} (bash/ksh)
+\end{quote}
+ Then edit the \texttt{fastlibs} file to indicate the location of the
+ protein and DNA sequence libraries. If the protein sequence library is
+ kept in the file \texttt{/seqdata/aa/swissprot.lseg} and your Genbank
+ DNA sequence library is kept in the directory:
+ \texttt{/seqdata/genbank}, then the \texttt{fastlibs} file might
+ contain:
+%%\pagebreak
+\begin{verbatim}
+SwissProt$0P/seqdata/aa/swissprot.lseg 0
+UniProt$0+uniprot+@/seqdata/aa/uniprot.nam
+GB Primate$1P@/seqdata/genbank/gpri.nam
+GB Rodent$1R@/seqdata/genbank/grod.nam
+GB Mammal$1M@/seqdata/genbank/gmammal.nam
+^ 1 ^^^^ 4 ^ ^
+ 23 (5)
+\end{verbatim}
+The first line of this file says that there is a copy of the SwissProt
+sequence database (a protein database) that can be selected by typing
+"P" on the command line or when the database menu is presented in
+interactive mode.
+
+Note that there are 4 (or 5) fields in the lines in the
+\texttt{fastlibs} file. The first field describes library and is
+displayed by FASTA program; it ends with the '\$'. The second field
+(1 character), is a 0 if the library is a protein library and 1 if it
+is a DNA library. The third field can either be a single character
+(\texttt{P}) or a word surrounded by the \texttt{+} symbol
+(\texttt{+uniprot+}), and can be used to specify the library on the command line or in interactive mode.
+
+The fourth field is the name of the library file. In the example
+above, the \texttt{/seqdata/aa/swissprot.lseg} file contains the
+entire protein sequence library. Alternatively,
+\texttt{/seqdata/aa/uniprot.nam} is a file of file names, which
+contains a list of one or more library files. Likewise, the DNA
+library files are files of file names.
+
+In addition, an optional fifth field can be used to specify the format
+of the library file. Alternatively, you can specify the library
+format in a file of file names. This field must be separated from the
+file name by a space character ('\ ') from the filename. FASTA can
+read the libraries in the following formats:\\
+
+\begin{tabular}{r l}
+0 & FASTA (\texttt{>SEQID} - comment/sequence) \\
+1 & Uncompressed Genbank (LOCUS/DEFINITION/ORIGIN)\\
+2 & NBRF CODATA (ENTRY/SEQUENCE) (obsolete)\\
+3 & EMBL/SWISS-PROT (ID/DE/SQ)\\
+4 & Intelligenetics (;comment/SEQID/sequence) (obsolete)\\
+5 & NBRF/PIR VMS (\texttt{>P1;SEQID}/comment/sequence) (obsolete)\\
+6 & GCG (version 8.0) Unix Protein and DNA (compressed)\\
+7 & FASTQ (sequence only, quality ignored)\\
+10 & subset format (</slib2/swissprot.lseg 0:2 4|) \\
+11 & NCBI Blast1.3.2 format (unix only) (obsolete)\\
+12 & NCBI Blast2.0 format\\
+16 & MySQL (requires special compilation) \\
+17 & Postgres (requires special compilation) \\
+\end{tabular}
+
+Today, the most popular formats are \texttt{FASTA}, type \texttt{'0'},
+the default, and the NCBI Blast2.0 \texttt{formatdb} formats (type
+\texttt{'12'}). The FASTA programs cannot read NCBI ASN.1 formatted databases.
+If a library format is not specified, for example, because
+you are just comparing two sequences, FASTA (format 0) is used by
+default. To specify a library type on the command line, add it to the
+library filename and surround the filename and library type in quotes:
+\begin{quote}
+\begin{verbatim}
+fasta36 query.file "/seqdb/genbank/gbmam 12"
+\end{verbatim}
+\end{quote}
+NCBI \texttt{formatdb} databases are built from multiple files,
+e.g. \texttt{gbmam.nsq}, \texttt{gbmam.nhr}, \texttt{gbmam.nin}; to
+refer to the complete set of files, simply use name before the
+suffixes, e.g. \texttt{gbmam}. When NCBI databases distributed across
+several files, e.g. \texttt{gbbct.00}, \texttt{gbbct.01}, etc, those
+files must be included in a \texttt{gbbct.nam} file of file names.
+
+FASTA subset format ({\tt 10}) allows users to search a subset of a
+sequence database, by specifying a list of {\tt gi} numbers or accessions in
+a larger database. The format begins with a line naming the file
+sequence file followed by information about how to
+extract the {\tt gi} number or accession. Thus, the line.
+\begin{quote}
+\texttt{<library\_file lib\_fmt:id\_fmt id\_loc}
+\end{quote}
+where {\tt lib\_fmt} is the library format (0), {\tt id\_fmt} is the
+format of the sequence identifier (:1, :2 - ordered strings or
+numbers; :3, :4 - unordered strings or numbers),
+and {\tt id\_loc} is the location of the sequence identifier. For example,
+\begin{quote}
+\begin{verbatim}
+</slib2/blast/swissprot.lseg 0:2 4|
+3121763
+51701705
+7404340
+74735515
+...
+\end{verbatim}
+\end{quote}
+specifies the file containing all the sequences and the file is in
+FASTA format ('0:'), the sequence identifier is a number (':2'),
+and the identifier starts at character 4 and ends with the \texttt{'|'} symbol.
+
+The major problem that most new users of the FASTA package have is in
+setting up the program to find the databases and their library type.
+In general, if you cannot get \texttt{fasta36} to read a sequence
+database, there is probably something wrong with the \texttt{FASTLIBS}
+file. A common problem is that the database file is found, but either
+no sequences are read, or an incorrect number of entries is read.
+This is almost always because the library format (\texttt{libtype}) is
+incorrect.
+
+Test the setup by running FASTA. Enter the sequence
+file '\texttt{mgstm1.aa}' when the program requests it (this file is
+included with the programs). The program should then ask you to
+select a protein sequence library. Alternatively, if you run the
+\texttt{tfastx36 -I} program and use the mgstm1.aa query sequence, the program
+should show you a selection of DNA sequence libraries.
+Once the \texttt{fastlibs} file has been set up correctly, you can
+set FASTLIBS=fastgbs in your AUTOEXEC.BAT file, and you will not need to
+remember where the libraries are kept or how they are named.
+
+%%\pagebreak
+\section{Frequently Asked Questions (FAQs)}
+
+{\noindent}\textbf{Where can I get FASTA?} --
+\url{http://faculty.virginia.edu/wrpearson/fasta} has the latest
+versions of the FASTA programs. This document describes
+\texttt{\CURRENT}, which is available from
+\url{http://faculty.virginia.edu/wrpearson/fasta/fasta3.tar.gz}.
+In addition, pre-compiled versions of the programs are available for
+MacOSX and Windows.
+
+\needspace{4\baselineskip}
+{\noindent}\textbf{Which program should I use?} -- See Table I, also:\\
+
+\begin{tabular}{l l l l l }
+\hline \\[-1.0ex]
+Query & Library & FASTA pgm. & BLAST pgm. & \\[1.2ex]
+\hline \\[-1.0ex]
+Prot. & Prot. & \texttt{fasta36} & \texttt{blastp} & heuristic local similarity \\
+ & & \texttt{ssearch36} & & optimal local sim.\\
+ & & \texttt{ggearch36} & & global:global sim. \\
+ & & \texttt{ggearch36} & & global:local sim.\\
+DNA & DNA & \texttt{fasta36}$^*$ & \texttt{blastn} & \\[1.2ex]
+\hline \\[-1.0ex]
+Prot. & Prot. & \texttt{lalign36} & & multiple non-intersecting \\
+DNA & DNA & & & alignments \\[1.2ex]
+\hline \\[-1.0ex]
+DNA & Prot. & \texttt{fastx36} & \texttt{blastx} & trans. DNA:protein sim. \\
+ & & \texttt{fasty36} & & \\[1.2ex]
+\hline \\[-1.0ex]
+Prot. & DNA & \texttt{tfastx36} & \texttt{blastn} & protein:trans. DNA \\
+ & & \texttt{tfasty36} & & \\[1.2ex]
+\hline \\[-1.0ex]
+Prot. & Prot. & \texttt{fasts36} & & Unordered peptides \\
+Prot. & DNA & \texttt{tfasts36} & & Unordered peptides \\
+DNA & DNA & \texttt{fasts36} & & Unordered oligonucleotides \\
+Prot. & Prot. & \texttt{fastm36} & & Ordered peptides \\
+DNA & DNA & \texttt{fastm36} & & Ordered oligos \\[1.2 ex]
+\hline \\[-1.0ex]
+\multicolumn{5}{l}{$^*$\texttt{ssearch36} can also be used for DNA:DNA, but is much slower and no more sensitive.}\\[0.2ex]
+\hline \\
+\end{tabular}
+
+\needspace{4 ex}
+{\noindent}\textbf{How do I make FASTA act/look like BLAST}? --
+\vspace{-0.5ex}
+\begin{quote}
+\texttt{fasta36 -s BP62 -m BB query.file library.file}
+\end{quote}
+\vspace{-0.5ex}
+\texttt{-s BP62} sets the same scoring matrix (BLOSUM62) and
+gap-penalties (-11/-1) as BLAST (FASTA uses BLOSUM50 by
+default). \texttt{-m BB} produces very BLAST-like output.
+
+In addition, the \texttt{-m 8} and \texttt{-m 8C} options provide
+BLAST tabular output, optionally with comments (\texttt{-m 8C}). This
+compact output is effective for analysis pipelines. In addtion,
+\texttt{-m 8XC} (no comments) or \texttt{-m 8CC} provides two
+additional blast-tabular fields, a CIGAR alignment string and, if
+available, an annotation string.
+
+{\noindent}\textbf{When I search Genbank - the program reports:} \texttt{0 residues in 0
+sequences}? This typically happens because the program does not
+know that you are searching a Genbank flatfile database and is looking
+for a FASTA format database. Be certain to specify the library type
+("1" for Genbank flatfile) with the database name.
+
+{\noindent}\textbf{The search seemed to work, but I do not see any results.} -- In
+command line mode (the default), all the FASTA programs limit the
+number of high scoring sequences shown using an expectation value
+cutoff ($E()<10$ for proteins; $E()<2$ for DNA). Sometimes, a search
+will complete successfully (you see the message \texttt{XXXX residues
+ in YYY sequences}) but the message: \texttt{!! No sequences with E()
+ < 10} instead of \texttt{The best scores are:}. Typically, this
+happens because of a problem with the statistical estimation process;
+in particular, if the library contains only related sequences and
+\texttt{-z 11} was not used, none of the hits may be ``significant''.
+To trouble shoot this problem, you can search with \texttt{-z -1},
+which turns off all the statistical estimation procedures, and will
+show the 20 highest scoring sequences (\texttt{-b \#} sets the default
+number of sequences shown).
+
+{\noindent}\textbf{What is the difference between} \texttt{fastx3} and
+\texttt{fasty3}? (or \texttt{tfastx3} and \texttt{tfasty3})? --
+\texttt{[t]fastx3} uses a simpler codon based model for alignments
+that does not allow frameshifts in some codon positions (see
+ref. \cite{wrp971}). \texttt{fastx3} is about 30\% faster, but
+\texttt{fasty3} can produce higher quality alignments in some cases.
+
+\vspace{0.5ex}
+{\noindent}\textbf{What is ktup}? -- All of the programs with \texttt{fast} in their
+name use a computer science method called a lookup table to speed the
+search. For proteins with \emph{ktup}=2, this means that the program
+does not look at any sequence alignment that does not involve matching
+two identical residues in both sequences. Likewise with DNA and
+\emph{ktup} = 6, the initial alignment of the sequences looks for 6
+identical adjacent nucleotides in both sequences. Because it is less
+likely that two identical amino-acids will line up by chance in two
+unrelated proteins, this speeds up the comparison. But very distantly
+related sequences may never have two identical residues in a row but
+will have single aligned identities. In this case, \emph{ktup} = 1 may
+find alignments that \emph{ktup}=2 misses.
+
+\vspace{0.5ex} {\noindent}\textbf{How do I turn off statistics}? --
+The FASTA programs are designed to identify homologs based on
+statistically significant similarity; to infer homology you need
+accurate statistical estimates. Sometimes, however, you know the
+sequences are related, and searching against libraries of related
+sequences can confuse FASTA if you do not use \texttt{-z 11}. If all
+you want are scores and alignments, use \texttt{-z -1} to turn off
+statistical estimates.
+
+\vspace{0.5ex}
+{\noindent}\textbf{Where are} \texttt{prss} {\noindent}\textbf{and} \texttt{prfx}? -- Earlier FASTA3
+releases included \texttt{prss3} and \texttt{prfx3}. With FASTA
+version 35 and 36, these programs have been incorporated into
+\texttt{ssearch36} and \texttt{fastx36}. FASTA version 35 and 36
+programs now automatically estimate statistical parameters by
+shuffling - the function of \texttt{prss} and \texttt{prfx}, when
+searching for libraries with fewer than 500 members.
+
+\vspace{0.5ex}
+{\noindent}\textbf{Where is} \texttt{tfasta}? -- Although it is possible to make
+\texttt{tfasta36}, it is not compiled by default. \texttt{tfastx36}
+and \texttt{tfasty36} allow frame-shifts to be joined into a single
+alignment; \texttt{tfasta} did not. \texttt{tfastx36} produces better
+alignments with better statistics.
+
+\vspace{0.5ex}
+{\noindent}\textbf{Can I run the FASTA programs on a cluster}? -- With version
+36.3.4, almost all of the FASTA programs can be run on clusters of
+computers using MPI (Message Passaging Interface). The programs can
+be compiled using \texttt{make -f ../make/Makefile.mpi\_sse2} from the
+\texttt{fasta36/src} directory. Except for \texttt{lalign36}, all the
+programs in Table I are available as \texttt{fasta36\_mpi},
+\texttt{ssearch36\_mpi}, etc.
+
+Unfortunately, the current MPI implementation involves substantially
+more communications overhead than the threaded versions. The FASTA
+programs are very efficient on threaded machines; if the preload
+option is used (edit \texttt{make/Makefile36m.common} to use
+\texttt{comp\_lib8.c}), the FASTA programs can obtain more than 40-fold speedup on a 48-core machine (the largest I have tested).
+
+\vspace{0.5ex}
+{\noindent}\textbf{Sometimes, in the list of best scores, the same sequence is
+ shown twice with exactly the same score. Sometimes, the sequence is
+ there twice, but the scores are slightly different}? -- When any of
+the FASTA programs searches a long sequence, it breaks the sequence up
+into \emph{overlapping} pieces. If the highest scoring alignment is
+at the end of one piece, it will be scored again at the beginning of
+the next piece. If the alignment is not be completely included in the
+overlap region, one of the pieces will give a higher score than the
+other. These duplications can be detected by looking at the
+coordinates of the alignment. If either the beginning or end
+coordinate is identical in two alignments, the alignments are at least
+partially duplicates.
+
+\vspace{2ex}
+As always, please inform me of bugs as soon as possible.
+
+\begin{quote}
+William R. Pearson\\
+Department of Biochemistry\\
+Jordan Hall Box 800733\\
+U. of Virginia\\
+Charlottesville, VA\\
+wrp at virginia.EDU
+\end{quote}
+
+\bibliographystyle{plain}
+\bibliography{fasta_guide}
+
+\appendix
+\section*{Appendix}
+
+\section{FASTA Makefile compile time options}
+
+\begin{table}
+\caption{\label{make-defs}FASTA \texttt{Makefile} compile time \texttt{\#defines}}
+\vspace{1.0ex}
+\begin{tabular}{l l p{1.00 in} p{3.0 in}}
+\hline\\[-1.2ex]
+\texttt{\#define} & Status$^*$ & Target file(s) & Function \\[1.0ex]
+\hline\\[-1.5ex]
+\texttt{ALLOCN0} & obs & \texttt{dropnfa.c}, \texttt{dropfx.c}, \texttt{dropfz2.c} & allows FASTA algorithm to use memory $\sim$ query length (n0), not query $+$ library (n0+n1). \\
+\texttt{DNALIB\_LC} & undef & \texttt{initfa.c} & enable lower case masking for DNA libraries \\
+\texttt{HTML\_HEAD} & undef & \texttt{comp\_lib5e.c}, \texttt{comp\_lib8.c} & wrap \texttt{-m 6} HTML output with \texttt{<html> <body> </body> </html>} \\
+\texttt{M10\_CONS} & def & \texttt{c\_dispn.c} & show consensus line (\texttt{:. }) with \texttt{-m 10} output. \\
+\texttt{OLD\_FASTA\_GAP} & undef & \texttt{drop*.c} & use first-residue/additional residue penalties, not open/extend. \\
+\texttt{PGM\_DOC} & def & \texttt{comp\_lib5e.c}, \texttt{comp\_lib8.c} & provide \texttt{\#pgm\_name -opt1 -opt2 query file} copy of command line \\
+\texttt{PROGRESS} & def & \texttt{comp\_lib5e.c}, \texttt{comp\_lib8.c} & provide progress symbols in interactive mode \\
+\texttt{SAMP\_STATS} & def & \texttt{comp\_lib5e.c}, \texttt{comp\_lib8.c} & scores are sampled for statistical estimates \\
+\texttt{SAMP\_STATS\_LESS} & def & \texttt{compacc.c} & a slower sampling strategy is used \\
+\texttt{SHOW\_ALIGN\_SCORE} & undef & \texttt{wm\_align.c} & print score, cummulative score, during alignment (for teaching) \\
+\texttt{SHOW\_HELP} & def & \texttt{comp\_lib5e.c}, \texttt{comp\_lib8.c}, \texttt{initfa.c}, \texttt{doinit.c} & print out help information with '-help', or no arguments given. Undef \texttt{SHOW\_HELP} reverts to pre-\texttt{fasta-35.4.4}.\\
+\texttt{SHOW\_HIST} & undef & \texttt{doinit.c} & inverts current meaning of \texttt{-H} (shows by default for non-PCOMPLIB (MPI) programs). \\
+\texttt{SHOWSIM} & def & \texttt{mshowbest.c} \texttt{mshowalign2.c} & display percent similarity \\
+\texttt{USE\_LNSTATS} & obs & \texttt{scaleswn.c} & use $ln()$-scaling for scores, removed in \texttt{fasta2.0}.\\
+\hline \\
+\end{tabular}
+$^*$Status: def: \#defined in standard \texttt{Makefiles}; undef: undefined; obs: obsolete, provided backwards compatibility with FASTA2.0 or earlier.
+\end{table}
+
+The \texttt{fasta-36/make} directory includes \texttt{Makefile}s
+appropriate for a broad range of environments, including Linux/Unix,
+BSD, MacOSX, and Windows. Makefiles are regularly tested against
+MacOSX, Linux, and Windows. Table \ref{make-defs} summarizes the
+Makefile options that can be modified.
+
+As distributed, the \texttt{Makefiles} in \texttt{fasta36/make}, build
+a version of the FASTA programs that is optimized for single searches
+against arbitrary sized databases, using bit scores, efficient sampled
+statistics, and gap-open/extend penalties. The default compilation
+configuration can be changed either by changing the compile time
+defines (Table \ref{make-defs}) in the main \texttt{Makefile},
+e.g. \texttt{make/Makefile.linux64\_sse2}, or by editing
+\texttt{make/Makefile36m.common}.
+
+\emph{High-performance searches with many queries} -- By default, the
+\texttt{comp\_lib5e.c} program specified in
+\texttt{Makefile36m.common} builds FASTA programs that re-read the
+library sequence database for every query sequence. This has the
+advantage that sequence comparison begins almost immediately, but if
+thousands of searches are being performed, the database is re-read
+thousands of times. \texttt{Makefile36m.common} can be edited to use
+\texttt{comp\_lib8.c} in place of \texttt{comp\_lib5e.c} and the
+database is read only once, then held in memory for additional
+searches. Of course, if \texttt{comp\_lib8.c} is used, the computer
+must have enough memory to store the complete database. Keeping the
+database in memory allows the FASTA programs to very efficiently used
+large, multicore computers.
+
+\needspace{5\baselineskip}
+\emph{Parallel searches with MPI} -- By default under
+Unix/Linux/MacOSX, the FASTA programs are threaded; they will spawn as
+many threads as CPU cores are available (this can be limited with the
+\texttt{-t n-threads} option). Using \texttt{comp\_lib8.c}, we see
+almost 48-fold speedup on a 48-core machine. The FASTA programs can
+also be run in parallel in the MPI environment on clusters of
+computers. To build the MPI versions of the programs, use
+\texttt{make ../make/Makefile.mpi\_sse2 ssearch36\_mpi},
+\texttt{fastx36\_mpi}, etc. The MPI programs currently substantially
+more communications overhead than the threaded versions, so they may
+not scale as well to large clusters.
+
+\include{fasta.history}
+\end{document}
diff --git a/doc/fasta_versions.html b/doc/fasta_versions.html
new file mode 100644
index 0000000..14b186f
--- /dev/null
+++ b/doc/fasta_versions.html
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<title>Major FASTA versions</title>
+<style type="text/css">
+body { margin-left: 6px; }
+.sidebar {
+font-size: 12px; font-family: sans-serif; text-decoration:none; background-color: #FFFFCC; }
+.fasta { font-family: sans-serif; }
+.fasta h2 { font-size: 16px; color: #000000 }
+.fasta h3 { font-size: 14px; color: #000000 }
+.fasta td {background-color: #FFFFCC }
+.fasta a { text-decoration: none; }
+.fasta li { margin-left:-1em }
+</style>
+<head>
+<body>
+<div class=fasta>
+<h2>FASTA version summary</h2>
+<code> $Id: fasta_versions.html 256 2010-03-29 13:41:48Z wrp $</code>
+<hr>
+<b>March, 2010</b>
+<p />
+Currently, there are five different versions of the FASTA programs that can be downloaded from the <a href="http://faculty.virginia.edu/wrpearson/fasta">FASTA Software WWW site</a>. I recommend that you use either <a href="fasta36/">fasta-36</a>, the very latest version, or <a href="CURRENT/">fasta-35</a>, the "classic" version that has been available for since 2007.
+<p />
+Here I try to explain the major differences.
+<p />
+<hr />
+<table>
+<tr><th>Version</th><th>New Programs</th><th>Other changes</th></tr>
+<tr>
+<td><a href="fasta36/">fasta-36</a></td><td> None </td><td>
+The major improvement in FASTA v36 is the ability to calculate and
+display multiple significant alignments (multiple HSP's) between a
+query sequence and a library sequence. Previous FASTA versions had
+the serious shortcoming of only showing the best
+alignment. A <tt>tfastx36</tt> alignment between a protein and its
+exon-containing gene will show all of the exon alignments that are
+long enough to be significant.
+<p>
+FASTA v36 also provides more flexible strategies for searching lists
+of datbase files, and versions after fasta-36.2 are fully threaded, so
+both searches and alignments can be distributed among multiple
+processors with efficient speedup.
+<p>
+PVM/MPI parallel versions of FASTA v36 are under development.
+</td>
+</tr>
+<tr><td colspan=3><hr /></td></tr>
+<tr>
+<td><a href="fasta3/">fasta-35</a></td>
+<td><code>+lalign35</code><br /><code>+ggsearch35</code><br /><code>+glsearch35</code><br /><code>-prss3</code><br /><code>-prfx3</code>
+</td>
+<td>FASTA v35 provides significant improvements in statistical accuracy and program efficiency in threaded (multi-CPU, multi-core) environments. The program now automatically produces 500 random shuffles when small libraries are searched. Thus, <code>prss</code> and <code>prfx</code> are no longer required; <code>ssearch35</code> and <code>fastx35</code> provide the same function.</td>
+</tr>
+<tr><td colspan=3><hr /></td></tr>
+<tr>
+<td><a href="fasta3/">fasta34</a></td>
+<td><code>+ssearch3</code><br /><code>+fasts3/tfasts3</code><br /><code>+fastf3/tfastf3</code><br /><code>-lfasta</code><br /><code>-lalign</code><br /><code>-tfasta</code>
+</td>
+<td>FASTA v34 is the last version of the FASTA3 series before the
+ significant changes introduced with FASTA v35. The FASTA v3
+ programs were first introduced in 1996, with threaded code
+ for multi-processors, more accurate statistical estimates, and
+ optimal Smith-Waterman alignments with <code>ssearch</code>. More
+ recent versions (v34) provided accelerated Smith-Waterman searches
+ using Altivec and SSE2 vector processors. FASTA v34 is no longer
+ updated for bug fixes. </td>
+</tr>
+<tr><td colspan=3><hr /></td></tr>
+<tr>
+<td><a href="fasta2/">fasta2</a></td>
+<td><code>+lfasta</code><br /><code>+tfasta</code>
+</td>
+<td>FASTA v2 was introduced in 1995, improving sensitivity by
+ calculated gapped alignment scores. Modern (V34 and V35) versions of
+ FASTA have significantly more robust statistical estimates, threaded
+ code, and vectorized Smith-Waterman, so FASTA v2 should not be used
+ for database searching. However, until <code>lalign</code> was addeed
+ to FASTA v35 in March, 2007, FASTA v2 was the only source for
+ the <code>lalign</code> program. Today, the only programs provided in
+ FASTA v2 that are not provided in FASTA v35 are the Kyte-Doolittle
+ hydropathy plotter <code>grease</code> and the classic (though very
+ inaccurate) secondary structure prediction programs <code>chofas</code>
+ and <code>garnier</code></td>
+</tr>
+</table>
+<p />
+The current stable version of the FASTA programs is version 35,
+and older releases of version 35
+are <a href="fasta3/">available</a>
+to make it easier for software packagers to work with a consistent
+version of the software. However,
+the <a href="CURRENT/">CURRENT</a>
+version of FASTA should be used whenever possible. Many bug reports
+reflect older versions of the software.
+<hr /> <p /> Description of minor changes: <a href="changes_v35.html">changes_v35.html</a> <p />
+<hr />
+</html>
diff --git a/doc/fastf3.1 b/doc/fastf3.1
new file mode 100644
index 0000000..91bd7ff
--- /dev/null
+++ b/doc/fastf3.1
@@ -0,0 +1,176 @@
+.TH FASTF/TFASTFv3 1 local
+.SH NAME
+fastf3, fastf3_t \- compare a mixed peptide sequence against a protein
+database using a modified fasta algorithm.
+
+tfastf3, tfastf3_t \- compare a mixed pepide sequence against a
+translated DNA database.
+
+.SH DESCRIPTION
+
+.B fastf3
+and
+.B tfastf3
+are designed to compare a sequence of mixed peptides to a protein
+(fastf3) or translated DNA (tfastf3) database. Unlike the traditional
+.B fasta3
+search, which uses a protein or DNA sequence,
+.B fastf3
+and
+.B tfastf3
+work with a query sequence of the form:
+.in +5
+.nf
+>testf from mgstm1
+MGCEN,
+MIDYP,
+MLLAY,
+MLLGY
+.fi
+.in 0
+This sequence indicates that a mixture of four peptides has been
+found, with 'M' in the first position of each one (as from a CNBr
+cleavage), in the second position 'G', 'I', or 'L' (twice), at the
+third position 'C', 'D', or 'L' (twice), at the fourth position 'E',
+'Y', 'A', or 'G', etc. When this sequence is compared against mgstm1.aa
+(included with the distribution), the mixture is deconvolved to form:
+.nf
+.ft C
+.in +5
+testf MILGY-----------MLLEY-----------MGDAP-----------
+ ::::: ::::: :::::
+GT8.7 MPMILGYWNVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEK
+ 10 20 30 40 50
+
+testf --------------------------------------------------
+
+GT8.7 FKLGLDFPNLPYLIDGSHKITQSNAILRYLARKHHLDGETEEERIRADIV
+ 60 70 80 90 100
+
+ 20
+testf ------------MLCYN
+ :::::
+GT8.7 ENQVMDTRMQLIMLCYNPDFEKQKPEFLKTIPEKMKLYSEFLGKRPWFAG
+ 110 120 130 140 150
+.in 0
+.ft P
+.fi
+.SH Options
+.LP
+.B fastf3
+and
+.B tfastf3
+can accept a query sequence from the unix "stdin" data stream. This makes it much
+easier to use fasta3 and its relatives as part of a WWW page. To
+indicate that stdin is to be used, use "-" or "@" as the query
+sequence file name.
+.TP
+\-b #
+number of best scores to show (must be < -E cutoff)
+.TP
+\-d #
+number of best alignments to show ( must be < -E cutoff)
+.TP
+\-D
+turn on debugging mode. Enables checks on sequence alphabet that
+cause problems with tfastx3, tfasty3, tfasta3.
+.TP
+\-E #
+Expectation value limit for displaying scores and
+alignments. Expectation values for
+.B fastf3
+and
+.B tfastf3
+are not as accurate as those for the other
+.B fasta3
+programs.
+.TP
+\-H
+turn off histogram display
+.TP
+\-i
+compare against only the reverse complement of the library sequence.
+.TP
+\-L
+report long sequence description in alignments
+.TP
+\-m 0,1,2,3,4,5,6,10
+alignment display options
+.TP
+\-n
+force query to nucleotide sequence
+.TP
+\-N #
+break long library sequences into blocks of # residues. Useful for
+bacterial genomes, which have only one sequence entry. -N 2000 works
+well for well for bacterial genomes.
+.TP
+\-O file
+send output to file
+.TP
+\-q/-Q
+quiet option; do not prompt for input
+.TP
+\-R file
+save all scores to statistics file
+.TP
+\-S #
+offset substitution matrix values by a constant #
+.TP
+\-s name
+specify substitution matrix. BLOSUM50 is used by default;
+PAM250, PAM120, and BLOSUM62 can be specified by setting -s P120,
+P250, or BL62. With this version, many more scoring matrices are
+available, including BLOSUM80 (BL80), and MDM_10, MDM_20, MDM_40 (M10,
+M20, M40). Alternatively, BLASTP1.4 format scoring matrix files can be
+specified.
+.TP
+\-T #
+(threaded, parallel only) number of threads or workers to use (set by
+default to 4 at compile time).
+.TP
+\-t #
+Translation table - tfastf3 can use the BLAST tranlation tables. See
+\fChttp://www.ncbi.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c/\fP.
+.TP
+\-w #
+line width for similarity score, sequence alignment, output.
+.TP
+\-x "#,#"
+offsets query, library sequence for numbering alignments
+.TP
+\-z #
+Specify statistical calculation. Default is -z 1, which uses
+regression against the length of the library sequence. -z 0 disables
+statistics. -z 2 uses the ln() length correction. -z 3 uses Altschul
+and Gish's statistical estimates for specific protein BLOSUM scoring
+matrices and gap penalties. -z 4: an alternate regression method.
+.TP
+\-Z db_size
+Set the apparent database size used for expectation value calculations.
+.TP
+\-1
+Sort by "init1" score.
+.TP
+\-3
+(TFASTF3 only) use only forward frame translations
+.SH Environment variables:
+.TP
+FASTLIBS
+location of library choice file (-l FASTLIBS)
+.TP
+SMATRIX
+default scoring matrix (-s SMATRIX)
+.TP
+SRCH_URL
+the format string used to define the option to re-search the
+database.
+.TP
+REF_URL
+the format string used to define the option to lookup the library
+sequence in entrez, or some other database.
+
+.SH AUTHOR
+Bill Pearson
+.br
+wrp at virginia.EDU
diff --git a/doc/fasts3.1 b/doc/fasts3.1
new file mode 100644
index 0000000..74af8f2
--- /dev/null
+++ b/doc/fasts3.1
@@ -0,0 +1,169 @@
+.TH FASTS/TFASTSv3 1 local
+.SH NAME
+fasts3, fasts3_t \- compare several short peptide sequences against a protein
+database using a modified fasta algorithm.
+
+tfasts3, tfasts3_t \- compare short pepides against a
+translated DNA database.
+
+.SH DESCRIPTION
+
+.B fasts3
+and
+.B tfasts3
+are designed to compare set of (presumably non-contiguous) peptides to
+a protein (fasts3) or translated DNA (tfasts3) database.
+fasts3/tfasts3 are designed particularly for short peptide data from
+mass-spec analysis of protein digests. Unlike the traditional
+.B fasta3
+search, which uses a protein or DNA sequence,
+.B fasts3
+and
+.B tfasts3
+work with a query sequence of the form:
+.in +5
+.nf
+>tests from mgstm1
+MLLE,
+MILGYW,
+MGADP,
+MLCYNP
+.fi
+.in 0
+This sequence indicates that four peptides are to be used. When this
+sequence is compared against mgstm1.aa (included with the
+distribution), the result is:
+.nf
+.ft C
+.in +5
+testf MILGYW----------MLLE------------MGDAP-----------
+ :::::: :::: :::::
+GT8.7 MPMILGYWNVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEK
+ 10 20 30 40 50
+
+testf --------------------------------------------------
+
+GT8.7 FKLGLDFPNLPYLIDGSHKITQSNAILRYLARKHHLDGETEEERIRADIV
+ 60 70 80 90 100
+
+ 20
+testf ------------MLCYNP
+ ::::::
+GT8.7 ENQVMDTRMQLIMLCYNPDFEKQKPEFLKTIPEKMKLYSEFLGKRPWFAG
+ 110 120 130 140 150
+.in 0
+.ft P
+.fi
+.SH Options
+.LP
+.B fasts3
+and
+.B tfasts3
+can accept a query sequence from the unix "stdin" data stream. This makes it much
+easier to use fasta3 and its relatives as part of a WWW page. To
+indicate that stdin is to be used, use "-" or "@" as the query
+sequence file name.
+.TP
+\-b #
+number of best scores to show (must be < -E cutoff)
+.TP
+\-d #
+number of best alignments to show ( must be < -E cutoff)
+.TP
+\-D
+turn on debugging mode. Enables checks on sequence alphabet that
+cause problems with tfastx3, tfasty3, tfasta3.
+.TP
+\-E #
+Expectation value limit for displaying scores and
+alignments. Expectation values for
+.B fasts3
+and
+.B tfasts3
+are not as accurate as those for the other
+.B fasta3
+programs.
+.TP
+\-H
+turn off histogram display
+.TP
+\-i
+compare against only the reverse complement of the library sequence.
+.TP
+\-L
+report long sequence description in alignments
+.TP
+\-m 0,1,2,3,4,5,6,9,10
+alignment display options
+.TP
+\-N #
+break long library sequences into blocks of # residues. Useful for
+bacterial genomes, which have only one sequence entry. -N 2000 works
+well for well for bacterial genomes.
+.TP
+\-O file
+send output to file
+.TP
+\-q/-Q
+quiet option; do not prompt for input
+.TP
+\-R file
+save all scores to statistics file
+.TP
+\-S #
+offset substitution matrix values by a constant #
+.TP
+\-s name
+specify substitution matrix. BLOSUM50 is used by default;
+PAM250, PAM120, and BLOSUM62 can be specified by setting -s P120,
+P250, or BL62. With this version, many more scoring matrices are
+available, including BLOSUM80 (BL80), and MDM_10, MDM_20, MDM_40 (M10,
+M20, M40). Alternatively, BLASTP1.4 format scoring matrix files can be
+specified.
+.TP
+\-T #
+(threaded, parallel only) number of threads or workers to use (set by
+default to 4 at compile time).
+.TP
+\-t #
+Translation table - tfasts3 can use the BLAST tranlation tables. See
+\fChttp://www.ncbi.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c/\fP.
+.TP
+\-w #
+line width for similarity score, sequence alignment, output.
+.TP
+\-x "#,#"
+offsets query, library sequence for numbering alignments
+.TP
+\-z #
+Specify statistical calculation. Default is -z 1, which uses
+regression against the length of the library sequence. -z 0 disables
+statistics. -z 2 uses the ln() length correction. -z 3 uses Altschul
+and Gish's statistical estimates for specific protein BLOSUM scoring
+matrices and gap penalties. -z 4: an alternate regression method.
+.TP
+\-Z db_size
+Set the apparent database size used for expectation value calculations.
+.TP
+\-3
+(TFASTS3 only) use only forward frame translations
+.SH Environment variables:
+.TP
+FASTLIBS
+location of library choice file (-l FASTLIBS)
+.TP
+SMATRIX
+default scoring matrix (-s SMATRIX)
+.TP
+SRCH_URL
+the format string used to define the option to re-search the
+database.
+.TP
+REF_URL
+the format string used to define the option to lookup the library
+sequence in entrez, or some other database.
+
+.SH AUTHOR
+Bill Pearson
+.br
+wrp at virginia.EDU
diff --git a/doc/map_db.1 b/doc/map_db.1
new file mode 100644
index 0000000..173f119
--- /dev/null
+++ b/doc/map_db.1
@@ -0,0 +1,45 @@
+.TH MAP_DB "September, 1999"
+.SH NAME
+.B map_db
+\- read a FASTA (0), GENBANK flat file (1) PIR/VMS (5) or GCG binary
+(6) sequence database and produce the offsets necessary for efficient
+memory mapping.
+.SH SYNOPSIS
+.B map_db
+[-n] filename | "filename libtype"
+.SH DESCRIPTION
+.B map_db
+.I filename
+reads the sequence database in
+.I filename
+and produce a new file
+.I filename.xin
+with the offset information necessary for efficient memory mapping.
+.LP
+The programs in fasta version 32t08 can use memory mapped i/o to load
+sequence database files and read them efficiently. Memory mapping is
+used only if a "\c
+.I .xin\c
+\&" file is available. The "\c
+.I .xin\c
+\&" file is created by
+.B map_db\c
+\&.
+.LP
+In addition to
+.B map_db\c
+\&,
+.B list_db
+is available to display the database size, etc, and set of offsets calculated
+by
+.B map_db\c
+\&.
+.SH OPTIONS
+.TP
+\-n
+Read file as DNA database.
+.SH BUGS
+.SH AUTHOR
+Bill Pearson
+.br
+wrp at virginia.EDU
diff --git a/doc/prss3.1 b/doc/prss3.1
new file mode 100644
index 0000000..dd407ce
--- /dev/null
+++ b/doc/prss3.1
@@ -0,0 +1,170 @@
+.TH PRSS3 1 local
+.SH NAME
+prss \- test a protein sequence similarity for significance
+.SH SYNOPSIS
+.B prss34
+\&[-Q -A -f # -g # -H -O file -s SMATRIX -w # -Z #
+.I -k # -v #
+]
+sequence-file-1 sequence-file-2
+[
+.I #-of-shuffles
+]
+
+.B prfx34
+\&[-Q -A -f # -g # -H -O file -s SMATRIX -w # -z 1,3 -Z #
+.I -k # -v #
+]
+sequence-file-1 sequence-file-2
+[
+.I ktup
+]
+[
+.I #-of-shuffles
+]
+
+.B prss34(_t)/prfx34(_t)
+[-AfghksvwzZ]
+\- interactive mode
+
+.SH DESCRIPTION
+.B prss34
+and
+.B prfx34
+are used to evaluate the significance of a protein:protein, DNA:DNA
+(
+.B prss34
+), or translated-DNA:protein (
+.B prfx34
+) sequence similarity score
+by comparing two sequences and calculating optimal similarity scores,
+and then repeatedly shuffling the second sequence, and calculating
+optimal similarity scores using the Smith-Waterman algorithm. An
+extreme value distribution is then fit to the shuffled-sequence
+scores. The characteristic parameters of the extreme value
+distribution are then used to estimate the probability that each of
+the unshuffled sequence scores would be obtained by chance in one
+sequence, or in a number of sequences equal to the number of shuffles.
+This program is derived from
+.B rdf2\c
+\&, described by Pearson and Lipman, PNAS (1988) 85:2444-2448, and
+Pearson (Meth. Enz. 183:63-98). Use of the extreme value
+distribution for estimating the probabilities of similarity scores was
+described by Altshul and Karlin, PNAS (1990) 87:2264-2268. The
+'z-values' calculated by rdf2 are not as informative as the P-values
+and expectations calculated by prdf.
+.B prss34
+calculates optimal scores using the same rigorous Smith-Waterman
+algorithm (Smith and Waterman, J. Mol. Biol. (1983) 147:195-197) used by the
+.B ssearch34
+program.
+.B prfx34
+calculates scores using the FASTX algorithm (Pearson et al. (1997) Genomics 46:24-36.
+.PP
+.B prss34
+and
+.B prfx34
+also allow a more sophisticated shuffling method: residues can be shuffled
+within a local window, so that the order of residues 1-10, 11-20, etc,
+is destroyed but a residue in the first 10 is never swapped with a residue
+outside the first ten, and so on for each local window.
+.SH EXAMPLES
+.TP
+(1)
+.B prss34
+\& -v 10 musplfm.aa lcbo.aa
+.PP
+Compare the amino acid sequence in the file musplfm.aa with that
+in lcbo.aa, then shuffle lcbo.aa 200 times using a local shuffle with
+a window of 10. Report the significance of the
+unshuffled musplfm/lcbo comparison scores with respect to the shuffled
+scores.
+.TP
+(2)
+.B prss34
+musplfm.aa lcbo.aa 1000
+.PP
+Compare the amino acid sequence in the file musplfm.aa with the sequences
+in the file lcbo.aa, shuffling \fClcbo.aa\fP 1000 times. Shuffles can also be specified with the -k # option.
+.TP
+(3)
+.B prfx34
+mgstm1.esq xurt8c.aa 2 1000
+.PP
+Translate the DNA sequence in the \fCmgstm1.esq\fP file in all six
+frames and compare it to the amino acid sequence in the file
+\fCxurt8c.aa\fP, using ktup=2 and shuffling \fCxurt8c.aa\fP 1000
+times. Each comparison considers the best forward or reverse
+alignment with frameshifts, using the fastx algorithm (Pearson et al
+(1997) Genomics 46:24-36).
+.TP
+(4)
+.B prss34/prfx34
+.PP
+Run prss in interactive mode. The program will prompt for the file
+name of the two query sequence files and the number of shuffles to be
+used.
+.SH OPTIONS
+.PP
+.B prss34/prfx34
+can be directed to change the scoring matrix, gap penalties, and
+shuffle parameters by entering options on the command line (preceeded
+by a `\-'). All of the options should preceed the file names number of
+shuffles.
+.TP
+\-A
+Show unshuffled alignment.
+.TP
+\-f #
+Penalty for opening a gap (-10 by default for proteins).
+.TP
+\-g #
+Penalty for additional residues in a gap (-2 by default) for proteins.
+.TP
+\-H
+Do not display histogram of similarity scores.
+.TP
+\-k #
+Number of shuffles (200 is the default)
+.TP
+\-Q -q
+"quiet" - do not prompt for filename.
+.TP
+\-O filename
+send copy of results to "filename."
+.TP
+\-s str
+specify the scoring matrix. BLOSUM50 is used by default for proteins;
++5/-4 is used by defaul for DNA.
+.B prss34
+recognizes the same scoring matrices as fasta34, ssearch34, fastx34, etc;
+e.g. BL50, P250, BL62, BL80, MD10, MD20, and other matrices in BLAST1.4
+matrix format.
+.TP
+\-v #
+Use a local window shuffle with a window size of #.
+.TP
+\-z #
+Calculate statistical significance using the mean/variance
+(moments) approach used by fasta34/ssearch or from maximum likelihood
+estimates of lambda and K.
+.TP
+\-Z #
+Present statistical significance as if a '#' entry database had
+been searched (e.g. "-Z 50000" presents statistical significance as if
+50,000 sequences had been compared).
+.SH ENVIRONMENT VARIABLES
+.PP
+.B (SMATRIX)
+the filename of an alternative scoring matrix file. For protein
+sequences, BLOSUM50 is used by default; PAM250 can be used with the
+command line option
+.B -s P250\c
+(or with -s pam250.mat). BLOSUM62 (-s BL62) and PAM120 (-S P120).
+.SH "SEE ALSO"
+ssearch3(1), fasta3(1).
+.SH AUTHOR
+Bill Pearson
+.br
+wrp at virginia.EDU
+
diff --git a/doc/ps_lav.1 b/doc/ps_lav.1
new file mode 100644
index 0000000..c3bd02f
--- /dev/null
+++ b/doc/ps_lav.1
@@ -0,0 +1,20 @@
+.TH PS_LAV 1 local
+.SH NAME
+ps_lav [-B] [-Z db_size] \- plot an "lav" file in postscript.
+.SH DESCRIPTION
+ps_lav is a simple program to take "lav" format output from "lalign35"
+and produce postscript alignment plots (which look like "dot-plots").
+It was designed to work with the output of the \fClalign35\fP program,
+but should work with other \fClav\fP output as well.
+.TP
+\-B
+color alignment lines using the "bit" score.
+.TP
+\-Z db_size
+set the effective database size for an E()-value calculate to db_size,
+and color alignment lines using E()-values. Requires bit scores in
+the \fClav\fP ouput.
+.SH AUTHOR
+Bill Pearson
+.br
+wrp at virginia.EDU
diff --git a/doc/readme.v30 b/doc/readme.v30
new file mode 100644
index 0000000..f445ec0
--- /dev/null
+++ b/doc/readme.v30
@@ -0,0 +1,38 @@
+
+Because of interdependencies in the Makefile, sometimes you must
+type "make" a second time to get everything built.
+
+June 12, 1996 - fasta30t1
+
+ Fixed bug in reading blast-format DNA sequence files.
+ Fixed core-dump for some large libraries on some machines.
+
+June 19, 1996 - fasta30t2
+
+ Fixed a serious bug in the Smith-Waterman alignment routines used
+ by both fasta3 (dropnfa.c) and ssearch3 (dropgsw.c) that caused
+ the amount of memory required to depend on the library sequence
+ size, rather than the query sequence size.
+
+ Fixed some memory-overwrite errors in showalign.c
+
+June 27, 1996 - fasta30t3
+
+ Found and fixed bugs in comp_thr.c and nxgetaa.c that caused core
+ dumps when reading DNA libraries with long sequences in fasta
+ format.
+
+July 6, 1996 - fasta30t4
+
+ ibm_pthread_subs.c available, Makefile.ibm for multiprocessor
+ IBM RS/6000 AIX systems.
+
+ Finally (?) fixed the previous bug that caused core dumps when
+ reading DNA libraries in fasta format.
+
+ Corrections to the fastx algorithm.
+
+July 10, 1996
+
+ Fixed reading of compressed GCG DNA format.
+
diff --git a/doc/readme.v30t6 b/doc/readme.v30t6
new file mode 100644
index 0000000..bb35f83
--- /dev/null
+++ b/doc/readme.v30t6
@@ -0,0 +1,74 @@
+
+>>August 24, 1996
+
+New programs - tfastx3, tfastx3_t, compare a protein sequence to
+forward and reverse translations of a DNA sequence database. An excellent
+replacement for tfasta3.
+
+Sun multiprocessing - change in thr_create() to use all CPU's if available.
+
+GCG formats - now can search with simple GCG-format query sequences and
+results with GCG format Swissprot and Genpept are more readable.
+
+>>August 26, 1996
+
+Fixed bugs in tfastx3(_t) and fastx3(_t) including an ancient problem
+with aatran(). Less redundancy in gcg_ranlib().
+
+
+>>August 31, 1996
+
+Included support for BLOSUM62 (-s BL62) as per documentation.
+
+Rearranged Makefile's so that they would make everything in one pass.
+
+>>September 6, 1996
+
+Corrected yet another problem with the fastx/tfastx code.
+
+Noticed that searching without optimized scores gave no optimized
+scores on the final list of scores - fixed this.
+
+The pvm version now does alignments - not thoroughly tested.
+
+>>September 13, 1996
+
+Fixed display of best scores to stdout.
+
+Fixed problem with alignments when -o flag used.
+
+pvcompfa/pvcompsw have now been tested on DEC Alpha, Solaris X86, and
+SGI PVM implementations. Several bugs were corrected.
+
+>>September 18, 1996
+
+Fixed bug selectbestz() that caused core dumps in pvcomplib.c
+(changes to pvcomplib.c, comp_thr.c, complib.c).
+
+>>September 23, 1996
+
+Corrected showalign.c/pvm_showalign.c addressing bug found and fixed
+by Erik Wallin. (erikw at biokemi.su.se).
+
+>>October 15, 1996
+
+Corrected bug so alternative scoring matrices are used.
+
+>>October 22, 1996
+
+Remove singularities from regression routine.
+
+-z 0 now means no statistics (same as -z -1).
+
+No longer show alignment for 0 score.
+
+>>October 26, 1996
+
+Fix problem with -b, -d when Z-values disabled.
+
+>>November 1, 1996
+
+Altschul-Gish statistical estimates (-z 3) now work properly.
+
+Fix problem with mean_var==0.0.
+
diff --git a/doc/readme.v30t7 b/doc/readme.v30t7
new file mode 100644
index 0000000..42682d1
--- /dev/null
+++ b/doc/readme.v30t7
@@ -0,0 +1,175 @@
+>> October 30, 1996
+
+A new program, sc_to_e, can be used to calculate expectation values
+from the regression coefficients reported from a search. The
+expectation value is based on similarity score, sequence length, and
+database size.
+
+>> November 8, 1996
+
+fasta30t7 differs from fasta30t6 in the amount of information provided
+with the -m 10 option.
+
+(1) The query and library sequence identifiers are no longer abbreviated.
+
+(2) New information about the program and program version are provided:
+
+The new information provided is:
+
+ mp_name: program name (actually argv[0])
+ mp_ver: main program version (can be different from function version)
+ mp_argv: command line arguments (duplicates argv[0])
+
+ Some statistical information is provided as well:
+ mp_extrap: XXXX YYY - statistics extrapolated from XXX to YYY
+ mp_stats: indicates type of statistics used for E() value
+ mp_KS: Kolmogorov-Smirnoff statistic
+
+The "mp_" (main program) information is function independent, while the "pg_"
+information is produced by a particular comparison function (ssearch,
+fastx, fasta, etc). "pg_" should probably be called "fn_", and "mp_"
+called "pg_", but I remain backwards compatible.
+
+(3) The end of the "parseable" records is denoted with:
+
+ >>><<<
+
+(4) There now an compile-time option -DM10_CONS, that allows you to
+display a final alignment summary:
+
+;al_cons:
+ .::.:- .:: .. :. .:.---: : .--.:. :
+.. .--- ..: :: ... :..: .::.:. . .---. . .:
+ : . . . : .. . :..: .--. . : .:. .. : .
+ .:.::: ..:. :
+
+or, if M10_CONS_L is defined (in addition to M10_CONS), the output is:
+;al_cons:
+ p==p=-mmmp==mpzmm=pmmmmz=p---=mmm=mmp--p=zm=m
+pzmmp---mmzp=m==mzzzm=zp=mz==z=pmzmmz---pmmpmmmp=m
+m=mzmmzmpm=mmmmppmmmpmmmm=pp=mp--pmpm=mp=pmzzm=mmp
+mp=z===mmpz=zm=
+
+where '=' indicates identical residues, '-' a gap in one or the other
+sequence, 'p' indicates a positive pam value, 'm' indicates a negative
+pam value, and 'z' indicates a zero pam value.
+
+A typical run now looks like:
+
+>>>gtm1_mouse.aa, 217 aa vs s library
+; mp_name: fasta3_t
+; mp_ver: version 3.0t7 November, 1996
+; mp_argv: fasta3_t -q -m 10 gtm1_mouse.aa s
+; pg_name: FASTA
+; pg_ver: 3.06 Sept, 1996
+; pg_matrix: BL50
+; pg_gap-pen: -12 -2
+; pg_ktup: 2
+; pg_optcut: 24
+; pg_cgap: 36
+; mp_extrap: 50000 51933
+; mp_stats: Expectation fit: rho(ln(x))= 5.8855+/-0.000527; mu= 1.5386+/- 0.029; mean_var=73.0398+/-15.283
+; mp_KS: 0.0133 (N=29) at 42
+>>GTM1_MOUSE GLUTATHIONE S-TRANSFERASE GT8.7 (EC 2.5.1.18) (GST 1-1) (CLASS-MU).
+; fa_initn: 1490
+; fa_init1: 1490
+; fa_opt: 1490
+; fa_z-score: 1754.6
+; fa_expect: 0
+; sw_score: 1490
+; sw_ident: 1.000
+; sw_overlap: 217
+>GTM1_MOUSE ..
+; sq_len: 217
+; sq_type: p
+; al_start: 1
+; al_stop: 217
+; al_display_start: 1
+PMILGYWNVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKF
+KLGLDFPNLPYLIDGSHKITQSNAILRYLARKHHLDGETEEERIRADIVE
+NQVMDTRMQLIMLCYNPDFEKQKPEFLKTIPEKMKLYSEFLGKRPWFAGD
+KVTYVDFLAYDILDQYRMFEPKCLDAFPNLRDFLARFEGLKKISAYMKSS
+RYIATPIFSKMAHWSNK
+>GTM1_MOUSE ..
+; sq_len: 217
+; sq_type: p
+; al_start: 1
+; al_stop: 217
+; al_display_start: 1
+PMILGYWNVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKF
+KLGLDFPNLPYLIDGSHKITQSNAILRYLARKHHLDGETEEERIRADIVE
+NQVMDTRMQLIMLCYNPDFEKQKPEFLKTIPEKMKLYSEFLGKRPWFAGD
+KVTYVDFLAYDILDQYRMFEPKCLDAFPNLRDFLARFEGLKKISAYMKSS
+RYIATPIFSKMAHWSNK
+>>GTM1_RAT GLUTATHIONE S-TRANSFERASE YB1 (EC 2.5.1.18) (CHAIN 3) (CLASS-MU).
+; fa_initn: 1406
+; fa_init1: 1406
+; fa_opt: 1406
+; fa_z-score: 1656.3
+; fa_expect: 0
+; sw_score: 1406
+; sw_ident: 0.931
+; sw_overlap: 217
+>GTM1_MOUSE ..
+; sq_len: 217
+; sq_type: p
+; al_start: 1
+; al_stop: 217
+; al_display_start: 1
+PMILGYWNVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKF
+KLGLDFPNLPYLIDGSHKITQSNAILRYLARKHHLDGETEEERIRADIVE
+NQVMDTRMQLIMLCYNPDFEKQKPEFLKTIPEKMKLYSEFLGKRPWFAGD
+KVTYVDFLAYDILDQYRMFEPKCLDAFPNLRDFLARFEGLKKISAYMKSS
+RYIATPIFSKMAHWSNK
+>GTM1_RAT ..
+; sq_len: 217
+; sq_type: p
+; al_start: 1
+; al_stop: 217
+; al_display_start: 1
+PMILGYWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWLNEKF
+KLGLDFPNLPYLIDGSRKITQSNAIMRYLARKHHLCGETEEERIRADIVE
+NQVMDNRMQLIMLCYNPDFEKQKPEFLKTIPEKMKLYSEFLGKRPWFAGD
+KVTYVDFLAYDILDQYHIFEPKCLDAFPNLKDFLARFEGLKKISAYMKSS
+RYLSTPIFSKLAQWSNK
+;al_cons:
+:::::::::::::::::.:::::::::.::::.::::::.::::::::::
+::::::::::::::::.::::::::.::::::::: ::::::::::::::
+:::::.::::::::::::::::::::::::::::::::::::::::::::
+::::::::::::::::..::::::::::::.:::::::::::::::::::
+::..::::::.:.::::
+>>><<<
+
+
+217 residues in 1 query sequences
+18531385 residues in 52205 library sequences
+ Tcomplib (4 proc)[version 3.0t7 November, 1996]
+ start: Fri Nov 8 18:20:26 1996 done: Fri Nov 8 18:20:41 1996
+ Scan time: 38.434 Display time: 2.166
+
+Function used was FASTA
+
+================================================================
+
+>> November 11, 1996
+
+ --> v30t71
+
+Made changes to complib.c, comp_thr.c, nxgetaa.c to allow scoring
+matrix to be modified in fastx3, fastx3_t.
+
+================================================================
+
+>> November 15, 1996
+
+ --> v30t72
+
+nxgetaa.c now accepts query sequences from "stdin" by using "-" as the
+input file name. If DNA sequences are read in this mode, the "-n"
+option must be used.
+
+> November 23, 1996
+
+Included code in nxgetaa.c and Makefile.sgi to get around a bug in SGI's
+sscanf() that prevented compressed GCG databases from being read properly.
+
diff --git a/doc/readme.v31t0 b/doc/readme.v31t0
new file mode 100644
index 0000000..0018ded
--- /dev/null
+++ b/doc/readme.v31t0
@@ -0,0 +1,160 @@
+
+>>November 1, 1997
+
+ --> v31t0
+
+version 31t of the fasta program package uses a more modular
+structure for comparison functions. In addition to modular functions
+to initialize, calculate and align sequences, v31 provides a modular
+function for creating the alignment display. This was required for
+fasty and fastf, which have very different alignment strategies from
+the other search programs.
+
+>>February 13, 1998
+
+modified nascii[] so that 0, 1, 2 are no longer end of sequence
+characters.
+
+prss3 added. Unlike prss, prss3 uses -d # to specify the number of
+shuffles.
+
+>>March 18, 1998
+
+First public release. Corrected problems with dropfz.c (which is
+used in fasty3, tfasty3). Makefile is well tested, but other Makefile's
+are not. PVM versions not tested.
+
+>>March 19, 1998
+
+Problem with unthreaded tfastx3, tfasty3 caused by bug in complib.c
+fixed. All Makefiles (Makefile.alpha Makefile.sun, Makefile.sgi,
+Makefile.linux) have been tested and work properly. Threaded versions
+do not work on linux (yet). Function labeling problems with fasty3,
+tfasty3 corrected.
+
+>>March 20, 1998
+
+ --> v31t02
+
+Fixed problem with inconsistent openlib() calls that broke BLAST databases
+on some platforms.
+
+>>March 27, 1998
+
+ --> v31t04
+
+Fixed a long standing problem with fastx/tfastx and fasty/tfasty that
+caused various memory allocation problems and core dumps.
+
+The PVM version works again, but cannot produce alignments. The
+change in the location of the modular display functions will require
+significant changes in the pvm display functions. For the moment,
+showalign() has been commented out.
+
+Code tested on Macintosh without changes.
+
+Added some additional information in the results file.
+
+
+Please report bugs to wrp at virginia.edu
+
+>>April 3, 1998
+
+Removed some debugging code in faatran.c now that fastx/fasty bugs
+seem corrected.
+
+ FASTA --> v3.14
+
+Corrected uninitialized array elements in dropnfa.c.
+
+>>April 10, 1998
+
+Added facility for specifying SRCH_URL (the URL string that will be
+used to re-search the database) and REF_RUL (the URL string that
+will be used to lookup the sequence) ini url_subs.c. This allows perl
+scripts to provide different databases for re-searching dynamically.
+
+>>April 16, 1998
+
+ --> v31t05
+
+Corrected problem with ignoring ','s in databases (','s are found in
+PIR).
+
+>>April 18, 1998
+
+Corrected some problems with sequence names for Entrez lookups and
+re-searching databases.
+
+Made minor modifications to nxgetaa.c and compacc.c for compatibility
+with Borland 'C' compiler for Win32 systems. Including makefile.tc
+fasta.rsp, prss.rsp, and test.bat for Borland 'C'/win32.
+
+>>April 24, 1998
+
+ --> v31t06
+
+Fixed another bug in fasty3/tfasty3 alignment routines.
+
+Added additional information to the do_url1() (url_subs.c) function.
+The re-search URL can now reference the start, stop, and length of the
+library sequence to be re-searched with. For DNA library sequences,
+these values are always in nucleotides, even with tfasta/x/y.
+
+
+>>May 12, 1998
+
+(no version change as v31t06 was not released prior to this)
+
+Correct nxgetaa.c GETLIB to deal correctly with BLAST NR database
+sequences with exceptionally long title lines.
+
+Fix bug with long -O results files.
+
+>>May 18, 1998
+
+ --> v31t07
+
+Corrected some bugs in information string lengths (e.g. gstring1,
+stat_str), disabling statistics with -z 0, translation of 'X' by
+saatran() (faatran.c) that caused problems with FASTX.
+
+A serious bug has been fixed in the FASTX alignment routines.
+For some pathological sequences, % identity increases from < 10%
+to 40%. The version number of the main program has not changed,
+but the version number of the fastx function has changed to 3.2.
+
+>>June 19, 1998
+
+ --> v31t08
+
+Corrected some problems with alignments with -m 10.
+
+Added -Z db_size option to modify apparent database size for
+expectation value calculation (used only for protein/protein FASTA and
+SSEARCH, FASTX, FASTY, TFASTX, and TFASTY).
+
+>>July 1, 1998
+
+ (no version change)
+
+Corrected size of lbnames[], lb_size[] in structs.h to accomodate MAX_LF
+files.
+
+>>July 13, 1998
+
+ --> v31t09
+
+Corrected problem in nxgetaa.c encountered when reading long sequences
+(that must be split) in fasta format.
+
+Corrected problem in statistics calculation encountered with a small number
+of very long DNA sequences.
+
+>>July 17, 1998
+
+ (no version change, date change for ssearch3)
+
+Corrected default expectation cutoff (it was 10, now it is 2.0) for
+DNA with ssearch3.
+
diff --git a/doc/readme.v31t1 b/doc/readme.v31t1
new file mode 100644
index 0000000..dd9fc7d
--- /dev/null
+++ b/doc/readme.v31t1
@@ -0,0 +1,113 @@
+>>July 22, 1998
+
+ --> v31t10
+
+Corrected problem with histogram when unscaled statistics used (e.g. prss3).
+
+Corrected problems with prss3 shuffled sequence prompt. Provided option
+to enter number of shuffles, window size, for prss3. Number of shuffles
+for prss3 can be entered as an option (-d #) or as the third argument
+on the command line (prss3 query lib 1000).
+
+Modified nrand.c, nrand48.c to use time to set random number.
+
+Corrected problems reading GCG formatted files with prss3.
+
+Corrected various problems with pvcomp* programs, but they still do
+not produce alignments with version 3.1.
+
+Two new programs, fastf3(_t) and tfastf3(_t) are available. These
+programs compare a set of mixed peptide sequences from an Edman
+sequencer to a protein (fastf3) or DNA (tfastf3) database, using
+the database sequences to de-convolve the peptide mixture.
+
+See fastf3.1
+
+>>August 11, 1998
+
+(no version change)
+
+Modified initfa.c so that using '-n' on the fastx/fasty command line
+would not cause problems.
+
+Changed labeling of query sequence length for fastx/fasty from 'aa' to 'nt'.
+
+>>August 18, 1998
+
+(no version change)
+
+Modified complib.c, comp_thr.c scaleswn.c, to report E()-value for only
+one related sequence if -z 3 is used.
+
+>>August 23, 1998
+
+ -->v31t11
+
+Some serious problems with prss3 have been corrected:
+
+(1) use dropnsw.c rather than dropgsw.c for more accurate low scores
+
+(2) modify estimation program; use scaleswe.c rather than scaleswn.c.
+ scaleswe.c has some improvements for estimation by moments and can
+ use MLE as well as mu/var (-z 3).
+
+(3) add p() estimate.
+
+(4) correct bugs in nrand48, which caused bad sequences for llgetaa.c
+
+(5) -Z number works properly for prss3 and other programs (fixed histogram).
+
+(6) a new program, ssearch3e, is available that uses the same scaling
+ routines as prss3 (scaleswe.c). prss3 will save the random
+ sequences it generates when the -r file option is given; the
+ sequences are in file_rlib. ssearch3e (or ssearch3 or fasta) can
+ then do a search on exactly the same sequences that were used by prss3.
+
+A bug reading GCG format compressed DNA databases was fixed.
+
+Fixed a bug that caused query sequence not to be displayed with -m 10.
+
+Simple optimization in dropnfa.c improves performance 10%.
+
+>>Sept. 1, 1998
+
+(no version change)
+
+Modified nxgetaa.c to recognize "ACGTX" as nucleotides.
+
+>>Sept. 7, 1998
+
+ --> v31t12
+
+Added -z 11 - 15, which use shuffled sequences, rather than real
+sequences to calculate statistical estimates. Because a shuffled
+sequence score is calculated for each sequence score, the search
+process takes twice as long. In this first version, codons are not
+preserved during shuffles, so tfasta/x/y shuffles may not be as
+informative as they should be.
+
+Also fix a problem with prss3 shuffles.
+
+>>Sept. 14, 1998
+
+ (no version change; previous version not released)
+
+Corrected bugs in tfastx3/tfasty3 caused by using the -3 option with
+or without -i. With the bug fixes; "-3" and "-3 -i" work as expected;
+"-3" gives the forward three frames, while "-3 -i" gives the reverse
+three frames.
+
+In addition, tfasta3/tfasta3_t was upgraded to perform the same way
+that tfastx/y3 does - i.e. a search with "-i -3" searches only frames
+4,5, and 6, while "-3" searches only frames 1, 2, and 3.
+
+>>Sept. 29, 1998
+
+ --> v31t13
+
+Corrected bugs in dropfx.c that were corrected in fasta30 last May,
+but lingered in fasta31. Also included code to ensure that tfastx/y
+alignments against long introns would not overrun the alignment
+buffer. Instead of overrunning the buffer, the message: ***aligment
+truncated *** is displayed.
+
diff --git a/doc/readme.v32t0 b/doc/readme.v32t0
new file mode 100644
index 0000000..4535a89
--- /dev/null
+++ b/doc/readme.v32t0
@@ -0,0 +1,407 @@
+
+FASTX/Y and FASTA (DNA) are now half as fast, because the programs now
+search both the forward and reverse strands by default.
+
+The documentation in fasta3x.me/fasta3x.doc has been substantially
+revised.
+
+>>October 9, 1999
+ --> v32t08 (no version number change)
+
+Added "-M low-high" option, where low and high are inclusion limits
+for library sequences. If a library sequence is shorter than "low" or
+longer than "high", it will not be considered in the search. Thus,
+"-M 200-250" limits the database search to proteins between 200 and
+250 residues in length. This should be particularly useful for fasts3
+and fastf3. This limit applies only to protein sequences.
+
+Modified scaleswn.c to fall back to maximum likelihood estimates of
+lambda, K rather than mean/variance estimates. (This allows MLE
+estimation to be used instead of proc_hist_n when a limited range of
+scores is examined.)
+
+>>October 20, 1999
+(no version change)
+
+Modify nxgetaa.c/nmgetaa.c to recognize 'N' as a possible DNA character.
+
+>>October 9, 1999
+ --> v32t08 (no version number change)
+
+Added "-M low-high" option, where low and high are inclusion limits
+for library sequences. If a library sequence is shorter than "low" or
+longer than "high", it will not be considered in the search. Thus,
+"-M 200-250" limits the database search to proteins between 200 and
+250 residues in length. This should be particularly useful for fasts3
+and fastf3. -M -500 searches library sequences < 500; -M 200 -
+searches sequences > 200. This limit applies only to protein
+sequences.
+
+Modified scaleswn.c to fall back to maximum likelihood estimates of
+lambda, K rather than mean/variance estimates. (This allows MLE
+estimation to be used instead of proc_hist_n when a limited range of
+scores is examined.)
+
+>>October 2, 1999
+ --> v32t08
+
+Many changes:
+
+(1) memory mapped (mmap()ed) database reading - other database reading fixes
+(2) BLAST2 databases supported
+(3) true maximum likelihood estimates for Lambda, K
+(4) Misc. minor fixes
+
+(1) (Sept. 26 - Oct. 2, 1999) Memory mapped database access.
+It is now possible to use mmap()ed access to FASTA format databases,
+if the "map_db" program has been used to produce an ".xin" file. If
+USE_MMAP is defined at compile time and a ".xin" file is present, the
+".xin" will be used to access sequences directly after the file is
+mmap()ed. On my 4-processor Alpha, this can reduce elapsed time by
+50%. It is not quite as efficient as BLAST2 format, but it is close.
+
+Currently, memory mapping is supported for type 0 (FASTA), 5
+(PIR/GCG ascii), and 6 (GCG binary). Memory mapping is used if a
+".xin" file is present. ".xin" files are created by the new program
+"map_db". The syntax for "map_db" is:
+
+ map_db [-n] "/dir/database.fa"
+
+which creates the file /dir/database.fa.xin. Library types can be
+included in the filename; thus:
+
+ map_db -n "/gcggenbank/gb_om.seq 6"
+
+would be used for a type 6 GCG binary file.
+
+The ".xin" file must be updated each time the database file changes.
+map_db writes the size of the database file into the ".xin" file, so
+that if the database file changes, making the ".xin" offset
+information invalid, the ".xin" file is not used. "list_db" is
+provided to print out the offset information in the ".xin" file.
+
+(Oct 2, 1999) The memory mapping routines have been changed to
+allow several files to be memory mapped simultaneously. Indeed, once a
+database has been memory mapped, it will not be unmap()ed until the
+program finishes. This fixes a problem under Digital Unix, and should
+make re-access to mmap()ed files (as when displaying high scores and
+alignments) much more efficient. If no more memory is available for
+mmap()ing, the file will be read using conventional fread/fgets.
+
+(Oct 2, 1999) The names of the database reading functions has been
+changed to allow both Blast1.4 and Blast2.0 databases to be read. In
+addition, Makefile.common now includes an option to link both
+ncbl_lib.o and ncbl2_lib.o, which provides support for both libraries.
+However, Blast1.4 support has not been tested.
+
+The Makefile structure has been improved. Each architecture specific
+Makefile (Makefile.alpha, Makefile.linux, etc) now includes
+Makefile.common. Thus, changes to the program structure should be
+correct for all platforms. "map_db" and "list_db" are not made with
+"make all".
+
+The database reading functions in nxgetaa.c can now return a database
+length of 0, which indicates that no residues were read. Previously,
+0-length sequences returned a length of 1, which were ignored.
+Complib.c and comp_thr.c have changed to accommodate this
+modification. This change was made to ensure that each residue,
+including the last, of each sequence is read.
+
+Corrected bug in nxgetaa.c with FASTA format files with very long
+(>512 char) definition lines.
+
+(2) (September 20, 1999) BLAST2 format databases supported
+
+This release supports NCBI Blast2.0 format databases, using either
+conventional file reading or memory mapped files. The Blast2.0 format
+can be read very efficiently, so there is only a modest improvement in
+performance with memory mapping. The decision to use mmap()'ed files
+is made at compile time, by defining USE_MMAP. My thanks to Eamonn
+O'Toole of DEC/Compaq, and Daryl Madura of Sun Microsystems, for
+providing mmap()'ed modifications to fasta3. On my machines, Blast2.0
+format reduces search time by about 30%. At the moment, ambiguous DNA
+sequences are not decoded properly.
+
+(3) (September 30, 1999) A new statistical estimation option is
+available. -z 2 has been changed from ln()-scaling, which never
+should have been used, to scaling using Maximum Likelihood Estimates
+(MLEs) of Lambda and K. The MLE estimation routines were written by
+Aaron Mackey, based on a discussion of MLE estimates of Lambda and K
+written by Sean Eddy. The MLE estimation examines the middle 95% of
+scores, if there are fewer than 10000 sequences in the database;
+otherwise it excludes (censors) the top 250 scores and the bottom 250
+scores. This approach seems to effectively prevent related sequences
+from contaminating the estimation process. As with -z 1, -z 12 causes
+the program to generate a shuffled sequence score for each of the
+library sequences; in this case, no censoring is done. If the
+estimation process is reliable, Lambda and K should not vary much with
+different queries or query lengths. Lambda appears not to vary much
+with the comparison algorithm, although K does.
+
+(4) Minor changes include fixes to some of the alignment display routines,
+individual copies of the pstruct structure for each thread, and some
+changes to ensure that every last residue in a library is available
+for matching (sometime the last residue could be ignored). This
+version has undergone extensive testing with high-throughput sequences
+to confirm that long sequences are read properly. Problems with
+fastf3/fasts3 alignment display have also been addressed.
+
+>>August 26, 1999 (no version change - not released)
+
+Corrected problem in "apam.c" that prevented scoring matrices from
+being imported for [t]fasts3/[t]fastf3.
+
+>>August 17, 1999
+ --> v32t07
+
+Corrected problem with opt_cut initialization that only appeared
+with pvcomp* programs.
+
+Improved calculation of FASTA optcut threshold for DNA sequence
+comparison for match scores much less than +5 (e.g. +3). The previous
+optcut theshold was too high when the match penalty was < 4 and
+ktup=6; it is now scaled more appropriately.
+
+Optcut thresholds have also been raised slightly for
+fastx/y3/tfastx/y3. This should improve performance with minimal
+effects on sensitivity.
+
+>>July 29, 1999
+(no version change - date change)
+
+Corrected various uninitialized variables and buffer overruns
+detected.
+
+>>July 26, 1999 - new distribution
+(no version change - v32t06, previous version not released)
+
+Changed the location of "(reverse complement)" label in tfasta/x/y/s/f
+programs.
+
+Statistical calculations for tfasta/x/y in unthreaded version
+corrected. Statistical estimates for threaded and unthreaded versions
+of the tfasta/x/y/s/f programs should be much more consistent.
+
+Substantial modifications in alignment coordinate calculation/
+presentation. Minor error in fastx/y/tfastx/y end of alignment
+corrected. Major problems with tfasta alignment coordinates
+corrected. tfasta and tfastx/y coordinates should now be consistent.
+
+Corrected problem with -N 5000 in tfasta/x/y3(_t) searches encountered
+with long query sequences.
+
+Updated pthr_subs.c/Makefile.linux to increase the pthreads stacksize
+to try to avoid "cannot allocate diagonal arrays" error message.
+Pthreads stacksize can be changed with RedHat 6.0, but not RedHat 5.2,
+so Makefile.linux uses -DLINUX5 for RedHat5.* (no pthreads stack size).
+I am still getting this message, so it has not been completely
+successful. Makefile.linux now uses -DALLOCN0 to avoid this problem,
+at some cost in speed.
+
+The pvcomp* programs have been updated to work properly with
+forward/reverse DNA searches. See readme.pvm_3.2.
+
+>>July 7, 1999 - not released
+ --> v32t06
+
+Corrected bug in complib.c (fasta3, fastx3, etc) that caused core
+dumps with "-o" option.
+
+Corrected a subtle bug in fastx/y/tfastx/y alignment display.
+
+>>June 30, 1999 - new distribution
+(no version change)
+
+Corrected doinit.c to allow DNA substitution matrices with -s matrix
+option.
+
+Changed ".gbl" files to ".h" files.
+
+>>June 2 - 9, 1999 - new distribution
+(no version change)
+
+Added additional DNA lambda/K/H to alt_param.h. Corrected some
+other problems with those table. for the case where (inf,inf)
+gap penalties were not included.
+
+Fixed complib.c/comp_thr.c error message to properly report filename
+when library file is not found.
+
+Included approximate Lambda/K/H for BL80 in alt_parms.h.
+BL80 scoring matrix changed from 1/3 bit to 1/2 bit units.
+
+Included some additional perl files for searchfa.cgi, searchnn.cgi
+in the distribution (my-cgi.pl, cgi-lib.pl).
+
+>>May 30, 1999, June 2, 1999 - new distribution
+(no version number change)
+
+Added Makefile.NetBSD, if !defined(__NetBSD__) for values.h. Changed
+zs_to_E() and z_to_E() in scaleswn.c to correctly calculate E() value
+when only one sequence is compared and -z 3 is used.
+
+>>May 27, 1999
+(no version number change)
+
+Corrected bug in alignment numbering on the % identity line
+ 27.4% identity in 234 aa (101-234:110-243)
+for reverse complements with offset coordinates (test.aa:101-250)
+
+>>May 23, 1999
+(no version number change)
+
+Correction to Makefile.linux (tgetaa.o : failed to -DTFAST).
+
+>>May 19, 1999
+(no version number change)
+
+Minor changes to pvm_showalign.c to allow #define FIRSTNODE 1.
+Changes to showsum.c to change off-end reporting. (Neither of these
+changes is likely to affect anyone outside my research group.)
+
+>>May 12, 1999
+ --> v32t05
+
+Fixed a serious bug in the fastx3/tfastx3 alignment display which
+caused t/fastx3 to produce incorrect alignments (and incorrectly low
+percent identities). The scores were correct, but the alignment
+percent identities were too low and the alignments were wrong.
+
+Numbering errors were also corrected in fastx3/tfastx3 and
+fasty3/tfasty3 and when partial query sequences were used.
+
+>>May 7, 1999
+
+Fixed a subtle bug in dropgsw.c that caused do_work() to calculate
+incorrect Smith-Waterman scores after do_walign() had been called.
+This affected only pvcompsw searches with the "-m 9" option.
+
+>>May 5, 1999
+
+Modified showalign.c to provide improved alignment information that
+includes explicitly the boundaries of the alignment. Default
+alignments now say:
+
+Smith-Waterman score: 175; 24.645% identity in 211 aa overlap (5:207-7:207)
+
+>>May 3, 1999
+
+Modified nxgetaa.c, showsum.c, showbest.c, manshowun.c to allow a
+"not" superfamily annotation for the query sequence only. The
+goal is to be able to specify that certain superfamily numbers be
+ignored in some of the search summaries. Thus, a description line
+of the form:
+
+>GT8.7 | 40001 ! 90043 | transl. of pa875.con, 19 to 675
+
+says that GT8.7 belongs to superfamily 40001, but any library
+sequences with superfamily number 90043 should be ignored in any
+listing or summary of best scores.
+
+In addition, it is now possible to make a fasta3r/prcompfa, which is
+the converse of fasta3u/pucompfa. fasta3u reports the highest scoring
+unrelated sequences in a search using the superfamily annotation.
+fasta3r shows only the scores of related sequences. This might be
+used in combination with the -F e_val option to show the scores
+obtained by the most distantly related members of a family.
+
+>>April 25, 1999
+
+ -->v32t04 (not distributed)
+
+Modified nxgetaa.c to remove the dependence of tgetaa.o on TFASTA
+(necessary for a more rational Makefile structure). No code changes.
+
+>>April 19, 1999
+
+Fixed a bug in showalign.c that displayed incorrect alignment coordinates.
+(no version number change).
+
+>>April 17, 1999
+
+ --> v32t03
+
+A serious bug in DNA alignments when the sequence has been broken into
+multiple segments that was introduced in version fasta32 has been
+fixed. In addition, several minor problems with -z 3 statistics on
+DNA sequences were fixed.
+
+Added -m 9 option, which unfortunately does different things in
+pvcompfa/sw and fasta3/ssearch3. In both programs, -m 9 provides the
+id's of the two sequences, length, E(), %_ident, and start and end of
+the alignment in both sequences. pvcompfa/sw provides this
+information with the list of high scoring sequences. fasta3/ssearch3
+provides the information in lieu of an alignment.
+
+>>March 18, 1999
+
+ --> v32t02
+
+Added information on the algorithm/parameter description line to
+report the range of the pam matrices. Useful for matrices like
+MD_10, _20, and _40 which require much higher gap penalties.
+
+>>March 13, 1999 (not distributed)
+
+ --> v32t01
+
+ -r results.file has been changed to -R results.file to accomodate
+ DNA match/mismatch penalties of the form: -r "+1/-3".
+
+>>February 10, 1999
+
+Modify functions in scalesw*.c to prevent underflow after exp() on
+Alpha Linux machines. The Alpha/LINUX gcc compiler is buggy and
+doesn't behave properly with "denormalized" numbers, so "gcc -g -m
+ieee" is recommended.
+
+Add "Display alignments also (y/n)[n] "
+
+pvcomplib.c again provides alignments!! In addition, there is a
+new "-m 9" option, which reports alignments as:
+
+>>>/home/wrp/slib/hlibs/hum0.aa#5>HS5 gi:1280326 T-cell receptor beta chain 30 aa, 30 aa vs /home/wrp/slib/hlibs/hum0.seg library
+HS5 30 HS5 30 1.873e-11 1.000 30 1 30 1 30
+HS5 30 HS2249 40 1.061e-07 0.774 31 1 30 7 37
+HS5 30 HS2221 38 1.207e-07 0.833 30 1 30 7 35
+HS5 30 HS2283 40 1.455e-07 0.774 31 1 30 7 37
+HS5 30 HS2239 38 1.939e-07 0.800 30 1 30 7 35
+
+where the columns are:
+
+query-name q-len lib-name lib-len E() %id align-len q-start q-end l-start l-end
+
+>>February 9, 1999
+
+Corrected bug in showalign.c that offset reverse complement alignments
+by one.
+
+>>Febrary 2, 1999
+
+Changed the formatting slightly in showbest.c to have columns line up better.
+
+>>January 11, 1999
+
+Corrected some bugs introduced into fastf3(_t) in the previous version.
+
+>>December 28, 1998
+
+Corrected various problems in dropfz.c affecting alignment scores
+and coordinates.
+
+Introduced a new program, fasts3(_t), for searching with peptide
+sequences.
+
+>>November 11, 1998
+
+ --> v32t0
+
+Added code to correct problems with coordinate number in long library
+sequences with tfastx/tfasty. With this release, sequences should be
+numbered properly, and sequence numbers count down with reverse
+complement library sequences.
+
+In addition, with this release, fastx/y and tfastx/y translated
+protein alignments are numbered as nucleotides (increasing by 3,
+labels every 30 nucleotides) rather than codons.
+
diff --git a/doc/readme.v33t0 b/doc/readme.v33t0
new file mode 100644
index 0000000..013453b
--- /dev/null
+++ b/doc/readme.v33t0
@@ -0,0 +1,1268 @@
+
+ $Id: readme.v33t0 342 2010-06-28 19:57:56Z wrp $
+ $Revision: $
+
+================ readme.v33t0 ================
+
+This release includes an MPI implementation of the parallel
+library-vs-library comparison code. See readme.mpi_3.3 and
+readme.pvm_3.3 for more information.
+
+=====
+>>July 9, 2001
+
+Considerable changes to support no-global library functions.
+
+(1) Separate ascii/sequence mapping arrays are used by the
+ query-reading (qascii), library-reading (lascii), and sequence
+ comparison function (pascii) routines. As a result, there is no
+ longer a need for tgetlib.o/lgetlib.o - lgetlib.o can serve both
+ functions.
+
+(2) This also allows us to remove all #ifdef TFAST/FASTX conditionals
+ from complib.c/comp_thr.c/p2_complib.c. We no longer need
+ tcomp_thr.o, comp_thrx.o, etc. We still have a variety of
+ p2_complib.o variations to support the different c34.work* files.
+
+(3) Because non-global openlib/getlib functions are available, exactly
+ the same open/get functions are available for reading both the
+ query and reference libraries in pv34comp* programs. The
+ host-specific openlib/getlib functions in hxgetaa.c are now
+ provided by nmgetlib.c, etc. This has two effect:
+
+ (a) it is now possible to compare a query database generated by an
+ SQL query to a library database generated by a different SQL
+ query.
+
+ (b) pv34comp* has lost (at least in this version) the ability to
+ automatically detect the query sequence type. To search with a
+ DNA query, you MUST use "-n".
+
+(4) the resetp() function is now responsible for almost all of the
+ function sepcific (TFAST/FASTX/etc) initializations. All of the
+ function specific code has been removed from complib.c/comp_thr.c
+ and most of it has been moved to initfa.c/resetp().
+
+(5) manageacc.c has been merged into compacc.c (mostly prhist()).
+
+(6) Although it may reflect a subtle bug in my code, it is not
+ possible to reliably run threaded/memory mapped versions of the
+ fasta34_t code. I have spent considerable time tracking down the
+ problem, and have determined that, in threaded code, something
+ happens during the thread initialization to corrupt the
+ description offset information used when files are memory mapped.
+ This never occurs when the unthreaded versions of the code are
+ used. And it does not occur under MacOSX, Compaq Tru64Unix, Sun
+ Solaris/Sparc, or SGI IRIX.
+
+ Thus, I cannot recommend using the threaded code versions (_t)
+ under Linux (RH6.2 or 7.1).
+
+=====
+>>June 1, 2001
+
+Many changes to accomodate a new - no global variable - strategy for
+reading sequence databases. Every time a file is opened, a struct
+lmf_str is allocated which can be used for memory mapped files, ncbl2,
+files, and mysql files.
+
+In addition, an open'ed file has a default sequence type: DNA or
+protein, or one can open a file in a mode that will allow the sequence
+type to be changed.
+
+=====
+>>May 18, 2001 CVS: fa33t09d0
+
+A new compile time parameter - -DGAP_OPEN, is available to change the
+definition of the "-f gap-open" parameter from the penalty for the
+first residue in a gap to a true gap-open penalty, as is used in BLAST
+and many other comparison algorithms. This will probably become the
+default for fasta in version 3.4.
+
+Fixes to conflicts between "-S" and "-s matrix". When a scoring
+matrix file was specified, lower-case alignments were not displayed
+with -S (although the scores were calculated properly).
+
+More extensive testting of mysql_lib.c (mySQL query-libraries) with
+the pv4comp* and mp4comp* programs.
+
+=====
+>>April 5, 2001 CVS: fa33t08d4b3
+
+Changes in nmgetlib.c and ncbl2_mlib.c to return long sequence
+descriptions for PCOMPLIB (pv4/mp3comp*). Also fix p2_complib.c to
+request DNA library for translated comparisons.
+
+Fix for prss33(_t) to read both sequences from stdin.
+
+=====
+>>March 27, 2001 CVS: fa33t08d4
+ --> fa33t08d4
+
+Problems in ncbl2_mlib.c found searching NCBI non-redundant nucleotide
+database "nt" were fixed. Testing revealed a minor memory leak, which
+was fixed by modifying showbest.c, showalign.c, comp_thr.c, complib.c,
+and p2_complib.c to remember the last opened database file more
+effectively.
+
+Modifications to allow 64-bit fseek/ftell on machines like Sun,
+Linux/Intel, that support -D_FILE_OFFSET_BITS=64, -D_LARGE_FILE_SOURCE
+off_t, and fseeko(), ftello() with the option -DUSE_FSEEKO. Machines
+with 64-bit long's do not need this option. Machines with 32-bit
+longs that allow files >2 Gb can do so with 64-bit file access
+functions, including fseeko() and ftello(), which work with off_t file
+offsets instead of long's.
+
+=====
+>>March 3, 2001 CVS: fa33t08d2
+
+Corrected problems in nmgetaa.c and mysql_lib.c with parallel
+programs, and one serious problem with alternate DNA scoring matrices
+(initfa.c, initsw.c) not being set properly. A subtle problem with
+the merge of scaleswn.c and scaleswg.c is fixed.
+
+>>February 17, 2001
+
+Modified mysql_lib.c to use "#", rather than "%ld", to indicate the
+position of the GID. This change was made because sprintf() cannot be
+used reliably to generate an SQL string, as '"' and '%' are used in
+such strings.
+
+=====
+>>January 17, 2001
+(no version change, date change)
+
+Minro fixes to initfa.c, initsw.c to deal with DNA scoring matrices
+properly. "-n -s dna.mat" is required for the sequence/matrix to be
+recognized as DNA.
+
+>>January 16, 2001
+-->v34t00
+
+Merge of the main CVS trunk - fa33t06 with the latest release branch,
+fa33t08.
+
+In addition, PCOMPLIB mods have been made to mysql_lib.c. Because
+p2_complib.c gets sequence description information during the first
+read of the database, the mysql_query must be changed to return:
+result[0]=GID, result[1]=description, result[2]=sequence. In the
+PCOMPLIB case, the other SQL queries (for GID description, sequence)
+are not necessary but must still be provided.
+
+=====
+>>January 16, 2001
+(no version change, previous version not released)
+
+changes to p2_complib.c to correct openlib() incompatibility.
+
+changes to nmgetaa.c, ncbl2_lib.c to incorporate PCOMPLIB. nxgetaa.c
+removed.
+
+=====
+>>January 12, 2001
+(no version change, previous version not released)
+
+Change to initfa.c to move ktup check from query_parm() to last_init().
+
+=====
+>>January 10, 2001
+--> v33t08
+
+Fixes to complib.c, comp_thr.c to deal properly with long query
+protein sequences when a short library chunk (e.g. -N 5000) was given.
+In the case where the chunk size is too short, it will be reset to a
+length which allows the search to proceed, by including an amount of
+new sequence that is equal to the amount of overlap sequence.
+
+scaleswn.c and scaleswg.c have been merged.
+
+v33t08 includes the initial implementation for mySQL described below
+for v33t07x.
+
+======
+>>Dec. 20, 2000
+--> v33t07x
+
+Initial implementation of a syntax for mySQL database queries. A new
+file, mysql_lib.c has been added, and changes have been made to
+nmgetaa.c (which should now replace nxgetaa.c) and altlib.h. A mySQL
+database search needs a file with 4 parts:
+
+(1) description of the database, user, password
+(2) a select statement that generates the set of protein sequences
+ as: UID, sequence
+(3) a select statement that generates a UID, description given a UID
+(4) a select statement that generats a single UID, sequence given a UID
+
+Each of the four parts should be separated by ';'. For example, in
+the database that we are using for testing, a file "demo.sql" that
+contains:
+
+================
+localhost taxonomy username secret;
+SELECT proteins.gid, proteins.sequence FROM proteins,swissprot WHERE proteins.gid=swissprot.gid AND swissprot.spid IS NOT NULL;
+select proteins.gid, concat(swissprot.spid," ",proteins.description) from proteins,swissprot where proteins.gid=%ld AND swissprot.gid=proteins.gid;
+select gid, sequence from proteins where gid=%ld;
+================
+
+will find all the proteins in the BLAST "nr" database that also have
+SwissProt ID's when given the command line:
+
+ fasta33 -q query.aa "demo.sql 16"
+
+At least for simple queries, there is surprisingly little overhead for the
+search. For more complex queries involving several tables, the overhead
+can be significant.
+
+At the moment, libraries that need the functions in mysql_lib.c will
+use library type 16. We may also use file type 17 for SQL queries
+that return binary sequences.
+
+This implementation of mysql_lib.c was written to require a minimal
+amount of change to the other programs. Only nmgetaa.c and altlib.h
+needed to be changed to incorporate this new capability. One result
+of this limitation is that one cannot mix mySQL databases queries with
+other databases in the same search. Eventually, I would like to make
+a mySQL database like any other, so that several mysql database
+queries could be searched in the same run, and mysql databases could
+be mixed with other (flat file) databases, but this will require some
+changes in the function calls throughout the code. (Right now, the
+various programs do not distinguish between an openlib() that is made
+before searching a large database, and one before retrieving a single
+sequence. This must be changed for a database query like mySQL to
+behave like other databases.
+
+Several mySQL demo files have been provided: mysql_demo*.sql.
+
+(10 January 2001) The mySQL code has been tested on Intel Linux and
+Compaq/Alpha/Tru64 Unix.
+
+>>Dec. 9, 2000
+
+Changes to apam.c that to tie different default gap penalties to
+alternate scoring matrices. In addition, changes to apam.c, to deal
+with user-specified matrices with or without '*'.
+
+>>Nov. 5, 2000 (date updated)
+
+pst.dnaseq can now have 3 values, -1, or 0-> protein, 1->DNA, and 2->other.
+This becomes important for thing like init_karlin_a, which needs a
+background frequency of residues.
+
+>>Nov. 1, 2000
+
+Significant bug fixes for the -z 6/-z 16 option. An ininitialized
+variable was fixed in karlin.c, and comp_thr.c did not pass the
+correct composition argument type in find_zp(). The -z 6/16 option
+has now been tested and works correctly on Alphas, Linux x86, SGI, Sun
+and Mac OSX. Another problem was fixed in scaleswn.c (simplex()) that
+prevented the code from being reused by the pv4/mp4 complib programs.
+
+>>Oct. 9, 2000
+
+Several changes made to accomodate Mac OSX. Longer lists of superfamily
+numbers now supported in p[su]4comp/m[su]4comp programs.
+
+>>Sept 25, 2000
+
+All global variables have been removed from scaleswn.c. The last to
+go, db_struct db, required many edits, because until now, the fasta
+programs have kept two versions of the db_struct data (entries,
+length). One version was kept by the main program, which updated entry
+number and db length as sequences were read; a second copy of this
+information was kept by the statistical estimation routines. Now
+there is only one copy, which means that the E() values will be a
+function of the complete database, not the database with some high
+scoring sequences removed.
+
+>>Sept 23, 2000
+
+Continued removal of global variables from scaleswn.c. Only one
+global is left, db_struct db, which contains the number of entries in
+the database and the number of residues. It will be the next to go
+(changing all the zs_to_*() functions) and scaleswn. will be free
+of globals. scaleswg.c is gone - scaleswn.c compiles to scaleswg.c
+with -DNORMAL_DIST.
+
+>>Sept 20, 2000
+
+Removal of histogram globals required changes in p2_complib.c as well.
+p_complib.c has not been updated. scaleswg.c has been modified to
+reflect the new histogram strategy.
+
+>>Sept 19, 2000
+
+Substantial changes to remove globals for printing histogram. m_msg
+now contains a hist_str, which keeps histogram information.
+
+>>Sept. 19, 2000
+(no version change, previous version not released)
+
+Correct bug introduced into scaleswn.c (inithist()) by changing
+score2_sums[], score_sums[] from int to double.
+
+Reporting of version numbers is more consistent between fasta33,
+fasta33_t, and pv4compfa/mp4compfa. The programs now report the same
+numbers/dates in similar places.
+
+>>Sept. 15, 2000
+--> v33t07
+
+Changes to fix problems with statistical estimates when a large
+fraction (but not all) of the database is related. Several users
+reported problems when searching with rRNA genes with version 33t06.
+In some cases, a 100% identitical match over 1500 nt would not be
+statistically significant against a search of the bacterial division
+of Genbank. This problem was not seen with some releases of v33t05.
+
+The cause of the problem was a change between v33t05 and v33t06 to
+allow scoring matrices with unusual scaling to be used. In v33t05,
+there was a line that excluded all scores > 300 from the statistical
+estimation procedure. While 300 is a high score with any "normal"
+scoring matrix, some investigators were using matrices scaled 10X, so
+that a score of 300 was really a score of 30 with a conventional
+matrix, and should not be excluded. Unfortunately, removing the test
+to exclude scores > 300 meant that when a rRNA sequence was used to
+search the bacterial division, tens of thousands of high scoring
+related sequences were treated as if they were unrelated, with the
+result that the variance estimates were much too high, and thus high
+real scores had low z-scores, and thus were not statistically
+significant. (There appear to be more than 20,000 rRNA sequences in
+the bacterial division of Genbank, almost 25% of all sequences).
+
+The solution to the problem is a substantial enhancement in the
+strategies used to exclude high-scoring, related sequences, the -z 1,
+4, and 5 parameter estimation strategies. The programs now estimate
+the expected high scoring sequence by calculating an ungapped Lambda
+and K, and then use a relatively conservative threshold for excluding
+scores that are higher than would be expected 0.01 times by chance.
+By calculating Lambda and K, we can scale the cutoff thresholds to
+allow scoring matrices with unusual scales. For "normal" searches,
+there should be little change, but there should be an improvement for
+searches with large numbers of related sequences in the database.
+
+As a result of testing for this change, a bug in the karlin() function
+used with -z 6 was found and corrected.
+
+=======
+>>Sept. 9, 2000
+
+Changes to manshowbest.c to include correct display coordinates.
+
+Significant changes to structs.h, param.h, p2_complib.c,
+p2_workcomp.c, to store and use a reliable a_struct for alignment
+coordinates.
+
+Other cosmetic changes.
+
+>>Sept. 7, 2000
+
+Minor changes to complib.c, showrss.c, so that prss33 -q uses 200
+shuffles and prss33 provides bit scores, rather than z-scores.
+(no version number change).
+
+Modifications to p2_complib.c to include superfamily numbers for
+ps4comp* ms4comp*.
+
+>>Aug 22, 2000
+
+Changes to mmgetaa.c, ncbl2_mlib.c, dropfs.c to accomodate AIX.
+00README.1st updated to reflect the current version and correct
+outdated information on threads.
+
+>>Aug. 3, 2000
+
+Modifications to initpam2() in initsw.c to correct a problem with pam_x
+when the -S option is used.
+
+Modifications to compacc.c, scaleswn.c to ensure that residue numbers
+are calculated properly when more than 2 Gb of sequence is searched.
+
+>>July 12, 2000
+
+Modifications to dropnfa.c so that DNA matches to 'N' will be included
+in the "ungapped %identity". Thus, a sequence that is 100% identical
+for 100 nt on either side of a 100 nt region that has been masked to
+'NNNNN' will be reported as: "67% identical (100% ungapped)". This
+has been added to deal with masked BAC-end databases. It would be
+better if masking changed the letters to lowercase, but the mouse
+BAC-end sequences at TIGR use 'NNNNN'. This is currently available
+only for the fasta function, not [t]fast[x/y], etc, and only for DNA
+sequences.
+
+mk_n_pam() in apam.c modified to ensure that mismatch scores of -1
+remain -1.
+
+>>June 25, 2000
+
+Modification to nxgetaa.c, nmgetaa.c, mmgetaa.c to return Genbank Accession
+number as part of the descriptive string.
+
+>>June 11, 2000
+
+(no version change - not yet released)
+
+Modifications to calcons(), calc_id(), showbest(), p_workcomp.c to
+provide ngap_q (number of alignment gaps in query) , ngap_l (number
+of gaps in library) information for -m 9 output.
+
+>>June 6, 2000
+
+(no version change - not yet released)
+
+Modified scaleswn.c to provide better support for unconventional
+scoring scoring matrices, in particular, scoring matrices where every
+value is 50-times higher. Previous versions of the MLE estimator (-z
+2) started with lambda = 0.2, which is too high for a scoring matrix
+going from -500:+1500. The initial estimate for lambda is now
+calculated using the formula: lambda = pi/sqrt(6*variance). For the
+default -z 1, a restriction to limit scores to a maximum of 300 for
+the statistical analysis was removed.
+
+>>June 3, 2000
+
+Modified aligment output, and -m 9 and -m10, to report an "ungapped"
+identity as well as the traditional "gapped" identity. The
+traditional "gapped" identity reports the number of identities divided
+by the overall length of the alignment, including gaps. The
+"ungapped" identity does not include gaps in the length of the
+alignment. This new value is included for alignments that include
+introns; thus, a tfastx33 search might find the 100% identical genomic
+sequence but report the gapped percent identity if a short intron were
+included in the alignment (the alignment probably would not span a
+long exon) as 66%. The "ungapped" identity would remain 100%. The
+ungapped identity value is also shown in the "-m 9" output line after
+the "gapped" fraction identical.
+
+>>June 1, 2000
+
+Modified -m 9 output to provide fraction identical, alignment boundary
+information with the initial list of high scoring sequences, just as
+the pv3comp and mp_comp versions do. The -m 9 option now shows the
+same alignment display as -m 0, but the width of the alignment is
+increased by 40. Thus, by default, -m 9 will show the list of best
+hits, with percent identity, Smith-Waterman score, and alignment
+boundaries initially, and then show alignments standard (-m 0)
+alignments with 100 residues/line.
+
+>>May 29, 2000
+
+Correct some problems with reading data files with <CR>'s under unix.
+
+nmgetaa.c/nxgetaa.c/mmgetaa.c have been modified to convert <TAB>
+('\t') to <SPC> (' ') in descriptive lines.
+
+=======
+
+>>May 3, 2000
+
+ Corrected problem with very low mean_var in fit_llen() in scaleswn.c.
+
+>>May 2, 2000
+ (no version number change - previous version not released)
+
+ Merged fasta33t05d2 with fasta33t06. Also removed restriction on
+"-M size-range" to proteins - the size range now can be applied to DNA
+as well.
+
+>>May 1, 2000
+ (changes to v33t05d merged into v33t06)
+
+Introduced changes to include '*' as a valid sequence character, which
+indicates termination. Thus, 'TGA', 'TAG', and 'TAA' are now
+tranlated to '*' rather than 'X', and the protein PAM matrices have
+been modified to provide a match score of approximately 1/2 the max
+identity score for a '*:*' match. Otherise, '*' is the same as 'X'.
+This change only affects query sequences that include a '*' to
+indicate an end of sequence, the '*' is not there by default.
+
+The inclusion of '*' broke some things in tfasts33, tfastf33, fasty33,
+and tfasty33, which were fixed today.
+
+>>March 28, 2000/April 24, 2000
+ --> v33t06
+
+(a) -z 6 statistics that factor in composition
+(b) -smatrix-offset pam-offset parameter
+
+(a) This release provides a new statistics option, -z 6, which
+provides a more sophisticated model that accounts for sequence
+composition. When -z 6 is used (only for fasta33(_t) and
+ssearch33(_t)), the program calculates a composition parameter
+comp=1/lambda using a modified version of the Karlin-Altschul karlin()
+function. As a result, every sequence in the database has an
+associated length (n1) and composition (comp).
+
+The length n1 and composition comp are used in the maximum likelihood
+estimation described by Mott (1992) Bull. Math. Biol. 54:59-75. Four
+parameters are estimated, a0, a1, a2, and b1, and the probability of
+obtaining a score is then:
+
+p(s >= x) = 1-exp(-exp(-( a0 + a1*comp + a2*comp*log(n0*n1) + x)/(b1*comp)))
+
+The maximum likelihood estimates of a0, a1, a2, and b1 are calculated
+using the Nelder-Mead simplex search strategy.
+
+The average Lambda is reported for the search using Lambda =
+1/(b1*ave_comp). Where ave_comp is the geometric mean of the comp values
+calculated during the statistical estimates.
+
+The "lambda/comp" calculation can fail for sequences with very biased
+amino acid composition. When this occurs, 'comp' is set to -1.0 (as
+is 'H', the information content parameter) and the 'ave_comp' value is
+used to calculate statistical significance. (But obviously 'ave_comp'
+is not really appropriate, since if the sequence had an average 'comp'
+value, it would have been calculated.) When -z 6 is used, the
+alignment display shows the 'comp' and 'H' values for that library
+sequence.
+
+(b) Scoring matrix offsets - The main reason that the "lamdba/comp"
+calculation fails is that, for the particular query/library sequence
+pair, the expected score is not < 0, instead, Sum {p_ij S_ij} >= 0.0.
+This problem is reported to 'stderr' when it occurs. The simplest
+solution to the problem is to provide an offset to the scoring matrix;
+for example, to use Blosum62 - 1, which ranges from +10 to -5, rather
+than the standard +11 to -4. This option used to be available with
+the -S offset option, but -S is now used to specify a lower-case
+seg-ed database. The offset can now be specified as part of the
+scoring matrix name. Thus, "-s BL62-1" uses Blosum62 reduced by 1 at
+each entry. The '-' character is used to indicate an offset, so
+scoring matrix files must not have a '-' in their name.
+Alternatively, "-s BL80+1" or "-s BL80--1" would add one to each value.
+
+nxgetaa.c, nmgetaa.c, and mmgetaa.c have been edited to avoid string
+run-off problems after strncpy().
+
+Fixed problem where positive gap extension penalties in ssearch33
+were not converted to negative values.
+
+>>April 8, 2000
+
+Fixed problem in calculating corrected sequence lengths for
+Altschul-Gish probabilities.
+
+>>March 30, 2000
+ (no version change, date updated to March 30, 2000)
+
+Corrected problem with -m 9 option.
+
+The '*' character is now available to allow translated alignments to
+extend through the termination codon. Thus, if a protein sequence ends
+with a '*', and matches in to a translated termination codon, the
+score will be increased. The *:* match score is set to 1/2 the max
+positive score for the matrix (see upam.h). This strategy can also be
+used to upweight a match that extends all the way to the end of a
+full-length sequence by putting '*' at the end of both the query and
+library protein sequences. Recognition of '*' will probably become a
+command line option.
+
+>>March 21, 2000
+ (no version change, previous version not distributed)
+
+Changes to map_db.c, list_db.c, and mmgetaa.c to accomodate large
+sequence files. Long (64-bit on some systems) variables are now used
+to specify file and memory position for the memory mapped functions.
+As a result, there are now two *.xin (memory mapped index) file
+formats: MP0, which uses 32-bit longs, and MP1, which uses 64-bit
+longs. On 64-bit machines, MP0 32-bit indices are read properly, but
+limit the database size to 2 or 4 Gb; MP1 64-bit indices allow very
+large databases. Blast2.0 formatdb databases are still limited to
+4Gb. To compile map_db.c to generate 64-bit index files, include the
+compile time option -DBIG_LIB64 in the Makefile. (Currently this
+option has been tested only on the DEC Alpha and SGI platforms, and
+will work only with Unix versions that provide 64-bit longs and 64-bit
+ftell()'s.)
+
+The -R results file now uses sfn_cmp() to report a matching
+superfamily number, if one exists, and '0' otherwise.
+
+>>March 12, 2000
+ (no version change, previous version not distributed)
+
+Provide new strategy for specifying library abbreviations. In
+addition to:
+
+ fasta33 query.aa %anr
+
+one can also specify:
+
+ fasta33 query.aa %pir1+sp+nr
+or
+ fasta33 query.aa +pir1+sp+nr
+or
+ fasta33 query.aa %+pir1+sp+nr
+
+where the + anywhere in the library name string indicates that
+variable length library names, separated by '+', are being used (the
+last '+' is optional). The FASTLIBS file then becomes:
+
+================
+PIR1 Annotated Protein Database (rel 56)$0+pir1+/slib2/blast/pir1.lseg
+NBRF Protein database (complete)$0+nbrf+@/seqlib/lib/NBRF.nam
+NRL_3d structure database$0D/seqlib/lib/nrl_3d.seq 5
+NCBI/Blast non-redundant proteins$0+nr+/slib2/blast/nr.lseg
+NCBI/Blast Swissprot$0+sp+/slib2/blast/swissprot.lseg
+================
+
+The two abbreviation types, single letter and +word+, cannot be
+intermixed, and at least initially, +word+ specifiers are
+case-sensitive (single letter abbreviations are not) and will not be
+available interactively, only on the command line.
+
+Removed 'K' estimate for Expectation_n, Expectation_i fits to the
+distribution of unrelated similarity scores. 'K' cannot be calculated
+from the data available. 'Lamdba' can be calculated, it is
+1.28255/sqrt(mean_var), and is still available.
+
+>>March 3, 2000
+ (no version change)
+
+changed Makefile33.common, Makefile.common, to incorporate $(NRAND)
+rather than "rand48". Provide nrandom.c which uses random(), as
+replacement for nrand.c, which uses rand48().
+
+>>February 8, 2000
+ --> v33t05
+
+Fixes to scaleswn.c (proc_hist_ml) to set num_db_entries properly.
+Scaleswn.c also provides Lambda estimates for -z 1/11 (Expectation_n),
+and -z 1/14 (Expectation_i) statistical estimates.
+
+Modifications to calc_id() to correct bug in counting identities.
+Modified showalign() to use calc_id() with -m 9, for simpler
+debugging.
+
+Additional modifications to dropfa*.c files to deal properly with 'n's
+and 'x's.
+
+Added new option: -x #, which allows one to override the penalty for a
+match against 'x' (or 'N') provided by the scoring matrix. This
+option is particularly useful in fast[x/y] searches, where out of
+frame low complexity regions can generate high scores.
+
+The old function of '-x' - to specify an alternate coordinate system,
+is now available as '-X # #'.
+
+Updated scaleswn.c to provide window shuffle information for -z 12.
+
+Updated compacc.c, workacc.c, to fix serious bug in wshuffle()
+that destroyed aa1[n1]=0.
+
+>>January 25, 2000
+ --> v33t04
+
+ A serious bug in all of the fasta related programs has been
+corrected. The new code in fasta33 which ignores certain residues
+failed to initialize one of the arrays properly. As a result, in
+pathological situations, a very strong match could be missed.
+
+ Corrected minor bug in initsw.c that cause misplaced "ktup" command
+line argument, which should be ingnored by ssearch, to be read as -d
+ktup.
+
+ Improved error message for 0 length query sequence.
+
+>>January 17, 2000
+ --> no external version number change
+
+Modified mmgetaa.c, map_db.c, and nmgetaa.c to provide memory mapping
+of genbank flatfile (format=1) files. This format could be read much
+more efficiently, however.
+
+>>January 12, 2000
+ --> no external version number change
+
+Changed the behavior of the options that set the number of high scores
+(-b) and alignments (-d) that are displayed. Previously, fasta33 -E
+10.0 -d 10 would show 50 best scores, rather than all the scores with
+E() < 10.0. To get the -E threshold to limit, -E 10.0 -b 10000 -d 10
+was required. This is now fixed. Setting "-d 10" does not affect the
+number of best scores shown.
+
+Minor change in mw.h to remove unused defines.
+
+fasta3x.me (fasta3x.doc) updated.
+
+>>January 6, 2000
+ --> v33t03
+
+Corrected bug in memory mapped reads of gcg_binary format files
+that potentially caused the last 63 residues to be read improperly.
+
+Changes to comp_thr.c, pthr_subs.c, uthr_subs.c, ibm_pthr_subs.c to
+ensure that each thread has its own work_info structure. This solves
+some minor race conditions that sometimes caused some parameters
+not to be reported properly.
+
+Changes to most of the drop*.c files to correct some minor problems
+with sequence alphabets. Code in mmgetaa.c (memory mapped code for
+FASTA, GCG compressed files) reordered to prevent files from being
+memory mapped if appropriate index files are not available.
+
+See readme.pvm_3.3 for updates to the pvm programs.
+
+>>December 10, 1999
+ (no version change - modifications largely affect ps3comp*)
+
+Modifications to showsum.c to deal with 2 scores/sequence. Modifications
+to mmgetaa.c for superfamily numbers.
+
+>>December 7, 1999
+ (no version change, previous version not released)
+
+Corrected problem in mmgetaa.c that caused searches on a memory mapped
+single long sequence (e.g. Chr22) to fail. Corrected bug in map_db.c
+that caused it to crash on some architectures if a filename was not
+specified. Corrected off-by-three error in fasty/tfasty. Corrected
+indexing error in dropfz2.c.
+
+>>December 5, 1999
+ --> v33t02
+
+corrected some bugs in inifa.c/initsw.c/doinit.c that caused
+abbreviated function names to be lost.
+
+modify showbest.c, showalign.c to include information on position in
+library sequence (bbp->cont) to distinguish subsegment of very long
+sequences. Currently, the new label is available only with -m 6.
+
+>>November 29, 1999
+ [t]fastz33 uses v33t02 of fasty function.
+
+Replace dropfz.c with dropfz2.c. Dropfz2.c interprets any codons,
+that include the nucleotide 'N' as the amino 'X'. Previously, 'N' was
+treated as 'A', so 'NNN' ended up 'K'. This modification, together
+with the -S option and lower-case pseg'ed databases, should ensure
+that DNA queries with large numbers of 'N's do not match low
+complexity regions.
+
+>>November 20, 1999
+ (no version change, previous version not released)
+
+Modify initfa.c to disply initn, init1 scores for [t]fast[fs].
+Include "-B" option to show previous z-scores.
+
+>>November 17, 1999
+ (no version change, previous version not released)
+
+Modify dropfx.c to use saatran(), rather than aatran(). saatran
+translates any 'N' containing codon as 'X'. aatran() treats 'N' as
+an 'A'. Although more steps are required for translation, the program
+appears to run just as fast.
+
+>>November 7, 1999
+ --> v33t01
+
+Substantial changes to the output format in showbest.c (the list of
+high scoring sequences) and showalign.c (the alignments). The classic
+list of best scores:
+
+The best scores are: initn init1 opt z-sc E(82014)
+gi|121716|sp|P10649|GTM1_MOUSE GLUTATHIO ( 218) 1497 1497 1497 1761.1 2.3e-91
+gi|121717|sp|P04905|GTM1_RAT GLUTATHIONE ( 218) 1413 1413 1413 1662.9 6.7e-86
+
+has been replaced by:
+
+The best scores are: opt bits E(82138)
+gi|121716|sp|P10649|GTM1_MOUSE GLUTATHIONE S-TRAN ( 218) 1497 354 7.6e-98
+gi|121717|sp|P04905|GTM1_RAT GLUTATHIONE S-TRANSF ( 218) 1413 335 5.3e-92
+
+This display provides more information and removes the outdated initn
+and init1 scores, which are no longer used. The "bit" score is
+comparable to the blast2 bit score. It is calculated as: (lambda*S -
+ln K)/ln 2, where S is the raw similarity score, lambda and K are
+statistical parameters estimated from the distribution of unrelated
+sequence similarity scores. All of the similarity scores, including
+init1, initn, and z-scores are reported with the alignment data.
+Z-scores are displayed instead of bit scores in the list of high
+scores if the command line option "-B" is specified.
+
+In addition, the alignment score line has changed from:
+
+>>gi|2506495|sp|P20136|GTM2_CHICK GLUTATHIONE S-TRANSFER (220 aa)
+ initn: 954 init1: 954 opt: 958 Z-score: 1130.9 expect() 1.1e-56
+Smith-Waterman score: 958; 61.927% identity in 218 aa overlap (1-218:1-218)
+
+to:
+
+>>gi|2506495|sp|P20136|GTM2_CHICK GLUTATHIONE S-TRANSFER (220 aa)
+ initn: 954 init1: 954 opt: 958 Z-score: 1130.9 bits: 216.4 E(): 2.8e-56
+Smith-Waterman score: 958; 61.927% identity in 218 aa overlap (1-218:1-218)
+
+In addition to the addition of the "bits:" score, the "expect()" label
+has changed to "E()" to save some space.
+
+>>November 4,12, 1999
+(no version change)
+
+Fixed serious bug in -z 2 lambda/K calculation in scaleswn.c
+
+Fixed bugs in llgetaa.c (openlib()) and definition of superfamily
+numbers.
+
+>>October 21, 1999
+(no version change)
+
+Begin using CVS for version control. Correct faulty error message in
+dropfs.c. Corrected bad "goto loopl;" in dropfz.c. Corrected prss3.rsp
+for Makefile.tc (Win32 version).
+
+>>October 18, 1999
+ --> v33t0
+
+Corrected some serious bugs with the various fasta/x/y programs when
+the -DALLOCN0 was used to save memory. Improvements to fasta3x.me/.doc
+documentation.
+
+>>October 12, 1999
+ --> v33tx
+
+For this initial release of version 33 of the FASTA programs, the
+Makefile's have been modified to make "fasta33(_t)", "fastx33(_t)",
+etc, so that you can test fasta33 while retaining fasta3 (from release
+v32t08). The FASTA33 programs are somewhat slower than previous
+releases, but I believe the ability to handle low complexity regions
+without 'X'ing them out outweighs the slowdown. By (temporarily)
+changing the names of the programs slightly, it will be easier for you
+to judge the relative cost and benefit. To "make" the programs as
+"fasta3(_t)", etc, simply replace "Makefile33.common" with
+"Makefile.common" in the "Makefile" that you use.
+
+>>September 30, 1999
+
+ssearch3/fasta3/fastx3/fasty3 have been modified to search databases
+containing both upper and lower case letters, where lower case letters
+indicate low-complexity regions. With the modified programs, lower
+case letters are treated as 'X's' in the initial scan, but are then
+treated normally in the final alignment. In addition, alignments can
+contain lower case letters. Lower case letters are treated as
+low-complexity regions during the seach phase of the program, but as
+"conventional" residues during the alignment phase, with the "-S"
+option. Currently, lower case letters are mapped to 'X's during the
+scan of the entire library. In the future, alternate weights will be
+available. This is a substantial improvement for very large scale
+comparison, where one seeks both accurate statistical estimates and
+accurate %identities and alignments, and for translated DNA:protein
+comparisons, like "fastx3" and "fasty3", where out-of-frame
+translations tend to match low complexity regions (see Pearson et
+al. (1997) Genomics 46:24-36).
+
+Protein databases (and query sequences) can be generated in the
+appropriate format using John Wooton's "pseg" program, available from
+ftp://ftp.ncbi.nih.gov/pub/seg/pseg. Once you have compiled the "pseg"
+program, use the command:
+
+ pseg database.fasta -z 1 -q > database.lc_seg
+
+Once you have database.lc_seg, run the command "map_db" to generate
+a ".xin" file that can be used to efficiently memory map the database.
+
+You can then search database.lc_seg with or without the "-S" option.
+Without "-S", the database is treated as any other FASTA format file -
+all the residues are present. With "-S", lower case residues will be
+treated as 'x's' during the initial scan but as normal residues when
+final alignments are displayed.
+
+When the -S option is used, the matrix information line is changed
+from: "BL50 matrix (15:-5)" to "BL50 matrix (15:-5)xS". The "-S"
+option is no longer available to provide a scoring matrix offset.
+
+Unfortunately, Blast2.0 format files cannot contain lower case
+letters. We have addressed this problem by providing efficient memory
+mapped access to Fasta and GCG/PIR, and GCG/compressed-binary files in
+the last release of fasta32t08. The memory mapped file I/O
+improvements are provided in fasta33 as well.
+
+================ readme.v32 ================
+
+FASTX/Y and FASTA (DNA) are now half as fast, because the programs now
+search both the forward and reverse strands by default.
+
+The documentation in fasta3x.me/fasta3x.doc has been substantially
+revised.
+
+>>October 20, 1999
+(no version change)
+
+Modify nxgetaa.c/nmgetaa.c to recognize 'N' as a possible DNA character.
+
+>>October 9, 1999
+ --> v32t08 (no version number change)
+
+Added "-M low-high" option, where low and high are inclusion limits
+for library sequences. If a library sequence is shorter than "low" or
+longer than "high", it will not be considered in the search. Thus,
+"-M 200-250" limits the database search to proteins between 200 and
+250 residues in length. This should be particularly useful for fasts3
+and fastf3. -M -500 searches library sequences < 500; -M 200 -
+searches sequences > 200. This limit applies only to protein
+sequences.
+
+Modified scaleswn.c to fall back to maximum likelihood estimates of
+lambda, K rather than mean/variance estimates. (This allows MLE
+estimation to be used instead of proc_hist_n when a limited range of
+scores is examined.)
+
+>>October 2, 1999
+ --> v32t08
+
+Many changes:
+
+(1) memory mapped (mmap()ed) database reading - other database reading fixes
+(2) BLAST2 databases supported
+(3) true maximum likelihood estimates for Lambda, K
+(4) Misc. minor fixes
+
+(1) (Sept. 26 - Oct. 2, 1999) Memory mapped database access.
+It is now possible to use mmap()ed access to FASTA format databases,
+if the "map_db" program has been used to produce an ".xin" file. If
+USE_MMAP is defined at compile time and a ".xin" file is present, the
+".xin" will be used to access sequences directly after the file is
+mmap()ed. On my 4-processor Alpha, this can reduce elapsed time by
+50%. It is not quite as efficient as BLAST2 format, but it is close.
+
+Currently, memory mapping is supported for type 0 (FASTA), 5
+(PIR/GCG ascii), and 6 (GCG binary). Memory mapping is used if a
+".xin" file is present. ".xin" files are created by the new program
+"map_db". The syntax for "map_db" is:
+
+ map_db [-n] "/dir/database.fa"
+
+which creates the file /dir/database.fa.xin. Library types can be
+included in the filename; thus:
+
+ map_db -n "/gcggenbank/gb_om.seq 6"
+
+would be used for a type 6 GCG binary file.
+
+The ".xin" file must be updated each time the database file changes.
+map_db writes the size of the database file into the ".xin" file, so
+that if the database file changes, making the ".xin" offset
+information invalid, the ".xin" file is not used. "list_db" is
+provided to print out the offset information in the ".xin" file.
+
+(Oct 2, 1999) The memory mapping routines have been changed to
+allow several files to be memory mapped simultaneously. Indeed, once a
+database has been memory mapped, it will not be unmap()ed until the
+program finishes. This fixes a problem under Digital Unix, and should
+make re-access to mmap()ed files (as when displaying high scores and
+alignments) much more efficient. If no more memory is available for
+mmap()ing, the file will be read using conventional fread/fgets.
+
+(Oct 2, 1999) The names of the database reading functions has been
+changed to allow both Blast1.4 and Blast2.0 databases to be read. In
+addition, Makefile.common now includes an option to link both
+ncbl_lib.o and ncbl2_lib.o, which provides support for both libraries.
+However, Blast1.4 support has not been tested.
+
+The Makefile structure has been improved. Each architecture specific
+Makefile (Makefile.alpha, Makefile.linux, etc) now includes
+Makefile.common. Thus, changes to the program structure should be
+correct for all platforms. "map_db" and "list_db" are not made with
+"make all".
+
+The database reading functions in nxgetaa.c can now return a database
+length of 0, which indicates that no residues were read. Previously,
+0-length sequences returned a length of 1, which were ignored.
+Complib.c and comp_thr.c have changed to accommodate this
+modification. This change was made to ensure that each residue,
+including the last, of each sequence is read.
+
+Corrected bug in nxgetaa.c with FASTA format files with very long
+(>512 char) definition lines.
+
+(2) (September 20, 1999) BLAST2 format databases supported
+
+This release supports NCBI Blast2.0 format databases, using either
+conventional file reading or memory mapped files. The Blast2.0 format
+can be read very efficiently, so there is only a modest improvement in
+performance with memory mapping. The decision to use mmap()'ed files
+is made at compile time, by defining USE_MMAP. My thanks to Eamonn
+O'Toole of DEC/Compaq, and Daryl Madura of Sun Microsystems, for
+providing mmap()'ed modifications to fasta3. On my machines, Blast2.0
+format reduces search time by about 30%. At the moment, ambiguous DNA
+sequences are not decoded properly.
+
+(3) (September 30, 1999) A new statistical estimation option is
+available. -z 2 has been changed from ln()-scaling, which never
+should have been used, to scaling using Maximum Likelihood Estimates
+(MLEs) of Lambda and K. The MLE estimation routines were written by
+Aaron Mackey, based on a discussion of MLE estimates of Lambda and K
+written by Sean Eddy. The MLE estimation examines the middle 95% of
+scores, if there are fewer than 10000 sequences in the database;
+otherwise it excludes (censors) the top 250 scores and the bottom 250
+scores. This approach seems to effectively prevent related sequences
+from contaminating the estimation process. As with -z 1, -z 12 causes
+the program to generate a shuffled sequence score for each of the
+library sequences; in this case, no censoring is done. If the
+estimation process is reliable, Lambda and K should not vary much with
+different queries or query lengths. Lambda appears not to vary much
+with the comparison algorithm, although K does.
+
+(4) Minor changes include fixes to some of the alignment display routines,
+individual copies of the pstruct structure for each thread, and some
+changes to ensure that every last residue in a library is available
+for matching (sometime the last residue could be ignored). This
+version has undergone extensive testing with high-throughput sequences
+to confirm that long sequences are read properly. Problems with
+fastf3/fasts3 alignment display have also been addressed.
+
+>>August 26, 1999 (no version change - not released)
+
+Corrected problem in "apam.c" that prevented scoring matrices from
+being imported for [t]fasts3/[t]fastf3.
+
+>>August 17, 1999
+ --> v32t07
+
+Corrected problem with opt_cut initialization that only appeared
+with pvcomp* programs.
+
+Improved calculation of FASTA optcut threshold for DNA sequence
+comparison for match scores much less than +5 (e.g. +3). The previous
+optcut theshold was too high when the match penalty was < 4 and
+ktup=6; it is now scaled more appropriately.
+
+Optcut thresholds have also been raised slightly for
+fastx/y3/tfastx/y3. This should improve performance with minimal
+effects on sensitivity.
+
+>>July 29, 1999
+(no version change - date change)
+
+Corrected various uninitialized variables and buffer overruns
+detected.
+
+>>July 26, 1999 - new distribution
+(no version change - v32t06, previous version not released)
+
+Changed the location of "(reverse complement)" label in tfasta/x/y/s/f
+programs.
+
+Statistical calculations for tfasta/x/y in unthreaded version
+corrected. Statistical estimates for threaded and unthreaded versions
+of the tfasta/x/y/s/f programs should be much more consistent.
+
+Substantial modifications in alignment coordinate calculation/
+presentation. Minor error in fastx/y/tfastx/y end of alignment
+corrected. Major problems with tfasta alignment coordinates
+corrected. tfasta and tfastx/y coordinates should now be consistent.
+
+Corrected problem with -N 5000 in tfasta/x/y3(_t) searches encountered
+with long query sequences.
+
+Updated pthr_subs.c/Makefile.linux to increase the pthreads stacksize
+to try to avoid "cannot allocate diagonal arrays" error message.
+Pthreads stacksize can be changed with RedHat 6.0, but not RedHat 5.2,
+so Makefile.linux uses -DLINUX5 for RedHat5.* (no pthreads stack size).
+I am still getting this message, so it has not been completely
+successful. Makefile.linux now uses -DALLOCN0 to avoid this problem,
+at some cost in speed.
+
+The pvcomp* programs have been updated to work properly with
+forward/reverse DNA searches. See readme.pvm_3.2.
+
+>>July 7, 1999 - not released
+ --> v32t06
+
+Corrected bug in complib.c (fasta3, fastx3, etc) that caused core
+dumps with "-o" option.
+
+Corrected a subtle bug in fastx/y/tfastx/y alignment display.
+
+>>June 30, 1999 - new distribution
+(no version change)
+
+Corrected doinit.c to allow DNA substitution matrices with -s matrix
+option.
+
+Changed ".gbl" files to ".h" files.
+
+>>June 2 - 9, 1999 - new distribution
+(no version change)
+
+Added additional DNA lambda/K/H to alt_param.h. Corrected some
+other problems with those table. for the case where (inf,inf)
+gap penalties were not included.
+
+Fixed complib.c/comp_thr.c error message to properly report filename
+when library file is not found.
+
+Included approximate Lambda/K/H for BL80 in alt_parms.h.
+BL80 scoring matrix changed from 1/3 bit to 1/2 bit units.
+
+Included some additional perl files for searchfa.cgi, searchnn.cgi
+in the distribution (my-cgi.pl, cgi-lib.pl).
+
+>>May 30, 1999, June 2, 1999 - new distribution
+(no version number change)
+
+Added Makefile.NetBSD, if !defined(__NetBSD__) for values.h. Changed
+zs_to_E() and z_to_E() in scaleswn.c to correctly calculate E() value
+when only one sequence is compared and -z 3 is used.
+
+>>May 27, 1999
+(no version number change)
+
+Corrected bug in alignment numbering on the % identity line
+ 27.4% identity in 234 aa (101-234:110-243)
+for reverse complements with offset coordinates (test.aa:101-250)
+
+>>May 23, 1999
+(no version number change)
+
+Correction to Makefile.linux (tgetaa.o : failed to -DTFAST).
+
+>>May 19, 1999
+(no version number change)
+
+Minor changes to pvm_showalign.c to allow #define FIRSTNODE 1.
+Changes to showsum.c to change off-end reporting. (Neither of these
+changes is likely to affect anyone outside my research group.)
+
+>>May 12, 1999
+ --> v32t05
+
+Fixed a serious bug in the fastx3/tfastx3 alignment display which
+caused t/fastx3 to produce incorrect alignments (and incorrectly low
+percent identities). The scores were correct, but the alignment
+percent identities were too low and the alignments were wrong.
+
+Numbering errors were also corrected in fastx3/tfastx3 and
+fasty3/tfasty3 and when partial query sequences were used.
+
+>>May 7, 1999
+
+Fixed a subtle bug in dropgsw.c that caused do_work() to calculate
+incorrect Smith-Waterman scores after do_walign() had been called.
+This affected only pvcompsw searches with the "-m 9" option.
+
+>>May 5, 1999
+
+Modified showalign.c to provide improved alignment information that
+includes explicitly the boundaries of the alignment. Default
+alignments now say:
+
+Smith-Waterman score: 175; 24.645% identity in 211 aa overlap (5:207-7:207)
+
+>>May 3, 1999
+
+Modified nxgetaa.c, showsum.c, showbest.c, manshowun.c to allow a
+"not" superfamily annotation for the query sequence only. The
+goal is to be able to specify that certain superfamily numbers be
+ignored in some of the search summaries. Thus, a description line
+of the form:
+
+>GT8.7 | 40001 ! 90043 | transl. of pa875.con, 19 to 675
+
+says that GT8.7 belongs to superfamily 40001, but any library
+sequences with superfamily number 90043 should be ignored in any
+listing or summary of best scores.
+
+In addition, it is now possible to make a fasta3r/prcompfa, which is
+the converse of fasta3u/pucompfa. fasta3u reports the highest scoring
+unrelated sequences in a search using the superfamily annotation.
+fasta3r shows only the scores of related sequences. This might be
+used in combination with the -F e_val option to show the scores
+obtained by the most distantly related members of a family.
+
+>>April 25, 1999
+
+ -->v32t04 (not distributed)
+
+Modified nxgetaa.c to remove the dependence of tgetaa.o on TFASTA
+(necessary for a more rational Makefile structure). No code changes.
+
+>>April 19, 1999
+
+Fixed a bug in showalign.c that displayed incorrect alignment coordinates.
+(no version number change).
+
+>>April 17, 1999
+
+ --> v32t03
+
+A serious bug in DNA alignments when the sequence has been broken into
+multiple segments that was introduced in version fasta32 has been
+fixed. In addition, several minor problems with -z 3 statistics on
+DNA sequences were fixed.
+
+Added -m 9 option, which unfortunately does different things in
+pvcompfa/sw and fasta3/ssearch3. In both programs, -m 9 provides the
+id's of the two sequences, length, E(), %_ident, and start and end of
+the alignment in both sequences. pvcompfa/sw provides this
+information with the list of high scoring sequences. fasta3/ssearch3
+provides the information in lieu of an alignment.
+
+>>March 18, 1999
+
+ --> v32t02
+
+Added information on the algorithm/parameter description line to
+report the range of the pam matrices. Useful for matrices like
+MD_10, _20, and _40 which require much higher gap penalties.
+
+>>March 13, 1999 (not distributed)
+
+ --> v32t01
+
+ -r results.file has been changed to -R results.file to accomodate
+ DNA match/mismatch penalties of the form: -r "+1/-3".
+
+>>February 10, 1999
+
+Modify functions in scalesw*.c to prevent underflow after exp() on
+Alpha Linux machines. The Alpha/LINUX gcc compiler is buggy and
+doesn't behave properly with "denormalized" numbers, so "gcc -g -m
+ieee" is recommended.
+
+Add "Display alignments also (y/n)[n] "
+
+pvcomplib.c again provides alignments!! In addition, there is a
+new "-m 9" option, which reports alignments as:
+
+>>>/home/wrp/slib/hlibs/hum0.aa#5>HS5 gi:1280326 T-cell receptor beta chain 30 aa, 30 aa vs /home/wrp/slib/hlibs/hum0.seg library
+HS5 30 HS5 30 1.873e-11 1.000 30 1 30 1 30
+HS5 30 HS2249 40 1.061e-07 0.774 31 1 30 7 37
+HS5 30 HS2221 38 1.207e-07 0.833 30 1 30 7 35
+HS5 30 HS2283 40 1.455e-07 0.774 31 1 30 7 37
+HS5 30 HS2239 38 1.939e-07 0.800 30 1 30 7 35
+
+where the columns are:
+
+query-name q-len lib-name lib-len E() %id align-len q-start q-end l-start l-end
+
+>>February 9, 1999
+
+Corrected bug in showalign.c that offset reverse complement alignments
+by one.
+
+>>Febrary 2, 1999
+
+Changed the formatting slightly in showbest.c to have columns line up better.
+
+>>January 11, 1999
+
+Corrected some bugs introduced into fastf3(_t) in the previous version.
+
+>>December 28, 1998
+
+Corrected various problems in dropfz.c affecting alignment scores
+and coordinates.
+
+Introduced a new program, fasts3(_t), for searching with peptide
+sequences.
+
+>>November 11, 1998
+
+ --> v32t0
+
+Added code to correct problems with coordinate number in long library
+sequences with tfastx/tfasty. With this release, sequences should be
+numbered properly, and sequence numbers count down with reverse
+complement library sequences.
+
+In addition, with this release, fastx/y and tfastx/y translated
+protein alignments are numbered as nucleotides (increasing by 3,
+labels every 30 nucleotides) rather than codons.
+
diff --git a/doc/readme.v34t0 b/doc/readme.v34t0
new file mode 100644
index 0000000..49db859
--- /dev/null
+++ b/doc/readme.v34t0
@@ -0,0 +1,1683 @@
+
+ $Id: readme.v34t0 348 2010-07-20 21:33:22Z wrp $
+ $Revision: $
+
+>>May 28, 2007
+
+Small modification for GCG ASCII (libtype=5) header line.
+
+>>January 12, 2007 fasta-34_26_2
+
+Fix a problem with pssm_asn_subs.c reading strings (sequences) longer
+than 1024 bytes.
+
+Remove searchfa.cgi, searchnn.cgi, cgi-lib.pl, my-cgi.pl - this code
+was used for an ancient FASTA WWW implementation and has been replaced
+by the FASTA_WWW package.
+
+FASTA Version numbers are being modified to make releases easier to
+track, thus fa34t26b5 has become fasta-34_26_1. I would prefer to use
+decimal versions, but CVS does not allow '.' in tags.
+
+>>January 4, 2007 fasta-34_26_1
+
+Include scripts for building Mac OS X Universal binaries on a PPC
+machine. Programs are compiled first with Makefile.os_x (gcc-3.3 for
+PPC) and then installed into ./ppc/. Programs are next compiled with
+Makefile.os_x86 for i386, and the resulting executables installed into
+./i386/. Finally, the "make_osx_univ.sh" script is run to build the
+universal binaries from the two executables using "lipo".
+
+>>December 12, 2006
+
+Fix some problems with p2_workcomp.c: (1) no longer initialize pad
+characters for non-existant sequences. (2) deal with small libraries
+consistently with the serial versions.
+
+>>November 17, 2006 fa34t26b5
+
+Fixed a problem reading ASN.1 format 2 PSSM's. It is now possible to
+download a PSI-BLAST PSSM RID and search properly. Next, the query
+sequence from the PSSM should be used instead of the provided query
+sequence, so that the query sequence is ignored.
+
+>>October 19, 2006 fa34t26b4
+
+Fixed problem with SSE2 code when PSSM's are used.
+
+>>October 6, 2006 fa34t26b3
+
+A new set of WIN32 programs is now available that use the Intel C++
+9.1 compiler, rather than the much older Borland Turbo-C compiler. All
+of the unthreaded programs that are part of the Unix and MacOSX FASTA
+distributions are now available. Threaded (multiprocessor) versions
+of the program as available as well, as are sse2 accelerated versions
+of ssearch34 (ssearch34sse2.exe, ssearch34sse2_t.exe).
+
+Th new WIN32 code also uses Microsoft's "nmake" program to build the
+programs, which allows much greater consistency between the Unix and
+Windows versions.
+
+
+>>September 18, 2006
+
+Static global alignment variables removed from dropnfa.c, dropfx.c,
+dropfz2.c. dropnfa.c, dropfx.c and dropfz2.c should be thread safe.
+Together with the earlier changes, all the FASTA functions should now
+be thread safe during the alignment process.
+
+>>August 17, 2006
+
+Begin removal of static variables from Smith-Waterman alignment
+functions. These variables kept the functions from being thread-safe.
+Now dropgsw.c and dropnsw.c are thread-safe.
+
+>>August 15, 2006 fa34t26b2
+
+Fixed a problem with pv34compfx/mp34compfx (and fy) producing
+improperly labeled alignments and de-allocating memory for the reverse
+complement.
+
+>>July 18, 2006
+
+The library file name parsing programs now provide the option for
+environment variable substitions. For example, SLIB2=/slib2 as an
+environment variable (e.g. export SLIB2=/slib2 for ksh and bash), then
+
+ fasta34 -q query.aa '${SLIB2}/swissprot.fa' expands as expected.
+
+While this is not important for command lines, where the Unix shell
+would expand things anyway, it is very helpful for various
+configuration files, such as files of file names, where:
+
+ <${SLIB2}/blast
+ swissprot.fa
+
+now expands properly, and in FASTLIBS files the line:
+
+ NCBI/Blast Swissprot$0S${SLIB2}/blast/swissprot.fa
+
+expands properly. Currently, Environment variable expansion only
+takes place for library file names, and the <directory in a file of
+file names.
+
+>>July 14, 2006 fa34t26b1
+
+Updated Farrar smith_waterman_sse2.c code to address possible bug
+(code from Michael Farrar). Include <sunmedia_intrin.h> for
+compilation with Sun compiler with Makefile.sun_x86.
+
+>>July 2, 2006 fa34t26b0
+
+This release provides an extremely efficient SSE2 implementation of
+the Smith-Waterman algorithm for the SSE2 vector instructions written
+by Michael Farrar (farrar.michael at gmail.com). The SSE code speeds up
+Smith-Waterman 8 - 10-fold in my tests, making it comparable to Eric
+Lindahl's Altivec code for the Apple/IBM G4/G5 architecture.
+
+The Farrar code is largely confined to smith_waterman_sse2.c and
+smith_waterman_sse2.h, which are copyright (2006) by Michael Farrar,
+and cannot be redistributed without his permission. Mr. Farrar has
+agreed to provide his code under the same policy used by FASTA -
+e.g. the code can be used without permission, but not redistributed.
+
+The Farrar code uses GCC version 4.0 SSE2 intrinsic functions to avoid
+assembly language code. Unfortunately, in my hands, "gcc -O3" causes
+"out of memory" errors, and other problems, so "gcc -O" is used instead.
+
+>>June 23, 2006 fa34t25d10
+
+Modifications to comp_lib.c, compacc.c, and other files to ensure that
+function-specific MAXTOT values are used properly. MAXTOT is now
+available as m_msg.max_tot, which is set in initfa.c (m_msg.max_tot =
+MAXTOT) to ensure that functions that need very large MAXTOT values
+(e.g. TFASTX) can get them. tfastx can now search successfully with
+titin, a 27,000 residue protein.
+
+Other changes have been made to accomodate long query sequences.
+
+A serious bug was found in fastx34(_t) that caused alignment
+coordinates to be calculated improperly when the DNA sequence was much
+longer than the protein sequence.
+
+>>May 31, 2006 fa34t25d9
+
+Fixed some problems with fasts/fastf alignments when -m 9 options were
+used. Unlike the other algorithms, the a_res structure does not
+capture all the information to re-produce an alignment, so do_walign
+now sets bptr->have_ares to indicate whether the a_res structure is
+valid.
+
+Various problems with bad library names, and short query titles were
+also fixed.
+
+Updated version number/date on all drop*.c functions.
+
+>>May 24, 2006 fa34t25d8
+
+Revised code for NCBI *.pal/*.nal databases has been tested on all
+architectures, including Windows.
+
+In addition, support for ASN.1 PSSM:2 files provided by the NCBI
+PSI-BLAST WWW site is included. This code will not work with
+iteration 0 PSSM's (which have no PSSM information). For ASN.1
+PSSM's, which provide the matrix name (and in some cases the gap
+penalties), the scoring matrix and gap penalties are set appropriately
+if they were not specified on the command line. ASN.1 PSSM's are type 2:
+ ssearch34 -P "pssm.asn1 2" .....
+
+>>May 18, 2006
+
+Support for NCBI Blast formatdb databases has been expanded. The
+FASTA programs can now read some NCBI *.pal and *.nal files, which are
+used to specify subsets of databases. Specifically, the
+swissprot.00.pal and pdbaa.00.pal files are supported. FASTA supports
+files that refer to *.msk files (i.e. swissprot.00.pal refers to
+swissprot.00.msk); it does not currently support .pal files that
+simply list other .pal or database files (e.g. FASTA does not support
+nr.pal or swissprot.pal).
+
+In the process of providing this support, the routines used to read
+ASN.1 binary formatdb files were substantially improved. It is now
+possible to see multiple description lines for a single sequence.
+
+IS_BIG_ENDIAN has been removed from all of the Makefiles. The code
+now looks for the definition of __BIG_ENDIAN__ or _BIG_ENDIAN to
+decide whether the architecture IS_BIG_ENDIAN. If, for some reason,
+one of these macros is not defined on a BIG_ENDIAN architecture, then
+-DIS_BIG_ENDIAN is required.
+
+>>May 12, 2006 CVS fa34t25d7
+
+Corrected serious problem with coordinate display calculation for
+fasta34 and ssearch34 - in some cases the coordinates and alignment
+symbols were off by the length of the context (typically 30 residues).
+
+Added capability to read ASN.1 binary PSSM information. This
+information is provided (in an encoded form) from the NCBI PSI-BLAST
+WWW site. (What is actually provided from the WWW site is a bzip2-ed
+binary file that is converted to ASCII HEX. The ASCII HEX file must
+be converted to binary, and then bunzip'ed. This bunzip-ed file is
+binary ASN.1.) These files can also be generated by
+
+ blastpgp -J T -C pssm.asn1_bin -u 2
+
+I am parsing the ASN.1 binary manually, not using the NCBI toolkit, so
+there may be some files that are not parsed properly - if so, let me
+know.
+
+(May 12, 2006 - The NCBI changed the format of the psi-blast ASN.1
+PSSM - and has not yet provided documentation of the new structure, so
+this code does not work. It does work with blastpgp v 2.2.13, but not
+with the web site version 2.2.14. A fix was provided 24-May-2006)
+
+>>April 18, 2006
+
+Small modification in mshowbest.c to provide more consistent display
+widths with -m 9i in list of best hits.
+
+>>April 11, 2006 CVS fa34t25d6
+
+Corrected a problem introduced with the new, more efficient method for
+displaying alignments. For the tfast* programs, which must translate
+the library sequence, translations were not done when alignments were
+re-displayed.
+
+Corrected an older problem with tfastx34 against very long sequence
+databases - the code to more efficiently do the display alignment did
+not use the correct sequence coordinates.
+
+Modifications to dropfs2.c to ensure that exact peptide matches are
+captured more frequently.
+
+>>March 16, 2006 CVS fa34t25d5
+
+Change to initfa.c to allow lower case DNA libraries using the
+-DDNALIB_LC compile time option.
+
+Modify p2_complib.c, p2_worklib.c (and doinit.c, msg.h) to allow the
+-V annotation option for the parallel programs. Also modify to allow
+specification of the query range (but only for the first query, like
+fasta34) for the parallel programs.
+
+Modification of p2_workcomp.c to correct some problems presenting
+percent similarity. Also correct unreleased bugs in the alignment
+routines that allow more efficient alignment re-calculation.
+
+>>Nov 20, 2005
+
+Changes to support asymmetric matrices - a scoring matrix read in from
+a file can be asymmetric. Default matrices are all symmetric.
+
+>>Oct 24, 2005
+
+Modifications extended to p2_complib.c/p2_workcomp.c. Incorporation
+of drop_func.h into p2_workcomp.c greatly simplifies things. No
+changes in communication - struct a_res_str is internal to
+p2_workcomp.c.
+
+Additional changes to do_walign() so that aln_func_vals() must be
+called to set llfact, qlfact, etc in a_struct aln before or after
+do_walign is called. do_walign produces a_res_str a_res, which has
+all the information necessary to produce a calcons() or calc_code()
+alignment.
+
+>>Oct 19, 2005 CVS fa34t26b0
+
+Modifications to drop*.c and c_dispn.c to separate (and simplify) some
+of the alignment coordinate calculations. Before, the "a_struct" had
+the coordinates of the alignment used in the display (seqc0, seqc1)
+AND in the original sequences (aa0, aa1), as well as other information
+used to calculate alignment coordinates. In the new version, astruct
+coordinates always refer to seqc0,1, while a new structure, a_res_str,
+has coordinates for aa0, aa1 as well as the alignment encoding in res[nres].
+Eventually, this should make it possible to display multiple local
+alignments from the same two sequences.
+
+In addition, the file "drop_func.h" has been added to the project, and
+is included by many of the files (all the drop*.c functions,
+mshowbest.c, mshowalign.c) to ensure that the various functions are
+declared and used consistently.
+
+>>Sept 19, 2005 CVS fa34t25d4
+
+Changes to support Mac OS 10.4 - Tiger (include sys/types.h in more
+files). Documentation update for prss34/prfx34. Modifications to
+comp_lib.c to support prss34_t/prfx34_t. Shuffle numbers for
+prss/prfx can now be specified by "-k #".
+
+>>Sept 2, 2005
+
+The prss34 program has been modified to use the same display routines
+as the other search programs. To be more consistent with the other
+programs, the old "-w shuffle-window-size" is now "-v window-size".
+
+prss34/prfx34 will also show the optimal alignment for which the
+significance is calculated by using the "-A" option.
+
+Since the new program reports results exactly like other
+fasta/ssearch/fastxy34 programs, parsing for statistical significance
+is considerably different. The old format program can be make using
+"make prss34o".
+
+>>Aug 26, 2005
+
+Modifications to save_best() in comp_lib.c to support prss34_t. It
+did not work before.
+
+>>July 25, 2005
+
+Modify mshowbest.c to suppress gi|12345 in HTML mode.
+
+>>July 18, 2005 CVS fa34t25d3
+
+Modifications to Makefile.tc to support NCBI formatdb formats under
+Windows.
+
+>>May 19, 2005 CVS fa34t25d2
+
+Modifications to dropfs2.c to fix an obscure bug that occurred when
+correctly ordered peptides aligned one residue apart.
+
+>>May 5, 2005 CVS fa34t25d1
+
+Modification to the -x option, so that both an "X:X" match score and
+an "X:not-X" mismatch score can be specified. (This score is also used
+
+give a positive score to a "*:*" match - the end of a reading frame,
+while giving a negative score to "*:not-*".
+
+>>March 14, 2005 CVS fa34t25b4
+
+Fixed some problems caused by padding characters required for
+Smith-Waterman ALTIVEC in the parallel (p2_complib.c, p2_workcomp.c)
+versions.
+
+>>Feb 24, 2005 CVS fa34t25b3
+
+Changes to comp_lib.c (and Makefile.pcom) to support prss34_t.
+
+>>Feb 12, 2005
+
+Modify dropfs.c to dynamically allocate space for alignments, so that
+queries with a large number of fragments can still place all the
+fragments on the alignment. Also fix a problem produced by removing
+-DBIGMEM from most of the Makefile's, but not fixing defs.h to use
+BIGMEM sizes by default.
+
+>>Jan 24, 2005
+
+Include a new program, "print_pssm", which reads a blastpgp binary
+checkpoint file and writes out the frequency values as text. These
+values can be used with a new option with ssearch34(_t) and prss34,
+which provides the ability to read a text PSSM file. To specify a
+text PSSM, use the option -P "query.ckpt 1" where the "1" indicates a
+text, rather than a binary checkpoint file. "initfa.c" has also been
+modified to work with PSSM files with zero's in the in the frequency
+table. Presumably these positions (at the ends) do not provide
+information. (Jan 26, 2005) blastpgp actually uses BLOSUM62 values
+when zero frequencies are provided, so read_pssm() has been modified
+to use scoring matrix values for zero frequencies as well.
+
+>>Jan 13, 2005
+
+Change to initfa.c to have fasts34 do a protein comparison by default,
+rather than an unknown sequence type. Automatic checking for fasts34
+does not work reliably, because queries can be very short. Likewise
+for fastm34. [Jan 26, 2004] Undo this change, which broke DNA
+comparison when "-n" was specified.
+
+>>Jan 7, 2005
+
+Changes to tatstats.h, dropfs2.c to allow larger numbers of peptides
+to match when fasts is used to show coverage on a proteomics
+experiment. Previously fasts could match no more than 30 peptides,
+that has been increased to 50. In addition, ktup=2 can be used
+to increase the likelihood that short exact matchs trump longer
+mismatched regions.
+
+>>Nov 11, 2004 CVS fa34t25
+
+Finished merge of earlier fa34t24 branch with HEAD. Correct
+labeling of TFASTM.
+
+>>Nov 4-8, 2004
+
+Incorporation of Erik Lindahl "anti-diagonal" Altivec code for
+Smith-Waterman, only. Altivec SSEARCH is now faster than FASTA for
+query sequences < 250 amino acids.
+
+Small modifications to output score display to ensure that the correct
+scores are shown, and that they are correctly labeled.
+
+>>Aug 25,26, 2004 CVS fa34t24b3
+
+Small change in output format for p34comp* programs in
+">>>query_file#1 string" line before alignments. This line is not present
+in the non-parallel versions - it would be better for them to be consistent.
+
+Change in last_stats.c to properly label fasts statistics with -z != 1.
+
+Change in dropfs2.c to ensure that tatprobs are not precalculated with -z 4.
+
+Modify -m 9i output option to show in HTML output.
+
+Add "#ifdef NOOVERHANG" to dropfs2.c that causes overlapping
+alignments to score a 0, rather than the partial overlap score.
+Useful for SAGE alignments, because "fasts" requires global alignments
+(except for for overhangs, unless NOOVERHANG is defined).
+
+>>Aug 23, 2004
+
+Fix problem with very long definition lines with formatdb version4
+ASN databases. Fix mshowalign.c to re-enable "-L" option.
+
+>>July 28, 2004
+
+Fix to re-enable -w window shuffle for PRSS. Modify comp_lib.c
+for PRSS to ensure that the unshuffled score and probability
+are shown, even for very high probabililty alignments.
+
+>>July 21, 2004
+
+Modifications to support PostgreSQL databases with the same commands
+as MySQL databases. MySQL database libraries are type 16, PostgreSQL
+are type 17. Makefile.linux_sql and Makefile.pvm4_sql support both
+database types simultaneously.
+
+>>June 23, 2004 CVS fa34t24b2
+
+Additional fixes to enable -n or -p with fasts34 and
+fastm34. Makefile.pcom was fixed for fastm34_t. A new file,
+mgstm1.nts, of DNA fragments from mgstm1.seq, is included for testing
+fasts34 and fastm34.
+
+>>May 4, 2004
+
+Fixes to initfa.c to allow DNA:DNA for FASTS, FASTM. This change
+introduced a bug that broke FASTS completely, but was fixed June 18,
+2004 (and retagged fa34t24b2).
+
+>>April 23, 2004 CVS fa34t24b1
+
+Fix bug in initfa.c that caused tfasts/tfastf not to examine all six
+frames.
+
+>>May 4, 2004
+
+Fixes to initfa.c to allow DNA:DNA for FASTS, FASTM.
+
+>>March 19, 2004 CVS fa34t24b0
+
+Modify all the drop*.c files, plus mshowbest.c and mshowalign.c, to
+display percent similarity, rather than percent ungapped. An
+alignment is counted as similar if the score is greater than or equal
+to zero (the same criterion used for placing ".". To disable this
+change, remove -DSHOWSIM from the appropriate Makefile.*.
+
+>>March 18, 2004 CVS fa34t23b8
+
+Fix bug in initfa.c tables that caused prss to generally compare
+proteins.
+
+>>March 15, 2004
+
+Fix bug in calls to revcomp(); make revcomp() guarantee NULL termination.
+
+>>March 2, 2004 CVS fa34t23b7
+
+Fix a very embarrassing and surprising bug that caused insertions
+in fasta alignments to appear in the wrong sequence.
+
+>>Feb 7, 2004 CVS fa34t23b6
+
+Change initfa.c to allow "-i" (reverse complement) and "-i -3" with
+"fastx34" and "prfx34". In addition, "prfx34" now examines both query
+DNA strands in calculated the shuffled statistical significance.
+
+>>Feb 5, 2004
+
+Reverse assignments for G:U baseparing in initfa.c.
+
+Fix memory allocation error caused by doubling DNA alignment width.
+
+>>Jan 7, 2004 CVS fa34t23b5
+
+Change in do_walign() in dropnfa.c to make final DNA alignments use a
+band that is 2X as large as the search band width.
+
+>>Dec 22, 2003 CVS fa34t23b4
+
+Fix typo in p2_complib.c that prevented compilation. Fix problem
+with karlin.c for asymmetrical matrices, such as used with -U.
+
+>>Dec 10, 2003 CVS fa34t23b3
+
+Fix problem in resetp()/initfa.c that disabled banded Smith-Waterman
+DNA alignments.
+
+Allow spam() to do extended alignments for DNA if one of the sequences
+is < 50 nt.
+
+Cause default ktup to drop for short sequences. For protein < 50, ktup=1;
+for DNA < 20, 50, 100 ktup = 1, 2, 3, respectively.
+
+>>Dec 7, 2003
+
+A new option, "-U" is available for RNA sequence comparison. "-U"
+functions like "-n", indicating that the query is an RNA sequence. In
+addition, to account for "G:U" base pairs, "-U" modifies the scoring
+matrices so that a "G:A" match has the same score as "G:G" match,
+and "T:C" match has the same score as a "T:T" match. (Corrected
+13-July-2010 -- the G:A/T:C scores are score(G:G)-3.) The asymmetric
+matrix required changes in dropnfa.c that were similar to the changes
+in dropgsw.c required for profiles. In addition, m_msg.qdnaseq and pst.dnaseq
+ can now be SEQT_DNA, SEQT_RNA, SEQT_PROT, SEQT_UNK, or SEQT_OTHER.
+m_msg.ldnaseq does not use SEQT_RNA, only SEQT_DNA. A new member of
+struct pstruct: int nt_align, is used to indicate nucleotide
+alignments.
+
+>>Nov 19, 2003
+
+Changes to Makefile's to distinguish between tatstats_fs.o and
+tatstats_ff.o.
+
+>>Nov 2, 2003
+
+Substantial changes to comp_lib.c, p2_complib.c, mshowbest.c, and
+mshowalign.c to support more sophisticated display options.
+Previously, one could have only on "-m #" option, even though several
+of the options were orthogonal (-m 9c is independent of -m 1 and -m2,
+which is independent of -m 6 (HTML)). The programs now use a bitmask
+that allows independent options to be combined. In particular -m 9c
+can be combined with -m 6, which can be very helpful for runs that
+need HTML output but can also exploit the encoding provided by -m 9c.
+
+The "-m 9" option now also allows "-m 9i", which shows the standard
+best score information, plus percent identity and alignment length.
+
+>>Oct 26, 2003 CVS fa34t23b1
+
+Additional fixes to Makefiles to enable tfastf34(_t). Changes to
+support ossearch34 (a non-Phil Green optimized Smith-Waterman).
+
+>>Oct 8, 2003 CVS fa34t23b0
+
+Fixes to get DNA queries working in both directions, and to fix PCOMPLIB
+programs for "-V" option. Currently, the parallel programs cannot use
+the "-V" option.
+
+>>Sept 25, 2003
+
+A new option is available for annotating alignments. -V '@#?!'
+can be used to annotate sites in a sequence, e.g:
+ >GTM1_HUMAN ...
+ PMILGYWDIRGLAHAIRLLLEYTDS at S?YEEKKYT at MG
+ DAPDYDRS at QWLNEKFKLGLDFPNLPYLIDGAHKIT
+might mark known and expected (S,T) phosphorylation sites. These
+symbols are then displayed on the query coordinate line:
+
+ 10 20 @? 30 @ 40 @ 50 60
+GTM1_H PMILGYWDIRGLAHAIRLLLEYTDSSYEEKKYTMGDAPDYDRSQWLNEKFKLGLDFPNLP
+ ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+gtm1_h PMILGYWDIRGLAHAIRLLLEYTDSSYEEKKYTMGDAPDYDRSQWLNEKFKLGLDFPNLP
+ 10 20 30 40 50 60
+
+This annotation is mostly designed to display post-translational
+modifications detected by MassSpec with FASTS, but is also available
+with FASTA and SSEARCH.
+
+>>Sept 22, 2003 CVS fa34t22b5
+
+The Altivec Smith-Waterman code has been removed.
+
+>>Sept 17, 2003 CVS fa34t22b4
+
+A variety of different bugs have been fixed. (1) All the functions in
+the old initsw.c are now in initfa.c; initsw.c will be removed.
+Specifically, the Profile/PSSM code is now in initfa.c. initfa.c is
+now fully table driven. (2) various problems with prss34 and prfx34
+have been fixed in initfa.c. (3) An additional ncbl2_mlib.c buffer
+overrun has been fixed. (4) fastf34 is now available in this package.
+Its performance is very similar to, but not identical to, fastf33. I
+am tracking down the differences. In general, the raw scores
+calculated by both programs are the same, but the statistical analysis
+seems to be slightly different.
+
+>>July 30, 2003 CVS fa34t22b3
+
+Fix bug in ncbl2_mlib.c that caused buffer overrun with blast/formatdb
+v3 description lines.
+
+>>July 28, 2003
+
+The initfa.c file has been substantially re-structured to use a
+table-driven approach to parameter setting, rather than the previous
+confusing combinations of #ifdef's. Two tables of parameters are
+used, pgm_def_arr[] and msg_def_arr[], which specify values like the
+program name, reference, scoring matrix, default gap penalties, etc.
+msg_def_arr[] has the sequence types for the query, library, and
+algorithm, as well as other parameters (qframe, nframe, nrelv, etc),
+which greatly simplifies the sequence recognition logic. ppst->pgm_id
+can be used to identify the program that is running. Eventually,
+almost all of the program specific #ifdef's will be removed from
+initfa.c. initfa.c now provides initsw.c functionality, so that
+initsw.c is no longer needed.
+
+>>July 25, 2003
+
+A new file is included - fasta.defaults - that lists the scoring
+matrix, gap penalty, and other defaults for all of the fasta34
+programs. This file will be used soon to simplify parameter setting
+for the FASTA programs, and should also be used by Javascript WWW
+interfaces to the FASTA programs.
+
+>>July 22, 2003 CVS fa34t22b2
+
+Fixes to dropfs2.c, tatprobs.c to ensure that negative probabilities
+cannot occur. Negative probabilities were never seen with standard
+matrices, but did occur with BL50. Another optimization in dropfs.c
+considerably improves fasts34 performance in some cases.
+
+Fix a problem with formatdb v4 ASN.1 format files.
+
+>>July 12, 2003
+
+Fix a bug that prevented "-L" (long sequence descriptions) from
+working.
+
+>>July 9, 2003
+
+Fix reverse complement (M:K) error. Fix off-by-one error for FASTA
+DNA alignments that caused the first aligned residue pair to be
+missed.
+
+>>July 4 - 8, 2003
+
+Incorporate blast-def-line ASN.1 parsing so that NCBI formatdb version
+4 files can be read.
+
+>>June 26, 2003
+
+The strategy for displaying the match/mismatch line (" .:" for -m 0)
+has been changed dramatically to acommodate more sophisticated
+strategies for indicating conservative replacements, e.g. because of
+PSSM's. In addition to seqc0 and seqc1, which hold the aligned
+sequences for display, there is also seqca, which holds the alignment
+symbol. calcons(), do_show(), and discons() have all changed to
+include seqca. calcons() is somewhat more complex; discons() is much
+simpler. (June 29, 2003 - dropgsw.c calcons() now displays profile
+similarity accurately - it is very very illuminating.)
+
+>>June 16, 2003 version: fasta34t22
+
+ssearch34 now supports PSI-BLAST PSSM/profiles. Currently, it only
+supports the "checkpoint" file produced by blastall, and only on
+certain architectures where byte-reordering is unnecessary. It has not
+been tested extensively with the -S option.
+
+ ssearch34 -P blast.ckpt -f -11 -g -1 -s BL62 query.aa library
+
+Will use the frequency information in the blast.chkpt file to do a
+position specific scoring matrix (PSSM) search using the
+Smith-Waterman algorithm. Because ssearch34 calculates scores for
+each of the sequences in the database, we anticipate that PSSM
+ssearch34 statistics will be more reliable than PSI-Blast statistics.
+
+The Blast checkpoint file is mostly double precision frequency
+numbers, which are represented in a machine specific way. Thus, you
+must generate the checkpoint file on the same machine that you run
+ssearch34 or prss34 -P query.ckpt. To generate a checkpoint file,
+run:
+
+blastpgp -j 2 -h 1e-6 -i query.fa -d swissprot -C query.ckpt -o /dev/null
+
+(This searches swissprot for 2 iterations ("-j 2" using a E()
+threshold 1e-6 saving the resulting position specific frequencies in
+query.ckpt. Note that the original query.fa and query.ckpt must
+match.)
+
+>>June 5, 2003
+
+Fix to mshowbest.c to get -m 9 coordinates correct on reverse strand
+with pv34comp*. Some additional fixes for prfx34.
+
+>>May 22, 2003
+
+Changes to llgetaa.c, getseq.c, comp_lib.c to provide a different
+library residue lookup table (sascii) for queries and libraries. This
+allows one to make a prfx34 (like prss34, but using the fastx
+algorithm). prfx34 is now available.
+
+>>May 13,14 2003
+
+Fixes to most of the drop*.c files, and mshowbest.c, to ensure that
+coordinates displayed with -m 9(c) and the final alignment are
+consistent. They were consistent for fasta34/ssearch34/fasts34, but
+not for fastx34/fasty34. The alignment coordinate system has been
+been revised for consistency in allthe drop*.c programs (coordinates
+used to be off-by-one for some, but not other functions).
+
+Fixes to -m 9c for fasty34/pv34compfy. In addition, a problem was
+fixed with fastx34/fasty34 that appeared with a protein sequence was
+considerably longer than the DNA query, e.g. an EST vs titin (26K
+residues). This problem only appeared on pv34compfx/fy on Xserve's
+under OS_X; but it should improve fastx34/fasty34 performance with
+very long protein sequences on all platforms.
+
+>>May 7,8 2003
+
+Changes to p2_workcomp.c, compacc.c, and p_mw.h to fix persistent
+bugs in the -m 9c display. Previous pv34comp* programs would not
+return the correct coded alignment if more than 100 alignments came
+from the same node, or if an encoding was longer than 127 chars.
+
+Also, fixes to p2_complib.c, comp_lib.c, to allow long query sequences
+to be segmented. Previously, only the first 20,000 residues were
+used. The segmented queries are not overlapped; segmented library
+sequences are.
+
+>>May 5, 2003
+
+Changes to last_tat.c, scaleswt.c to ensure that all fasts alignments
+that are likely to have significant scores are displayed. In previous
+implementations, if the query had more than 10 fragments, only the 100
+best scores were shown. Now, we rescore up to 2500 alignments. The
+new approach allows large mixtures to be used for searches, where some
+of the fragments from the mixture match too many proteins
+(e.g. actins). Some differences between the fasts34 and pv34compfs
+implementations have been fixed. The two programs typically will not
+give exactly the same results, because of small differences in the
+sampling procedures, but the results are essentially equivalent.
+
+>>Apr 11, 2003 CVS fa34t21b3
+
+Fixes for "-E" and "-F" with ssearch34, which was inadvertantly disabled.
+
+A new option, "-t t", is available to specify that all the protein
+sequences have implicit termination codons "*" at the end. Thus, all
+protein sequences are one residue longer, and full length matches are
+extended one extra residue and get a higher score. For
+fastx34/tfastx34, this helps extend alignments to the very end in
+cases where there may be a mismatch at the C-terminal residues.
+
+-m 9c has also been modified to indicate locations of termination
+codons ( *1).
+
+>>Mar 17, 2003 CVS fa34t21b2
+
+A new option on scoring matrices "-MS" (e.g. "BL50-MS") can be used to
+turn the I/L, K/Q identities on or off. Thus, to make "fastm34" use
+the isobaric identities, use "-s M20-MS". To turn them off for "fasts34",
+use "-s M20".
+
+More fixes for correct alignment coordinates. There was a conflict between
+-m 9 and -m 9c and subsequent alignment displays.
+
+>>Mar 13, 2003
+
+Various fixes to produce correct fastm34 alignments. Changes to all
+functions to correct potential problem with -m 9 alignment coordinates
+when both -m 9 and actual alignments are shown.
+
+>>Feb 25,27, 2003
+
+Modifications to re-activate showsum.c, which included corrections to
+the showbest() call in p2_complib.c.
+
+>>Feb 13, 2003 CVS fa34t21b1
+
+Modifications to dropfx.c to dramatically improve alignment speed for
+cases where the DNA sequence is considerably longer than the protein
+sequence. Previously, a 200 aa vs 5000 nt comparison would do a full
+200 x 5000 Smith-Waterman alignment; with this modification, no more
+than a 200 x 1200 (2x3x200) alignment is done. This optimization has
+not (yet) been applied to dropfz2.c (fasty/tfasty).
+
+>>Feb 11, 2003
+
+Small modifications to comp_lib.c, p2_complib.c, and nmgetlib.c to
+pass openlib() a possibly old lmf_str. This allows openlib() to
+re-use memory mapped files. closelib() no longer releases memory
+mapped file buffers. Under Linux, memory mapped file buffers were not
+really released, so when comparing a set of sequences against nr, the
+program could not mmap() the database after several searches. This
+will also speed up memory mapped multiple sequence searches.
+
+>>Jan 28-31, 2003 CVS fa34t21b0
+
+Fix another bug (all of v34t20) involved with overlapping long
+sequences. And another bug that occurred when using sampled
+statistics, but appeared only on the SGI platform - thanks to Dmitri
+Mikhailov. Several other issues have been addressed based on more
+instrumented runtime testing.
+
+Fix an old (all v34) bug that caused problems with -z 11-16 (shuffled
+sequence array was not allocated properly). Fixed another bug with -z
+6/16 when using threaded (_t) searches in fasta34_t.
+
+Restructure statistical analysis functions (scaleswn.c, scaleswt.c) to
+return the "final" statistical estimation routine done in pst.zsflag_f.
+This allows the program to cope with searches against a single sequence
+correctly.
+
+Corrected an error for DNA sequences needing Altschul-Gish statistics.
+
+>>Jan 25, 2003
+
+Add option "-J start:stop" to pv34comp*/mp34comp*. "-J x" used to
+allow one to start at query sequence "x"; now both start and stop can
+be specified.
+
+>>Jan 14, 2003
+
+Changes to apam.c to provide an error message on stderr when a scoring
+matrix cannot be found.
+
+Changes to dropfs2.c, initsw.c, initfa.c to provide -m9c information
+for fasts34 searches. Modify the alignment algorithm to use
+probabilistic scores properly.
+
+>>Dec 22, 2002
+
+Change to compacc.c (sortbeste()) to do a second sort on zscore when
+several sequences have E() == 0.
+
+>>Nov 27, 2002
+
+Change FSEEK_T to fseek_t to keep Borland BCC5 happy.
+
+>>Nov 14-22, 2002 CVS fa34t20b6
+
+Include compile-time define (-DPGM_DOC) that causes all the fasta
+programs to provide the same command line echo that is provided by the
+PVM and MPI parallel programs. Thus, if you run the program:
+
+ fasta34_t -q -S gtt1_drome.aa /slib/swissprot 12
+
+the first lines of output from FASTA will be:
+
+ # fasta34_t -q gtt1_drome.aa /slib/swissprot
+ FASTA searches a protein or DNA sequence data bank
+ version 3.4t20 Nov 10, 2002
+ Please cite:
+ W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
+
+This has been turned on by default in most FASTA Makefiles.
+
+Fix p2_complib.c so that qstats[] is always allocated before it is used.
+
+Fix serious bug in non-threaded comp_lib.c that caused some high
+scoring sequences to be missed by fasts34. New tests are included in
+test.sh to detect this problem in the future.
+
+The shell sort algorithm in sortbeste(), sortbestz(), and sortbesto()
+has been modified to use an improved algorithm that will not go
+quadratic in pathological cases.
+
+nmgetlib.c and mmgetaa.c have been modified to remove "^A" in libstr
+when used with p2_complib.c.
+
+Fix problem with MAXSEG in tatstats.h with IBM/AIX.
+
+Changes to most Makefiles to use -DSAMP_STATS; fixes to p2_complib.c
+for SAMP_STATS.
+
+>>Oct 22, Nov 3, Nov 9, 2002 CVS tag fa34t20b5
+
+Fix problem in comp_lib.c that caused the query sequence length to be
+counted twice.
+
+Fixed problem with prss34 (updated find_zp in showrss.c).
+
+Correct shuffling function in several places.
+
+Add jitter back to addhistz() - improves appearance with prss34.
+
+Changes to fix problems with aln_code using -m 9c.
+
+Fix to serious bug in scaleswt.c (fasts34, etc) that caused sorts on
+the high scores to take much to long. The program is now 10X faster,
+and scales well on PVM/MPI.
+
+Fix to llgetaa.c to work with new getseq() API with automatic alphabet
+recognition.
+
+>>Oct 12, 2002 CVS tag fa34t20b4
+
+Several very obscure (and sometimes old) bugs that appeared in certain
+MPI environments have been fixed. This occurred because the pst.sq[]
+array did not always have a '\0' at the end. In addition,
+mshowalign.c/p2_workcomp.c sometimes failed to put the '\0' at the end
+of seqc0/seqc1. Correct bug introduced in fa34t20b3 for fasts34(_t).
+
+>>Oct 9, 2002 CVS tag fa34t20b3
+
+Fix to apam.c build_xascii() to not zero-out qascii[0]. Fix
+Makefile.pvm4. Mix problem with -m 9c with compacc.c.
+
+>>Sept 28, 2002
+
+Additional fixes to -m 9c in p2_complib.c/compacc.c/mshowbest.c.
+Remove restriction in fasts34(_t) to less than 30 peptides (though no
+more than 30 peptides can be aligned currently).
+
+>>Sept 24, 2002
+
+Fix p2_workcomp.c so that e_scores are delivered correctly when
+last_calc flag is set, and -m 9c provides alignments when only one
+best hit is present.
+
+Fix comp_lib.c to use different maxn and overlap for each different
+query sequence. fasta34 and fasta34_t now have identical results when
+a long sequence is searched.
+
+Add '@C:101' support to memory mapped FASTA format files.
+
+Fix mshowalign.c so that coordinates returned by cal_coord() use
+loffset+l_off.
+
+>>Sept 14, 2002 CVS tag fa34t20b2
+
+Changes to p2_complib.c, compacc.c to fix statistics problems with
+pv34compfs on query sequences with more than 10 fragments.
+
+>>Aug 27, 2002
+
+Modifications to mshowbest.c and drop*.c (and p2_workcomp.c,
+compacc.c, doinit.c, etc.) to provide more information about the
+alignment with the -m 9 option. There is now a "-m 9c" option, which
+displays an encoded alignment after the -m 9 alignment information.
+The encoding is a string of the form: "=#mat+#ins=#mat-#del=#mat".
+Thus, an alignment over 218 amino acids with no gaps (not necessarily
+100% identical) would be =218. The alignment:
+
+ 10 20 30 40 50 60 70
+GT8.7 NVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKFKL--GLDFPNLPYL-IDGSHKITQ
+ :.:: . :: :: . .::: : .: ::.: .: : ..:.. ::: :..:
+XURTG NARGRMECIRWLLAAAGVEFDEK---------FIQSPEDLEKLKKDGNLMFDQVPMVEIDG-MKLAQ
+ 20 30 40 50 60
+
+would be encoded: "=23+9=13-2=10-1=3+1=5". The alignment encoding is
+with respect to the beginning of the alignment, not the beginning of
+either sequence. The beginning of the alignment in either sequence is
+given by the an0/an1 values. This capability is particularly useful
+for [t]fast[xy], where it can be used to indicate frameshift positions
+"/#\#" compactly. If "-m 9c" is used, the "The best scores" title
+line includes "aln_code".
+
+>>Aug 14, 2002 CVS tag fa34t20
+
+Changes to nmgetlib.c to allow multiple query searches coming from
+STDIN, either through pipes or input redirection. Thus, the command
+
+ cat prot_test.lseg | fasta34 -q -S @ /seqlib/swissprot
+
+produces 11 searches. If you use the multiple query functions, the
+query subset applies only to the first sequence.
+
+Unfortunately, it is not possible to search against a STDIN library,
+because the FASTA programs do not keep the entire library in memory
+and need to be able to re-read high-scoring library sequences. Since
+it is not possible to fseek() against STDIN, searching against a STDIN
+library is not possible.
+
+>>Aug 5, 2002
+
+fasts34(_t) and fastm34(_t) have been modified to allow searches with
+DNA sequences. This gives a new capability to search for DNA motifs,
+or to search for ordered or unordered DNA sequences spaced at
+arbitrary distances.
+
+>>Aug 4, 2002
+
+comp_lib.c has been modified to provide comp_mlib.c function.
+comp_mlib.c is no longer used. comp_lib.c with the "mlib" function
+can now recognize protein or DNA sequences automatically, and reads
+from stdin can now detect DNA/protein sequence types automatically.
+Changes to compacc.c, getseq.c, doinit.c initfa.c, initsw.c, and
+nmgetlib.c to support automatic sequence type detection.
+
+>>July 28-31, 2002
+
+(1) The various Makefile's have been "normalized". The fast*34[_t]
+ (Makefile.34m.common[_sql]), Makefile.pvm4[_sql], and
+ Makefile.mpi4[_sql] make files all use a common set of filenames,
+ described in Makefile.fcom. This greatly simplifies adding
+ programs, but requires that all *.o files be deleted when moving
+ from fast*34* to pv34comp* to mp34comp*.
+
+(2) showalign.c/p_showalign.c have been merged into mshowalign.c
+ showbest.c/manshowbest.c have been merged into mshowbest.c. Some
+ of the related files (showun.c, manshowun.c, have not been merged
+ or tested).
+
+(3) Code for ranking scores with valid e_value's incorporated.
+
+(4) Bug fixes in p2_complib.c, so that fasts34/fasts34_t/pvcompfs
+ provide identical statistics.
+
+>>July 26, 2002
+
+Makefile.pvm4_sql and Makefile.pvm4 have been substantially simplified
+by providing the worker program name from the h_init() function in the
+initfa.c/initsw.c files.
+
+>>July 24, 2002
+
+Substantial modifications to param.h, structs.h to ensure that no
+sequence specific information is kept in struct pstruct. This
+structure now holds the pam[] matrix, and other scoring parameters,
+but nothing that is dependent on aa0. The aa0 dependent stuff (nm0,
+Lambda, K, etc) is now stored in struct mngmsg. This was mostly done
+to support the pv34comp* programs, which have separate mngmsg
+structures but the same pstructs.
+
+The fasts34, fasts34_t, and pv34compfs/c34.workfs have all been tested
+successfully.
+
+>>July 19, 2002
+
+Fix an old bug in the calculation of E()-values in DNA databases
+longer than 2147483647 residues on machines with 32-bit longs.
+
+
+>>July 28-31, 2002
+
+(1) The various Makefile's have been "normalized". The fast*34[_t]
+ (Makefile.34m.common[_sql]), Makefile.pvm4[_sql], and
+ Makefile.mpi4[_sql] make files all use a common set of filenames,
+ described in Makefile.fcom. This greatly simplifies adding
+ programs, but requires that all *.o files be deleted when moving
+ from fast*34* to pv34comp* to mp34comp*.
+
+(2) showalign.c/p_showalign.c have been merged into mshowalign.c
+ showbest.c/manshowbest.c have been merged into mshowbest.c. Some
+ of the related files (showun.c, manshowun.c, have not been merged
+ or tested).
+
+(3) Code for ranking scores with valid e_value's incorporated.
+
+(4) Bug fixes in p2_complib.c, so that fasts34/fasts34_t/pvcompfs
+ provide identical statistics.
+
+>>July 26, 2002
+
+Makefile.pvm4_sql and Makefile.pvm4 have been substantially simplified
+by providing the worker program name from the h_init() function in the
+initfa.c/initsw.c files.
+
+>>July 24, 2002
+
+Substantial modifications to param.h, structs.h to ensure that no
+sequence specific information is kept in struct pstruct. This
+structure now holds the pam[] matrix, and other scoring parameters,
+but nothing that is dependent on aa0. The aa0 dependent stuff (nm0,
+Lambda, K, etc) is now stored in struct mngmsg. This was mostly done
+to support the pv34comp* programs, which have separate mngmsg
+structures but the same pstructs.
+
+The fasts34, fasts34_t, and pv34compfs/c34.workfs have all been tested
+successfully.
+
+>>July 8, 2002
+
+Modifications to comp_lib.c, initfa.c and new scaleswt.c, tatstats.c
+to support FASTS with Tatusov statistics.
+
+last_params() has been introduced to allow aa0 dependent changes in m_msg/pstr.
+
+sortbest() has been moved into initfa.c/initsw.c to make it function specific.
+
+find_z() takes an additional parameter, escore.
+
+The do_work() results structure, beststr, and stat_str all accommodate
+escores as well as integer scores (stat_str also saves segn and segl
+but doesn't need them).
+
+In scaleswt.c, process_hist() now knows much more about Tatusov statistics.
+
+last_stats() provided to accommodate rank-based statistical corrections.
+
+scale_scores() is the last function to modify the beststr scores
+(final calculation of E-value).
+
+Some sortbest*() calls and some bptr[i]->zscore=find_zp() loops have
+been moved into scale_scores();
+
+>>July 3,5, 2002
+
+Modifications to allow mySQL comments (--) in "library.sql 16" files.
+Thus, a first line of:
+
+ --host seqdb user password;
+
+is read by FASTA as the login information to a mySQL server, but is
+ignored by mySQL. "DO" commands in FASTA mySQL files can also be
+rendered invisible to mySQL in this way. See "do.sql".
+
+Modifications to mysql_lib.c to allow very long SQL statements. The
+buffer is now dynamically reallocated in 4Kb chunks.
+
+The fasta3.1 man page has been updated and re-organized.
+
+>>June 26, 2002
+
+Minor modifications to nmgetaa.c (openlib()) to use the same arguments
+for searching and PRSS. PRSS needs access to all of m_msg, but
+searches do not. Other small fixes to comp_mlib.c, towards the goal
+of merging comp_mlib.c and comp_lib.c.
+
+>>June 25, 2002
+
+Modify the statistical estimation strategy to sample all the sequences
+in the database, not just the first 60,000. The histogram is still
+based only on the first 60,000 scores and lengths, though all scores
+an lengths are shown. The fit to the data may be better than the
+histogram indicates, but it should not be worse.
+
+Currently, this modification is available only if the -DSAMPLE_STATS
+option is defined.
+
+>>June 23, 2002 CVS fa34t11d4
+
+Fix a very long-standing bug in fasty/tfasty that caused 'NNN' to be
+translated as 'S', rather than 'X'. fastx/tfastx has done this
+correctly for many years, but the fasty/tfasty code that I received
+from Zheng Zhang was not implemented correctly (my fault, his code was
+fine).
+
+>>June 19, 2002
+
+Added "-C #" option, where 6 <= # <= MAX_UID (20), to specify the
+length of the sequence name display on the alignment labels. Until
+now, only 6 characters were ever displayed. Now, up to MAX_UID
+characters are available.
+
+>>May 30, 2002 CVS fa34t11d3
+
+Fixed problem with programs using the default -E cutoff when -b was
+provided. With this implementation, -E can override -b, but -b
+overrides the default -E.
+
+Fixed problem with 64-bit file offsets in param.h (change USE_FSEEK0
+-> USE_FSEEKO, include -D_LARGEFILE_SOURCE and -D_LARGEFILE64_SOURCE
+in Makefile.linux_sql). Put limits on alignment display length (200
+chars). More checks for null returns from SQL queries.
+
+>>Apr 17, 2002 CVS fa34t11d2
+
+Fixed bug in mm_file.h/ncbl2_mlib.c that caused the SGI version to be
+unable to read blast2 format files.
+
+Changed "mp_*" tags to "pg_*" for -m 10 option.
+
+>>Mar 30, 2002
+
+Fix embarrassing bug in revcomp() (getseq.c) that failed to complement
+the central nucleotide in a sequence with an odd number of residues.
+
+Small changes to dropfs.c for more segments.
+
+>>Mar 16, 2002
+
+Added create_seq_demo.sql, nt_to_sql.pl to show how to build an SQL
+protein sequence database that can be used with with the mySQL
+versions of the fasta34 programs. Once the mySQL seq_demo database
+has been installed, it can be searched using the command:
+
+ fasta34 -q mgstm1.aa "seq_demo.sql 16"
+
+mysql_lib.c has been modified to remove the restriction that mySQL
+protein sequence unique identifiers be integers. This allows the
+program to be used with the PIRPSD database. The RANLIB() function
+call has been changed to include "libstr", to support SQL text keys.
+Due to the size of libstr[], unique ID's must be < MAX_UID (20)
+characters.
+
+A "pirpsd.sql" file is available for searching the mySQL distribution
+of the PIRPSD database. PIRPSD is available from
+ftp://nbrfa.georgetown.edu/pir_databases/psd/mysql.
+
+>>Mar 6, 2002
+
+Fix showbest.c showbest() to report pst.zdb_size as database size.
+Fix dropnfa.c spam() to address off-by-one on end of run, and double
+counting on backwards scan. Fix dropnfa.c do_fasta() to fix another
+problem introduced by -S. Changes to comp_lib.c to ensure that both
+the beginning and end of the query and library sequence have '\0'
+present. Changes to initfa.c, initsw.c to ensure that a match to a
+lower-case letter with -S gets exactly the same score as a match to an
+'X'. Changes to mmgetlib.c to work with 64-bit longs in *.xin files.
+
+>>Feb 26, 2002
+
+Fixes to doinit.c, initfa.c, initsw.c to allow DNA matrices using the
+"-s dna.mat" option. A new matrix, "d50ry.mat" is available that
+scores +5 for a match, -2 for a transition, and -5 for a
+transversion. "d50ry.mat" corresponds to DNA PAM50 with transitions
+twice as common as transversions. When "-s dna.mat" is used, "-n"
+MUST be used as well.
+
+Query sequence names ("aa", "nt") should be more accurate.
+
+>>Feb 22, 2002
+
+Fix to getseq.c to allow "plain" sequence files.
+
+>>Feb 12, 2002
+
+Minor fix to res_stats.c.
+
+>>Jan 28, 2002
+
+Fixes to resurrect res_stats.c. res_stats (cc -o res_stats
+res_stats.c scaleswn.c -lm) takes the output from a current "-R
+file.res" file and calculates statistical significance - this allows
+one to take exactly the same set of scores (and lengths) and calculate
+statistical estimates using different strategies.
+
+>>Jan 24, 2002
+
+modifications to mmgetlib.c, ncbl2_mlib.c to more robustly read memory
+mapped files (*.xin, map_db) on machines lacking "native" 64-bit
+longs. If the machine provides some definition for a 64-bit long
+(e.g. "long long", "int64_t"), things should work. 64-bit offsets into
+memory mapped files work properly on Alpha, SGI, i386 Linux, and
+MacOSX. The current implementation depends either on 64 bit longs
+(Compaq Alpha's pre 4.0G) or the <sys/inttype.h> file. Makefile,
+Makefile.alpha, and Makefile.linux have been modified.
+
+Modifications to nmgetlib.c, mmgetlib.c to provide GI numbers and
+Accession versions for Genbank searches. If the GI:123456 number is
+available, it will be used and the description line will be formatted:
+
+ gi|123456|gb|ACC1234.1|LOCUS description
+
+This should help FAST_PAN runs, where the version of a sequence
+changes frequently.
+
+>>Jan 10, 2002
+
+Modifications to p2_complib.c, p2_workcomp.c to more reliably allocate
+space for library sequence descriptions on the master and workers.
+
+>>Jan 2-3, 2002 CVS fa34t10c/fa34t10d3
+
+Fixes to comp_lib.c to support Macintosh and Windows/Turbo-C
+compilation. New Makefile.tc. Macintosh version supports both
+"Classic" and "Carbon" environments.
+
+"<values.h>" has been replaced with the more modern "<limits.h>"
+
+Fixes to p2_complib.c to support n_libstr (libstr length) in GETLIB().
+
+comp_thr.c, complib.c removed.
+
+>>Dec 16, 2001
+
+Complete integration of comp_mlib.c with both the unthreaded and
+threaded programs. Comp_mlib allows fasta34 and fasta34_t to compare
+a database with a second database, just as pv34compfa does. Using
+multiple queries with fasta34_t is not as efficient as pv34compfa (and
+it cannot use networks of Unix workstations), but it is much easier to
+use and install.
+
+With the comp_mlib.c option, fasta34 cannot automatically recognize
+DNA sequences, just as pv34compfa no longer recognizes DNA sequences.
+You must use the "-n" option to search with DNA sequences. The other
+programs (fastx34, tfastx34, etc) "know" the type of the query and
+database sequences, so "-n" is only required for fasta34(_t).
+
+>>Dec 14, 2001 CVS tag fa34t10b
+
+Fix problems reading DNA databases in blast2 format.
+
+>>Dec 11, 2001
+
+Changes to spam() in dropnfa.c so that, for DNA sequences, the
+previous behavior for finding the boundaries of a local alignment
+region use the same algorithm as previous versions of fasta. For
+protein sequences, the algorithm will extend the local region beyond
+the "ktup" boundaries if a better score can be found. For DNA
+sequences, this raises the noise rather than increasing sensitivity,
+so it is turned off and "ktup" boundaries are respected. The old,
+"ktup" boundary algorithm is available with -DNOSPAM_EXT.
+
+This version also includes a working res_stats.c, which can be used to
+test various statistical estimates on exactly the same set of scores.
+
+Fixed problems with -m 9 percent identity for fastx/fasty/tfastx/tfasty.
+These errors have been present since -m 9 was implemented.
+
+>>Dec 10, 2001
+
+Fix to map_db.c to work correctly with files > 2 Gb when 64-bit longs
+are available. It is not yet designed to work with ftello() and other
+offset types.
+
+>>Nov 11,21, 2001 CVS tag fa34t10a, fa34t10d1
+
+Substantial changes to revcomp(), getseq(), and other functions to
+correct problems with -S on DNA sequences. Sequences with lower case
+nucleotides were not recognized or reverse complemented properly.
+
+Fix to dropnfa.c (v34t07, Nov 21, 2001) bg_align() to re-initialize
+static globals - this fixes a problem encountered with pv34compfa. A
+new main program, comp_mlib.c has been added to the CVS archive,
+although it is not referenced in any of the Makefile. comp_mlib.c
+works like p2_complib.c and compares a library against another
+library.
+
+>>Nov 4, 2001
+
+Change to dropnfa.c spam () while(1) -> while(lpos <= dmax->stop).
+This fixes a problem with ktup=1 on Suns only, so far.
+
+>>Oct 4, 2001 CVS tag fa34t10
+
+Add comp_lib.c file, which merges complib.c (unthreaded) and
+comp_thr.c (threaded) code into one file.
+
+Modifications to nmgetlib.c, mmgetaa.c to allow Genbank flatfile
+format without DESCRIPTION or ACCESSION lines.
+
+Additional fix for -S with ktup=1.
+
+>>Sept. 24, 2001
+
+Fix to have correct gap-penalties for short scoring matrices with
+tfastx/fastx.
+
+>>Sept. 10, 2001 CVS tag fa34t05d6
+
+Fix a bug introduced by -S fix in fa34t05d5. Also, try to remove
+changes in p34compfa compared to pv4compfa output.
+
+>>Sept. 6, 2001 CVS tag fa34t05d5
+
+Fix the -S dropnfa/fx/fz2 bug that was not actually fixed in
+fa34t05d4. Incorporate the correct scaleswn.c refered to in
+fa34t05d4.
+
+>>Sept. 5, 2001 CVS tag fa34t05d4
+
+Fix problem with m_msg.quiet that prevented interactive prompts for
+ktup, file name, etc with threaded programs.
+
+Fix serious bug in dropnfa.c/dropfx.c/dropfz2.c that caused -S to work
+improperly on sequences with effective length of 3 or less.
+
+Change to scaleswn.c to make mle_cen(), mle_cen2() more robust to cases
+where the top and bottom scores are the same.
+
+Change p2_complib.c to avoid compiler complaints with (void *)wstage2p=NULL
+on some platforms.
+
+>>Aug. 30, 2001 CVS tag fa34t05d3
+
+Fixed problem with uthr_subs.c for Suns, but changed Makefile.sun to
+use pthreads rather than Sun Unix threads. Removed SQL stuff from
+Makefile.mpi4/pvm4 and added Makefile.mpi4_sql/pvm4_sql.
+
+fa34t05d2 - fix to map_db.c to provide *sascii.
+
+fa34t05d1 - fixes to ibm_pthr_subs.c and Makefile.ibm from IBM.
+
+>>Aug. 20, 2001 CVS tag fa34t05d0
+
+The pvm/mpi complib programs have been substantially updated with
+release 3.4. See readme.v34t0 for more information. With version
+3.4, the MPI programs are mp34comp*, mu34comp*, etc.
+
+A major effect of this change is to disable automatic sequence type
+(protein/DNA) recognition with pv34compfa/mp34compfa. By default,
+protein libraries are assumed. Thus, pv34compfa/mp34compfa require
+the "-n" command line option when running pv34compfa/mp34compfa on DNA
+sequence libraries. This issue does not occur with the other
+programs, which will recognize the appropriate sequence type, because
+it is determined by the program (e.g. pv34compfx requires
+DNA:protein).
+
+Fixed substantial problem with 64-bit file offsets for Linux in
+complib.c/comp_thr.c, p2_complib.c. This problem, solved by Doug
+Blair, was preventing the threaded versions from working properly in
+memory mapped mode.
+
+In all earlier versions of fasta, when very long sequences were
+searched, the sequence length reported was that of the "chunk" that
+was actually searched (typically 80,000-query_length) rather than the
+actual library sequence length. The peculiar behavior now changed,
+and the full length of the library sequence, not the sequence chunk,
+is reported as the library sequence length. Note that chunks are
+still used, however, which can cause the same alignment to be shown
+twice. In addition, the "-m 9" output format has changed to report
+the coordinates of the query and library sequence (see below), which
+may be different from 1-sequence_length because the the query and
+library sequences may have been extracted from larger sequences. Four
+additional fields have been added, "pn0", "px0","pn1", "px1" that are
+the positions in for the beginning (pn0/1) and end (px0/1) of they
+query/library sequence. pn0/1 would typically be changed with the
+"@C:#" directive, described below.
+
+Changes to doinit.c/initfa.c/initsw.c to provide a new function -
+f_lastenv() - that allows function-specific adjustments to parameters
+after the command line options have been read but before the first
+sequence is read. This change solved problems with "mp/pv34compfx -S".
+
+fasts34/tfasts34 now recognize that 'I/L' are the same, as are 'Q/K'
+(which are apparently indistinguishable by Mass-Spec). The latter
+identity is on by default, but can be turned off with "-h 0".
+
+The MPI/PVM versions of the programs have been tested extensively with
+compfa, compfx, and comptfx. Makefile.mpi4 now works properly.
+Changes to p2complib.c to support the PVM option "-T 1-4", which
+allows one to run on nodes 1-4 of a (presumably larger) PVM virtual
+machine. This option has no effect on the mp34comp* programs. The
+old "-T 4" to run on 4 nodes, is also available. If each node has 2
+cpu's, as indicated in the "pvmd hostfile", both CPU's will be used
+for a total, in this example, of 8 processes. This allows one to
+specify a large PVM machine and use separate parts of it
+independently.
+
+Changes to nmgetlib.c to fix problems with longer dates in GCG files
+(Y2K). Fixes to faatran.c for extended alphabets and 'X's. Various
+code clean-ups to make "gcc -Wall" a little bit (not much) happier.
+
+This is the first distributed fasta34 version.
+
+================
+>>Aug 9, 2001 CVS tag fa34t05
+
+Corrections to initfa.c to allow -S to work with tfastx/y.
+Fix to manshowbest.c for query position with -m 9.
+
+>>July 18, 2001 CVS tag fa34t04
+
+Various changes to complib.c, comp_thr.c, p2_complib.c, showbest.c,
+showalign.c to deal with overlapping alignments in long sequences that
+have been segmented. When long sequences are segmented (lcont>0), the
+eventual total length (n1tot_v) is saved at beststr->n1tot_p. If
+there was no lcont, then beststr->n1tot_p = NULL, and beststr->n1
+should be used as the sequence length. This has the advantage of
+requiring space only when long sequences are encountered, and
+requiring only one integer for several segments.
+
+m_msg.noshow has been removed.
+
+The -m 9 format has been changed - 5 fields have been added, 4
+(pmn0/pmx0/pmn1/pmx1) provide the beginning and end coordinates of the
+query and library sequence; the last (fs) reports the number of
+frameshifts. The names of the alignment boundaries have been changed
+from min0/max0/min1/max1 to amn0/amx0/amn1/amx1 (Alignment miN/maX).
+
+The SQL format has been extended to provide for statements that do
+things but do not generate results, such as creating and selecting into a temporary table, e.g.:
+================
+ do
+ create temporary table seq_pos (
+ id int unsigned not null auto_increment primary key,
+ prot_id int unsigned not null default 0,
+ start int unsigned not null default 0,
+ length int unsigned not null default 0,
+ )
+ ;
+ do
+ insert into seq_pos (prot_id, start, length)
+ select id, 11, len-10
+ from protein, annot
+ where len > 100
+ and annot.protein_id = protein.id
+ and annot.pref=1
+ ;
+ select seq_pos.id,
+ substring(protein.seq, start, length),
+ concat("@C:", start, " ", descr)
+ from protein, seq_pos, annot
+ where protein.id = annot.protein_id
+ and protein.id = seq_pos.prot_id
+ and annot.pref = 1
+ ;
+ select prot_id,
+ concat("@C:", start, " ", descr)
+ from seq_pos, annot
+ where annot.protein_id = seq_pos.prot_id
+ and seq_pos.id = #
+ and annot.pref = 1
+ ;
+================
+
+ In the current implementation, these statements must start with "DO"
+as the first two characters on the line, and come immediately after a
+line ending with ';'. The text from "DO" to the next ";", excluding
+the "DO", is executed when the database connection is made.
+
+===== >>July 12, 2001
+
+The allocation of the work_info data structure used to send
+information to the worker threads has been changed. The old method
+worked, possibly by accident.
+
+A bug in p2_complib.c that caused E()-values to be calculated
+improperly for the first query sequence has been fixed.
+
+>>July 11, 2001 --> fa34t02
+
+It is now possible to specify output coordinates in library sequences
+by including the string: "@C:number" on the description line, e.g.
+
+ >gtm1_human gi|12345 human glutathione transferase M1 @C:21
+
+would label the first residue in the library sequence "21" rather than
+"1". This capability has been included to provide accurate
+coordinates for searches done against subsequences generated by an SQL
+query. For example, one could use a query of the form:
+
+ SELECT protein.id, substring(protein.seq,11,length(protein.seq)-20),
+ concat(protein.name," @C:11 ",protein.descr)
+ FROM protein;
+
+to generate a sequence set with each sequence starting with residue
+11. Without the "@C:11" option on the description line, the program
+would number the alignment positions starting at 1, even though the
+first residue of the sequence really started at 11. "@C:11" allows
+one to correct the coordinate system.
+
+Currently, "@C:offset" is available only with library type 1 (fasta
+format) and 16 (mySQL).
+
+The SQL-generated database with "@C:offset" can be used with both the
+fast*34(_t) programs and with pv34comp*. However, the SQL syntax is
+used differently in the fasta34 and pv34compfa programs. fast*34(_t)
+requires three SQL statements during a search: (1) a statement to
+generate a large set of library sequences; (2) a statement to generate
+a description of a single sequence, given a unique identifier provided
+by (1); and (3) a statement to generate a single sequence given a
+unique identifier provided by (1). For fast*34 searches, the third
+(3) SQL statement must provide the "@C:offset" information in the
+third results field for the offset to be used. It is optional in (1)
+and (2).
+
+The pv34comp* programs only require one SQL statement, statement (1)
+above, which must provide three fields, a unique identifier, the
+sequence, and a complete description that must include "@C:offset" if
+substrings are used. If SQL queries (2) and (3) are provided, they
+are ignored. Thus, the same files can be used by both programs, but
+the "@C:offset" is required in different SQL queries by the fast*34
+and pv34comp* programs.
+
+Other changes:
+
+Re-incorporation of GAP_OPEN option; fix to Altschul-Gish stats when
+GAP_OPEN is used.
+
+Re-incorporation of A. Mackey's spam() improvement in dropnfa.
+
+Fixes to include file ordering to allow fast*34(_t) pv34comp* programs
+to compile.
+
+Fix to lascii[] for SQL database queries.
+
+Fix to an old bug in comp_thr.c to send individual worker_info
+structures to threads (does not fix LINUX threads problems, however).
+
+=====
+>>July 9, 2001
+
+Considerable changes to support no-global library functions.
+
+(1) Separate ascii/sequence mapping arrays are used by the
+ query-reading (qascii), library-reading (lascii), and sequence
+ comparison function (pascii) routines. As a result, there is no
+ longer a need for tgetlib.o/lgetlib.o - lgetlib.o can serve both
+ functions.
+
+(2) This also allows us to remove all #ifdef TFAST/FASTX conditionals
+ from complib.c/comp_thr.c/p2_complib.c. We no longer need
+ tcomp_thr.o, comp_thrx.o, etc. We still have a variety of
+ p2_complib.o variations to support the different c34.work* files.
+
+(3) Because non-global openlib/getlib functions are available, exactly
+ the same open/get functions are available for reading both the
+ query and reference libraries in pv34comp* programs. The
+ host-specific openlib/getlib functions in hxgetaa.c are now
+ provided by nmgetlib.c, etc. This has two effect:
+
+ (a) it is now possible to compare a query database generated by an
+ SQL query to a library database generated by a different SQL
+ query.
+
+ (b) pv34comp* has lost (at least in this version) the ability to
+ automatically detect the query sequence type. To search with a
+ DNA query, you MUST use "-n".
+
+(4) the resetp() function is now responsible for almost all of the
+ function sepcific (TFAST/FASTX/etc) initializations. All of the
+ function specific code has been removed from complib.c/comp_thr.c
+ and most of it has been moved to initfa.c/resetp().
+
+(5) manageacc.c has been merged into compacc.c (mostly prhist()).
+
+=====
+>>June 1, 2001
+
+Many changes to accommodate a new - no global variable - strategy for
+reading sequence databases. Every time a file is opened, a struct
+lmf_str is allocated which can be used for memory mapped files, ncbl2,
+files, and mysql files.
+
+In addition, an open'ed file has a default sequence type: DNA or
+protein, or one can open a file in a mode that will allow the sequence
+type to be changed.
+
+=====
+>>May 18, 2001 CVS: fa33t09d0
+
+A new compile time parameter - -DGAP_OPEN, is available to change the
+definition of the "-f gap-open" parameter from the penalty for the
+first residue in a gap to a true gap-open penalty, as is used in BLAST
+and many other comparison algorithms. This will probably become the
+default for fasta in version 3.4.
+
+Fixes to conflicts between "-S" and "-s matrix". When a scoring
+matrix file was specified, lower-case alignments were not displayed
+with -S (although the scores were calculated properly).
+
+More extensive testting of mysql_lib.c (mySQL query-libraries) with
+the pv4comp* and mp4comp* programs.
+
+=====
+>>April 5, 2001 CVS: fa33t08d4b3
+
+Changes in nmgetlib.c and ncbl2_mlib.c to return long sequence
+descriptions for PCOMPLIB (pv4/mp3comp*). Also fix p2_complib.c to
+request DNA library for translated comparisons.
+
+Fix for prss33(_t) to read both sequences from stdin.
+
+=====
+>>March 27, 2001 CVS: fa33t08d4
+
+Modifications to allow 64-bit fseek/ftell on machines like Sun,
+Linux/Intel, that support -D_FILE_OFFSET_BITS=64, -D_LARGE_FILE_SOURCE
+off_t, and fseeko(), ftello() with the option -DUSE_FSEEKO. Machines
+with 64-bit long's do not need this option. Machines with 32-bit
+longs that allow files >2 Gb can do so with 64-bit file access
+functions, including fseeko() and ftello(), which work with off_t file
+offsets instead of long's.
+
+=====
+>>March 3, 2001 CVS: fa33t08d2
+
+Corrected problems in nmgetaa.c and mysql_lib.c with parallel
+programs, and one serious problem with alternate DNA scoring matrices
+(initfa.c, initsw.c) not being set properly. A subtle problem with
+the merge of scaleswn.c and scaleswg.c is fixed.
+
+>>February 17, 2001
+
+Modified mysql_lib.c to use "#", rather than "%ld", to indicate the
+position of the GID. This change was made because sprintf() cannot be
+used reliably to generate an SQL string, as '"' and '%' are used in
+such strings.
+
+=====
+>>January 17, 2001
+(no version change, date change)
+
+Minor fixes to initfa.c, initsw.c to deal with DNA scoring matrices
+properly. "-n -s dna.mat" is required for the sequence/matrix to be
+recognized as DNA.
+
+>>January 16, 2001
+-->v34t00
+
+Merge of the main CVS trunk - fa33t06 with the latest release branch,
+fa33t08.
+
+In addition, PCOMPLIB mods have been made to mysql_lib.c. Because
+p2_complib.c gets sequence description information during the first
+read of the database, the mysql_query must be changed to return:
+result[0]=GID, result[1]=description, result[2]=sequence. In the
+PCOMPLIB case, the other SQL queries (for GID description, sequence)
+are not necessary but must still be provided.
diff --git a/doc/readme.v35 b/doc/readme.v35
new file mode 100644
index 0000000..25c52c4
--- /dev/null
+++ b/doc/readme.v35
@@ -0,0 +1,535 @@
+
+ $Id: readme.v35 120 2010-01-31 19:42:09Z wrp $
+ $Revision: 55 $
+
+>>Sep. 10, 2008
+
+Fix problem in init_ascii() call for p2_complib2.c.
+
+>>Sep. 9, 2008
+
+Fix bug in display of library name when written to an output file
+(rather than stdout).
+
+>>Aug. 28, 2008 fa35_04_02 SVN Revision: 45
+
+Fix serious bug in alignment generation that only occurred when large
+libraries were used as a query with [t]fast[x/y]. This bug often
+resulted in a core dump.
+
+Address some other issues with uninitialized variables with -m 9c.
+
+>>Jul. 15, 2008 fa35_04_01 SVN Revision: 38
+
+Correct problems with Makefiles. Add information on compiling to README.
+Address issue with mp_KS for -m 10 when searching small libraries.
+
+>>Jul. 7, 2008 fa35_04_01 SVN Revision: 35
+
+Fix problems that occurred when statistics are disabled with -z -1,
+both for a normal library search, and for searches of a small library.
+
+>>Jul. 3, 2008 SVN Revision: 33
+
+Continue to fix an issue with 'J' and -S.
+
+>>Jun. 29, 2008 SVN Revision: 29, 31
+
+Fix additional problems with Makefiles, some issues uncovered with
+Solars 'C' compiler (Rev. 30).
+
+Discover serious bug when searching long, overlapping sequences, such
+as genomes. The length of the library sequence was not updated to
+reflect the length of the new region plus the overlap.
+
+Fix inconsistency in the value of 'J' between uascii.h/aascii[] and
+pascii[]. Add code to ensure that lascii[], qascii[], never return a
+value outside pam2[][] (all <= pst.nsq) (particularly for 'O' and 'U'
+amino-acids).
+
+exit(0) returns for map_db, list_db.
+
+>>Jun. 11, 2008
+
+Correct bug in scaleswn.c that prevented exact matches to queries < 10
+residues from being scored and displayed.
+
+>>Jun. 1, 2008
+
+Address various cosmetic issues in FASTA output:
+
+(1) Modify comp_lib2.c so that -O outfile works when multiple queries are
+compared in one run.
+
+(2) remove the duplicated query sequence length in the 1>>>query line.
+
+(3) in -m 10 output, the tags "pg_name" and "pg_ver" were duplicated, e.g.
+
+>>>K1HUAG, 109 aa vs a library
+; pg_name: fasta35_t
+; pg_ver: 35.03
+; pg_argv: fasta35_t -q -b 10 -d 5 -m 10 ../seq/prot_test.lseg a
+; pg_name: FASTA
+; pg_ver: 3.5 Sept 2006
+
+The ; pg_ver and ; pg_name produced by the get_param() functions in
+drop*.c have been renamed ; pg_ver_rel and ; pg_ver_alg.
+
+>>>K1HUAG, 109 aa vs a library
+; pg_name: fasta35_t
+; pg_ver: 35.03
+; pg_argv: fasta35_t -q -b 10 -d 5 -m 10 ../seq/prot_test.lseg a
+; pg_name_alg: FASTA
+; pg_ver_rel: 3.5 Sept 2006
+
+Modify mshowbest.c, mshowalign.c to highlight E() values (<font
+color="dark red"></font> in HTML output.
+
+>>Apr. 16, 2008 fa35_03_07
+
+Merge fa35_ann1_br, which allows annotations in library sequences.
+
+The PVM/MPI parallel version now support query sequence annotations
+and -m 9c annotation encoding. It does not yet support library
+annotations. Tested with both PVM and MPI.
+
+>>Apr. 2, 2008 fa35_03_06
+
+Ensure that code in last_init() to modify ktup never increases ktup value.
+
+Add fasta_versions.html to more explicitly describe programs available.
+
+>>Mar. 4, 2008
+
+Fix parsing of parameters (matrix, gap open, gap ext) in ASN.1 PSSM
+files produced by blastpgp.
+
+>>Feb. 18, 2008 fa35_03_05
+
+Re-implement -M low-high sequence range options. Sequence range
+restriction has probably been missing since the introduction of
+ggsearch and glsearch, which use a new approach to limiting the
+sequence range.
+
+>>Feb. 7, 2008 fa35_ann1_br
+
+Add annotations to library sequences (they were already available in
+query sequences). Currently, annotations are only available within
+sequences, but they should be available in FASTA format, or any of the
+other ascii text formats (EMBL/Swissprot, Genbank, PIR/GCG). If
+annotations are present in a library and the annotation characters
+includes '*', then the -V '*' option MUST be used. However, special
+characters other than '*' are ignored, so annotations of '@', '%', or
+'@' should be transparent.
+
+In translated sequence comparisons, annotations are only available for
+the protein sequence.
+
+The format for encoded annotations has changed to support annotations
+in both the query and library sequence. If the -m 9c flag is provided
+and annotations are present, then an annotated position in the
+alignment will be encoded as:
+
+ '|'q-pos':'l-pos':'q-symbol'l-symbol':'match-symbol'q-residue'l-residue'
+
+For example:
+
+ |7:7:@@:=YY|14:14:##:=TT
+
+In cases where the query or library sequence does not have an
+annotation, then the q-symbol or l-symbol will be 'X' (which is not a
+valid annotion symbol).
+
+>>Jan. 25, 2008 fa35_03_04
+
+Map 'O' (pyrrolysine) to 'K', 'U' (seleno-cysteine) to 'C' in uascii.h
+('J' is already recognized and mapped to the average of 'I' and 'L').
+Thus, 'J' will appear in alignments, but 'O' and 'U' are transformed
+to 'K' and 'C'.
+
+Because "Oo" and "Uo" are not (currently) part of aax[] ("Uu" is in
+ntx[]), apam.c/build_xascii() was extended to add characters from
+othx[] - "oth" for "other" so that they are not lost.
+
+Double check, and fix, some mappings for 'J/j' and 'Z/z'.
+
+>>Jan. 11, 2008 fa35_03_03
+
+Clean up some issues with -m 10 output; put "; mp_Algorithm", ";
+mp_Parameters" down with other -m 10 ";" lines. Also provide ";
+al_code" and "; al_code_ann" if -m 9c is specified. Remove duplicate
+">>>query" line.
+
+Add "; aln_code" and "; ann_code" to -m 10 -m 9c output. The
+alignment/annotation encoding is only produced once (in showbest(),
+and is then saved for -m 10 aligment.
+
+>>Dec. 13, 2007 fa35_03_02m (merge of fa35_03_02 and fa35_02_08_br)
+
+Add ability to search a subset of a library using a file name and a
+list of accession/gi numbers. This version introduces a new filetype,
+10, which consists of a first line with a target filename, format, and
+accession number format-type, and optionally the accession number
+format in the database, followed by a list of accession numbers. For
+example:
+
+ </slib2/blast/swissprot.lseg 0:2 4|
+ 3121763
+ 51701705
+ 7404340
+ 74735515
+ ...
+
+Tells the program that the target database is swissprot.lseg, which is
+in FASTA (library type 0) format.
+
+The accession format comes after the ":". Currently, there are four
+accession formats, two that require ordered accessions (:1, :2), and
+two that hash the accessions (:3, :4) so they do not need to be
+ordered. The number and character after the accession format
+(e.g. "4|") indicate the offset of the beginning of the accession and
+the character that terminates the accession. Thus, in the typical
+NCBI Fasta definition line:
+
+ >gi|1170095|sp|P46419|GSTM1_DERPT Glutathione S-transferase (GST class-mu)
+
+The offset is 4 and the termination character is '|'. For databases
+distributed in FASTA format from the European Bioinformatics
+Institute, the offset depends on the name of the database, e.g.
+
+ >SW:104K_THEAN Q4U9M9 104 kDa microneme/rhoptry antigen precursor (p104).
+
+and the delimiter is ' ' (space, the default).
+
+Accession formats 1 and 3 expect strings; accession formats 2 and 4
+work with integers (e.g. gi numbers).
+
+>>Dec. 12, 2007 fa35_02_08
+
+Correct bug in ssearch35 gapped scores that only occurred in
+non-accelerated code. This bug has been present since fa35_02_06.
+Modified the Makefiles so that accelerated (ssearch35(_t)) and
+non-accelerated (ssearch35s(_t)) are available. Edited Makefile's to
+provide accelerated ssearch35 more specifically.
+
+Modifications to provide information about annotated residues in the
+-m9c coded output. Previously, -m 9c output added a field:
+
+ =26+9=15-2=9-1=3+1=74-2=3-3=63
+
+after the standard -m 9 output information. With the new version, an
+annotated query sequence ( -V '*#' ) adds the field:
+
+ |14:16:#<TM|24:26:#>TA|44:37:*>ST|71:66:#=TT
+
+which indicates that residue 14 in the query sequence aligns with
+residue 16 in the target (library) with annotation symbol '#', the
+alignment score is '<' less than zero, and the residues are 'T'
+(query) and 'M' (library). (The '|' is used to separate each
+annotation entry.)
+
+>>Nov. 10, 2007
+
+Parts of p2_complib.c and p2_workcomp.c, and the pvm/mpi Makefiles,
+have been updated to be consistent with name changes in the param.h
+and structs.h directories.
+
+>>Nov. 20, 2007 fa35_02_08
+
+Parts of p2_complib.c and p2_workcomp.c, and the pvm/mpi Makefiles,
+have been updated to be consistent with name changes in the param.h
+and structs.h directories.
+
+>>Nov. 6, 2007 fa35_02_07
+
+Correct problems with asymmetric RNA matrices in initfa.c and rna.mat.
+
+>>Oct. 18, 2007
+
+Correct problem parsing ASN1 FastaDefLines when the database is local.
+
+Recovering from a misplaced cvs commit of code that was supposed to be
+on a branch, code has been recovered from earlier versions (fa35_02_05
+because fa35_02_06 has some branch contamination).
+
+>>Oct. 4, 2007 fa35_02_06
+
+Correct error in gap penalties in dropnnw.c. Due to an unfortunate
+inconsistency, the gap parameter in FLOCAL_ALIGN (in dropgsw2.c) had a
+different meaning than that in almost all the other programs (it was
+the sum of gap_open and gap_ext). The FLOCAL_ALIGN function call was
+copied for FGLOBAL_ALIGN, even though the the FGLOBAL_ALIGN function
+used the more conventional gap_open, gap_ext parameters. Thus,
+FGLOBAL_ALIGN was wrong and the subsequent do_walign() in dropnnw.c
+were wrong. dropgsw2.c:FLOCAL_ALIGN has been modified to use the
+conventional gap_open parameter, and calls to dropnnw.c:
+FGLOBAL_ALIGN() and do_walign() have been fixed.
+
+>>Sept. 20, 2007
+
+Modify the logic used when saving a seq_record *seq_p into beststr
+*bbp to ensure that if the seq_record is replaced, it is replaced at
+all the places where it is referenced. This involves adding a linked
+list into beststr (*bbp->bbp_link). When making the link (and freeing
+it up), be certain that the linked seq_p is the same as the one being
+replaced.
+
+>>Sept. 18, 2007 fa35_02_05
+
+A relatively obscure problem was found on the SGI platform when
+searching a library smaller than 500 sequences (thus requiring some
+shuffles). Two bugs were found and corrected; one involved not
+allocating aa1shuff with COMP_THR and not do a m_file_p->ranliba()
+before re_getlib(). The second involved destroying a pointer to the
+list of seq_records when a sequence was being shuffled. The bugs were
+confirmed with Insure, and have been fixed.
+
+>>Sept. 7, 2007 fa35_02_04
+
+Revamp the offset handling code to provide better uniformity between
+query and library offsets and coordinate systems.
+
+Fix a problem with load_mmap() to load 64-bit sequence locations
+properly on machines with 32-bit integers.
+
+>>Sept. 4, 2007
+
+Modify ncbl2_mlib.c slightly to check to see whether the amino-acid
+mapping in blast databases is identical to the FASTA mapping (it
+should be). If they are identical, do not re-map the blast amino acid
+sequences (potentially a small speed up).
+
+>>Aug. 22, 2007
+
+Change ps_lav.c to lav2ps.c, and add lav2svg.c. It is now possible to
+generate a lalign35 HTML output that has both SVG (lav2svg) and PNG
+(lav2ps | gs ), graphics.
+
+>>Aug. 10, 2007 CVS fa35_02_03
+
+Fix faatran.c:aacmap() bug.
+
+>>Aug. 6, 2007
+
+Extensive restructuring of pssm_asn_subs.c to parse PSSM:2 ASN.1's
+downloaded from NCBI WWW PSI-BLAST more robustly.
+
+>>July 25, 2007 CVS fa35_02_02
+
+Change default gap penalties for OPTIMA5 matrix to -20/-2 from -24/-4.
+
+>>July 24, 2007
+
+Correct bugs introduced by adding 'J' - 'J' was initially put before
+'X' and '*' in the alphabet, which led to problems because the
+one-dimensional lower-triangular pam[] matrices (abl50[], abl62[],
+etc) had entries for 'X', and '*', but not for 'J'. By placing 'J'
+after the other characters, the problem is resolved.
+
+Modify tatstats.c to accommodate 'J'.
+
+'*' is back in the aascii[] matrix, so that it is present by default
+(like fasta34).
+
+>>July 23, 2007
+
+Changes to support sub-sequence ranges for "library" sequences -
+necessary for fully functional prss (ssearch35) and lalign35. For all
+programs, it is now possible to specify a subset of both the query and
+the library, e.g.
+
+ lalign35 -q mchu.aa:1-74 mchu.aa:75-148
+
+Note, however, that the subset range applied to the library will be
+applied to every sequence in the library - not just the first - and
+that the same subset range is applied to each sequence. This probably
+makes sense only if the library contains a single sequence (this is
+also true for the query file).
+
+Correct bugs in the functions that produce lav output from lalign35 -m
+11 to properly report the begin and end coordinates of both sequences.
+Previously, coordinates always began with "1". Correct associated bug
+in ps_lav.c that assumed coordinates started with "1".
+
+>>June 29, 2007 CVS fa35_02_01
+
+Merge of HEAD with fasta35 branch.
+
+>>June 29, 2007 CVS fa35_01_06
+
+Add exit(0); to ps_lav.c for 0 return code.
+
+>>June 26, 2007
+
+Add amino-acid 'J' for 'I' or 'L'.
+
+Add Mueller and Vingron (2000) J. Comp. Biol. 7:761-776 VT160 matrix,
+"-s VT160", and OPTIMA_5 (Kann et al. (2000) Proteins 41:498-503).
+
+Changes to dropnnw.c documentation functions to remove #ifdef's from
+strncpy() - which apparently is a macro in some versions of gcc.
+
+>>June 7, 2007
+
+Modify initfa.c to allow ggssearch35(_t), glsearch35(_t) to use PSSMs.
+
+>>June 5, 2007 CVS fa35_01_05
+
+Modifications to p2_complib.c, p2_workcomp.c to support Intel C
+compiler. Fixed bug in p2_workcomp.c - gstring[2][MAX_STR] required -
+[MAX_SSTR] too short. mp35comp* programs now tested and working (as
+are pv35comp*, c35.work* programs).
+
+Fix problem with fasts/fastm/fastf last_tat.c with limited memory.
+
+Correct problem with lalign35.exe Makefile.nm_[fp]com.
+
+Add $(CFLAGS) to map_db to enable large file support.
+
+Address problem with PSSM's when '*' not defined (initfa.c:extend_pssm()).
+
+>>May 30, 2007 CVS fa35_01_04
+
+Complete work on ps_lav, which converts an lalign35 lav (-m 11) file
+into a postscript plot, which looks identical to the plots produced by
+plalign from fasta2. (ps_lav has been replaced by lav2ps and lav2svg).
+
+>>May 25,29, 2007
+
+Changes to defs.h, doinit.c mshowalign.c for -m 11, which produces lav
+output only for lalign35.
+
+Changes to comp_lib2.c to add m_msg.std_output, which provides all the
+standard print lines. This is turned off for -m 11 (lav) output.
+lalign35 -m 11 provides standard lav output, with the addition of
+#lalign35 -q ... .
+
+>>May 18, 2007
+
+Add m_msg.zsflag to preserve pst.zsflag when reset by global/global
+exclusion of many library sequences.
+
+>>May 9, 2007 CVS fa35_01_03
+
+Tested local database size determination with p2_complib2/p2_workcomp2.
+
+>>May 2, 2007 renamed fasta35, pv35comp, etc
+
+Separate thread buffer structures from param.h.
+
+Problems with incorrect alignments has been fixed by re-initializing the
+best_seqs and lib_buf2_list.buf2 structures after each query sequence.
+
+The labels on the alignment scores are much more informative (and more
+diverse). In the past, alignment scores looked like:
+
+>>gi|121716|sp|P10649|GSTM1_MOUSE Glutathione S-transfer (218 aa)
+ s-w opt: 1497 Z-score: 1857.5 bits: 350.8 E(): 8.3e-97
+Smith-Waterman score: 1497; 100.0% identity (100.0% similar) in 218 aa overlap (1-218:1-218)
+^^^^^^^^^^^^^^
+
+where the highlighted text was either: "Smith-Waterman" or "banded
+Smith-Waterman". In fact, scores were calculated in other ways,
+including global/local for fasts and fastf. With the addition of
+ggsearch35, glsearch35, and lalign35, there are many more ways to
+calculate alignments: "Smith-Waterman" (ssearch and protein fasta),
+"banded Smith-Waterman" (DNA fasta), "Waterman-Eggert",
+"trans. Smith-Waterman", "global/local", "trans. global/local",
+"global/global (N-W)". The last option is a global global alignment,
+but with the affine gap penalties used in the Smith-Waterman
+algorithm.
+
+>>April 24, 2007
+
+The new program structure has been migrated to the PVM and MPI
+versions. In addition, the new global algorithms (pv35compgg,
+pv35compgl) have been moved, though the the PVM/MPI versions do not
+(yet) to the appropriate size filtering.
+
+>>April 19, 2007
+
+Two new programs, ggsearch35(_t) and glsearch35_t are now available.
+ggsearch35(_t) calculates an alignment score that is global in the
+query and global in the library; glsearch35_t calculates an alignment
+that is global in the query and local, while local in the library
+sequence. The latter program is designed for global alignments to domains.
+
+Both programs assume that scores are normally distributed. This
+appears to be an excellent approximation for ggsearch35 scores, but
+the distribution is somewhat skewed for global/local (glsearch)
+scores. ggsearch35(_t) only compares the query to library sequences
+that are beween 80% and 125% of the length of the query; glsearch
+limits comparisons to library sequences that are longer than 80% of
+the query. Initial results suggest that there is relatively little
+length dependence of scores over this range (scores go down
+dramatically outside these ranges).
+
+A bug was found and fixed in showalign() and showbest() where the
+aa1save buffer was not preserved when some sequences needed to be
+re-read, while others were stored in the beststr.
+
+>>April 9, 2007
+
+Some of the drop*.c functions have been reconfigured to reduce the
+amount of duplicate code. For example, dropgsw.c, dropnsw.c, and
+dropnfa.c all used exactly the same code to produce global alignments
+(NW_ALIGN() and nw_align()), this code is now in wm_align.c.
+Likewise, those same files, as well as dropgw2.c, use identical code
+to produce consensus alignments (calcons(), calcons_a(), calc_id(),
+calc_code()). Rather than working with three or four copies of
+identical code, there is now one version.
+
+>>March 29, 2007
+
+At last, the lalign (SIM) algorithm has been moved from FASTA21 to
+FASTA35. Currently, only lalign35 is available. A plotting version
+will be available shortly (or perhaps a more general solution that
+produces lav output).
+
+The statistical estimates for lalign35 should be much more accurate
+than those from the earlier lalign, because lambda and K are estimated
+from shuffles.
+
+Many functions have been modified to reduce the number of times
+structures are passed as arguments, rather than pointers.
+
+>>February 23, 2007
+
+The threading strategy has been modified slightly to separate the end
+of the search phase (and a complete reading of all results buffers)
+from the termination phase. This will allow future threading of
+subsequent phases, including the Smith-Waterman alignments in
+showbest() and showalign() (though care will be required to ensure
+that the results are presented in the correct order).
+
+>>February 20, 2007 fasta-34_27_0 (released as fasta-35_1)
+
+The FASTA programs have been restructured to reduce the differences
+between the threaded and unthreaded versions (and ultimately the
+parallel versions) and to make more efficient use of modern large
+memory systems. This is the beginning of a move towards a more robust
+shuffling strategy when searching databases with modest numbers of
+related sequences.
+
+The major changes:
+
+ comp_lib.c -> comp_lib2.c - comp_lib.c will be removed
+ work_thr.c -> work_thr2.c - work_thr.c will be removed
+
+ mshowbest.c, mshowalign.c have been modified to remove aa1 as an
+ argument. They must allocate that space if they need it.
+
+ The system is set up to allocate a substantial amount of library
+ sequence memory, either to a single buffer (unthreaded) or to the
+ threaded buffer pool. For smaller databases, the library sequences
+ are read once, and then subsequently read from memory (this could be
+ extended for RANLIB(bline) as well).
+
+Soon, these changes will allow the program to re-read the beststr[]
+sequences and shuffle them to produce accurate lambda/K estimates.
+
+================================================================
+
+See readme.v34t0 for earlier changes.
+
+================================================================
diff --git a/doc/readme.v36 b/doc/readme.v36
new file mode 100644
index 0000000..5826024
--- /dev/null
+++ b/doc/readme.v36
@@ -0,0 +1,2213 @@
+
+Version 3.6 of the FASTA programs is a significant update over version
+3.5. It uses the same underlying structure as FASTA35 (specifically
+the strategies for ensuring accurate statistics), but it allows for
+multiple high-scoring alignments to be shown, rather than just one.
+This is the main functional difference between FASTA and BLAST -
+BLAST could show multiple HSPs, FASTA did not.
+
+>>May 23, 2017 [released as fasta-36.3.8f]
+[url_subs.c]
+A small, but major change in the output available to the $SRCH_URL and
+$SRCH_URL2 strings, which are used to enable re-searching, and now
+pairwise alignment. (It would be better to provide a json string of
+the information, rather than using fprintf().) An additional value,
+the name of the query sequence, is provided to these urls so that
+pairwise alignment becomes possible.
+
+>>May 23, 2017
+[scripts/ann_feats2ipr.pl,ann_feats_up_www2.pl,test_ann_scripts.sh src/defs.h]
+Changes to ensure that EBI format databases, which place the ID before
+the accession, e.g. SP:GSTM1_HUMAN P09388, can be processed properly
+by annotation scripts. This involved displaying more of the
+description line, so that the accession field is included, in the
+annot_XXXXX file.
+
+>>May 8, 2017
+[compacc2e.c]
+Address problem where initial domain annotation similarity
+score/identity not properly reset.
+
+[scripts/annot_blast_btop2.pl]
+Fix various problems with domain scores, particularly in gaps, and
+domain coordinates.
+
+Modify version string to May, 2017
+
+>>April 18, 2017
+[cal_cons2.c]
+Address problem where identity count not correctly assigned to
+N-terminal domain at the end of a domain.
+
+>>April 14, 2017
+[src/compacc2e.c, scripts/ann_exons_up_www.pl]
+
+Provide a new script to annotate exon positions in Uniprot Proteins
+(scripts/ann_exons_up_www.pl) that uses the EBI proteins/api/coordinate service.
+
+Provide additional error checking on annotates to ensure that domain
+start is always <= domain end.
+
+>>Jan 17, 2017
+[scripts/ann_pfam30_tmptbl.pl]
+ann_pfam30_tmptbl.pl is a modification of ann_pfam30.pl that loads a
+temporary tables of accessions to be annotated, rather than asking for
+one sequence at a time.
+
+>>Dec 14, 2016
+[initfa.c/scaleswn.c]
+Change required shuffle count (down to 100) and introduce an
+median/IQR strategy to robustly estimate mean and S.D. for ggsearch
+(normal) comparisons (-z 3, in place of Altschul-Gish statistics).
+
+Modify version string to Dec., 2016.
+
+>>Nov 18, 2016
+[build_ares.c]
+fix sequence encoding memory leak
+
+>>Sept 30, 2016 [released as fasta-36.3.8e]
+[psisearch2/]
+
+Added a new sub-directory, psisearch2/, which includes scripts and
+documentation for the new iterative psisearch2_msa.pl and
+psisearch2_msa.py programs. These programs perform iterative PSIBLAST
+(or SSEARCH) searches, but with an option (--query_seed) that
+dramatically reduces false-positives.
+
+Modified most of the scripts/ann_*.pl files to work with new NCBI
+Swissprot accession format. Modified scripts/ann_feats_up_www2.pl and
+scripts/ann_upfeats_pfam_e.pl to work with JSON format Uniprot
+descriptions.
+
+>>July 28, 2016
+[src/pssm_asn_subs.c]
+Fix another problem with binary ASN.1 file processing where the
+asnp->abp buffer was not refilled in time.
+
+>>July 12, 2016
+[src/mshowbest.c]
+Modified -m8/-m 8CB output to include "eval2" when a second E()-value
+is available (when -z > 20). "eval2" is shown after the bit score,
+but before BTOP and annotations.
+
+>>May 25, 2016
+[scripts/ann_pfam28.pl]
+Implement --split_over command option, which takes overlapping domains
+and produces virtual like domains from the overlap region.
+
+>>Apr. 12, 2016 [released as fasta-36.3.8d]
+[src/pssm_asn_subs.c]
+Fix another problem with binary ASN.1 file processing where the
+asnp->abp buffer was not refilled in time.
+
+[initfa.c] - version date updated to Apr, 2016
+
+[upam.h] - changes to default gap penalties for VT40 (from -14/-2 to
+-13/-1), VT80 (from -14/-2 to -11/-1), and VT120 (from -10/-1 to 11/-1).
+
+>>Mar. 30, 2016
+[scripts/m9B_btop_msa.pl]
+Provide --bound_file_only, --bound_file_in, --bound_file_out.
+Ensure that alignments outside boundaries are NOT included in MSA.
+
+>>Mar. 22, 2016
+[scripts/m8_btop_msa.pl, m9B_btop_msa.pl]
+Ensure that full length query sequence is included in MSA.
+[pssm_asn_subs.c]
+Fixes to allow IUPACAA sequences in ASN.1 PSSM. Other fixes to ensure
+that arrays not allocated are not freed when wfreqs2d[] is not available.
+
+>>Mar. 18, 2016
+[scripts/m8_btop_msa.pl, m9B_btop_msa.pl]
+scripts/m8_btop_msa.pl takes a fasta36 -m 8CB output file and produces
+a multiple sequence alignment that can be used with psi-blast.
+
+scripts/m9B_btop_msa.pl takes a fasta36 -m 9B output file and produces
+a multiple sequence alignment that can be used with psi-blast.
+
+>>Feb. 15, 2016
+[mshowbest.c, compacc2e.c, cal_cons2.c, dropfx2.c, dropfz3.c]
+Modify logic for calculating percent identity in sub-alignments to use
+the BLASTP strategy, which does not could gapped regions as part of
+the alignment length. Fix the -m 8 display (BLAST tabular output) to
+use ungapped alignment length for percent identity (as -m BB does).
+
+[initfa.c] - version date updated to Feb, 2016
+
+>>Feb. 12, 2016
+[compacc2e.c, cal_cons2.c, dropfx2.c, dropfz3.c]
+Modify display_push_features() to use both the rst.score[score_ix],
+which is used to calculate the zscore and bitscore, and also sw_score,
+which is the correct divisor for sub-alignment scores. Previously,
+only the rst.score[score_ix] was used, which caused some bit scores to
+be out of range, and produced erroneous Q-value scores for
+sub-alignments.
+
+>>Jan. 24, 2016
+[cal_cons2.c]
+Ensure left_domain_link[01] set to NULL before initialized.
+
+Rename ann_feats2l.pl to ann_feats_up_sql.pl for consistency with
+ann_feats_up_www2.pl. ann_feats_up_www2.pl no longer works because of
+changes at the EBI.
+
+>>Dec. 15, 2015 [re-released as fasta-36.3.8c]
+[pssm_asn_subs.c]
+Fixed another problem parsing ASN.1 because of reading past the end of
+the buffer.
+[cal_cons2.c]
+Fix a serious bug that prevented display of annotated sites using -m9c/-m8CC
+
+>>Nov. 24, 2015 [re-released as fasta-36.3.8c]
+[mshowalign2.c]
+Correct first_line logic to display >>seqid description on first
+alignment line, but >- on remaining lines.
+
+>>Nov. 23, 2015 [released as fasta-36.3.8c]
+[cal_cons2.c, mshowalign2.c, scripts/annot_blast_btop.pl, scripts/ann*_e.pl]
+Fix the problem that lalign36 no longer displayed the library/subject
+accession/description. Correct some problems introduced with BTOP
+alignment encoding.
+
+A new script, scripts/annot_blast_btop.pl, is available to provide -V
+type sub-alignment scoring to BLASTP BTOP alignments stored in tabular
+files. In addition, the scripts/ann*.pl scripts were modified to work
+as part of a unix pipe, and the ann*_e.pl scripts replace the older
+non "_e.pl" scripts, and were renamed with out the "_e" (thus,
+ann_pfam_www.pl was removed, and ann_pfam_www_e.pl was renamed
+ann_pfam_www.pl).
+
+
+>>Nov. 6, 2015
+[cal_cons2.c, initfa.c, mshowbest.c, dropfx2.c, dropfz3.c]
+Implement BLAST+ BTOP alignment format, available with -m 8CB or -m 9B.
+Convert previously static calc_code alignment strings to dynamic strings.
+
+>>Oct. 13, 2015 [released as fasta-36.3.8b]
+[initfa.c, pssm_asn_subs.c]
+Fix problems encountered when reading in binary ASN.1 file produced by
+datatool. Previous versions did not use the final score data provided
+by the tool; this version now uses that information if it is
+available. If it is not available, the PSSM integer values are
+calculated from the frequency data.
+
+>>Oct. 8, 2015
+[pssm_asn_subs.c]
+Fix a rare condition where the pssm_asn parser reads past the asn
+buffer.
+
+>>Sep. 28, 2015
+[comp_lib9.c, scaleswn.c, dropnfa.c, dropfx2.c dropfz3.c]
+(1) [scaleswn.c] -- changes to drop back to Altschul-Gish statistics
+when other strategies fail. (2) Fix to ensure that adler32() is
+calculated correctly for 1-residue library sequences; definition of
+adler32() added to drop*.c files.
+
+>>Sep. 7, 2015
+[Makefile.nmk_icl, Makefile.nm_pcomp, doinit.c, readme.win32]
+Automatic detection of thread/core number on windows. Changes to
+readme.w32 documentation, Windows programs no longer require sse2 in
+name (since all modern x86 processors have it).
+
+>>Sep. 4, 2015
+[comp_lib9.c, cal_cons2.c, dropfx2.c, dropfz3.c]
+(1) Fix bug with overlapping domains when a domain ends exactly where
+the alignment starts. (2) provide command line in -m 8CC output with -DPGM_DOC
+
+>>Aug. 31, 2015 [git v36.3.8_30Jul15]
+[cal_cons2.c, dropfx2.c, dropfz3.c, mshowbest.c, build_ares.c, doinit.c, comp_lib9.c]
+Modifications to enhance the independence of annotation output to
+different files. Earlier, annotations could not be properly output to
+different files in different formats. For example, -m 9c prevented -m
+"F8CC output.m8CC" -m "F9I ouutput.m9I". Annotation output formats
+are now more independent. They are not fully independent, however.
+Thus, if CIGAR format is used for one output, it will be used in all
+other alignment encoding outputs.
+
+>>Aug. 21, 2015
+[cal_cons.c, dropfx2.c, dropfz3.c, mshowbest.c, build_ares.c, doinit.c]
+Add -m 9I to -m 9i. -m 9i reports identity and variation (based on
+annotation scripts). -m 9I also reports domain content on the initial
+summary line.
+
+>>Aug. 20, 2015 [fasta-36.3.8a]
+[mshowalign2.c]
+Fixed bug in lalign36 E()-value, bit score calculations for highest
+scoring non-identical alignment by reverting to older code. This bug
+was introduced in fasta-36.3.6d in January, 2014.
+
+>>Jul. 21, 2015 [fasta-36.3.8]
+[compacc2e.c, cal_cons2.c, dropfx2.c dropfz3.c, param.h]
+Fixed a major bug in the annotation code that had been added to
+accomodate overlapping domains. The original implementation was not
+thread-safe, because the array of annotations was modified during the
+scoring, but was also shared by threads. The new version keeps
+independent scoring arrays.
+
+>>Jun. 23, 2015 [released as fasta-36.3.7b]
+[dropnnw2.c]
+Fix problem where glsearch reset (ignored) the -M sequence limit.
+
+>>Jun. 18, 2015
+[dropfx.c, dropgsw.c, dropfx.c, dropfx2.c, dropfz3.c]
+Fix problem in do_walign.c with comparison to score_thresh during
+recursive alignment.
+
+>>May. 21, 2015
+[compacc2e.c]
+Add additional checks to ensure that annotations are within the
+sequence boundaries.
+
+>>Jan. 26, 2015 [ re-released as fasta-36.3.7a]
+[compacc2e.c]
+Fix problem with domain boundary calculations for subsets of sequences.
+
+>>Jan. 21, 2015 [ released as fasta-36.3.7a]
+[calc_cons2.c, dropfx2.c, dropfy3.c]
+Fix problems with -m 9c / -m 9C alignment encodings in version
+36.3.7. Apparently, the Nov. 25, 2014 fix was not committed properly.
+In addition, make certain that the query sequence is ALWAYS the
+reference sequence, particularly in translated alignments. As a
+result, the insertion/deletion codes are now reversed for fast[xy]36
+and tfast[xy]36.
+
+>>Jan. 6, 2014
+[data/VTML_*.mat]
+Provided scoring matrix files for the VTML_10,20,40,80,120,160,200
+matrices available internally.
+
+>>Nov. 25, 2014 [ released as fasta-36.3.7]
+[cal_cons.c, dropfx2.c, dropfz3.c]
+Fix problem that prevented -m 9c and -m 8CC unless annotations were
+present.
+
+Added approved copyright notice and Apache 2.0 license to
+appropriate files.
+
+>>Nov. 19, 2014
+[mshowbest.c]
+Add alignment (CIGAR) string and annotation string to BLAST tabular
+(-m 8) aligments with -m 8C[cCdD]. To get alignment and annotation
+encoding without BLAST comments, use -m 8X[cCdD].
+
+>>Nov. 10, 2014
+[cal_cons2.c, dropfx2.c, dropfz3.c]
+Ensure that site annotations are shown when annotations are embedded
+in a sequence, not provided by a script.
+
+>>Oct. 27, 2014
+[cal_cons2.c]
+Fix a bug in the annotation alignment that put annotation symbols off
+by one (or more) in the coordinate lines. Add annotations that align
+in gaps.
+
+>>Oct. 6, 2014
+[most source files]
+The copyright notice for fasta-36.3.7 has been updated to include an
+open software license, Apache2.0, for redistribution.
+
+>>Sept. 28, 2014
+[url_subs.c]
+Substitute annot_p->s_annot_arr_p[] for annot_p->domain_arr_p[i] in
+display_domains(), encode_json_str(). Remove domain_arr_p from struct
+annot_entry. With domain_arr_p gone, n_domains is less useful, but it
+is still available, and used for checking for domain graphics.
+encode_json_domains() also now uses annot_p->n_annots, and skips over
+non-domains.
+
+>>Sept. 19, 2014
+[dropfx2.c, dropfz3.c]
+Fixes to produce correct coordinates with forward and reverse
+complement [t]fast[x,y].
+
+>>Sept. 17, 2014 [new version, fasta-36.3.7]
+[compacc2e.c, cal_cons2.c, dropfx2.c, dropfz3.c]
+The annotation domain scoring/plotting strategy has been extended to
+allow overlapping domains. To accommodate overlapping domain
+annotations, the annotation file format (e.g. gstm1_human.annot) has
+been extended to accept the form:
+
+>sp|P09388|GSTM1_HUMAN
+1 - 88 Glutathione_S-Trfase_N :1
+7 V F Mutagen: Reduces catalytic activity 100- fold.
+90 - 208 Glutathione-S-Trfase_C-like :2
+108 V Q Mutagen: Reduces catalytic activity by half.
+
+where a "-" in the second field indicates that the first and third
+fields specify the beginning and end of the domain. In previous
+versions, a '[' specified the beginning of a domain, and a ']' on a
+later line specified the end of the domain. '[' and ']' on separate
+lines required that domains not overlap (so that the '[' and ']' could
+be paired). fasta-36.3.7 will still read this format, but the "start -
+stop" format is both simpler and more flexible.
+
+Three new annotation scripts are available that use the new domain
+notation: ann_feats2ipr_e.pl, ann_feats_up_www2_e.pl, ann_pfam_e.pl,
+and ann_pfam_www_e.pl. All four scripts will report overlapping
+domains.
+
+Overlapping domains also allows domain annotations from different
+sources to be combined (e.g. InterPro Pfam, Panther, and Superfamily
+domain annotations), as well as domain annotations of different types,
+e.g. Uniprot domain and secondary structure annotations.
+
+>>Aug. 28, 2014 [re-released as fasta-36.3.6f]
+[ncbl2_mlib.c]
+The code used to parse blastfmtdb sequence description lines has not
+kept up with NCBI's use of ASN.1 in sequence descriptions. This code
+has been updated, and now works properly with the protein and DNA
+sequence databases.
+
+[comp_lib9.c]
+Fixed a seg-fault that occurred when an open-file error occurred.
+
+>>Aug. 22, 2014 [released as fasta-36.3.6f]
+[mshowbest.c]
+Change alignment summary display for lalign to not show identical
+alignment score unless '-J' option used. Add "The best non-identical
+alignments" when no "-J"
+
+[ann_pfam_www.pl] Fix bugs.
+
+[ncbl2_mlib.c]
+modified to read NCBI ambiguity codes in
+blastdbfmt/formatdb nucleotide databases. Not extensively tested.`
+
+>>Aug. 20, 2014
+[compacc2.c, cal_cons.c, dropfx.c, dropfz2.c]
+Modify sub-alignment score report to calculate bit-score by dividing
+total alignment bit score by sub-alignment raw score divided by total
+alignment raw score. This produces a bit score that is much more
+sensible than the previous strategy, which calculated a z-score from
+the sub-alignment.
+
+>>Aug. 18, 2014
+[compacc2.c, cal_cons.c]
+Undo removal of '[]' from aa0a/aa1a (they are required to visualize
+domain boundaries in alignment). cal_cons.c now users PSSMs when they
+are available.
+
+>>Aug. 8, 2014
+[comp_lib9.c, compacc2.c]
+Move the call to get query annotations via scripts out of compacc2.c
+and into comp_lib9.c.
+
+>>July 29,2014
+[comp_lib9.c, mshowbest.c, mshowalign2.c]
+Enable high scoring alignment display (like high scoring sequences)
+with lalign36, when -m 9 (-m 9c/d/C/D) option is provided, or with -m
+8. This allows lalign36 to provide a compact, tabular list of
+non-overlapping local alignments.
+
+>>June 30, 2014
+[pssm_asn_subs.c]
+Update the code for parsing ASN.1 binary PSSM files produced by
+psiblast+. The new code reads more of the optional fields in
+pssm_intermediate_data(). The fields are not used, but broke the
+earlier parser.
+
+>>June 11, 2014
+[cal_cons.c, initfa.c, dropfx.c, dropfz2.c]
+Extend the match/mismatch encoding provided by -m 9c and -m 9C with -m
+9d and -m 9D. The -m 9d/D options provide mismatch locations as well
+as insertion/deletion locations. For -m 9d, the list of codes has
+expanded from '=\/*' to '=\/*x'; for -m 9D, 'MDIMX'. Current
+implementation works for all programs except [t]fast[fms]. Updated
+version strings to June, 2014.
+
+>>May 28, 2014
+[mshowalign2.c, mshowbest.c, initfa.c, structs.h]
+Add the command line option -XI. Changes the calculation of percent
+identity to ensure that a single mismatch in a long sequence with >
+99.9\% identity is displayed as 99.9% (0.999) identity, rather than
+100.0% identity. Without this option, a single mismatch in 10,000
+residues displays 100% identity, with the option, 99.9% identity is
+displayed (even though the identity is 99.99%).
+
+[cal_consf.c]
+Fix the false error message "code begins with 0" in cal_consf.c.
+
+>>Feb. 12, 2014
+[compacc2.c]
+When providing "sequence length" to annotation scripts, add offsets.
+Also modify scripts to allow sequence lengths to increase.
+
+>>Jan. 28, 2014 (re-released as fasta-36.3.6d/Jan 2014)
+[dropfs2.c, calconsf.c, tatstats.c]
+The coordinate fix for fasts36/fastm36 (Dec 18, 2013) broke some
+fasts/fastm alignments. The alignment code has been reverted to the
+"classic" code that has been used for more than 10 years. However,
+that code always marked the first aligned residue as 1, even when the
+first part of the query did not align. The initial coordinate offset
+has been fixed; the coordinate is now the position in the first
+aligned fragment. This may be confusing, because with fasts, the
+first aligned fragment may not be the first fragment in the query
+list. The coordinate provided always provides the offset from the
+beginning of the first fragment in the alignment, not the first
+fragment in the list. This fix required changes to the definition of
+calc_astruct(), which required changes to build_ares.c, mshowalign.c,
+calc_cons.c, dropfx.c, and dropfz2.c.
+
+>>Jan. 24, 2014
+[mshowalign2.c]
+Add checks to assumption that '>gi|12345' is an NCBI library entry.
+[nmgetlib.c]
+Fix for nmgetlib.c with -DMYSQL_DB
+
+Some cleanup of old Makefiles.
+
+>>Jan. 1, 2014
+[url_subs.c]
+Fix off by one in domain coordinates in display_domains().
+
+>>Dec. 18, 2013
+[dropfs2.c, cal_consf.c]
+Fix problem with alignment display when query sequence is much longer than library sequence.
+
+>>Dec. 11, 2013
+[compacc2.c]
+Modified save_best2() to correctly exclude sequences outside
+-M n1_low-n1_high limits.
+
+>>Nov. 8, 2013 (re-released as fasta-36.3.6d)
+[ncbl2_mlib.c]
+Fix problem with src_long8_read() where int/uint64_t seems to cause
+problems with Linux intel icc. Using int/unsigned int solves the problem.
+
+>>Nov. 1, 2013
+[apam.c, ncbl2_mlib.c, map_db.c]
+[apam.c ] Fix problem with query sequences and libraries that do not
+end in newline ('\n'). [ncbl2_mlib.c, map_db.c] provide grouping for
+shifts for byte extraction in src_int4/long8_read() to remove compiler
+warnings. [map_db.c] Fix problem reading sequences for indexing that
+caused crash.
+
+>>Oct. 8, 2013 (released as fasta-36.3.6d)
+[comp_lib9.c, initfa.c]
+Modify initfa.c/re_ascii() function to avoid qascii[] characters that
+had been remapped for annotations.
+
+>>Oct. 4, 2013
+[nmgetlib.c, ncbl2_mlib.c]
+Modify nmgetlib.c/re_openlib() to re-use memory mapped file arrays.
+This had been the intention for some time, but a check for libf != 0
+prevented the memory mapped arrays from being reused. libf is no
+longer checked, just mm_flag.
+
+>>Sep. 26, 2013
+[ncbl2_mlib.c]
+Fix a bug in ncbl2_mlib.c/parse_fastadl_asn() that prevented
+accessions longer than 20 characters in description lines from BLAST
+formatted libraries.
+
+[compacc2.c]
+Fix a bug in compacc2.c/comment_var() that showed the wrong original
+sequence in qVariant changes.
+
+>>Sep. 2, 2013
+[dropfs2.c]
+Fix bug in dropfs2.c/init_work() that prevents correct tatusov
+statistics with -z >10.
+
+>>Aug. 21, 2013 (released as fasta-36.3.6c)
+[comp_lib9.c]
+Fix bug in comp_lib9.c/new_seqr_chain() that prevented memory from
+being allocated to the chain if a memory mapped database was followed
+by a non-memory mapped database.
+
+>>Aug. 9, 2013
+[scaleswn.c]
+Ensure shift to MLE_STATS if too many scores are excluded by trimming.
+
+>>July 31, 2013 (released as fasta-36.3.6b)
+[url_subs.c]
+Make JSON output for -m 6 (html) dependent on $ENV{JSON_HTML}. JSON
+output is not currently used.
+
+>>July 26, 2013
+[mshowalign2.c, scripts/lavplt_svg.pl]
+Correct offsets in -m 11 lav plots, and modify lav2plt.pl/
+lavplt_svg.pl/ lavplt_ps.pl to reflect the corrections.
+
+Move all perl scripts out of /src into /scripts.
+
+>>July 19, 2013 (released as fasta-36.3.6a)
+[compacc2.c, cal_cons.c, dropfx.c, dropfz2.c, build_ares.c]
+Provide dynamic string allocation/dyn_strcat for annotation string
+output. This fixes problems with long proteins with many domains or
+other annotations, which were too long for the fixed annotation output
+storage.
+
+Version date updated to July, 2013.
+Compiled and tested on Windows32.
+
+>>July 8, 2013
+[cal_cons.c, dropfx.c, dropfz2.c]
+Properly terminate annotions with offsets [cal_cons.c], and with
+domains beyond alignment [dropfx.c, dropfz2.c]
+
+>>July 5, 2013 (released as fasta-36.3.6)
+[comp_lib9.c, doinit.c, dropfx.c, dropfz2.c]
+Fix conflict between -m 9 and -z -1; fix annotation display using
+non-script annotations. Stop using calc_last_set in dropfx/fz2.c.
+
+>>June 24, 2013
+[scripts/ann_feats_up_www2.pl]
+Add script (ann_feats_up_www2.pl) for annotating UniProt sequences using:
+"http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/uniprotkb".
+
+>>June 6, 2013
+[compacc2.c, cal_cons.c, initfa.c, dropfx.c, dropfz2.c]
+Provide the -XNS/-XXS/-XN+/XX+ and -XND/-XXD/-XN-/-XX- options that
+specify how N:N and X:X alignments are counted for similarity and
+identity. By default, N:N (DNA) and X:X (protein) alignments are
+considered identical, but not similar (because their scores are
+typically negative to address statistical issues).
+-XNS/-XXS/-XN+/-XX+ cause N:N/X:X alignments to be counted as similar,
+even though their alignment are negative. Likewise,
+-XND/-XXD/-XN-/-XX- cause N:N and X:X alignments to be considered
+non-identical (and non-similar).
+
+>>May 28, 2013
+[url_subs.c]
+do_url1() has been modified to: (1) require env($REF_URL, $SRCH_URL,
+$SRCH_URL1) for these links to produce printout. (2) Link text is
+surrounded by <!-- LINK_START "lname" --> <!-- LINK_STOP -->. (3)
+do_url1() now produces <!-- JSON --> output automatically, which can
+be used to get all the information provided by earlier URL links.
+
+>>May 29, 2013
+[mshowalign2.c]
+Re-instate code in showalign() to ensure that original bbp->rst is
+used for first alignment, rather than that calculated by CHECK_SCORE
+(which is used for later sub-HSP's). The CHECK_SCORE -S alignment
+score is based on the non-S alignment, and is then re-scored with the
+low-complexity -S matrix. But the best alignment excluding
+low-complexity can have a higher score than the best all-complexity
+alignment rescored with -S.
+
+>>May 27, 2013
+[mshowalign2.c, url_subs.c]
+The plot_domain.cgi SVG code has been expanded to allow the domain
+structure of the entire query and library sequence, not just the
+aligned regions, to be displayed. Showing domains above the query or
+below the library takes an additional 18 px in each direction (36
+total); this size needs to be provided in the <object data=""
+width="660" height="54"> format string that is provided in
+$DOMAIN_PLOT_URL.
+
+Right now, the argument to $DOMAIN_PLOT_URL can get very long with
+lots of aligned domain (region), and query and library domain
+information. It would be better to provide this in some separate way.
+YAML might also be a more efficient strategy.
+
+>>May 9, 2013
+[dropfx.c, dropfz2.c, compacc2.c, url_subs.c]
+The web infrastructure for domain plots has been completed --
+plot_domain2.cgi which generates SVG for domain plots now understands
+reverse-complement cDNA fastx/y alignments, and plots coordinates
+accordingly. Testing with fastx36/fasty36 revealed some memory
+errors, which have been fixed. In addition, dropfz2.c has been
+updated to properly treat some region/alignment-boundary conditions;
+dropfx.c and dropfz2.c provide equivalent sub-alignment scores.
+
+[../scripts, ../misc]
+A new directory, ./scripts, has been created to collect the scripts
+used for sequence library expansion and domain/feature annotation.
+../scripts/README.scripts provides more information. Modify code to
+allow expansion scripts (-e) to start with '\!', like annotation
+scripts.
+
+>>Apr. 15, 2013
+(compacc2.c, cal_cons.c, dropfx.c, dropfz2.c, mshowalign2.c)
+Modifications to properly deal with sequence and coordinate offsets in
+annotation alignments. compacc2.c/get_annot_list() has been modified
+to only print/read an annotation once (the same sequence may appear
+twice with fastx/fasty). mshowalign2.c now includes <!--
+ANNOT_START/STOP --> and <!-- ALIGN_START/STOP --> in HTML mode. This
+comments are not on their own line, to save output space, so the
+remainder of the line should be captured.
+
+>>Apr. 5, 2013
+(doinit.c)
+Add the ability to specify HTML output using the -m '0H' option. This
+addresses the problem that -m "F6" does not fully specify the output
+format. In addition, -m 6 should probably explicitly set -m 0 (if it
+has not been set), rather than simply 'or'ing it, but right now we do
+not know when it is set.
+
+>>Mar. 17, 2013
+(compacc2.c, url_subs.c, plot_domain.cgi, ann_feats2l.pl)
+Modifications to url_subs.c to support SVG domain maps in HTML output.
+
+A new evironment variable has been defined, DOMAIN_PLOT_URL, which can
+be used to plot (using SVG or PNG) a map of the domains on the library
+sequence. The argument to DOMAIN_PLOT_URL is the concatenated list of
+annotations provided by the -V options. All annotations (including
+sites) are passed; non-alpha-numeric characters are URL encoded.
+plot_domain.cgi is an example of a script that can be passed as
+DOMAIN_PLOT_URL. To use this script:
+
+$ENV{DOMAIN_PLOT_URL}="<object data=\"plot_domain.cgi?n0=%d&query=%s&db=%s&lib=%s&q_start=%ld&q_stop=%ld&l_start=%ld&l_stop=%ld&n1=%d&o_pgm=%s&doms=%s\" width=\"660\" height=\"72\"></object>\n";
+
+ann_feats2l.pl has been extended to allow the --neg
+(or --neg-dom) option, which puts domain a NODOM domain annotation
+between the domain annotations provided by the database.
+
+>>Mar. 7, 2013
+(cal_cons.c)
+Modify update code to properly begin global alignments that start with
+insertions or deletions.
+
+>>Feb. 20, 2013
+(compacc2.c)
+Annotation scripts (-V \!ann_feats.pl) were being inactivated if no
+annotations were returned, fixed.
+
+>>Feb. 2, 2013
+(comp_lib9.c)
+Prevent premature termination of query title in -m 9 mode (guarantees
+the full >accession text to first space is preserved).
+(compacc2.c)
+Provide domain information (;C=PF00016) in -m9 domain scoring.
+
+>>Jan 7-9, 2013
+(initfa.c, pssm_asn_subs.c)
+Modify pssm_asn_subs.c to properly parse binary PssmWithParameters
+produced by NCBI asntool from psiblast (blast+) text ASN.1 output.
+The text ASN.1 uses a binary encoded query sequence; get_lambda() in
+initfa.c was modified to work with a binary encoded query sequence
+(the query is used to find the p_i from rrcounts[query[i]]).
+
+Modify pssm_asn_subs.c to set query=NULL when PSSM does not include
+query sequence. Modify read_asn_pssm() to set query=aa0 if query==NULL;
+
+>>Dec. 14, 2012
+(cal_cons.c, dropfx.c, dropfz2.c)
+Enable percent identity calculation on domains. Merge
+cal_cons.c/calc_code() strategies into dropfx.c, dropfz2.c
+
+>>Dec. 6, 2012
+(comp_lib8.c, comp_lib9.c, nmgetlib.c)
+Fix code in close_lib_list() that did not properly re-initialize files
+for re-reading (not seen when library is in memory, or for single
+sequence search).
+
+>>Dec 2, 2012
+(wm_align.c, Makefiles)
+CHECK_SCORE() in wm_align.c must return different scores for local and
+global (#define GGSEARCH in wm_align.c). Requires modified Makefiles.
+
+>>Sep 24, 2012
+(doinit.c, compacc2.c, cal_cons.c)
+Fix bugs introduced with next_annot_entry() strategy for reallocating
+annot_arr[]; find a bug in cal_cons.c where i1_annot was indexing
+annot0_arr_p[]; ensure that m_msg.ann_arr_def[] is appropriately initialized.
+
+>>Sep 17, 2012
+(lav2plt.pl, lavplt_ps.pl, lavplt_svg.pl, lav_defs.pl, l_feat_dom.pl)
+Convert the lav*.c programs to perl. This simplifies adding the
+ability to script domain annotation. The format for domain
+annotations for the lav2plt.pl programs differs slightly from the
+current up_feats_dom.pl program, because it requires a beginning and
+end for each domain, e.g.:
+
+>sp|Q14247.2|SRC8_HUMAN
+80 [] 116 Cortactin 1.
+117 [] 153 Cortactin 2.
+154 [] 190 Cortactin 3.
+191 [] 227 Cortactin 4.
+228 [] 264 Cortactin 5.
+265 [] 301 Cortactin 6.
+302 [] 324 Cort. 7; trunc.
+492 [] 550 SH3.
+
+and takes a single accession from the command line, e.g.:
+"l_annot_dom.pl sp|P09488" rather than reading a file.
+
+>>Sep 4, 2012
+(doinit.c, compacc2.c, fasta_guide.tex)
+Annotations can now be provided within a sequence (-V '%#!'), by a
+script (-V '\!up_feats.pl'), or from a file (-V '<annot.file
+q<annot.file'). Annotation files make particular sense for query
+annotations, where the user may know much more about the query than
+the database does.
+
+(doinit.c, compacc2.c, comp_lib9.c, structs.h)
+Ensure that calc_code() is called if any -m 'F9c file' requires it.
+
+>>Aug 31, 2012
+(cal_cons.c, compacc2.c, dropfx.c, dropfz2.c)
+The region score calculations have been corrected to include regions
+that overlap alignment boundaries, and regions that start in gaps.
+
+>>Aug 10, 2012
+(cal_cons.c, compacc2.c, dropfx.c, dropfz2.c)
+
+Introduce a second kind of annotation feature, the "Region" (denoted
+by '[' and ']'), that specifies a region that should be scored
+separately. These regions cannot be nested, each residue can belong
+to only one region. However, the scores in these regions can be
+calculated (perhaps percent identity and length later), and are
+displayed:
+
+>>sp|P09488|gstm1_human GLUTATHIONE S-TRANSFERASE MU 1 ( (218 aa)
+ Site:* : 23Y=23Y : MOD_RES: Phosphotyrosine (By similarity).
+ Site:* : 33Y=33Y : MOD_RES: Phosphotyrosine (By similarity).
+ Site:* : 34T=34T : MOD_RES: Phosphothreonine (By similarity).
+ Region : 3-82 : score=547; bits=146.4 : GST_N
+ Site:^ : 116Y=116Y : BINDING: Substrate.
+ Region : 104-171 : score=465; bits=125.8 : GST_C
+
+All information about the region should be provided with the '['
+(start) symbol.
+
+>>Aug 1, 2012
+(dropfx.c, dropfz2.c, c_dispn.c)
+Fix some very old bugs that caused errors in coordinate displays of
+reverse-complement fastx/fasty alignments. Fix BLAST alignment
+display coordinates. Enable variant calculations for FASTY
+(dropfz2.c), and simplify calculations for dropfx.c
+
+>>Jul 29,2012
+(doinit.c, compacc2.c, comp_lib9.c)
+Allow annotation descriptions to be delivered by annotation script,
+denoted by '=' in first line, e.g.:
+=*:phosphorylation
+=^:binding site
+=@:active site
+>gi|121735|sp|P09488.3|GSTM1_HUMAN
+7 V F Mutagen: Reduces catalytic activity 100- fold.
+23 * - MOD_RES: Phosphotyrosine (By similarity).
+33 * - MOD_RES: Phosphotyrosine (By similarity).
+34 * - MOD_RES: Phosphothreonine (By similarity).
+
+remove requirement for leading space before annotation script: e.g.:
+-V '\!up_feats_c.pl'
+
+>>Jul 27, 2012
+(compacc2.c, cal_cons.c, dropfx.c)
+
+(1) Allow comments/descriptions on features other than type 'V' (variant)
+to be displayed with alignment. If a '@' SITE feature has a comment
+provided by the annotation script, the comment will be displayed in
+the alignment description , e.g.:
+
+>>sp|P28161.2|GSTM2_HUMAN Glutathione S-transf (218 aa)
+ ^ :116Y=116Y: BINDING: Substrate (By similarity).
+ @ :210S+210T: SITE: Important for substrate specificity.
+ initn: 632 init1: 632 opt: 632 Z-score: 1414.3 bits: 268.8 E(450603): 2.6e-71
+Smith-Waterman score: 945; 75.2% identity (93.6% similar) in 218 aa overlap (1-218:1-218)
+
+If no comment is provided, the annotation will only appear in the
+coordinate line. This provides a way to show annotation locations in
+BLAST output.
+
+(2) Also add code to ensure that symbols returned by annotation scripts
+are displayed on the coordinate line.
+
+(3) Add environment variable substitution to =${TMP_D}/annot.defs and
+\!${TMP_D}/up_feats_c.pl parsing.
+
+>>Jul 24, 2012
+(uascii.h, map_db.c)
+Modify NANN, a value one more than the largest amino-acid encoding
+value, increasing it from 50 (too small for NCBIStdaa_ext_n) to 60;
+ESS changed to 59.
+
+>>Jul 20, 2012
+(mshowalign2.c, mshowbest.c, compacc2.c, comp_lib8.c)
+(transferred from fasta-36.3.5)
+(a) Fix bug in mshowalign2.c that occurred because of re-use of the
+"tmp_len" variable when adding '\n' to -L long descriptions. This
+typically occurred with -m 10. (b) Modify logic used to capture if an
+alignment had been calculated, reducing dramatically the number of
+re-alignments with multiple -m "F" output files.
+
+>>Jun 30, 2012
+(mshowbest.c)
+Ensure that opt score and E()-value are based on initial scan score,
+not later alignment score. score_delta is used to increment initial
+scan score. However, currently the E()-value of the alignment score
+is displayed in the alignment list, so the -m 9 and showalign()
+E()-values can be inconsistent.
+
+>>Jun 29, 2012 (from fasta-36.3.5c)
+(pssm_asn_subs.c)
+Add chk_asn_buf() before getting RPSPARAMS_MATRIX.
+
+>>Jun. 27, 2012 (from fasta-36.3.5c))
+(nmgetlib.c, compacc2.c)
+Fix bug that allocated unnecessary space for re-loading sequences in
+pre_load_best() (compacc2.c). Ensure that closed/NULL memory mapped
+file descriptors are not returned.
+
+>>Jun. 18, 2012
+(compacc2.c)
+Modify pre_load_best() to allocate memory for sequences to be aligned
+only if the sequences are not already in memory. (Searches against
+hg18 with repetitive queries caused very large amounts of memory to be
+allocated in duplicate.)
+
+>>Jun. 12, 2012
+(compacc2.c, doinit.c, dropfx.c, cal_consf.c)
+Implement variant scoring for fastx36. Also address problems with
+annotation location when -m markx is not set. Check function
+definitions for other drop functions where variant scoring is not yet
+implemented.
+
+>>Jun. 9, 2012
+(defs.h, doinit.c, c_dispn.c)
+Add 'M' and 'B' options to -m 0,1 to specify annotation location. For
+example, -m 0M (-m1) causes the annotation to be inserted in the
+"middle" alignment line, rather than in the coordinate line (making
+the sequence with the annotated feature ambiguous). -m 0B, -m1B
+puts the annotation in both the middle (alignment) line and the
+coordinate line.
+
+>>Jun. 8, 2012
+(doinit.c, compacc2.c, build_ares.c, mshowbest.c, mshowalign2.c,
+structs.h and others)
+
+Implement a script-driven strategy for feature annotation in
+alignments. In addition to: fasta36 -V '*%^@', which extracts the
+annotation characters from the library sequences, we can also do:
+fasta36 -V '*%^@ \!feature_script.pl' which expects the same
+annotation characters ('*%^@'), but expects them from the script
+'feature_script.pl'. This script gets the sequence description line,
+e.g: "gi|121746|sp|P09211|GSTP1_HUMAN Glutathione S-transferase P (GST
+class-pi) (GSTP1-1)", and is expected to return a tab-delimited file:
+====
+pos label value
+23 *
+33 *
+34 *
+116 ^
+173 V N
+210 V T
+====
+
+Currently, the "value" is ignored unless the label is "V", for
+variant. If 'V' annotations are present, then the alternative
+amino-acid residue values are tested in alignments; if the variant
+residue improves the score, the score is updated and the variant
+sequence is displayed, and a 'V' indicates the variant in the
+coordinate line. Currently, variant annotations can only affect
+library sequences.
+
+By default, annotation symbols are shown in the coordinate line for -m
+0 (default) and -m 1 (difference) alignments, sometimes overwriting
+the coordinate. Annotation symbols (from either sequence) can be shown
+in the middle alignment line by specifying -m 0M or -m 1M, or in both
+the middle alignment line and the coordinate line with -m 0B, -m 1B.
+
+>>May 5, 2012
+(dropnnw2.c)
+Enable rev-comp for ggsearch/glsearch.
+
+>>Mar. 13, 2012
+(defs.h)
+Increase default file name length to 256 from 120 to accommodate long
+file names at the EBI. Also allow much longer command line arguments
+argv_line[MAX_LSTR=4096] to be reported.
+
+>>Jan. 30, 2012
+(nmgetlib.c, altlib.h)
+Read .fastq sequence libraries (ignoring quality information) as library type '7';
+
+>>Dec. 21, 2011 (released as fasta-36.3.5c)
+(nmgetlib.c)
+Fixed a problem reading multiple library files that produced
+segmentation faults because a data buffer was free()ed and then
+re-used.
+
+>>Nov. 17, 2011
+(initfa.c, mshowalign.c) (from fasta-36.3.5b)
+Fix problem with ppst->e_cut_r for LALIGN DNA sequences (set
+improperly to 0.001). Add ':' to s_bits: in -m 10 output. Also
+remove "score" from "lsw_s-w opt" score description (not present in
+non-LALIGN -m 10).
+
+>>Nov. 9, 2011 (from fasta-36.3.5b)
+(lavplt_svn.c, lavplt_ps.c, ncbl2_mlib.c)
+Fix buffer overrun for lav legend. Fix old problem re-opening NCBI
+blastdbfmt indirect OID files.
+
+>>Oct. 30, 2011
+(comp_lib9.c)
+Correct re-initialization bug that prevented the second query sequence
+from seeing the entire library.
+
+[from fasta-36.3.5a_svn]
+(comp_lib9.c, comp_lib8.c, ncbl2_mlib.c, nmgetlib.c)
+Address out-of-memory problems when searching memory mapped, and fix
+problem using fopen()/fread() rather mmap for NCBI DNA databases. On
+32-bit machines, NCBI database files cannot be left open, and are now
+more agressively closed. However, searches that produce very large
+numbers of alignments may still run out of memory on low-memory 32-bit
+machines.
+
+(compacc2.c, comp_lib8.c, comp_lib9.c, htime.c)
+Correct problems that produce negative scan times.
+
+>>Oct. 21, 2011
+(pcomp_subs2.c, work_thr2.c, mshowalign2.c, make/Makefile.mp_com2, Makefile.fcom)
+Fixes to re-enable MPI compilation and execution.
+
+>>Oct. 18, 2011
+(compacc2.c, mshowbest.c, comp_lib8.c, comp_lib9.c, initfa.c)
+Fix the logic for specifying the number of alignments displayed with
+the -b 123, -b '>123', -b '=123', -b '$' options, particularly when
+statistics are not used.
+
+>>September 21, 2011
+(initfa.c, apam.c, scaleswn.c compacc2.c)
+Two major problems have been addressed (which also affect fasta-36.3.5
+and earlier versions): (a) specifying a -s dna.mat DNA matrix did not
+work properly; (b) too few shuffles, particularly with DNA sequences,
+were produced with pairwise comparisons. The problem with scoring
+matrix files was exacerbated by the use of fixed library alphabets.
+initfa.c has been modified to recognize that when a DNA scoring matrix
+is specified, the "-n" option is set. The shuffling problem appeared
+when, for pairwise DNA comparisons, fewer than 50 shuffles were
+reported. This occurred because the buffers used to communicate with
+threads no longer have a fixed amount of sequence buffer associated
+with them.
+
+>>August 23, 2011
+(tatstats.c, upam.h, apam.c)
+The remapping of the amino-acid encoding to NCBIstdaa broke some
+assumptions in tatstats.c, and elsewhere. In addition to the simple
+mapping problem, which changed the counts[] assignment in
+tatstats.c/calc_priors(), the fact that NCBIstdaa does not have
+contiguous real amino acids (e.g. B is at position 2), broke the
+generate_tatprobs() function because of a very old bug where priorptr
+was not always incremented.
+
+Some of the drop*.c functions have been updated to ensure that the
+space allocated for rapid pam[][] score lookup includes space for
+lower-case characters, which can be present in pseg'ed "map_db -b"
+libraries. In addition, binary format (currently all mmap'ed)
+libraries cannot include annotations, because common annotation values
+('*', '&') overlap the range of the NCBIstdaa_l (lowercase) mapping.
+
+>>August 1, 2011
+(map_db.c)
+map_db.c has been modified to provide a more efficient memory mapping
+for FASTA format files. map_db -b works like map_db, but, in addition
+to writing the .xin index file of descriptions and sequences in the
+FASTA library, it also produces a new protein_library.bsq file and
+protein_library.xin_b that contains binary encodings of the databases
+and an index for this file. The binary encoding can be memory mapped,
+so that database searches can proceed directly from memory. map_db -b
+.bsq files are very similar to the blastfmtdb files, except that they
+accomodate lower-case letters (masked) in the sequences. The
+implementation of blastfmtdb lower-case masking prevents it from being
+used in directly memory mapped files.
+
+map_db.c introduces a new memory mapped format encoding, MP2. I
+expect this format to be extended to allow not only directly memory
+mapped files, but also directly memory mapped lookup tables. A
+database can be hashed, and the hash and link files written to a
+library file, which can then be used for searches without the need to
+re-calculate the hash/link tables.
+
+(comp_lib9.c, mmgetaa.c, ncbl2_mlib.c, initfa.c, dropfz.c)
+Modifications to allow memory mapped files to be read and processed
+directly. Databases with lower-case characters can be memory mapped,
+which means that lower-case characters are coming into the alignment
+programs even when -S is not specified. As a result, all the protein
+scoring matrices must be built-out to allow lower-case
+characters. Likewise, the dropfz2.c matrices built by init_weights()
+must always be set for lower-case characters.
+
+>>July 20, 2011
+(mshowbest.c, mshowalign2.c)
+gi|12345 numbers are no longer shown in the list of best hits unless
+-m 8 or -m 9 are used. They are never shown in the alignments.
+(dropfz2.c)
+Modify MAX_UC, MAX_LC to be consistent with NCBIstdaa alphabet. Modify
+<= nsq for init_weights().
+
+>>July 16, 2011 fasta-36.3.6
+(comp_lib9.c, drop*.c, cal_cons*.c)
+The internal encoding of amino-acids has changed to NCBIstdaa
+throughout the programs. This allows the programs to use memory
+mapped NCBI blastdbfmt libraries directly, without re-encoding, but
+lower-case low-complexity mapping is not recognized. This allows
+substantial speedup in single query searching. However, to allow
+low-complexity searches, a new memory mapped format/encoding will be
+required.
+
+>>July 5, 2011 fasta-36.3.6
+(compacc2.c)
+Modify save_best2() logic for identifying scores to be used for
+statistics. An is_valid_stat is set for multi-frame results that
+specify which scores can be used for the stats[] and qstats[] arrays.
+Modifications to buf_do_work(), buf_shuf_work(), and buf_qshuf_work()
+to cause the calculation to be done in the thread, rather than the
+main program. Fix some bugs in the qshuffle code to ensure that all
+valid shuffles up to maxshuff are saved.
+
+(complib5e.c, complib7e.c, complib8.c)
+Fix -m 9c/C core dump with -z -1.
+
+(cal_cons.c, cal_consf.c)
+Reverse 'I', 'D' with CIGAR string.
+
+>>June 26, 2011
+(comp_lib8.c, compacc2.c)
+
+Added the ability to search a library produced/specified by a script.
+Like the "-e expand_script.sh", searching against a library that
+begins with a '!', e.g. '!library_script.sh', causes the
+library_script.sh to be executed, producing a temporary file from
+stdout, which is then scanned as the database. As with expansion
+files, all the standard library syntax can be included. Thus, if
+cat_db.sh contains the command 'echo /seqdb/swissprot.lseg', the
+command:
+
+ fasta36 query.aa '\!@cat_db.sh'
+
+will cause cat_db.sh to produce a temporary file with the line
+"swissprot.lseg"; the temporary file will be interpreted as an
+indirect file of filenames; and swissprot.lseg will be searched. Note
+that in Unix systems, the '!' must be preceeded by a '\' as shown
+above, so that it is not interpreted by the shell.
+
+>>June 23,24 2011
+(compacc2.c, comp_lib8.c, mysql_lib.c)
+A new save_best2() function in compacc2.c has been designed to
+simplify the logic involved in saving best scores, with the goal of
+moving some of the save_best() calculations into individual threads.
+
+mysql_lib.c has a new command, close_tables, that allows a script to
+remove a table after it has been used. (It might make more sense to
+add this to the extension script option.)
+
+>>June 14, 2011 (released as fasta-36.3.5a June, 2011)
+(comp_lib7e.c, comp_lib8.c, compacc2.c)
+Fix a serious bug in next_sequence_p() that caused a portion of the library to
+be missed when long sequences filled the sequence buffer before the
+slots were filled.
+
+Make certain that thread buffers are cleared when running an expansion
+script.
+
+Return an extra '\n' before the final summary for consistency with
+earlier versions.
+
+>>June 2, 2011 (released as fasta-36.3.5 June, 2011)
+(comp_lib8.c, comp_lib5e.c, comp_lib7e.c)
+Fix a bug that indicated that linked expanded sequences were
+pre-loaded for alignment when they were not.
+
+>>May 24, 2011 (released as fasta-36.3.5)
+(comp_lib8.c, comp_lib7e.c, comp_lib5e.c, mshowalign2.c, compacc2.c,
+initfa.c, param.h, scaleswn.c)
+
+The in-memory versions of the program are allocating much more memory
+than they actually use, causing the memory limits to cut in too soon.
+Fix this by using a smaller MAXLIB_P (36000) for searches against
+protein libraries, and expanding/contracting the aa1b_size more
+sensibly. Also add lost_memK value to track lost memory. For protein
+searches, lost memory is now around 15% of allocated memory (down from
+40%).
+
+Numerous fixes to improve formatting of HTML output. Full statistics
+parameters are now available with the fdata output.
+
+Add fset_vars() to comp_lib8.c to set m_msg.max_memK properly.
+Parameters have been modified to ensure less memory waste (all buffers
+have 1000 sequences); Drop default 64-bit library memory limit to 8GB
+(-XM8G, LIB_MEMK=8G).
+
+>>May 25, 2011
+(comp_lib8.c, comp_lib7e.c, comp_lib5e.c, mshowbest.c)
+
+Add the '-b >1' option, guarantees that at least 1 result is shown,
+but otherwise limits by E()-value. '-b =10' guarantees to show
+exactly 10 results (never more or less if the library is large
+enough), '-b 10' will show no more than 10 results, limited by -E
+e_cut, and '-b >1' will show at least 1 result, but is otherwise
+limited by -E e_cut.
+
+>>May 19, 2011
+(comp_lib8.c, compacc2.c, param.h)
+comp_lib8.c is a version of comp_lib7e.c that keeps sequences in
+memory over multiple searches, but returns seqr_chains of buffers of
+sequences as they are read, rather than waiting for everything to be
+read. comp_lib8.c will automatically allocate up to 2 GB (32-bit
+machines) or 8 GB (64-bit machines) to hold the sequence database in
+a multiple query search. This number can be increased or decreased
+using the -XM# (megabytes) or -XM#G (gigabytes) option, or by setting
+the LIB_MEMK environment variable. -XM4G (LIB_MEMK=4G) makes 4GB
+available for sequence libraries; -XM-1 makes all machine memory
+available.
+
+>>May 5 2011
+(mshowbest.c)
+Fix problems that prevented "-b align_number" properly limit output
+with "-z -1". "-z -1" also broke multiple HSPs (since no threshold
+could be calculated); fixed.
+(dropnfa.c)
+Fix some offset arithmetic that prevented FASTA alignments from
+extending to full length in do_walign().
+
+>>May 4, 2011
+(scaleswn.c)
+Provide additional checks for division by low numbers in fit_llen2()
+and fit_llens(). The similarities between fit_llen(), fit_llens(),
+and fit_llen2() have been highlighted, and their differences
+documented. scaleswn.c now provides pstat_info, which writes all the
+values required to re-calculate zscores or E()-values from raw scores.
+
+>>May 2, 2011
+(dropnfa.c)
+Fix a problem with the traditional cgap(join)/optcut(opt) thresholds
+(no longer used by default) caused by allowing ktup=3 for proteins.
+The ktup=3 modification increased the cgap/opt thresholds by 6.
+
+(comp_lib5e.c, comp_lib7e.c, comp_lib8.c)
+Confirm identity of -m # and -m "F3 file.out". Small differences fixed.
+
+(mshowbest.c, mshowalign2.c)
+Remove gi|12345 information from -m B, -m BB blast-like output. NCBI
+Blast does not display gi numbers.
+
+>>Apr. 22, 2011
+(doinit.c, initfa.c)
+Several of the less common options have been changed to expanded
+options, changing the meaning of -X (which now specifies expanded
+options), as well as -o, -1, -B, -x, and -y. -o now provides the
+offset coordinates previously specified with -X; -B is now -XB, -o
+-Xo, -x -Xx1,-1, and -y -Xy, e.g. -Xy32.
+
+>>Apr. 19, 2011
+(comp_lib7e.c, comp_lib5e.c, doinit.c, mshowbest.c)
+Test lastest version with -I interactive mode. Modificiations
+required to ensure that aligments goto outfd, not stdout, when
+filename is entered. In addition, in interactive mode there can be
+more scores shown than e_cut, so bbp->repeat_thresh must be set in
+showbest() not main() program.
+
+>>Apr. 17, 2011
+(comp_lib7e.c, doinit.c, compacc.c)
+
+The FASTA programs now support multiple output files with different -m
+out_fmt types using the -m "F# out_file" or -m "F#,#,# out_file"
+option. Normally, the -m out_fmt option applies to the default output
+file, which is either stdout, or specified with -O out_file (or within
+the program in interactive mode). With -m F, an output format can be
+associated with a separate output file, which will contain a complete
+FASTA program output. Thus,
+
+ ssearch36 -m 9c -m "FBB blast.out_file" -m "F10 m10.out_file" query library
+
+Will sent the -m 9c output to stdout, but will also send -m BB output
+to blast.out_file, and -m 10 output to m10.out_file. Consistent -m
+out_fmt comands can be set to the same file by separating them with
+','; e.g.:
+
+ ssearch36 -m 9c -m "F9c,10 m9c_10.out_file" query library.
+
+Producing alternative format alignments in different files has little
+additional computational cost.
+
+One of the shortcomings of this approach is that it affects only the
+output format, not the other options that modify the amount of output.
+Thus, if you specify -E 0.001; that expect threshold will be used for
+all the output files. When a -m option can modify the output (e.g. -m
+8 sets -d 0), that modification persists only for that file.
+
+>>Apr. 14, 2011
+(initfa.c)
+Fix bugs in e_cut_r calculation that made it much too low for
+lalign36, and used the >1.0 divisor improperly for all programs
+(change from e_cut_r = e_cut_r/divisor to e_cut_r = e_cut/divisor).
+
+>>Apr. 11, 2011
+(comp_lib5e.c, comp_lib7e.c, compacc.c)
+
+The non-preload version of FASTA (comp_lib5.c) has been extended to
+allow script expansion (comp_lib5e.c). To do this, the central score
+calculation loops have been moved to getlib_buf_work(), just as
+seqr_chain_work() was created for comp_lib7e.c. Moreover, the
+function used to build the link_file names is build_link_data() is now
+in compacc.c. Differences between comp_lib5e.c and comp_lib7e.c have
+been reduced.
+
+>>Apr. 5, 2011
+(comp_lib7e.c)
+Fix issue with closing unopened link_lib_list_p when no results are
+found. Remove no-sequence error message for link library file.
+
+>>Apr. 1, 2011
+(comp_lib7e.c)
+The -e script.sh has been generalized to have all the capabilities of
+a library file, in particular '@' specifies an indirect file, and
+"script.sh #" allows a library type to be specified. Thus, the
+script.sh invoked by "@script.sh" should not produce a fasta file; it
+should produce a file that contains the name of a fasta file (or
+possibly some other format). If '@' is used, the link_lib file
+written to stdout will be prepended with '@', and treated as an
+indirect file of file names.
+
+(comp_lib5.c, comp_lib7.c, comp_lib7e.c)
+Fix problem with null refstr (no Please cite:).
+
+>>Mar. 31, 2011
+(comp_lib7.c, comp_lib7e.c)
+close_lib() was being called after each query. This is incorrect for
+versions (like comp_lib7) that keep the entire database in memory; the
+files must be kept open to allow ranlib() to get long descriptions
+(alternatively, a long description could be read initially).
+
+(comp_lib5.c, comp_lib7.c, comp_lib7e.c)
+Fix query offset coordinates for long queries that are broken up.
+Allow query library to have zero-length sequences without stopping
+(queries now stop when end-of-file is reached).
+
+(upam.h)
+Fix gap penalties for BLOSUM80 matrix (change from -14, -2 to -10, -2).
+
+>>Mar. 29, 2011
+(comp_lib7e.c, doinit.c)
+
+Add the ability to search an expanded set of sequences based on the
+accessions from the initial search using "-e expand.sh" option.
+If "-e expand_script.sh" is specified, the command:
+
+ expand.sh link_acc_file > link_lib_file
+
+is run by the program (fasta36, ssearch36, fastx36, etc), where
+link_acc_file and link_lib_file are temporary file names produced by
+the program. (The location of the temporary files can be specified
+with the $TMP_DIR environment variable.) link_acc_file contains a
+list of accession strings for the statistically significant hits - the
+information in the description line to the first space, e.g.
+
+gi|121719|sp|P08010|GSTM2_RAT
+gi|121746|sp|P09211|GSTP1_HUMAN
+
+from a search against my pir1.lseg library.
+
+"expand.sh" then reads that file, extracts the accession information,
+expands the accessions to a new set of accessions, extracts the
+expanded set of accessions from a database and writes them to
+standard output (which is saved in the temporary link_lib_file
+name). The sequences in expanded link_lib_file are then added to the
+initial search, and included in the list of best scores (and
+alignments) if their scores are statistically significant. The
+additional sequences do not change the initial library size.
+
+To test the expansion capability, use an expand.sh script that simply
+cat's a file of homologs to stdout (which will go to link_lib_file and
+be read), e.g. expand.sh contains "cat ../seq/gst.lib".
+
+Building a program that can take an arbitrary list of accessions and
+produce a library of homologs is more complicated (and slower), but
+will allow a smaller database to be searched yet produce results
+similar to those found from a larger database.
+
+>>Mar. 24, 2011 (released as fasta-36.3.4)
+(comp_lib7.c, dropfx.c, dropfz2.c, doinit.c)
+Fix a bug in the new help display; identify and correct various memory
+leaks and references to uninitialized data.
+
+>>Mar. 15, 2011
+(doc/fasta3x.me, fasta3x.tex)
+The ancient, rarely updated, fasta3x.me has been replaced with
+fasta3x.tex, with the goal of producing a more up-to-date, accurate,
+and comprehensive document describing the capabilities of the FASTA
+programs. In addition, fasta36.1 has been updated/corrected.
+
+(make/Makefile.os_x86_64)
+Mac OS X clang 2.0, distributed with Xcode4.0, does not properly
+optimize the smith_waterman_sse2_word() in smith_waterman_sse2.c when
+clang -O is used to compile.
+
+>>Mar. 4, 2011
+(doinit.c)
+Histograms are now turned off by default. -H shows histograms for all
+programs, not just the *_mpi (PCOMPLIB) programs.
+
+>>Feb. 27, 2011
+(make/Makefile36m.common, Makefile.pcom_t, Makefile.pcom_s)
+
+The threaded programs are now the default, and the *_t versions of
+programs have been removed from the Unix and unix-like (MacOX)
+distributions. Windows versions can have either threaded or
+non-threaded versions, since the threaded windows programs require an
+additional library. Serial versions of the programs can still be built
+by editing the make/Makefile36m.common file, and using
+include Makefile.pcom_s instead of include Makefile.pcom_t.
+
+The documentation has been edited to reflect these changes.
+
+>>Feb. 24, 2011 (comp_lib5.c, comp_lib7.c, doinit.c, initfa.c,
+structs.h) The FASTA programs have a much more informative help
+system. If the -DSHOW_HELP option is included in the Makefile, the
+following changes occur: (1) the program is no longer interactive by
+default. To get interaction, use the -I option (-I previously meant
+showing the identity alignment in lalign; that option is now available
+with -J). (2) fasta36 and fasta36 -h present a short help message. (3)
+fasta36 -help provides a complete list of options with a more complete
+set of options. The getopt() option strings are now built
+dynamically.
+
+>>Feb. 18-21, 2011
+(doinit.c)
+Fix missing -m 9i percent identity/alignment length. Fix issues with
+short sequence description in -m 6 (html) mode.
+
+>>Feb. 17, 2011
+(comp_lib5.c, comp_lib7.c, doinit.c)
+Implementation of -m BB which provides completely BLAST-like output
+(not just alignments).
+
+Modification of the -b ### option. Previously, -b 100 guaranteed 100
+alignments; now -b 100 limits to 100 alignments if more than 100
+alignments have E()-values less than the -E threshold. An '=' symbol
+before the number reverts to the previous behavior; e.g. -m =100
+guarantees 100 alignments, regardless of E()-value (-m =100 is
+equivalent to -m 100 -E 100000.0, and disables other setting of the
+E()-value threshold).
+
+>>Feb. 10, 2011
+(doinit.c, mshowalign2.c, c_dispn.c)
+The FASTA programs have a new alignment option, "-m B", which shows
+alignments in BLAST format (no context, coordinates on the same line,
+BLAST symbols for matches and mismatches.) This version does not
+change the descriptions of the alignments, which are still FASTA like,
+but the alignments themselves should look just like BLAST alignments.
+Option -m BB makes output even more blast-like, showing not only the
+alignments, but the initial set of high scoring sequences, and other
+initial information, like BLAST+.
+
+>>Feb. 9, 2011 released as fasta-36.3.3
+(dropfs2.c, initfa.c, comp_lib*.c)
+Modify fasts36/fastm36 to allow up to ktup=3 for proteins; ktup=6 for
+DNA (previously the max was ktup=2 for both).
+
+Modify version string to match release version number.
+
+>>Feb. 6, 2011
+(initfa.c)
+Fix bug that prevented fastm36 from working properly with DNA queries.
+
+>>Jan. 31, 2011
+(pcomp_subs2.c, work_thr2.c)
+Fixes to fasty36_mpi/tfastx36_mpi problem. Only fasty needs pascii[]
+for alignments, but it wasn't being sent to workers. Fixed. The MPI
+versions of the programs have now been tested much more thoroughly.
+
+>>Jan. 29, 2011
+(comp_lib5.c, comp_lib6.c, comp_lib7.c, work_thr2.c, initfa.c,
+param.h, dropfs2.c, scaleswt.c, dropfx.c)
+
+Translated DNA shuffles (tfastx36, tfasty36) now shuffle DNA as
+codons. (1) Modify param.h pstruct to include shuffle_dna3,
+initialized in resetp() [initfa.c] (2) modify buf_shuf_work() to use
+ppst-zs_win and ppst->shuffle_dna3. (3) Add ppst->zs_off=0 to
+scaleswt.c/process_hist(). (4) Fix some memory leaks in dropfx.c.
+(5) Fix some other memory leads in dropfs2.c.
+
+>>Jan. 28, 2011
+(initfa.c, scaleswn.c, mshowalign2.c)
+Address crashes that occurred when novel scoring matrices and gap
+penalties were specified, particularly for DNA. Fix memory problem
+with long (-L) sequence descriptions.
+
+>>Jan. 23, 2011
+(comp_lib7.c)
+comp_lib7.c uses a more efficient strategy for reading chunks of
+sequences that ensures that sequence data is contiguous for *_mpi
+programs. comp_lib7.c replaces comp_lib6.c, which will be removed.
+
+>>Jan. 22, 2011
+(many files)
+Replace "mw.h" with "best_stats.h", a much more informative name.
+
+(drop*.c, p_mw.h, w_mw.h)
+Remove p_mw.h, w_mw.h from code base and update_params() from
+drop*.c. These files are left over from the old p2_complib.c parallel
+programs.
+
+>>Jan. 21, 2011 released as fasta-36.3.2
+(comp_lib5.c, comp_lib6.c, pcomp_subs2.c)
+Fixes for MPI version of programs. Earlier versions did not handle
+DNA/translated DNA comparisons properly, because duplicated sequences
+(forward/reverse strand) were not handled properly. The current code
+produces the correct scores and alignments, but probably is much less
+efficient than it should be.
+
+>>Jan. 11, 2011
+(initfa.c, scaleswn.c)
+Re-enable DNALIB_LC (read lower-case DNA sequences as lower case).
+
+Reset ktup to default after change for short query in multi-query
+searches.
+
+Address multiple issues associated with variable scoring matrices,
+i.e. -s '?BP62'. Introduce pst->pam_name for the actual scoring
+matrix, to distinguish it from pst->pam_file, which can correspond to
+the std_pam->abbrev, for values like BP62 (which encodes both a matrix
+and a specific set of gap penalties). Ensure that the new scoring
+matrix is initialized and extended correctly. Fix some issues with
+scoring matrix names in scaleswn.c
+
+>>Jan. 5, 2010
+(dropnnw2.c, dropgsw2.h, global_sse2.c,h, glocal_sse2.c,h)
+Include SSE2 optimization for global/global and global/local alignments
+provided by Michael Farrar. Global and glocal alignments are now 20X
+faster.
+
+>>Jan. 5, 2011 re-released as fasta-36.3.1
+(initfa.c, last_tat.c)
+Fix bug resetting pst.e_cut_r for DNA sequences. Modify last_tat.c
+code to use pre-loaded sequence if available. Remove last_tat.c
+PCOMPLIB code.
+
+>>Jan. 3, 2011 released as fasta-36.3.1
+(comp_lib5.c, comp_lib6.c)
+Add >>><<<, >>>/// to -m 9,10 output for separating multiple query
+searches. Also clean up extra >>>query line before alignments when no
+alignments are shown.
+
+>>Dec. 16, 2010
+(dropgsw2.c, dropnnw2.c, dropnsw.c, comp_lib5.c, comp_lib6.c)
+Fix bug that caused ssearch to not invert coordinates for
+reverse-complement DNA alignments (I never imagined using ssearch for
+DNA) in dropgsw2.c, dropnnw2.c, and dropnsw.c. Add SEQ_PAD to aa0[1]
+(rev-comp copy) in comp_lib5.c, comp_lib6.c.
+
+>>Dec. 14, 2010
+Modify CIGAR strings for frameshifts, including 1F and 1R for forward
+and reverse frameshifts. Extensive documentation updates.
+doc/fasta36.1 is the most comprehensive and accurate description of
+FASTA options.
+
+>>Dec. 1, 2010
+(drop*.c, comp_lib5.c, comp_lib6.c)
+Correct problems with copying for recursive sub-alignments. Correct
+bug in adler32_crc calculation that suggested a problem with continued
+library sequences that did not exist.
+
+(initfa.c, defs.h)
+Use MAXLIB, rather than MAXLIB+MAXTST for comp_lib6.c, which
+pre-allocates the sequence database. Increase MAXLIB.
+
+>>Nov. 24, 2010
+(drop*.c, drop_func.h)
+Modify drop*.c functions that do recursive sub-alignments to avoid
+modifying the aa1[] sequence array, which conceivably could be in use
+by other threads. do_walign() now has const *aa0 AND const *aa1. To
+prevent modification of aa1, sub-regions of aa1 are now copied into
+newly allocated arrays.
+
+>>Nov. 20, 2010
+(cal_cons.c, mshowbest.c, mshowalign2.c, doinit.c)
+The -m 9C option displays an alignment code in CIGAR format. (-m 9c
+shows the older alignment encoding.)
+
+>>Nov. 16, 2010 (beginning of fasta-36.3.*, verstr 36.07)
+(initfa.c, apam.c, upam.h, param.h)
+
+Provide the ability to adjust the scoring matrix based on the length
+of the query sequence for alignments using a protein alphabet (this
+could certainly be extended to DNA as well). By including a '?'
+before the scoring matrix, e.g. -s '?BP62', a shallower matrix will be
+chosen if the entropy of the selected matrix (i.e. bit score per
+aligned position) times the length of the protein query is
+<=DEF_MIN_BITS (defs.h), currently 40 -- this value should be set
+based on the library size). The FASTA programs include BLOSUM50 (0.49
+bits/pos) and BLOSUM62 (0.58 bits/pos) but can range to MD10 (3.44
+bits/position). The variable scoring matrix option searches down the
+list of scoring matrices to find one with information content high
+enough to produce a 40 bit alignment score. This option is included
+primarily for metagenomics scans, which can include relatively short
+DNA reads, and correspondingly short protein translations.
+
+Also correct the short-query modification to ktup, so that it works
+properly with translated FASTX/FASTY searches (ktup is set to 1 when
+the query_length/3 <= 20).
+
+(dropnfa.c, dropfx.c, dropfz2.c)
+Shuffled sequence alignment scores are calculated identically to
+library alignment scores. Previously, optimized scores were calculated
+for all shuffled sequences for FASTA type alignments, even though
+typically 20 - 40% of library sequences were optimized. Now the two
+sampling strategies are consistent, though this may cause problems
+when only a small fraction of sequences are optimized.
+
+Small changes to provide consistent dropnfa.c, dropfx.c, dropfz2.c
+parameter display, and fix display with -m 10.
+
+>>Nov. 15, 2010
+(initfa.c)
+Enable statistical thresholds by default (previously, they were
+enabled with -c -1 or -c 0.01 or anything < 1.0). The "classical"
+join/opt threshold behavior can be restored with -c O (upper case
+letter O), or by providing an optimization threshold >
+1.0. Statistical thresholds dramatically speed up searches (typically
+2-fold), and provide more accurate statistical estimates. The old
+join/optimization thresholds where optimized for BLOSUM50, and other
+1/3-bit scaled scoring matrices, and did not work well with BLOSUM62.
+Statistical thresholds have been tested extensively, particularly with
+-z 21, and produce much more reliable statistical estimates.
+
+>>Oct. 14, 2010
+(Makefile.fcom, cal_cons.c)
+Edits to re-enable compilation and successful execution of
+tfasta36(_t). tfasta36 has been superceeded by tfastx36(_t), which is
+faster, and treats frameshifts as a different type of gap.
+
+>>Oct. 13, 2010
+(mshowbest.c)
+Make it more difficult to request more description/scores than are
+available.
+
+>>Sep. 30, 2010 (released as fasta-36.2.7)
+(comp_lib5.c, comp_lib6.c, dropnfa.c, dropfx.c, dropfz2.c)
+Fix bugs in DEBUG versions with adler32_crc calculations on
+overlapping sequences. Add more informative error messages when
+debugging. Fix a problem with hist2.hist_a != NULL with some
+compilers. Fix formats for some debugging error messages in dropnfa.c,
+dropfx.c, and dropfz2.c.
+
+Also fix repeat_threshold calculation for very short sequences, to
+guarantee that all matches as good as the best match with the sequence
+are found. Fix some problems that prevented FASTA from finding short
+repeats with short queries.
+
+This version of the FASTA36 package offers an alternate main program
+file, comp_lib6.c, which reads the entire database into memory before
+doing the search. Using comp_lib6.c can dramatically speed up
+searches with multiple queries (there is no advantage with single
+query sequences) on large multi-core computers, as each search is done
+without re-reading the database. On a 48-core processor, we see
+speedups greater than 40X with ssearch36_t and fastx36_t. To enable
+comp_lib6.c, edit the make/Makefile36m.common file to comment out
+lines refering to comp_lib5.c and un-comment lines referring to
+comp_lib6.c.
+
+>>Sep. 29, 2010
+(comp_lib5.c, comp_lib6.c, mshowbest.c)
+Added -m 8C option, which mimics BLAST+ tabular with comment lines
+format.
+
+>>Sep. 17, 2010
+(dropfx.c)
+
+Fix a bug in dropfx.c/do_walign() that modified library sequences.
+(This only caused a problem with comp_lib6.c, which reads the entire
+database into memory and re-uses sequence buffers. Check sequence
+consistency with adler32 CRC calculation.
+
+>>Sep. 15, 2010
+(mshowbest.c, mshowalign2.c)
+Change the output format slightly. E2() expect values (-z 21+) no
+longer contain the library size (which is always the same as the
+E(library_size) value), and the -m 9 +- line no longer contains the
+frame information, since it is redundant. (The redundant rev-comp
+remains on the >-- HSP lines.)
+
+>>Sep. 14, 2010
+(comp_lib5.c, mshowbest.c, drop*.c, cal_cons[f].c, etc.)
+Implement BLAST -m 8 tabular output.
+
+>>Sep. 9, 2010
+
+(compacc.c) Fix a bug in pre_load_best() that disabled
+-L long sequence descriptions.
+
+(doinit.c) Fix a bug that prevented non-overlapping alignments from
+being displayed when the -E threshold was changed. Before -E 0.001
+would disable additional alignments. Now, -E "0.001 0" is required to
+disable the additional alignments.
+
+(drop*.c) The display of search parameters has changed to ensure that
+gap penalties are displayed on the same line as the scoring
+matrix. Previously, the FASTA "Parameters:" section looked like:
+
+Parameters: BL50 matrix (15:-5)xS ktup: 2
+ join: 42 (0.0944), opt: 30 (0.601), open/ext: -10/-2, width: 16
+ Scan time: 0.450
+
+With fasta-36.2.7 (and later), the Parameters: section is:
+
+Parameters: BL50 matrix (15:-5), open/ext: -10/-2
+ ktup: 2, join: 42 (0.102), opt: 30 (0.574), width: 16
+
+The [T]FAST[X/Y] Parameters: section includes the frameshift/substitution penalties (tfasty36):
+
+Parameters: BL50 matrix (15:-5) open/ext: -12/ -2 shift: -20, subs: -24
+ ktup: 2, E-join: 0.5 (0.224), E-opt: 0.1 (0.0536), width: 16
+
+>>Aug. 3, 2010 (released as fasta-36.2.6)
+(scaleswn.c)
+
+Modifications to calc_thresh(), proc_hist_ml(), to better accommodate
+search strategies (fast?? with statistical thresholds) that provide
+complete scores only for a high-scoring fraction of sequences. For
+some query sequences, the E()-values from the database were sometimes
+much "worse" than E2()-values, an observation that is
+counter-intuitive (if parameters are estimated against shuffled
+related sequences, the E()-values should get worse, not better). For
+some queries, the result was very dramatic (E() < 1E-80, E2() <
+1E-150). This error appears to occur because the z-trim or mle_cen
+thresholds are including many related sequences. -z 2 was modified to
+censor more sequences when only a subset are scored, and -z 1 was
+modified to adjust z-trim more carefully. As a result, z-trim was
+reduced, excluding more sequences. If too many sequence are excluded,
+then regression statistics do not work, and the program fails over to
+Altschul-Gish statistics.
+
+-z 21+ modified so that MLE statistics are used for shuffle E2()
+values if Altschul-Gish statistics are used for the library
+E()-values.
+
+>>July 30, 2010
+(comp_lib5.c, pcomp_subs2.c)
+
+Fix bug in buf_align_seq() that allowed buffer over-runs with long DNA
+sequences with MPI. Checks on buffer over-runs are now included in
+pcomp_subs2.c/put_rbuf(),get_wbuf(). Aug. 1, 2010, fixed similar bug
+in buf_shuf_seq(). -z 21 now works with long DNA sequences.
+
+>>July 28, 2010
+(mshowalign2.c)
+Fix lalign36/showalign() to show best sub-optimal E()-value, not
+bptr[0] E()-value (often identical).
+
+>>July 19, 2010 (released as fasta-36.2.5)
+(wm_align.c, dropfx.c,dropfz2.c)
+Fix some off-by-one boundary calculations to ensure that every query
+that can fit into a library is aligned correctly.
+
+>>May 18, 2010
+Implement comp_lib5.c, which simplifies the structure of
+comp_lib4.c by moving some calculations into functions.
+
+>>May 10, 2010
+Fix problem setting nshow with small library in interactive mode.
+
+>>May 5, 2010 fasta-36.2.3
+Fix bug that prevented shuffled scores to be used properly for small
+databases (prss capability was lost).
+
+>>May 2, 2010 fasta-36.2.2
+Fix problem with tat_score values from fasts and fastm. fasta35 did
+not re-calculate the z-score after last_stats(). fasta36 does, so it
+must ensure that the e-value (sometimes p-value) is used correctly.
+
+>>Apr. 29, 2010
+More extensive testing of the MPI-PCOMPLIB programs revealed some
+problems sending sequences when (or more) frames for the same sequence
+was used. This problem has been addressed, and large scale testing of
+fastx36_mpi (with 100K sequence queries in a run) works.
+
+>>Apr. 16,19, 2010
+(pcomp_subs2.c, comp_lib4.c, work_thr2.c)
+The MPI-PCOMPLIB parallel version of the FASTA36 programs is
+working. This PCOMPLIB version takes a very different approach from
+the older PVM/MPI parallel programs (p2_complib2.c/p2_workcomp2.c) -
+it works virtually identically to the threaded programs (sharing the
+same work_thr2.c code and get_rbuf/put_rbuf() (manager) and
+get_wbuf/put_wbuf() (worker/thread) functions. As a result, in this
+initial version, the database is NOT distributed to the nodes. During
+multiple searches, the library is re-read each time. However, load is
+distributed to workers exactly the way it would be for the threaded
+system, so the workload should scale.
+
+To distinguish them from the earlier mp35compsw, mp35compfa, etc, the
+new versions are search36_mpi, fasta36_mpi, etc.
+
+The programs work with multiple queries, and producing multiple
+sub-alignments, and work with -m 9c encodings.
+
+>>Apr. 7, 2010
+(various Makefiles, comp_lib4.c, pcomp_subs2.c, thr_bufs2.h,
+thr_buf_structs.h)
+
+The MPI version of the threaded programs, sseach36_mp, now compiles.
+pcomp_subs2.c replaces pthr_subs2.c, and thr_bufs.h ->
+thr_buf_structs.h, thr.h -> thr_bufs2.h, and pcomp_bufs2.h has been
+added as the equivalent of thr_bufs2.h for PCOMPLIB.
+
+>>Apr. 2, 2010
+(comp_lib4.c, work_thr2.c, compacc.c)
+Implement init_aa0(), which isolates code that calls init_work and
+sets up aa0s, aa1s, f_str[1] (reverse complement) and qf_str so that
+the same code is used by the serial, threaded, and (future) PCOMP
+versions.
+
+(work_thr2.c)
+work_thr2.c now contains code for either threaded or PCOMPLIB
+processes. Threaded processes get stuff from work_info; PCOMPLIB
+processes get the same information via messages sent from init_thr()
+called by main().
+
+>>Mar. 30, 2010
+(comp_lib4.c, work_thr2.c, thr_bufs.c +pcomp_subs2.c
+
+The the data buffers used to communicate between workers and threads
+have been restructured to separate the old buf2_str, which contained
+sequence, score results, and alignment results, into three buffers,
+buf2_data_s, buf2_res_s, and buf2_ares_s, separating sequence data
+from scores and alignments. This was done to simplify communication
+in the MPI/PVM environment. Workers should be able to return results
+directly into the appropriate buffer.
+
+>>Mar. 25, 2010 fasta-36.2.1
+
+(dropfx.c, dropfz2.c)
+Found/removed two "static" declarations in small_global that caused problems
+with [t]fastx/y with threaded alignments.
+
+>>Mar. 24, 2010 (now version 36.06 with threaded alignments)
+(dropnfa.c)
+The DNA band aligner in dropnfa.c was not thread safe. This has been
+fixed.
+
+>>Mar. 23, 2010
+Code for pre-loading/threaded-aligning sequences has been
+significantly cleaned up. Checks are made before RANLIB() and
+re_getlib() in showbest() and showalign() that should be consistent
+with annotations AND functions that cannot encode alignments.
+
+Add mshowalign2.c (which does not do PCOMPLIB) to provide threaded
+alignments. build_ares_code() and buf_do_align() modified to ignore
+MX_M9SUMM so that alignments are produced whenever demanded (still
+does not do alignment if a_res is available).
+
+>>Mar. 22, 2010
+(comp_lib4.c, work_thr2.c, thr_bufs.h)
+
+comp_lib4.c has been modified to thread the alignment encoding
+(build_ares) for -m 9c. If m_msg.quiet and alignments are required for
+showbest(), then the program identifies the number of alignments
+required, reads the sequences (and annotations) into a buffer, and
+sends them to the threads to be encoded. Then, when showbest() is
+called, bbp->have_ares has been set, and the alignments are not
+re-calculated. This should be extended to thread actual alignment
+production, and additional work is required to clean-up the sequence
+and bline(description) buffers before a second search.
+
+>>Mar. 17, 2010
+(comp_lib4.c, dropnfa,fx,fz2.c)
+Modifications to provide more sensible E2() statistical estimates with
+threshold-heuristic comparison functions and -z 21. Also fixed bug
+that caused the wrong zs_off to be used with -z 21. dropnfa,fx,fz2.c
+now optimize all scores when shuff_flg is set.
+
+>>Mar. 16, 2010
+(comp_lib4.c, scaleswn.c, drop*.c)
+
+A new, relatively consistent, statistical estimation strategy has been
+introduced for the heuristic programs that optimize only a fraction of
+scores (fasta36, [t]fast[xy]36). Statistics-based heuristic
+thresholds can increase search speed 2 - 4-fold by doing band
+optimization on only a small fraction of library sequences (with the
+-c -1 option, about 10% of alignments are band-optimized, compared
+with more than 50% with the classic thresholds). However, optimizing
+only a small part of the library produces two classes of scores,
+optimized (10% or less) and non-optimized, with different statistical
+properties. fasta36 addresses this problem by calculating statistical
+estimates only for the optimized scores, and then correcting the
+significance of the score by accounting for the frequency of
+optimization. For example, sampling only 5% of scores increases the
+z-value (std. deviation above the mean) by -logE(0.05)*sqrt(6)/Pi =
+2.34 which offsets the z-score by 23.4. This effect is only seen when
+the -c option is used to specify statistical thresholds, and is most
+apparent when looking at the histogram, which will be offset by the
+appropriate z-score.
+
+This strategy appears to produce more accurate statistics in general,
+but can produce less accurate statistics for the heuristic programs when
+the -z 21 option is used.
+
+>>Mar. 3, 2010
+
+(comp_lib4.c)
+Fix the new stats[] sampling strategy to sample >60K sequences more
+more uniformly. The old code massively over-sampled later sequences,
+because of several bugs. The new code works as expected. The first
+60K sequences are represented about 30% more than the rest, but after
+60K, sequences are sampled moderately uniformly. The older
+SAMP_STATS_MORE is uniform across all the scores.
+
+(build_ares.c)
+Move code to produce chains of alignments (a_res) produced by
+do_walign, followed by subsequent calls to calc_id, calc_code, into a
+new function, build_ares_code(), which is shared by the
+serial/threaded and parallel (p2_workcomp.c) programs. This is a
+first step towards having the parallel programs produce multiple HSP
+alignments.
+
+>>Feb. 27, 2010
+
+(lib_sel.c)
+Fix problem with new chained library access that prevented more than
+two files from being searched. Also, library name string has been
+lengthened to allow a list of libraries to be displayed.
+
+>>Feb. 26, 2010
+
+Parallel programs have been tested in both PVM and MPI versions, and
+some additional bugs have been fixed. Currently, the PVM/MPI versions
+are fully functional, but only with FASTA35 capabilities. The new
+multiple HSP alignments and best-shuffle E2() scores are not yet
+available.
+
+>>Feb. 24, 2010
+
+Fix some leaks, largely do to more complex alignment data structures
+for multiple alignments. Currently, all the major leaks are in data
+structures allocated in main(), and which I don't bother to
+de-allocate (mostly library buffer memory).
+
+Change zsflag > 10 to zsflag >= 10 && zsflag < 20 in three places.
+Too many shuffles were being done with zsflag==21.
+
+>>Feb. 22, 2010
+
+Begin conversion of p2_complib2.c/p2_workcomp.c. Very old code to
+allocate aln_d_base removed from v35 and v36. No code for best list
+shuffle, or multiple high-scoring alignments. However, the code now
+works properly with statistical thresholds. (Changes made to
+p2_complib2.c, p2_workcomp.c to update pst struct after last_param.()).
+
+>>Feb. 19, 2010 fasta-36x6
+
+Fix issues with -z 26 statistics. Add description of E2() statistics.
+
+Added option to specify statistics routine for best-shuffled
+statistics independently of library statistics by specifying a second
+-z option. Thus, -z "21 2" uses regression scaled statistics for the
+library estimate, and MLE statistics for the best-shuffled estimates.
+
+>>Feb. 17, 2010 fasta-36x5
+
+Some of the simplifications dealing with threads in comp_lib4.c failed
+on some compilers and architectures. The code for terminating threads
+has been modified to allow sequence buffers with zero entries, to
+simplify the empty_buffer logic. There is now an explicit option to
+terminate threads by setting lib_bhead_p->stop_thread. However, this
+flag is never set, as rbuf_done() stops the threads instead.
+
+Also fix problem with stats_idx being associated with wrong buf2_p in
+two frame searches.
+
+>>Feb. 15, 2010 fasta-36x4
+
+fasta36 can now display both "search" (E()) and "shuffled" (E2())
+E()-value calculation and display in the best scores and
+alignments. If the -z option is greater than 20, then two evalues are
+calculated, one from the search (e.g. -z 1 uses regression scaled
+scores) and a second derived from shuffling the high scoring
+sequences. The high-scoring sequence shuffled scores are
+approximately equivalent to doing a PRSS (pairwise shuffle), but more
+efficient. High-scoring shuffled E()-values (labled E2()) are
+typically 2 - 5-fold more conservative for average composition
+proteins, and 10 - 20X more conservative for biased composition
+proteins.
+
+Fix another bug in -S alignment scores vs opt scores in ssearch36 (see
+Feb. 8).
+
+>>February 12, 2010
+(prev. version 142)
+
+Create comp_lib4.c (from comp_lib3.c), which simplifies some of the
+processes for handling buffers of results (no more empty_reader_bufs)
+and enables shuffles of high-scoring sequences to evaluate significance.
+
+>>February 8, 2010
+
+Fix a problem with scores and E()-values for SSEARCH sub-alignments
+when the -S option is used. When the -S option was used to ignore
+lower-case residues in query or library for the initial score, the
+final alignments include the lower-case masked residues. The
+SSEARCH36 was using the non-masked alignment score, rather than the
+orginal score (FASTA36, and [T]FAST[XY]36 used the masked score).
+This was incorrect, as the statistics are calculated for masked
+sequences. The corrected version calculates both a non-masked and a
+masked score, where the masked score (for subalignments) uses the
+non-masked alignment.
+
+[T]FAST[XY]36 had a related problem, which is that when multiple
+sequences are in the query with the same pam2p[0] (no -S) score, then
+the wrong alignment could be shown with the initial scores. Fixing
+this requires that the alignment routine only work on the region
+specified from the initial band (fixed in dropnfa.c, dropfx.c, and
+dropfz2.c).
+
+>>February 4, 2010
+
+The more efficient statistical thresholds in fasta36 have been
+disabled by default. They can be turned on with -c -1, or by setting
+thesholds (-c "0.05 0.2" would set E_band_opt to 0.05 - target 5% of
+sequences - and E_join at 20% target).
+
+My initial implementation produced very inaccurate statistics,
+presumably because only a small fraction of unrelated sequences were
+being band-optimized (fasta35 typically optimized about 60% of library
+sequences, fasta36 with statistical thresholds optimizes about 2%,
+which causes a 2 - 3X speed increase). The sampling strategy for
+fasta36, and [t]fast[xy]36 scores has been adjusted to provide
+relatively accurate scores for searches that optimize only a small
+fraction of sequences. On the cases I have tested, statistical
+accuracy is comparable to, or better than, the version 35 programs,
+but probably not as robust as ssearch estimates.
+
+>>January 29, 2010
+
+The logic to predetermine where scores went for shuffling breaks when
+some scores are not calculated (e.g. -M 200 - 300). Fix by using
+nstats as the index for nstats < MAX_STATS, and then use stats_idx
+afterwards.
+
+Provide more efficient score sampling logic. The old method (left
+over from fasta34 or earlier) generated a random number for every
+sequence after MAX_STATS; if it was less than MAX_STATS, the sample
+was used. This logic is still available with -DSAMP_STATS_MORE. The
+new logic samples every other sequence between MAX_STATS and
+2*MAX_STATS, every third between 2*MAX_STATS and 3*MAXSTATS, etc, and
+randomly replaces one of the stats scores. For 430K SwissProt, this
+reduces the number of samples from 178K to about 145K, and reduces the
+number of calls to the random number generator from 430K to 85K.
+
+>>January 28, 2010
+
+(comp_lib3.c, mrandom.c) Tests of ssearch36 statistical accuracy
+suggests that the default statistical estimates (-z 1) are not as
+accurate as they should be with BLOSUM62, -11/-1. Both -z 11 and -z 2
+work better. In FASTA35, -z 11 - 15 caused a 2X-slowdown (actually
+more) because EVERY library sequence was shuffled, even though only a
+fraction of the sequences (for libraries > 60,000 would be used for
+the statistical calculation. comp_lib3.c uses a more sophisticated
+strategy for sampling scores after 60,000 so that sequences are only
+shuffled and aligned if they will be used in the statistical
+calculation. Doing this on SwissProt, with 430,000 sequences, means
+that ~180,000 additional shuffle alignments are done, not 430,000
+additional.
+
+However, using -z 11 with the threaded program was much more than
+2X-slower -- random() is not re-entrant, and is designed to provide a
+consistent set of random numbers over threads, so threads were waiting
+on the random number generator, with a big performance penalty. Using
+code from WikiPedia, I implemented a random number generator
+(mrandom.c) that saves a local copy of state, so threaded -z 11 has
+the correct performance penalty.
+
+>>January 25, 2010 (initfa.c 36.04 January 2010)
+
+(dropfz2.c, aln_struct.h) At long last, tfasty36 correctly produces
+multiple alignments on the reverse strand. (Jan. 26, 2010) Fixed
+introduced bug in fasty36 that used wrong offset in recursion.
+
+>>January 17, 2010
+
+Extensive changes have been made to all the drop_* functions, so that
+multiple alignment results are properly sorted from highest to lowest
+sw_score. dropnfa.c, dropgsw2.c, dropfx.c and dropfz2.c now all use
+similar strategies to calculate non-overlapping alternative alignments.
+score_thresh thresholds are applied to rst.score[ppst->score_ix]
+appropriately for all recursive functions.
+
+>>August 24, 2009
+
+Statistical thresholds have been adjusted to produce more
+approximately the correct number of joins/band optimizations. The
+approximate fraction of joins/band optimizations is now shown in the
+results.
+
+>>August 21, 2009
+
+fasta/fastx/fasty/tfastx/tfasty now use statistically based thresholds
+for joining short segments and deciding to do a band optimization --
+similar to the threshold strategy used by BLAST.
+
+The statistical thresholds used are set with the
+-c option, which used to be used to set optcut. The -c option now has three ranges:
+
+-c < 0 -- use the old FASTA thresholds, calculated in the same way
+0 < -c < 1.0 -- use the statistical thresholds and set E_opt_cut.
+c >= 1.0 -- use the old FASTA threshold, and specify it.
+
+For 0 < -c < 1.0, a second argument can be supplied (-c "0.02 0.1")
+for the joining E()-threshold. If this value is < 1.0, it is used as
+E_join; if it is > 1.0, E_opt_cut is multiplied by the value to get
+E_join.
+
+>>August 19, 2009
+
+Implement Lambda/K/H based c_gap, opt_cut in dropnfa.c, dropfx.c
+(fastx), and dropfz2.c (fasty). Add ELK_to_s() to scaleswn.c.
+
+>>August 11, 2009
+
+Fix bug in dropfx.c that used the wrong variables for calculating
+offsets into a long DNA sequence for subset alignments.
+
+Stop putting sw_score in score[0] when no score[0] was calculated.
+Use 0 instead.
+
+>>July 31, 2009
+
+(dropgsw2.c) Fix problems with dropgsw2.c that allowed poor
+sub-alignments to be shown. Consolidate merge_ares_acc() for all the
+functions. Add pst.do_rep to disable multiple alignments.
+
+>>July 6, 2009
+
+(initfa.c, apam.c, complib2.c, p2_complib.c) move changes for
+validate_novel_aa() from fasta35.
+
+(initfa.c) Enable checks for unusual characters ('Uu' in proteins) for
+many more programs with the -p option.
+
+>>June 16, 2009
+
+Modify statistical sampling strategy to greatly simplify the
+calculation.
+
+>>May 15, 2009
+
+Fix bug in lav2ps.c, lav2svg.c that occured when displaying very long
+sequence alignments (e.g. genome alignments). The maximum coordinate
+is set properly now.
+
+>>May 5, 2009
+
+(initfa.c) Fix bug (int e_cut in pgm_def_arr[]) that prevented e_cut
+to be set properly for lalign for DNA.
+
+>>May 4, 2009
+
+The functions that return multiple sub-alignments (HSPs) after the
+best alignment have been modified to ensure that alignments are
+returned sorted by score, by merging the list of alignments found to
+the left and right of the best alignment.
+
+>>April 28, 2009
+
+(p2_complib2.c, p2_workcomp2.c, mshowbest.c, mshowalign.c) modified to
+support new coordinate system, preliminary work on multiple HSPs in
+parallel environment.
+
+>>April 14, 2009
+
+(comp_lib2.c, nmgetaa.c) Comprehensive restructuring of library file
+list from a fixed length array to a variable length linked list. The
+link lists allows library files to insert additional files into the
+list, so that, for example, a file of accession numbers can refer to a
+list of files for the accessions.
+
+Eventually, this should allow FASTA to support .pal/.nal files from
+the NCBI, and to support files of file names most places file names
+are allowed.
+
+>>April 2, 2009 (from fasta35)
+
+(structs.h, comp_lib2.c, doinit.c, mshowbest.c, mshowalign.c) The code
+that selects the number of high scores to display has been reorganized
+to support the -F e_low option (which was not implemented properly if
+-b and -d were specified). The code is simplified; m_msg.nshow is
+used to specify the number of best scores listed, and min(m_msg.nshow,
+m_msg.ashow) is used to specify the number of alignments shown.
+
+>>March 26, 2009 (from fasta35 - fa35_04_07)
+
+(initfa.c) Fix problems with 'U' recognition in DNA pam matrix,
+correct implementation of -r +mat/-mis. Previous versions of fasta35
+may not have used the correct DNA matrix when the -r +mat/-mis option
+was specified.
+
+>>March 23, 2009 (initfa.c verstr -> 36.02)
+
+(mshowbest.c, aln_structs.h) Add loop for displaying multiple aligned
+regions with -m 9, -m 9i, and -m 9c in mshowbest.c.
+
+>>March 22, 2009
+
+(dropgsw2.c, dropnnw2.c, wm_align.c) Rearrange code in dropgsw2.c,
+dropnnw2.c (which replaces dropnnw.c) so that a single function,
+wm_align.c:nsw_malign() is responsible for recursive algnments for
+both dropgsw2.c (sw_walign) and dropnnw2.c (nw_walign). The strategy
+for tnese (Smith-Waterman, Global-Local) alignments is
+identical. nsw_malign() uses a function pointer that calculates S-W or
+N-W that it gets from dropgsw2.c or dropnnw2.c
+
+It might make sense to use a similar strategy for the recursive
+translated alignments.
+
+>>March 19, 2009
+
+(map_db.c, mm_file.h) Fix another bug in map_db.c that appears for
+sequence files larger than 2Gb. MM_OFF is now consistently used in
+more of the places where an int64_t might is required.
+
+>>March 17, 2009
+
+(list_db.c) Fix a bug in list_db that caused it to misread the maximum
+sequence length, and then be off by 4-bytes for all the offsets.
+Include list_db with map_db in the list of auxiliary programs.
+
+>>Mar. 8, 2009 fa35_04_06
+
+(comp_lib2.c, pthr_subs2.c, pthr_subs.h, doinit.c, dec_pthr_subs.c)
+Dynamically allocate pthread_t *fa_threads, rather than limit it to
+MAX_WORKERS. MAX_WORKERS is no longer used in the Unix environment;
+it gets its value from sysconf(_SC_NPROCESSORS_CONF). If sysconf() is
+not available, MAX_WORKERS is used. The threaded programs should now
+automatically adjust the number of threads to the number of
+processors. Moreover, the number of threads can be set to more than
+the number of processors with -T #threads. Also, max_workers was
+renamed fa_max_workers, and pthread_t *threads is now *fa_threads.
+
+>>Mar. 6, 2009
+
+copied comp_lib2.c from v35 (fix for query offset coordinates)
+
+>>Oct. 22, 2008
+
+The programs that allow multiple alignments to be found include:
+
+ ssearch36(_t)
+ fasta36(_t)
+ fastx36(_t)
+ fasty36(_t)
+
+fasts and fastf will probably not be updated in this way, because of
+the difficulty in reconstructing alignments, but fastm may be.
+
+Right now, the pvm/mpi versions of the programs do not support
+multiple sub-alignments.
+
+>>Sep. 25, 2008
+
+Modify the syntax for the -E option to allow the repeat E()-value
+cutoff to be specified in either of two ways.
+
+ -E "e_cut e_rep"
+
+If the value of e_rep is less than one, it is taken as the absolute
+E()-value threshold for additional local domains, for example:
+
+ -E "1.0 0.05" says use 1.0 for the main E()-value threshold,
+ and 0.05 as the threshold for additional local alignments.
+
+Alternatively, if e_rep >= 1.0, it is taken as a divisor for the
+E()-value threshold, thus:
+
+ -E "1.0 10.0"
+
+Sets the E()-value threshold for additional local alignments to
+1.0/10.0 = 0.1.
+
+Finally, if e_rep <= 0.0, no multiple alignments are done (equivalent
+to previous versions of FASTA).
diff --git a/doc/readme.w32 b/doc/readme.w32
new file mode 100644
index 0000000..e98ace7
--- /dev/null
+++ b/doc/readme.w32
@@ -0,0 +1,67 @@
+September 7, 2015
+
+On windows machines, the threaded programs are now capable of
+automatically detecting the number of threads. The pthreadsVC2.dll is
+still required, but it is now included in the program directory
+(bin/).
+
+October 6, 2006, updated September 7, 2015
+
+The FASTA programs for Windows32 environments (Windows7 and later)
+has undergone a major upgrade, so that now all the programs in the
+Unix/MacOSX distribution are available to Windows users. Moreover,
+Windows users with modern (SSE2 compatible) processors can run greatly
+accelerated versions of the Smith-Waterman ssearch program.
+
+Moreover, these programs work both with FASTA formatted files, and
+NCBI BLAST formatted files.
+
+The following programs are available:
+
+ fasta36.exe protein-protein or DNA-DNA database searches
+ fastf36.exe
+ fastm36.exe
+ fasts36.exe
+ fastx36.exe compare DNA query to protein library with frameshifts
+ fasty36.exe compare DNA query to protein library with frameshifts
+ ssearch36.exe Smith-Waterman for prot-prot or DNA-DNA searches,
+ accelerated with SSE2 extensions
+ tfastf36.exe
+ tfastm36.exe
+ tfasts36.exe
+ tfastx36.exe compare protein to DNA library with frameshifts
+ tfasty36.exe compare protein to DNA library with frameshifts
+
+Each of these programs also has a "threaded" version, which can run on
+multiple processors (or multiple cores) if they are available. However,
+they are built using the Unix pthreads API, so to use these programs,
+you must download the pthreadVC2.dll from:
+
+ftp://sources.redhat.com/pub/pthreads-win32/dll-latest/lib/pthreadVC2.dll
+
+see also http://sourceware.org/pthreads-win32/
+
+ fasta36_t.exe
+ fastf36_t.exe
+ fastm36_t.exe
+ fasts36_t.exe
+ fastx36_t.exe
+ fasty36_t.exe
+ ssearch36_t.exe
+ tfastf36_t.exe
+ tfasts36_t.exe
+ tfastx36_t.exe
+ tfasty36_t.exe
+
+Without that DLL, the threaded programs will not run at all. The
+current compilation supports two threads, and speeds up searches about
+2-fold on dual-core processors.
+
+The programs have been tested with protein and DNA databases in FASTA
+format, PIR/GCG-text format, and Genbank flatfile format. The program
+does not work properly with GCG binary format databases, but it seems
+unlikely that Windows users would need these.
+
+Please report bugs to:
+
+ wrp at virginia.edu
diff --git a/make/Makefile b/make/Makefile
new file mode 100644
index 0000000..940b482
--- /dev/null
+++ b/make/Makefile
@@ -0,0 +1,53 @@
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+#
+# Dec 8, 2005 - with gcc4.0.2 (or .1) under Redhat Linux Fedora FC4 -03 breaks the alignment code
+#
+
+CC= gcc -g -O2
+
+#CC=gcc -Wall -pedantic -ansi -g -O
+#CC = gcc -g -DDEBUG
+#CC= /usr/local/parasoft/bin.linux2/insure -g -DDEBUG
+
+# EBI uses the following with pgcc, -O3 does not work:
+# CC= pgcc -O2 -pipe -mcpu=pentiumpro -march=pentiumpro -fomit-frame-pointer
+
+# this file works for x86 LINUX
+
+# standard options
+CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DSFCHAR="':'" -DMAX_WORKERS=2 -DTHR_EXIT=pthread_exit -DPROGRESS -DUSE_MMAP -D_REENTRANT -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DSAMP_STATS -DPGM_DOC
+# -I/usr/local/include/mysql -DMYSQL_DB
+#
+#(for mySQL databases) (also requires change to Makefile34.common)
+
+LIB_M = -lm
+#LIB_M = -L/usr/local/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+# for Linux
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+XDIR = /seqprg/bin
+
+DROPGSW_NA_O = dropgsw2.o wm_align.o calcons_sw.o
+DROPGSW_SSE_O = dropgsw2_sse.o smith_waterman_sse2.o wm_align.o calcons_sw.o
+DROPGSW_ALT_O = dropgsw2_alt.o smith_waterman_altivec.o wm_align.o calcons_sw.o
+DROPGSW_O = $(DROPGSW_SSE_O)
+
+DROPLAL_NA_O = droplal2.o lsim4.o calcons_la.o
+DROPLAL_SSE_O = droplal2_sse.o smith_waterman_sse2.o lsim4.o calcons_la.o
+DROPLAL_ALT_O = droplal2_sse.o smith_waterman_altivec.o lsim4.o calcons_la.o
+DROPLAL_O = $(DROPLAL_SSE_O)
+
+# renamed (fasta36) programs
+include ../make/Makefile36m.common
+# conventional (fasta3) names
+# include ../make/Makefile.common
+
diff --git a/make/Makefile.NetBSD b/make/Makefile.NetBSD
new file mode 100644
index 0000000..a9c08d5
--- /dev/null
+++ b/make/Makefile.NetBSD
@@ -0,0 +1,40 @@
+#
+# this file works for NetBSD
+#
+# provided by Marc Baudoin <babafou at babafou.eu.org>
+#
+
+CC= cc -O
+#CC= cc -g -DDEBUG
+#CC= gcc -g -Wall
+#
+# standard line for normal searching
+CFLAGS= -DM10_CONS -DUNIX -DTIMES -DHZ=60 -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DFASTA_HOST='"your.host.here/fasta/cgi"' -DUSE_MMAP
+
+# special options for SUPERFAMLIES
+#CFLAGS= -DM10_CONS -DUNIX -DTIMES -DHZ=60 -DSFCHAR="'|'" -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DPROGRESS -DSUPERFAMNUM -DUSE_MMAP
+
+LIB_M= -lm
+HFLAGS= -o
+NFLAGS= -o
+
+# for NetBSD
+THR_SUBS = pthr_subs2
+THR_LIBS = -L/usr/pkg/pthreads/lib -lpthread
+THR_CC = -I/usr/pkg/pthreads/include
+
+BIN = ../bin
+XDIR = /seqprg/slib/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+include ../make/Makefile36m.common
diff --git a/make/Makefile.cray_pvp b/make/Makefile.cray_pvp
new file mode 100644
index 0000000..0b8f185
--- /dev/null
+++ b/make/Makefile.cray_pvp
@@ -0,0 +1,41 @@
+#
+# makefile for fasta35
+#
+# for more information on FASTA on CRAY's, see:
+#
+# http://home.cray.com/~cpsosa/ChemApps/BioInf/fasta/fasta.html
+# provided by: Carlos P. Sosa, cpsosa at cray.com
+#
+
+CC= cc -h inline1,scalar3,task0,vector2
+
+HFLAGS= -o
+NFLAGS= -o
+
+LIB_M=
+#
+
+CFLAGS= -DUNIX -DTIMES -DSFCHAR="':'" -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DIS_BIG_ENDIAN
+
+THR_SUBS = pthr_subs
+THR_LIBS = -lpthread
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/slib/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_SSE2_O)
+DROPLAL_O = $(DROPLAL_SSE2_O)
+DROPGNW_O = $(DROPGNW_SSE2_O)
+DROPLNW_O = $(DROPLNW_SSE2_O)
+
+# renamed (fasta35) programs
+include ../make/Makefile33.nommap
+# conventional (fasta3) names
+# include ../make/Makefile.common
diff --git a/make/Makefile.fcom b/make/Makefile.fcom
new file mode 100644
index 0000000..75af885
--- /dev/null
+++ b/make/Makefile.fcom
@@ -0,0 +1,344 @@
+
+#================ common .o files
+
+doinit.o : doinit.c defs.h param.h rstruct.h upam.h structs.h uascii.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c doinit.c
+
+init_sw.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DSSEARCH initfa.c -o init_sw.o
+
+init_sw_sse.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DSW_SSE2 -DSSEARCH initfa.c -o init_sw_sse.o
+
+init_sw_alt.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DSW_ALTIVEC -DSSEARCH initfa.c -o init_sw_alt.o
+
+init_lal.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DLALIGN initfa.c -o init_lal.o
+
+init_lnw.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DGLSEARCH initfa.c -o init_lnw.o
+
+init_lnw_sse.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DSW_SSE2 -DGLSEARCH initfa.c -o init_lnw_sse.o
+
+init_gnw.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DGGSEARCH initfa.c -o init_gnw.o
+
+init_gnw_sse.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DSW_SSE2 -DGGSEARCH initfa.c -o init_gnw_sse.o
+
+init_rss.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DPRSS initfa.c -o init_rss.o
+
+init_rfx.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DPRSS -DFASTX initfa.c -o init_rfx.o
+
+init_fa.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTA initfa.c -o init_fa.o
+
+init_ff.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTF initfa.c -o init_ff.o
+
+init_tf.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTF -DTFAST initfa.c -o init_tf.o
+
+init_fs.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTS initfa.c -o init_fs.o
+
+init_fm.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTM initfa.c -o init_fm.o
+
+init_tfs.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTS -DTFAST initfa.c -o init_tfs.o
+
+init_tfm.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTM -DTFAST initfa.c -o init_tfm.o
+
+init_tfa.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTA -DTFAST initfa.c -o init_tfa.o
+
+init_fx.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTX initfa.c -o init_fx.o
+
+init_tfx.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTX -DTFAST initfa.c -o init_tfx.o
+
+init_fy.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTY initfa.c -o init_fy.o
+
+init_tfy.o : initfa.c defs.h param.h rstruct.h upam.h structs.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTY -DTFAST initfa.c -o init_tfy.o
+
+#================ miscellaneous
+
+htime.o : htime.c
+ $(CC) $(THR_CC) $(CFLAGS) -c htime.c
+
+compacc2_t.o : compacc2e.c upam.h uascii.h param.h rstruct.h structs.h $(MWH) defs.h aln_structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c compacc2e.c -o compacc2_t.o
+
+compacc2_s.o : compacc2e.c upam.h uascii.h param.h rstruct.h structs.h $(MWH) defs.h aln_structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c compacc2e.c -o compacc2_s.o
+
+compacc2_p.o : compacc2e.c upam.h uascii.h param.h rstruct.h structs.h $(MWH) defs.h aln_structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -DMPI_SRCB -c compacc2e.c -o compacc2_p.o
+
+compacc.o : compacc.c upam.h uascii.h param.h rstruct.h structs.h $(MWH) defs.h aln_structs.h drop_func.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -c compacc.c -o compacc.o
+
+apam.o : apam.c defs.h param.h uascii.h upam.h
+ $(CC) $(THR_CC) $(CFLAGS) -c apam.c
+
+pssm_asn_subs.o : pssm_asn_subs.c defs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c pssm_asn_subs.c
+
+#================ display list of best hits / alignments
+
+showbest.o : $(SHOWBESTC) $(MWH) defs.h param.h rstruct.h structs.h aln_structs.h drop_func.h
+ $(CC) $(THR_CC) $(CFLAGS) -c $(SHOWBESTC) -o showbest.o
+
+build_ares.o : build_ares.c $(MWH) defs.h param.h rstruct.h structs.h aln_structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c build_ares.c -o build_ares.o
+
+$(SHOWALIGN_T).o : $(SHOWALIGN).c $(MWHP) defs.h structs.h param.h rstruct.h aln_structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -c $(SHOWALIGN).c -o $(SHOWALIGN_T).o
+
+$(SHOWALIGN_P).o : $(SHOWALIGN).c $(MWHP) defs.h structs.h param.h rstruct.h aln_structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -DMPI_SRC -c $(SHOWALIGN).c -o $(SHOWALIGN_P).o
+
+$(SHOWALIGN_S).o : $(SHOWALIGN).c $(MWHP) defs.h structs.h param.h rstruct.h aln_structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c $(SHOWALIGN).c -o $(SHOWALIGN_S).o
+
+$(LSHOWALIGN).o : $(SHOWALIGN).c $(MWHP) defs.h structs.h param.h rstruct.h aln_structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -DLALIGN -c $(SHOWALIGN).c -o $(LSHOWALIGN).o
+
+re_getlib.o : re_getlib.c mw.h mm_file.h
+ $(CC) $(THR_CC) $(CFLAGS) -c re_getlib.c
+
+lib_sel.o : lib_sel.c defs.h structs.h rstruct.h
+ $(CC) $(THR_CC) $(CFLAGS) -c lib_sel.c
+
+c_dispn.o : c_dispn.c defs.h structs.h param.h rstruct.h aln_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c c_dispn.c
+
+#================ statistical functions
+
+karlin.o : karlin.c param.h rstruct.h
+ $(CC) $(THR_CC) $(CFLAGS) -c karlin.c
+
+scale_se.o : scaleswn.c defs.h param.h rstruct.h structs.h $(MWH) alt_parms.h
+ $(CC) $(THR_CC) $(CFLAGS) -DLOCAL_SCORE -c scaleswn.c -o scale_se.o
+
+scale_sn.o : scaleswn.c defs.h param.h rstruct.h structs.h $(MWH) alt_parms.h
+ $(CC) $(THR_CC) -DNORMAL_DIST $(CFLAGS) -c scaleswn.c -o scale_sn.o
+
+scaleswtf.o : scaleswt.c defs.h param.h rstruct.h structs.h $(MWH) alt_parms.h
+ $(CC) $(THR_CC) $(CFLAGS) -DFASTF -c scaleswt.c -o scaleswtf.o
+
+scaleswts.o : scaleswt.c defs.h param.h rstruct.h structs.h $(MWH) alt_parms.h
+ $(CC) $(THR_CC) $(CFLAGS) -c scaleswt.c -o scaleswts.o
+
+tatstats_fs.o : tatstats.c tatstats.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTS tatstats.c -o tatstats_fs.o
+
+tatstats_ff.o : tatstats.c tatstats.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTF tatstats.c -o tatstats_ff.o
+
+tatstats_fm.o : tatstats.c tatstats.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTM tatstats.c -o tatstats_fm.o
+
+last_tat.o : last_tat.c defs.h mm_file.h structs.h param.h rstruct.h
+ $(CC) $(THR_CC) $(CFLAGS) -c last_tat.c
+
+last_thresh.o : last_thresh.c defs.h mm_file.h structs.h param.h rstruct.h
+ $(CC) $(THR_CC) $(CFLAGS) -c last_thresh.c
+
+#================ drop functions - actual scores/alignments
+
+drop_nfa.o : dropnfa.c dropnfa.h param.h rstruct.h defs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c dropnfa.c -o drop_nfa.o
+
+dropsbd.o : dropnfa.c dropnfa.h param.h rstruct.h defs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c dropsbd.c -o dropsbd.o
+
+# drop_ff, _fs, _fm must define FASTF, FASTS, and FASTM to ensure
+# that tatstats.h is built appropriately
+
+drop_ff2.o : dropff2.c param.h rstruct.h defs.h tatstats.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTF dropff2.c -o drop_ff2.o
+
+drop_tff.o : dropff2.c param.h rstruct.h defs.h tatstats.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTF -DTFAST dropff2.c -o drop_tff.o
+
+drop_fs2.o : dropfs2.c param.h rstruct.h defs.h tatstats.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -DFASTS -c dropfs2.c -o drop_fs2.o
+
+drop_tfs.o : dropfs2.c param.h rstruct.h defs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DTFAST -DFASTS dropfs2.c -o drop_tfs.o
+
+drop_fm.o : dropfs2.c param.h rstruct.h defs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DFASTM dropfs2.c -o drop_fm.o
+
+drop_tfm.o : dropfs2.c param.h rstruct.h defs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DTFAST -DFASTM dropfs2.c -o drop_tfm.o
+
+drop_tfa.o : dropnfa.c dropnfa.h upam.h param.h rstruct.h defs.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DTFASTA dropnfa.c -o drop_tfa.o
+
+drop_fx.o : dropfx2.c upam.h param.h rstruct.h defs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c dropfx2.c -o drop_fx.o
+
+drop_tfx.o : dropfx2.c upam.h param.h rstruct.h defs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DTFAST dropfx2.c -o drop_tfx.o
+
+drop_fz.o : dropfz3.c upam.h param.h rstruct.h defs.h aamap.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c dropfz3.c -o drop_fz.o
+
+drop_tfz.o : dropfz3.c upam.h param.h rstruct.h defs.h aamap.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DTFAST dropfz3.c -o drop_tfz.o
+
+dropnsw.o : dropnsw.c upam.h param.h rstruct.h structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c dropnsw.c
+
+#dropgsw.o : dropgsw.c dropgsw.h defs.h param.h rstruct.h drop_func.h a_mark.h dyn_string.h
+# $(CC) $(THR_CC) $(CFLAGS) -c dropgsw.c -o dropgsw.o
+
+dropgsw2.o : dropgsw2.c dropgsw2.h defs.h param.h rstruct.h drop_func.h a_mark.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c dropgsw2.c -o dropgsw2.o
+
+dropgsw2_sse.o : dropgsw2.c dropgsw2.h defs.h param.h rstruct.h drop_func.h a_mark.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -DSW_SSE2 -c dropgsw2.c -o dropgsw2_sse.o
+
+dropgsw2_alt.o : dropgsw2.c dropgsw2.h defs.h param.h rstruct.h drop_func.h a_mark.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -DSW_ALTIVEC -c dropgsw2.c -o dropgsw2_alt.o
+
+droplal2.o : dropgsw2.c dropgsw2.h defs.h param.h rstruct.h drop_func.h a_mark.h dyn_string.h
+ $(CC) $(THR_CC) -DLALIGN $(CFLAGS) -c dropgsw2.c -o droplal2.o
+
+droplal2_sse.o : dropgsw2.c dropgsw2.h defs.h param.h rstruct.h drop_func.h a_mark.h dyn_string.h
+ $(CC) $(THR_CC) -DLALIGN $(CFLAGS) -DSW_SSE2 -c dropgsw2.c -o droplal2_sse.o
+
+droplal2_alt.o : dropgsw2.c dropgsw2.h defs.h param.h rstruct.h drop_func.h a_mark.h dyn_string.h
+ $(CC) $(THR_CC) -DLALIGN $(CFLAGS) -DSW_ALTIVEC -c dropgsw2.c -o droplal2_alt.o
+
+lsim4.o : lsim4.c lsim4.h param.h rstruct.h defs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c lsim4.c
+
+smith_waterman_altivec.o : smith_waterman_altivec.c smith_waterman_altivec.h dropgsw2.h defs.h param.h rstruct.h
+ $(CC) $(THR_CC) $(CFLAGS) -DSW_ALTIVEC -c smith_waterman_altivec.c
+
+smith_waterman_sse2.o : smith_waterman_sse2.c smith_waterman_sse2.h dropgsw2.h defs.h param.h rstruct.h
+ $(CC) $(THR_CC) $(CFLAGS) -DSW_SSE2 -c smith_waterman_sse2.c
+
+global_sse2.o : global_sse2.c global_sse2.h dropgsw2.h defs.h param.h rstruct.h
+ $(CC) $(THR_CC) $(CFLAGS) -DSW_SSE2 -c global_sse2.c
+
+glocal_sse2.o : glocal_sse2.c glocal_sse2.h dropgsw2.h defs.h param.h rstruct.h
+ $(CC) $(THR_CC) $(CFLAGS) -DSW_SSE2 -c glocal_sse2.c
+
+droplnw.o : dropnnw2.c upam.h param.h rstruct.h structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) $(CFLAGS) -c dropnnw2.c -o droplnw.o
+
+droplnw_sse.o : dropnnw2.c upam.h param.h rstruct.h structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) -DSW_SSE2 $(CFLAGS) -c dropnnw2.c -o droplnw_sse.o
+
+dropgnw.o : dropnnw2.c upam.h param.h rstruct.h structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) -DGLOBAL_GLOBAL $(CFLAGS) -c dropnnw2.c -o dropgnw.o
+
+dropgnw_sse.o : dropnnw2.c upam.h param.h rstruct.h structs.h drop_func.h dyn_string.h
+ $(CC) $(THR_CC) -DGLOBAL_GLOBAL -DSW_SSE2 $(CFLAGS) -c dropnnw2.c -o dropgnw_sse.o
+
+lwm_align.o : wm_align.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -c wm_align.c -o lwm_align.o
+
+gwm_align.o : wm_align.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DGGSEARCH -c wm_align.c -o gwm_align.o
+
+calcons_fa.o : cal_cons2.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DFASTA -c cal_cons2.c -o calcons_fa.o
+
+calcons_tfa.o : cal_cons2.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DTFASTA -c cal_cons2.c -o calcons_tfa.o
+
+calcons_sw.o : cal_cons2.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DSSEARCH -c cal_cons2.c -o calcons_sw.o
+
+calcons_la.o : cal_cons2.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) -DLALIGN -DLCAL_CONS $(CFLAGS) -c cal_cons2.c -o calcons_la.o
+
+calcons_ff.o : cal_consf.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DFASTF -c cal_consf.c -o calcons_ff.o
+
+calcons_fs.o : cal_consf.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DFASTS -c cal_consf.c -o calcons_fs.o
+
+calcons_fm.o : cal_consf.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DFASTM -c cal_consf.c -o calcons_fm.o
+
+calcons_tff.o : cal_consf.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DTFAST -DFASTF -c cal_consf.c -o calcons_tff.o
+
+calcons_tfs.o : cal_consf.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DTFAST -DFASTS -c cal_consf.c -o calcons_tfs.o
+
+calcons_tfm.o : cal_consf.c defs.h param.h rstruct.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DTFAST -DFASTM -c cal_consf.c -o calcons_tfm.o
+
+#================ reading query, libraries
+
+getseq.o : getseq.c defs.h uascii.h structs.h rstruct.h upam.h mm_file.h
+ $(CC) $(THR_CC) $(CFLAGS) -c getseq.c
+
+llgetaa.o : llgetaa.c upam.h uascii.h mm_file.h
+ $(CC) $(THR_CC) $(CFLAGS) -c -DNOLIB llgetaa.c
+
+lgetlib.o : $(NGETLIB).c altlib.h upam.h uascii.h mm_file.h
+ $(CC) $(THR_CC) $(CFLAGS) -c $(NGETLIB).c -o lgetlib.o
+
+lgetaa_m.o : mmgetaa.c altlib.h ncbl2_head.h upam.h uascii.h mm_file.h
+ $(CC) $(THR_CC) $(CFLAGS) -c mmgetaa.c -o lgetaa_m.o
+
+ncbl_lib.o : ncbl_lib.c ncbl_head.h
+ $(CC) $(THR_CC) $(CFLAGS) -c ncbl_lib.c
+
+ncbl2_mlib.o : ncbl2_mlib.c ncbl2_head.h mm_file.h
+ $(CC) $(THR_CC) $(CFLAGS) -c ncbl2_mlib.c -o ncbl2_mlib.o
+
+mysql_lib.o : mysql_lib.c mm_file.h
+ $(CC) $(THR_CC) $(CFLAGS) -c mysql_lib.c
+
+pgsql_lib.o : pgsql_lib.c mm_file.h
+ $(CC) $(THR_CC) $(CFLAGS) -c pgsql_lib.c
+
+#================ threading functions
+
+pthr_subs2.o : pthr_subs2.c thr_bufs2.h pthr_subs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c pthr_subs2.c
+
+uthr_subs.o : uthr_subs.c thr_bufs2.h uthr_subs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c uthr_subs.c
+
+#================ MPI worker function
+
+mpi_subs2.o : pcomp_subs2.c pcomp_bufs.h pcomp_bufs.h thr_buf_structs.h
+ $(CC) -DMPI_SRC $(CFLAGS) -c pcomp_subs2.c -o mpi_subs2.o
+
+#================ translation
+
+faatran.o : faatran.c upam.h uascii.h
+ $(CC) $(THR_CC) $(CFLAGS) -c faatran.c
+
+url_subs.o : url_subs.c structs.h param.h rstruct.h
+ $(CC) $(THR_CC) $(CFLAGS) -c url_subs.c
+
+#================ lav plotting functions
+
+lav2plt.o : lav2plt.c lav_defs.h
+ $(CC) $(CFLAGS) -c lav2plt.c
+
+lavplt_ps.o : lavplt_ps.c lav_defs.h
+ $(CC) $(CFLAGS) -c lavplt_ps.c
+
+lavplt_svg.o : lavplt_svg.c lav_defs.h
+ $(CC) $(CFLAGS) -c lavplt_svg.c
diff --git a/make/Makefile.freebsd b/make/Makefile.freebsd
new file mode 100644
index 0000000..e837696
--- /dev/null
+++ b/make/Makefile.freebsd
@@ -0,0 +1,72 @@
+#
+# Makefile for building fasta3 on FreeBSD
+#
+# Fernan Aguero - <fernan at iib.unsam.edu.ar>
+
+# we take care of doing variable assignment using the '?=' and '+='
+# operators to preserve the value of variables if they are already
+# defined. In FreeBSD this happens when fasta3 is build from the port or
+# when the user has set these variables -- most notably CC and/or CFLAGS
+# -- in /etc/make.conf
+
+# Compiler executable, and optional flags
+CC?= gcc
+CFLAGS?= -g -O2
+
+# your FASTA host
+FASTA_HOST?= "your_fasta_host"
+
+# common CFLAGS. These are the set of CFLAGS that are always used
+COMMON_CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=2 \
+ -DTHR_EXIT=pthread_exit -DPROGRESS -DUSE_MMAP -D_REENTRANT \
+ -D_LARGE_FILE_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO \
+ -DHAS_INTTYPES -DSAMP_STATS
+
+# standard options, these will be added to the common CFLAGS if
+# selected below
+STANDARD_CFLAGS= -DSFCHAR="':'" -DFASTA_HOST='${FASTA_HOST}' \
+ -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DPGM_DOC
+
+# options for superfamily validations, these will be added to the common
+# CFLAGS if selected below
+SUPERFAMILY_CFLAGS= -DSFCHAR="'|'" -DSUPERFAMNUM -DBIG_LIB64
+
+# here we define CFLAGS to be the sum of common flags plus a subset of
+# optional flags that define our intended use.
+# The default standard flags are selected by default, although the user
+# can override this if s/he wants
+CFLAGS+= ${COMMON_CFLAGS} ${STANDARD_CFLAGS}
+
+BIN = ../bin
+XDIR = /usr/local/bin
+
+LIB_M+= -lm
+
+HFLAGS+= -o
+NFLAGS+= -o
+
+# FreeBSD users BEWARE! Different threading models ahead!
+
+# The threading model has changed along the way from FreeBSD-4 to
+# FreeBSD-6. If you're building fasta3 on your own, you will need to
+# adjust this accordingly. The default works in FreeBSD-6x (currently
+# the recommended major version for use in production). Or better yet,
+# use the biology/fasta3 port from the ports collection, which will use
+# the correct threading library for your OSVERSION
+
+THR_SUBS?= pthr_subs2
+THR_LIBS?= -lpthread
+THR_CC?=
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+include ../make/Makefile36m.common
diff --git a/make/Makefile.hpux_it b/make/Makefile.hpux_it
new file mode 100644
index 0000000..b05e07e
--- /dev/null
+++ b/make/Makefile.hpux_it
@@ -0,0 +1,56 @@
+#
+# makefile for fasta3, fasta3_t
+#
+# flags for HP-UX #
+
+CC= cc -g -O2 +Onolimit -Wl,+pi,1M -Wl,+pd,1M -Wl,+mergeseg
+#CC = gcc -g -DDEBUG
+
+#CC=gcc -Wall -pedantic -ansi -g -O
+#CC= /usr/local/parasoft/bin.linux2/insure -g -DDEBUG
+
+# EBI uses the following with pgcc, -O3 does not work:
+# CC= pgcc -O2 -pipe -mcpu=pentiumpro -march=pentiumpro -fomit-frame-pointer
+
+# this file works for x86 LINUX
+
+# use options below for superfamily validations
+#CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DSFCHAR="'|'" -DMAX_WORKERS=2 -DTHR_EXIT=pthread_exit -DPROGRESS -DSUPERFAMNUM -DUSE_MMAP -DBIG_LIB64 -D_LARGE_FILE_SOURCE -DUSE_FSEEKO -D_FILE_OFFSET_BITS=64 -DHAS_INTTYPES -DSAMP_STATS
+
+# standard options
+CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DSFCHAR="':'" -DMAX_WORKERS=2 -DTHR_EXIT=pthread_exit -DPROGRESS -DUSE_MMAP -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DSAMP_STATS -DPGM_DOC
+# -I/usr/local/include/mysql -DMYSQL_DB
+#
+#(for mySQL databases) (also requires change to Makefile35.common)
+
+LIB_M = -lm
+#LIB_M = -L/usr/local/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+# for Linux
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# renamed (fasta35) programs
+include ../make/Makefile36m.common
+# conventional (fasta3) names
+# include ../make/Makefile.common
+
diff --git a/make/Makefile.ibm b/make/Makefile.ibm
new file mode 100644
index 0000000..160b10b
--- /dev/null
+++ b/make/Makefile.ibm
@@ -0,0 +1,37 @@
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+
+CC= xlc_r -O3 -qarch=auto -qtune=auto -qcache=auto
+
+# for IBM with current pthreads
+CFLAGS= -DUNIX -DTIMES -DSFCHAR="':'" -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DPROGRESS -DIS_BIG_ENDIAN -DUSE_MMAP -DIBM_AIX -D_LARGE_FILES -DHAS_INTTYPES -D_LARGE_FILES -UMAXSEG -DSAMP_STATS -DPGM_DOC
+
+# consider -D_LARGE_FILE_API -D_LARGE_FILES for files > 2 GB
+
+LIB_M = -lm
+
+HFLAGS= -o
+NFLAGS= -o
+
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthreads
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/slib/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# renamed (fasta34) programs
+include ../make/Makefile36m.common
+
diff --git a/make/Makefile.linux b/make/Makefile.linux
new file mode 120000
index 0000000..b1a4d6b
--- /dev/null
+++ b/make/Makefile.linux
@@ -0,0 +1 @@
+Makefile.linux64_sse2
\ No newline at end of file
diff --git a/make/Makefile.linux32 b/make/Makefile.linux32
new file mode 100644
index 0000000..82f96a9
--- /dev/null
+++ b/make/Makefile.linux32
@@ -0,0 +1,63 @@
+# $ Id: $
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+# This file is designed for 32-bit Linux systems using an X86
+# architecture without SSE2 extensions.
+#
+# To use on a 64-bit linux system, add -D_LARGEFILE64_SOURCE and -DBIG_LIB64
+# (or use Makefile.linux64)
+#
+
+SHELL=/bin/bash
+
+#CC= gcc -g -O
+#CC = gcc -g -DDEBUG
+
+#CC=gcc -Wall -pedantic -ansi -g -O
+CC= /usr/local/parasoft/bin/insure -g -DDEBUG
+
+# EBI uses the following with pgcc, -O3 does not work:
+# CC= pgcc -O2 -pipe -mcpu=pentiumpro -march=pentiumpro -fomit-frame-pointer
+
+# this file works for x86 LINUX
+
+# standard options
+CFLAGS= -DSHOW_HELP -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DPROGRESS -DUSE_MMAP -D_REENTRANT -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DSAMP_STATS -DPGM_DOC
+# -DSFCHAR="'|'" -dSUPERFAMNUM
+
+# -I/usr/local/include/mysql -DMYSQL_DB
+#
+#(for mySQL databases) (also requires change to Makefile35.common)
+
+LIB_M = -lm
+#LIB_M = -L/usr/local/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+# for Linux
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# renamed (fasta35) programs
+include ../make/Makefile36m.common
+# conventional (fasta3) names
+# include ../make/Makefile.common
+
diff --git a/make/Makefile.linux32_sse2 b/make/Makefile.linux32_sse2
new file mode 100644
index 0000000..05d562c
--- /dev/null
+++ b/make/Makefile.linux32_sse2
@@ -0,0 +1,68 @@
+#
+# $Id: Makefile.linux32_sse2 479 2011-01-12 13:13:03Z wrp $
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+# This file is designed for 32-bit Linux systems using an X86
+# architecture with SSE2 extensions. SSE2 is used for ssearch35(_t)
+#
+# To use on a 64-bit linux system, add -D_LARGEFILE64_SOURCE and -DBIG_LIB64
+# (or use Makefile.linux64_sse2)
+#
+
+SHELL=/bin/bash
+
+CC= gcc -g -O -msse2 -ffast-math
+#CC = gcc -g -DDEBUG -msse2
+
+#CC= /usr/local/parasoft/bin/insure -g -DDEBUG
+
+#CC=gcc -Wall -pedantic -ansi -g -O
+
+# EBI uses the following with pgcc, -O3 does not work:
+# CC= pgcc -O2 -pipe -mcpu=pentiumpro -march=pentiumpro -fomit-frame-pointer
+
+# this file works for x86 LINUX
+
+# standard options
+
+CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=8 -DTHR_EXIT=pthread_exit -DPROGRESS -D_REENTRANT -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DSAMP_STATS -DPGM_DOC -DUSE_MMAP
+
+# -DSUPERFAMNUM -DSFCHAR="'|'"
+
+# -I/usr/local/include/mysql -DMYSQL_DB
+#
+#(for mySQL databases) (also requires change to Makefile35.common)
+
+LIB_M = -lm
+#LIB_M = -L/usr/local/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+# for Linux
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/bin
+#XDIR = ~/bin/LINUX
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# SSE2 acceleration
+#
+DROPGSW_O = $(DROPGSW_SSE_O)
+DROPLAL_O = $(DROPLAL_SSE_O)
+DROPGNW_O = $(DROPGNW_SSE_O)
+DROPLNW_O = $(DROPLNW_SSE_O)
+
+# renamed (fasta35) programs
+include ../make/Makefile36m.common
+# conventional (fasta3) names
+# include ../make/Makefile.common
+
diff --git a/make/Makefile.linux64 b/make/Makefile.linux64
new file mode 120000
index 0000000..b1a4d6b
--- /dev/null
+++ b/make/Makefile.linux64
@@ -0,0 +1 @@
+Makefile.linux64_sse2
\ No newline at end of file
diff --git a/make/Makefile.linux64_sse2 b/make/Makefile.linux64_sse2
new file mode 100644
index 0000000..addc5ab
--- /dev/null
+++ b/make/Makefile.linux64_sse2
@@ -0,0 +1,65 @@
+# $ Id: $
+#
+# makefile for fasta3, fasta3_t Use Makefile.mpi for fasta36_mpi
+#
+# This file is designed for 64-bit Linux systems using an X86
+# architecture with SSE2 extensions. -D_LARGEFILE64_SOURCE and
+# -DBIG_LIB64 require a 64-bit linux system.
+# SSE2 extensions are used for ssearch35(_t)
+#
+# Use Makefile.linux32_sse2 for 32-bit linux x86
+#
+
+SHELL=/bin/bash
+
+CC = gcc -g -O -msse2
+#CC= gcc -pg -g -O -msse2 -ffast-math
+#CC = gcc -g -DDEBUG -msse2
+#CC=gcc -Wall -pedantic -ansi -g -msse2 -DDEBUG
+
+# EBI uses the following with pgcc, -O3 does not work:
+# CC= pgcc -O2 -pipe -mcpu=pentiumpro -march=pentiumpro -fomit-frame-pointer
+
+# this file works for x86 LINUX
+
+# standard options
+
+CFLAGS= -DSHOW_HELP -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=8 -DTHR_EXIT=pthread_exit -DM10_CONS -D_REENTRANT -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DSAMP_STATS -DPGM_DOC -DUSE_MMAP -D_LARGEFILE64_SOURCE -DBIG_LIB64
+# -I/usr/include/mysql -DMYSQL_DB
+# -DSUPERFAMNUM -DSFCHAR="'|'"
+
+#
+#(for mySQL databases) (also requires change to Makefile36m.common or use of Makefile36m.common_mysql)
+# run 'mysql_config' so find locations of mySQL files
+
+LIB_M = -lm
+# for mySQL databases
+# LIB_M = -L/usr/lib64/mysql -lmysqlclient -lm
+
+HFLAGS= -o
+NFLAGS= -o
+
+# for Linux
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/bin
+#XDIR = ~/bin/LINUX
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# SSE2 acceleration
+#
+DROPGSW_O = $(DROPGSW_SSE_O)
+DROPLAL_O = $(DROPLAL_SSE_O)
+DROPGNW_O = $(DROPGNW_SSE_O)
+DROPLNW_O = $(DROPLNW_SSE_O)
+
+# renamed (fasta36) programs
+include ../make/Makefile36m.common
+# conventional (fasta3) names
+# include ../make/Makefile.common
diff --git a/make/Makefile.linux_icc b/make/Makefile.linux_icc
new file mode 100644
index 0000000..438c264
--- /dev/null
+++ b/make/Makefile.linux_icc
@@ -0,0 +1,58 @@
+# $Id: Makefile.linux_icc 499 2011-01-28 10:20:04Z wrp $
+#
+# makefile for fasta3, fasta3_t using the Intel icc compiler
+#
+# This file is designed for 64-bit Linux systems.
+# -D_LARGEFILE64_SOURCE and # -DBIG_LIB64 require a 64-bit linux system.
+
+SHELL=/bin/bash
+
+CC= icc -g -O3
+#CC = icc -g -DDEBUG
+
+#CC=gcc -Wall -pedantic -ansi -g -O
+#CC= /usr/local/parasoft/bin/insure -g -DDEBUG
+
+# EBI uses the following with pgcc, -O3 does not work:
+# CC= pgcc -O2 -pipe -mcpu=pentiumpro -march=pentiumpro -fomit-frame-pointer
+
+# this file works for x86 LINUX
+
+# standard options
+CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=8 -DTHR_EXIT=pthread_exit -DFASTA_HOST='"your_fasta_host_here"' -DUSE_MMAP -D_REENTRANT -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DPGM_DOC -DBIG_LIB64 -DSAMP_STATS
+
+# -I/usr/local/include/mysql -DMYSQL_DB
+#
+#(for mySQL databases) (also requires change to Makefile36.common)
+
+LIB_M = -lm
+#LIB_M = -L/usr/local/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+# for Linux
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# renamed (fasta36) programs
+include ../make/Makefile36m.common
+# conventional (fasta3) names
+# include ../make/Makefile.common
+
diff --git a/make/Makefile.linux_icc_sse2 b/make/Makefile.linux_icc_sse2
new file mode 100644
index 0000000..72724ec
--- /dev/null
+++ b/make/Makefile.linux_icc_sse2
@@ -0,0 +1,55 @@
+ # $Id: Makefile.linux_icc_sse2 1162 2013-05-27 16:48:11Z wrp $
+#
+# makefile for fasta3, fasta3_t using the Intel icc compiler
+#
+# This file is designed for 64-bit Linux systems.
+# -D_LARGEFILE64_SOURCE and # -DBIG_LIB64 require a 64-bit linux system.
+#
+# uses SSE2 extensions for ssearch36(_t)
+
+SHELL=/bin/bash
+
+CC= icc -O3 -g
+#CC = icc -g -DDEBUG
+
+#CC=gcc -Wall -pedantic -ansi -g -O
+#CC= /usr/local/parasoft/bin/insure -g -DDEBUG
+
+# this file works for x86 LINUX
+
+# standard options
+CFLAGS= -DSHOW_HELP -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=8 -DTHR_EXIT=pthread_exit -DUSE_MMAP -D_REENTRANT -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DPGM_DOC -DBIG_LIB64 -DSAMP_STATS
+# -I/usr/include/mysql -DMYSQL_DB
+#(for mySQL databases) (also requires change to Makefile36.common)
+
+LIB_M = -lm
+#LIB_M = -L/usr/local/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+# for Linux
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_SSE_O)
+DROPLAL_O = $(DROPLAL_SSE_O)
+DROPGNW_O = $(DROPGNW_SSE_O)
+DROPLNW_O = $(DROPLNW_SSE_O)
+
+# renamed (fasta36) programs
+include ../make/Makefile36m.common
+# conventional (fasta3) names
+# include ../make/Makefile.common
+
diff --git a/make/Makefile.linux_mysql b/make/Makefile.linux_mysql
new file mode 100644
index 0000000..d7c7ea4
--- /dev/null
+++ b/make/Makefile.linux_mysql
@@ -0,0 +1,57 @@
+# $ Id: $
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+# Includes files for reading mysql databases
+#
+# This file is designed for 64-bit Linux systems
+# -D_LARGEFILE64_SOURCE and -DBIG_LIB64 require a 64-bit linux system.
+# This makefile does not use SSE2 extensions.
+#
+
+SHELL=/bin/bash
+
+CC= gcc -g -O2
+#CC= gcc -g -DDEBUG
+
+# this file works for x86 LINUX
+
+# standard options
+CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DSFCHAR="':'" -DMAX_WORKERS=2 -DTHR_EXIT=pthread_exit -DFASTA_HOST='"your_fasta_host_here"' -DUSE_MMAP -D_REENTRANT -I/usr/include/mysql -DMYSQL_DB -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DM10_CONS -DBIG_LIB64
+
+# use options below for superfamily validations
+#CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DSFCHAR="'|'" -DMAX_WORKERS=2 -DTHR_EXIT=pthread_exit -DPROGRESS -DSUPERFAMNUM -DUSE_MMAP -D_REENTRANT
+
+# -I/usr/local/include/mysql -DMYSQL_DB
+#
+#(for mySQL databases) (also requires change to Makefile35.common)
+
+#LIB_M = -lm
+#LIB_M = -L/usr/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+# for Linux
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# renamed (fasta34) programs
+include ../make/Makefile36m.common_mysql
+# conventional (fasta3) names
+# include ../make/Makefile.common
diff --git a/make/Makefile.linux_pgsql b/make/Makefile.linux_pgsql
new file mode 100644
index 0000000..da088d9
--- /dev/null
+++ b/make/Makefile.linux_pgsql
@@ -0,0 +1,58 @@
+# $ Id: $
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+# Includes files for reading postgres (pgsql) databases
+#
+# This file is designed for 64-bit Linux systems.
+# -D_LARGEFILE64_SOURCE and -DBIG_LIB64 require a 64-bit linux system.
+#
+
+SHELL=/bin/bash
+
+CC= gcc -g -O
+#CC= gcc -g -DDEBUG
+#CC=/opt/parasoft/bin.linux2/insure -g -DDEBUG
+
+# this file works for x86 LINUX
+
+# standard options
+CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DSFCHAR="':'" -DMAX_WORKERS=2 -DTHR_EXIT=pthread_exit -DFASTA_HOST='"your_fasta_host_here"' -DUSE_MMAP -D_REENTRANT -I/usr/local/pgsql/include -DPGSQL_DB -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DM10_CONS -DBIG_LIB64
+
+# use options below for superfamily validations
+#CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DSFCHAR="'|'" -DMAX_WORKERS=2 -DTHR_EXIT=pthread_exit -DPROGRESS -DSUPERFAMNUM -DUSE_MMAP -D_REENTRANT
+
+# -I/usr/local/include/mysql -DMYSQL_DB
+#
+#(for mySQL databases) (also requires change to Makefile35.common)
+
+#LIB_M = -lm
+#LIB_M = -L/usr/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+# for Linux
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# renamed (fasta34) programs
+include ../make/Makefile36m.common_pgsql
+# conventional (fasta3) names
+# include ../make/Makefile.common
diff --git a/make/Makefile.linux_sql b/make/Makefile.linux_sql
new file mode 100644
index 0000000..57166bf
--- /dev/null
+++ b/make/Makefile.linux_sql
@@ -0,0 +1,58 @@
+# $ Id: $
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+# Includes files for reading MySQL and Postgres (pgsql) databases
+#
+# This file is designed for 64-bit Linux systems.
+# -D_LARGEFILE64_SOURCE and -DBIG_LIB64 require a 64-bit linux system.
+#
+
+SHELL=/bin/bash
+
+CC= gcc -g -O
+#CC= gcc -g -DDEBUG
+#CC=/opt/parasoft/bin.linux2/insure -g -DDEBUG
+
+# this file works for x86 LINUX
+
+# standard options
+CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DSFCHAR="':'" -DMAX_WORKERS=2 -DTHR_EXIT=pthread_exit -DFASTA_HOST='"your_fasta_host_here"' -DUSE_MMAP -D_REENTRANT -I/usr/local/pgsql/include -I/usr/include/mysql -DPGSQL_DB -DMYSQL_DB -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DM10_CONS -DBIG_LIB64
+
+# use options below for superfamily validations
+#CFLAGS= -DSHOWSIM -DLINUX6 -DUNIX -DTIMES -DHZ=100 -DSFCHAR="'|'" -DMAX_WORKERS=2 -DTHR_EXIT=pthread_exit -DPROGRESS -DSUPERFAMNUM -DUSE_MMAP -D_REENTRANT
+
+# -I/usr/local/include/mysql -DMYSQL_DB
+#
+#(for mySQL databases) (also requires change to Makefile35.common)
+
+#LIB_M = -lm
+#LIB_M = -L/usr/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+# for Linux
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# renamed (fasta34) programs
+include ../make/Makefile36m.common_sql
+# conventional (fasta3) names
+# include ../make/Makefile.common
diff --git a/make/Makefile.linux_sse2 b/make/Makefile.linux_sse2
new file mode 120000
index 0000000..b1a4d6b
--- /dev/null
+++ b/make/Makefile.linux_sse2
@@ -0,0 +1 @@
+Makefile.linux64_sse2
\ No newline at end of file
diff --git a/make/Makefile.mp_com2 b/make/Makefile.mp_com2
new file mode 100644
index 0000000..d4a63e9
--- /dev/null
+++ b/make/Makefile.mp_com2
@@ -0,0 +1,116 @@
+
+# combinations of files for "composite" drop* functions
+#
+DROPLNW_O = droplnw.o wm_align.o calcons_sw.o
+DROPGNW_O = dropgnw.o wm_align.o calcons_sw.o
+DROPNSW_O = dropnsw.o wm_align.o calcons_sw.o
+DROPNFA_O = drop_nfa.o wm_align.o calcons_fa.o
+DROPBD_O = dropsbd.o wm_align.o calcons_fa.o
+DROPTFA_O = drop_tfa.o
+DROPFF_O = drop_ff2.o calcons_ff.o
+DROPFS_O = drop_fs2.o calcons_fs.o
+DROPFM_O = drop_fm.o calcons_fm.o
+DROPTFF_O = drop_tff.o calcons_tff.o
+DROPTFS_O = drop_tfs.o calcons_tfs.o
+DROPTFM_O = drop_tfm.o calcons_tfm.o
+
+COMPACC_TO = compacc2_t.o # used with comp_lib5e.c/comp_lib7e.c/comp_lib8.c
+COMPACC_SO = compacc2_s.o
+COMPACC_PO = compacc2_p.o
+
+SHOWBESTC = mshowbest.c
+SHOWBESTO = showbest.o build_ares.o
+
+SHOWALIGN = mshowalign2
+SHOWALIGN_P = mshowalign2_p
+SHOWALIGN_S = mshowalign2_s
+SHOWALIGN_T = mshowalign2_t
+LSHOWALIGN = lshowalign
+
+MWH = mw.h
+MWHP = mw.h
+
+MP_PROGS = ssearch36_mpi fasta36_mpi fasts36_mpi fastx36_mpi tfastx36_mpi fasty36_mpi tfasty36_mpi tfasts36_mpi fastm36_mpi fastf36_mpi tfastf36_mpi glsearch36_mpi ggsearch36_mpi
+
+PROGS = $(MP_PROGS)
+
+all: $(PROGS)
+
+clean-up:
+ rm -f *.o $(PROGS); rm -rf $(BIN)/*
+
+install: $(PROGS)
+ pushd $(BIN); cp $(PROGS) $(XDIR); popd
+
+
+ssearch36_mpi : $(COMP_THRO) ${WORK_THR_O} $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o scale_se.o karlin.o $(DROPGSW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ssearch36_mpi $(COMP_THRO) ${WORK_THR_O} $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o $(DROPGSW_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+ssearch36s_mpi : $(COMP_THRO) ${WORK_THR_O} $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o scale_se.o karlin.o $(DROPGSW_NA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ssearch36s_mpi $(COMP_THRO) ${WORK_THR_O} $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o $(DROPGSW_NA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+glsearch36_mpi : $(COMP_THRO) ${WORK_THR_O} $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_lnw.o scale_sn.o karlin.o $(DROPLNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/glsearch36_mpi $(COMP_THRO) ${WORK_THR_O} $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_lnw.o $(DROPLNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+glsearch36s_mpi : $(COMP_THRO) ${WORK_THR_O} $(THR_SUBS).o ${COMPACC_PO} showsum.o re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_lnw.o scale_sn.o karlin.o $(DROPLNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/glsearch36s_mpi $(COMP_THRO) ${WORK_THR_O} $(THR_SUBS).o ${COMPACC_PO} showsum.o re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_lnw.o $(DROPLNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+ggsearch36_mpi : $(COMP_THRO) ${WORK_THR_O} $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_gnw.o scale_sn.o karlin.o $(DROPGNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ggsearch36_mpi $(COMP_THRO) ${WORK_THR_O} $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_gnw.o $(DROPGNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+fasta36_mpi : $(COMP_THRO) $(WORK_THR_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_fa.o scale_se.o karlin.o $(DROPNFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fasta36_mpi $(COMP_THRO) $(WORK_THR_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_fa.o $(DROPNFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+fastf36_mpi : $(COMP_THRO) $(WORK_THR_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_ff.o scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(DROPFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastf36_mpi $(COMP_THRO) $(WORK_THR_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_ff.o $(DROPFF_O) scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fastf36s_mpi : $(COMP_THRO) $(WORK_THR_O) $(THR_SUBS).o ${COMPACC_PO} showsum.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_ff.o scaleswtf.o karlin.o $(DROPFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastf36s_mpi $(COMP_THRO) $(WORK_THR_O) $(THR_SUBS).o ${COMPACC_PO} showsum.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_ff.o $(DROPFF_O) scaleswtf.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fasts36_mpi : $(COMP_THRO) $(WORK_THR_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_fs.o scaleswts.o last_tat.o tatstats_fs.o karlin.o $(DROPFS_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fasts36_mpi $(COMP_THRO) $(WORK_THR_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_fs.o $(DROPFS_O) scaleswts.o last_tat.o tatstats_fs.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fastm36_mpi : $(COMP_THRO) $(WORK_THR_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_fs.o scaleswts.o last_tat.o tatstats_fm.o karlin.o $(DROPFM_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastm36_mpi $(COMP_THRO) $(WORK_THR_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_fs.o $(DROPFM_O) scaleswts.o last_tat.o tatstats_fm.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fastx36_mpi : $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o c_dispn.o htime.o apam.o mpi_doinit.o init_fx.o faatran.o scale_se.o karlin.o drop_fx.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fastx36_mpi $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_fx.o drop_fx.o faatran.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+fasty36_mpi : $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o c_dispn.o htime.o apam.o mpi_doinit.o init_fy.o faatran.o scale_se.o karlin.o drop_fz.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fasty36_mpi $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_fy.o drop_fz.o faatran.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+tfastf36_mpi : $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o c_dispn.o htime.o apam.o mpi_doinit.o init_tf.o scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(DROPTFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfastf36_mpi $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_tf.o $(DROPTFF_O) scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+tfasts36_mpi : $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o c_dispn.o htime.o apam.o mpi_doinit.o init_tfs.o scaleswts.o last_tat.o tatstats_fs.o karlin.o $(DROPTFS_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfasts36_mpi $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_tfs.o $(DROPTFS_O) scaleswts.o last_tat.o tatstats_fs.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+tfastx36_mpi : $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_tfx.o scale_se.o karlin.o drop_tfx.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfastx36_mpi $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_tfx.o drop_tfx.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+tfasty36_mpi : $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_tfy.o scale_se.o karlin.o drop_tfz.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfasty36_mpi $(COMP_THRO) $(WORK_THRX_O) $(THR_SUBS).o ${COMPACC_PO} $(SHOWBESTO) re_getlib.o $(SHOWALIGN_P).o htime.o apam.o mpi_doinit.o init_tfy.o drop_tfz.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+comp_mpi4.o : comp_lib4.c mw.h structs.h defs.h param.h pcomp_bufs.h thr_buf_structs.h
+ $(CC) $(CFLAGS) -DMPI_SRC -DCOMP_MLIB -c comp_lib4.c -o comp_mpi4.o
+
+comp_mpi5.o : comp_lib5.c mw.h structs.h defs.h param.h pcomp_bufs.h thr_buf_structs.h
+ $(CC) $(CFLAGS) -DMPI_SRC -DCOMP_MLIB -c comp_lib5.c -o comp_mpi5.o
+
+comp_mpi6.o : comp_lib6.c mw.h structs.h defs.h param.h pcomp_bufs.h thr_buf_structs.h
+ $(CC) $(CFLAGS) -DMPI_SRC -DCOMP_MLIB -c comp_lib6.c -o comp_mpi6.o
+
+comp_mpi7.o : comp_lib7.c mw.h structs.h defs.h param.h pcomp_bufs.h thr_buf_structs.h
+ $(CC) $(CFLAGS) -DMPI_SRC -DCOMP_MLIB -c comp_lib7.c -o comp_mpi7.o
+
+comp_mpi9.o : comp_lib9.c mw.h structs.h defs.h param.h pcomp_bufs.h thr_buf_structs.h
+ $(CC) $(CFLAGS) -DMPI_SRC -DCOMP_MLIB -c comp_lib9.c -o comp_mpi9.o
+
+work_mpi2.o : work_thr2.c mw.h structs.h defs.h param.h pcomp_bufs.h thr_buf_structs.h
+ $(CC) -DMPI_SRC $(CFLAGS) -c work_thr2.c -o work_mpi2.o
+
+work_mpi2x.o : work_thr2.c mw.h structs.h defs.h param.h pcomp_bufs.h thr_buf_structs.h
+ $(CC) -DMPI_SRC -DTFAST $(CFLAGS) -c work_thr2.c -o work_mpi2x.o
+
+mpi_doinit.o : doinit.c defs.h param.h rstruct.h upam.h structs.h uascii.h aln_structs.h
+ $(CC) -DMPI_SRC $(CFLAGS) -c doinit.c -o mpi_doinit.o
diff --git a/make/Makefile.mpi_icc_sse2 b/make/Makefile.mpi_icc_sse2
new file mode 100644
index 0000000..9b7bb56
--- /dev/null
+++ b/make/Makefile.mpi_icc_sse2
@@ -0,0 +1,55 @@
+# $Id: Makefile.mpi_icc_sse2 849 2011-10-21 20:09:55Z wrp $
+#
+# makefile for fasta3, fasta3_t using the Intel icc compiler
+#
+# This file is designed for 64-bit Linux systems.
+# -D_LARGEFILE64_SOURCE and # -DBIG_LIB64 require a 64-bit linux system.
+#
+# uses SSE2 extensions for ssearch35(_t)
+
+SHELL=/bin/bash
+
+CC= mpicc
+#CC= mpicc-dbg -g -DDEBUG
+
+#CC=gcc -Wall -pedantic -ansi -g -O
+#CC= /usr/local/parasoft/bin/insure -g -DDEBUG
+
+# this file works for x86 LINUX
+
+# standard options
+CFLAGS= -DPCOMPLIB=MPI -DSAMP_STATS_MORE -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=8 -DTHR_EXIT=pthread_exit -DFASTA_HOST='"your_fasta_host_here"' -DUSE_MMAP -D_REENTRANT -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DPGM_DOC -DBIG_LIB64 -DSAMP_STATS
+# -DSAMP_STATS_FAST -DSUPERFAMNUM -DSFCHAR="'|'"
+
+# -I/usr/local/include/mysql -DMYSQL_DB
+#
+#(for mySQL databases) (also requires change to Makefile35.common)
+
+LIB_M = -lm -lz
+#LIB_M = -L/usr/local/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+# for Linux
+THR_SUBS = mpi_subs2
+THR_LIBS =
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_SSE_O)
+DROPLAL_O = $(DROPLAL_SSE_O)
+DROPGNW_O = $(DROPGNW_SSE_O)
+DROPLNW_O = $(DROPLNW_SSE_O)
+
+include ../make/Makefile36mpi.common
+
diff --git a/make/Makefile.nm_fcom b/make/Makefile.nm_fcom
new file mode 100755
index 0000000..646fc77
--- /dev/null
+++ b/make/Makefile.nm_fcom
@@ -0,0 +1,304 @@
+
+#================ common .obj files
+
+doinit.obj : doinit.c defs.h param.h upam.h structs.h uascii.h
+ $(CC) $(CFLAGS) -c doinit.c
+
+init_sw.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DSSEARCH initfa.c /Foinit_sw.obj
+
+init_lal.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DLALIGN initfa.c /Foinit_lal.obj
+
+init_gnw.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DGGSEARCH initfa.c /Foinit_gnw.obj
+
+init_lnw.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DGLSEARCH initfa.c /Foinit_lnw.obj
+
+init_ssw.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DOSEARCH initfa.c /Foinit_ssw.obj
+
+init_rss.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DPRSS initfa.c /Foinit_rss.obj
+
+init_rfx.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DPRSS -DFASTX initfa.c /Foinit_rfx.obj
+
+init_fa.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTA initfa.c /Foinit_fa.obj
+
+init_ff.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTF initfa.c /Foinit_ff.obj
+
+init_tf.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTF -DTFAST initfa.c /Foinit_tf.obj
+
+init_fs.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTS initfa.c /Foinit_fs.obj
+
+init_fm.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTM initfa.c /Foinit_fm.obj
+
+init_tfs.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTS -DTFAST initfa.c /Foinit_tfs.obj
+
+init_tfm.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTM -DTFAST initfa.c /Foinit_tfm.obj
+
+init_tfa.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTA -DTFAST initfa.c /Foinit_tfa.obj
+
+init_fx.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTX initfa.c /Foinit_fx.obj
+
+init_tfx.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTX -DTFAST initfa.c /Foinit_tfx.obj
+
+init_fy.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTY initfa.c /Foinit_fy.obj
+
+init_tfy.obj : initfa.c defs.h param.h upam.h structs.h
+ $(CC) $(CFLAGS) -c -DFASTY -DTFAST initfa.c /Foinit_tfy.obj
+
+#================ miscellaneous
+
+htime.obj : htime.c
+ $(CC) $(CFLAGS) -c htime.c
+
+compacc.obj : compacc.c upam.h uascii.h param.h structs.h $(MWH) defs.h
+ $(CC) $(CFLAGS) -c compacc.c
+
+compacc2_t.obj : compacc2e.c upam.h uascii.h param.h rstruct.h structs.h $(MWH) defs.h aln_structs.h drop_func.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c compacc2e.c /Focompacc2_t.obj
+
+compacc2_s.obj : compacc2e.c upam.h uascii.h param.h rstruct.h structs.h $(MWH) defs.h aln_structs.h drop_func.h
+ $(CC) $(THR_CC) $(CFLAGS) -c compacc2e.c /Focompacc2_s.obj
+
+pssm_asn_subs.obj : pssm_asn_subs.c defs.h
+ $(CC) $(CFLAGS) -c pssm_asn_subs.c
+
+#================ display list of best hits / alignments
+
+showbest.obj : $(SHOWBESTC) $(MWH) defs.h param.h structs.h aln_structs.h drop_func.h
+ $(CC) $(CFLAGS) -c $(SHOWBESTC) /Foshowbest.obj
+
+showrss.obj : showrss.c $(MWH) defs.h param.h structs.h aln_structs.h drop_func.h
+ $(CC) $(CFLAGS) -c showrss.c
+
+showun.obj : mshowbest.c $(MWH) defs.h aln_structs.h drop_func.h
+ $(CC) $(CFLAGS) -c -DSHOWUN mshowbest.c /Foshowun.obj
+
+showrel.obj : $(SHOWBESTC) $(MWH) defs.h aln_structs.h drop_func.h
+ $(CC) $(CFLAGS) -c -DSHOWREL $(SHOWBESTC) /Foshowrel.obj
+
+showsum.obj : showsum.c $(MWH) defs.h drop_func.h
+ $(CC) $(CFLAGS) -c showsum.c
+
+$(LSHOWALIGN).obj : $(SHOWALIGN).c $(MWHP) defs.h structs.h param.h aln_structs.h drop_func.h
+ $(CC) $(THR_CC) $(CFLAGS) -DLALIGN -c $(SHOWALIGN).c /Fo$(LSHOWALIGN).obj
+
+$(SHOWALIGN).obj : $(SHOWALIGN).c $(MWHP) defs.h structs.h param.h aln_structs.h drop_func.h
+ $(CC) $(CFLAGS) -c $(SHOWALIGN).c /Fo$(SHOWALIGN).obj
+
+$(SHOWALIGN)_u.obj : $(SHOWALIGN).c $(MWHP) defs.h structs.h param.h aln_structs.h drop_func.h
+ $(CC) $(CFLAGS) -DSHOWUN -c /Fo$(SHOWALIGN)_u.obj $(SHOWALIGN).c
+re_getlib.obj : re_getlib.c mw.h mm_file.h
+ $(CC) $(CFLAGS) -c re_getlib.c
+
+lib_sel.obj : lib_sel.c defs.h structs.h
+ $(CC) $(CFLAGS) -c lib_sel.c
+
+c_dispn.obj : c_dispn.c defs.h structs.h param.h
+ $(CC) $(CFLAGS) -c c_dispn.c
+
+build_ares.obj : build_ares.c $(MWH) defs.h param.h rstruct.h structs.h aln_structs.h drop_func.h
+ $(CC) $(THR_CC) $(CFLAGS) -c build_ares.c /Fobuild_ares.obj
+
+#================ statistical functions
+
+karlin.obj : karlin.c param.h
+ $(CC) $(CFLAGS) -c karlin.c
+
+scale_se.obj : scaleswn.c defs.h param.h structs.h $(MWH) alt_parms.h
+ $(CC) $(CFLAGS) -DLOCAL_SCORE -c scaleswn.c /Foscale_se.obj
+
+scale_sn.obj : scaleswn.c defs.h param.h structs.h $(MWH) alt_parms.h
+ $(CC) -DNORMAL_DIST $(CFLAGS) -c scaleswn.c /Foscale_sn.obj
+
+scaleswtf.obj : scaleswt.c defs.h param.h structs.h $(MWH) alt_parms.h
+ $(CC) $(CFLAGS) -DFASTF -c scaleswt.c /Foscaleswtf.obj
+
+scaleswts.obj : scaleswt.c defs.h param.h structs.h $(MWH) alt_parms.h
+ $(CC) $(CFLAGS) -c scaleswt.c /Foscaleswts.obj
+
+tatstats_fs.obj : tatstats.c tatstats.h
+ $(CC) $(CFLAGS) -c -DFASTS tatstats.c /Fotatstats_fs.obj
+
+tatstats_ff.obj : tatstats.c tatstats.h
+ $(CC) $(CFLAGS) -c -DFASTF tatstats.c /Fotatstats_ff.obj
+
+tatstats_fm.obj : tatstats.c tatstats.h
+ $(CC) $(CFLAGS) -c -DFASTM tatstats.c /Fotatstats_fm.obj
+
+last_tat.obj : last_tat.c defs.h mm_file.h structs.h param.h
+ $(CC) $(CFLAGS) -c last_tat.c
+
+last_thresh.obj : last_thresh.c defs.h mm_file.h structs.h param.h
+ $(CC) $(CFLAGS) -c last_thresh.c
+
+#================ drop functions - actual scores/alignments
+
+drop_nfa.obj : dropnfa.c dropnfa.h param.h defs.h drop_func.h
+ $(CC) $(CFLAGS) -c dropnfa.c /Fodrop_nfa.obj
+
+# drop_ff, _fs, _fm must define FASTF, FASTS, and FASTM to ensure
+# that tatstats.h is built appropriately
+
+drop_ff.obj : dropff2.c param.h defs.h tatstats.h drop_func.h
+ $(CC) $(CFLAGS) -DFASTF -c dropff2.c /Fodrop_ff.obj
+
+drop_tff.obj : dropff2.c param.h defs.h tatstats.h drop_func.h
+ $(CC) $(CFLAGS) -DFASTF -DTFAST -c dropff2.c /Fodrop_tff.obj
+
+drop_ff2.obj : dropff2.c param.h defs.h tatstats.h drop_func.h
+ $(CC) $(CFLAGS) -c -DFASTF dropff2.c /Fodrop_ff2.obj
+
+drop_tff2.obj : dropff2.c param.h defs.h tatstats.h drop_func.h
+ $(CC) $(CFLAGS) -c -DFASTF -DTFAST dropff2.c /Fodrop_tff.obj
+
+drop_fs2.obj : dropfs2.c param.h defs.h tatstats.h drop_func.h
+ $(CC) $(CFLAGS) -DFASTS -c dropfs2.c /Fodrop_fs2.obj
+
+drop_tfs.obj : dropfs2.c param.h defs.h drop_func.h
+ $(CC) $(CFLAGS) -c -DTFAST -DFASTS dropfs2.c /Fodrop_tfs.obj
+
+drop_fm.obj : dropfs2.c param.h defs.h drop_func.h
+ $(CC) $(CFLAGS) -c -DFASTM dropfs2.c /Fodrop_fm.obj
+
+drop_tfm.obj : dropfs2.c param.h defs.h drop_func.h
+ $(CC) $(CFLAGS) -c -DTFAST -DFASTM dropfs2.c /Fodrop_tfm.obj
+
+drop_tfa.obj : dropnfa.c dropnfa.h upam.h param.h defs.h
+ $(CC) $(CFLAGS) -c -DTFASTA dropnfa.c /Fodrop_tfa.obj
+
+drop_fx.obj : dropfx2.c upam.h param.h defs.h drop_func.h
+ $(CC) $(CFLAGS) -c dropfx2.c /Fodrop_fx.obj
+
+drop_tfx.obj : dropfx2.c upam.h param.h defs.h drop_func.h
+ $(CC) $(CFLAGS) -c -DTFAST dropfx2.c /Fodrop_tfx.obj
+
+drop_fz.obj : dropfz3.c upam.h param.h defs.h aamap.h drop_func.h
+ $(CC) $(CFLAGS) -c dropfz3.c /Fodrop_fz.obj
+
+drop_tfz.obj : dropfz3.c upam.h param.h defs.h aamap.h drop_func.h
+ $(CC) $(CFLAGS) -c -DTFAST dropfz3.c /Fodrop_tfz.obj
+
+dropnsw.obj : dropnsw.c upam.h param.h structs.h drop_func.h
+ $(CC) $(CFLAGS) -c dropnsw.c
+
+dropgsw2.obj : dropgsw2.c dropgsw2.h upam.h param.h structs.h drop_func.h
+ $(CC) $(CFLAGS) -c dropgsw2.c
+
+dropgnw2.obj : dropnnw2.c upam.h param.h structs.h drop_func.h
+ $(CC) $(CFLAGS) -c dropnnw2.c /Fodropgnw2.obj
+
+droplnw2.obj : dropnnw2.c upam.h param.h structs.h drop_func.h
+ $(CC) $(CFLAGS) -c dropnnw2.c /Fodroplnw2.obj
+
+droplal.obj : dropgsw2.c dropgsw2.h upam.h param.h drop_func.h a_mark.h
+ $(CC) $(CFLAGS) -DLALIGN -c dropgsw2.c /Fodroplal.obj
+
+droplal_sse2.obj : dropgsw2.c dropgsw2.h upam.h param.h drop_func.h a_mark.h
+ $(CC) $(CFLAGS) -DLALIGN -DSW_SSE2 -c dropgsw2.c /Fodroplal.obj
+
+dropgsw2_sse2.obj : dropgsw2.c dropgsw2.h upam.h param.h structs.h drop_func.h
+ $(CC) $(CFLAGS) -DSW_SSE2 -c dropgsw2.c /Fodropgsw2_sse2.obj
+
+smith_waterman_altivec.obj : smith_waterman_altivec.c smith_waterman_altivec.h dropgsw2.h defs.h param.h
+ $(CC) $(CFLAGS) -c smith_waterman_altivec.c
+
+smith_waterman_sse2.obj : smith_waterman_sse2.c smith_waterman_sse2.h dropgsw2.h defs.h param.h
+ $(CC) $(CFLAGS) -DSW_SSE2 -c smith_waterman_sse2.c
+
+dropnw.obj : dropnw.c upam.h param.h structs.h drop_func.h
+ $(CC) $(CFLAGS) -c dropnw.c
+
+wm_align.obj : wm_align.c defs.h param.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -c wm_align.c
+
+calcons_fa.obj : cal_cons2.c defs.h param.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DFASTA -c cal_cons2.c /Focalcons_fa.obj
+
+calcons_sw.obj : cal_cons2.c defs.h param.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DSSEARCH -c cal_cons2.c /Focalcons_sw.obj
+
+calcons_la.obj : cal_cons2.c defs.h param.h a_mark.h
+ $(CC) $(THR_CC) -DLALIGN -DLCAL_CONS $(CFLAGS) -c cal_cons2.c /Focalcons_la.obj
+
+calcons_ff.obj : cal_consf.c defs.h param.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DFASTF -c cal_consf.c /Focalcons_ff.obj
+
+calcons_fs.obj : cal_consf.c defs.h param.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DFASTS -c cal_consf.c /Focalcons_fs.obj
+
+calcons_fm.obj : cal_consf.c defs.h param.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DFASTM -c cal_consf.c /Focalcons_fm.obj
+
+calcons_tff.obj : cal_consf.c defs.h param.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DTFAST -DFASTF -c cal_consf.c /Focalcons_tff.obj
+
+calcons_tfs.obj : cal_consf.c defs.h param.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DTFAST -DFASTS -c cal_consf.c /Focalcons_tfs.obj
+
+calcons_tfm.obj : cal_consf.c defs.h param.h a_mark.h
+ $(CC) $(THR_CC) $(CFLAGS) -DTFAST -DFASTM -c cal_consf.c /Focalcons_tfm.obj
+
+#================ reading query, libraries
+
+getseq.obj : getseq.c defs.h uascii.h structs.h upam.h
+ $(CC) $(CFLAGS) -c getseq.c
+
+llgetaa.obj : llgetaa.c upam.h uascii.h
+ $(CC) $(CFLAGS) -c -DNOLIB llgetaa.c
+
+lgetlib.obj : $(NGETLIB).c altlib.h upam.h uascii.h mm_file.h
+ $(CC) $(CFLAGS) -c $(NGETLIB).c /Folgetlib.obj
+
+lgetaa_m.obj : mmgetaa.c altlib.h ncbl2_head.h upam.h uascii.h mm_file.h
+ $(CC) $(CFLAGS) -c mmgetaa.c /Folgetaa_m.obj
+
+ncbl_lib.obj : ncbl_lib.c ncbl_head.h
+ $(CC) $(CFLAGS) -c ncbl_lib.c
+
+ncbl2_mlib.obj : ncbl2_mlib.c ncbl2_head.h mm_file.h
+ $(CC) $(CFLAGS) -c ncbl2_mlib.c
+
+mysql_lib.obj : mysql_lib.c mm_file.h
+ $(CC) $(CFLAGS) -c mysql_lib.c
+
+pgsql_lib.obj : pgsql_lib.c mm_file.h
+ $(CC) $(CFLAGS) -c pgsql_lib.c
+
+#================ threading functions
+
+pthr_subs2.obj : pthr_subs2.c thr_bufs2.h pthr_subs.h
+ $(CC) $(CFLAGS) -c pthr_subs2.c
+
+uthr_subs.obj : uthr_subs.c thr_bufs2.h uthr_subs.h
+ $(CC) $(CFLAGS) -c uthr_subs.c
+
+#================ translation
+
+faatran.obj : faatran.c upam.h uascii.h
+ $(CC) $(CFLAGS) -c faatran.c
+
+url_subs.obj : url_subs.c structs.h param.h
+ $(CC) $(CFLAGS) -c url_subs.c
+
+$(NRAND).obj : $(NRAND).c
+ $(CC) $(CFLAGS) -c $(NRAND).c
+#================ windows getopt()
+
+getopt.obj : getopt.c
+ $(CC) $(CFLAGS) -c getopt.c
diff --git a/make/Makefile.nm_pcom b/make/Makefile.nm_pcom
new file mode 100755
index 0000000..b1530ec
--- /dev/null
+++ b/make/Makefile.nm_pcom
@@ -0,0 +1,217 @@
+
+# combinations of files for "composite" drop* functions
+#
+DROPLNW_O = droplnw2.obj wm_align.obj calcons_sw.obj
+DROPGNW_O = dropgnw2.obj wm_align.obj calcons_sw.obj
+DROPNFA_O = drop_nfa.obj wm_align.obj calcons_fa.obj
+DROPTFA_O = drop_tfa.obj
+DROPFF_O = drop_ff2.obj calcons_ff.obj
+DROPFS_O = drop_fs2.obj calcons_fs.obj
+DROPFM_O = drop_fm.obj calcons_fm.obj
+DROPTFF_O = drop_tff.obj calcons_tff.obj
+DROPTFS_O = drop_tfs.obj calcons_tfs.obj
+DROPTFM_O = drop_tfm.obj calcons_tfm.obj
+
+COMPACC_TO = compacc2_t.obj # used with comp_lib9.c
+COMPACC_SO = compacc2_s.obj
+
+SHOWBESTC = mshowbest.c
+SHOWBESTO = showbest.obj build_ares.obj
+SHOWALIGN = mshowalign2
+LSHOWALIGN = lshowalign2
+MWH = mw.h
+MWHP = mw.h
+
+TPROGS = ssearch36_t.exe fasta36_t.exe fasts36_t.exe fastx36_t.exe tfastx36_t.exe fasty36_t.exe tfasty36_t.exe tfasts36_t.exe fastm36_t.exe fastf36_t.exe tfastf36_t.exe ggsearch36_t.exe glsearch36_t.exe
+
+SPROGS = fasta36.exe ssearch36.exe fasts36.exe fastx36.exe tfastx36.exe fasty36.exe tfasty36.exe tfasts36.exe fastm36.exe tfastm36.exe fastf36.exe tfastf36.exe lalign36.exe ggsearch36.exe glsearch36.exe
+
+MAPROGS = map_db.exe
+
+XTPROGS = fastx36_t.exe tfastx36_t.exe fasty36_t.exe tfasty36_t.exe
+XPROGS = fastx36.exe tfastx36.exe .exe fasty36 tfasty36.exe
+
+PROGS = $(SPROGS) $(TPROGS)
+
+all : $(PROGS)
+
+tall: $(TPROGS)
+
+sall: $(SPROGS)
+
+xall: $(XTPROGS) $(XPROGS) $(ZTPROGS) $(ZPROGS)
+
+clean-up:
+ del *.obj $(PROGS)
+
+install: $(PROGS)
+ copy $(PROGS) $(XDIR)
+
+sinstall: $(SPROGS)
+ copy $(SPROGS) $(XDIR)
+
+tinstall: $(TPROGS)
+ cp $(TPROGS) $(XDIR)
+
+fasta36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fa.obj scale_se.obj karlin.obj $(DROPNFA_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fasta36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fa.obj $(DROPNFA_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj
+
+fastx36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fx.obj scale_se.obj karlin.obj drop_fx.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\fastx36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fx.obj drop_fx.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+
+fasty36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fy.obj scale_se.obj karlin.obj drop_fz.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\fasty36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fy.obj drop_fz.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+
+fastf36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ff.obj scaleswts.obj last_tat.obj tatstats_ff.obj karlin.obj $(DROPFF_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fastf36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ff.obj $(DROPFF_O) scaleswts.obj last_tat.obj tatstats_ff.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+
+fastf36u : $(COMP_LIBO) $(COMPACC_SO) showun.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ff.obj scaleswtf.obj karlin.obj $(DROPFF_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fastf36u.exe $(COMP_LIBO) $(COMPACC_SO) showun.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ff.obj $(DROPFF_O) scaleswtf.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+
+fastf36s : $(COMP_LIBO) $(COMPACC_SO) showsum.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ff.obj scaleswtf.obj karlin.obj $(DROPFF_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fastf36s.exe $(COMP_LIBO) $(COMPACC_SO) showsum.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ff.obj $(DROPFF_O) scaleswtf.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+
+fasts36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fs.obj scaleswts.obj last_tat.obj tatstats_fs.obj karlin.obj $(DROPFS_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fasts36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fs.obj $(DROPFS_O) scaleswts.obj last_tat.obj tatstats_fs.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+
+fastm36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fm.obj scaleswts.obj last_tat.obj tatstats_fm.obj karlin.obj $(DROPFM_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fastm36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fm.obj $(DROPFM_O) scaleswts.obj last_tat.obj tatstats_fm.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+
+tfastx36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfx.obj scale_se.obj karlin.obj drop_tfx.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\tfastx36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfx.obj drop_tfx.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+
+tfasty36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfy.obj scale_se.obj karlin.obj drop_tfz.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\tfasty36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfy.obj drop_tfz.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+
+tfastf36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tf.obj scaleswtf.obj last_tat.obj tatstats_ff.obj karlin.obj $(DROPTFF_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\tfastf36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tf.obj $(DROPTFF_O) scaleswtf.obj last_tat.obj tatstats_ff.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj
+
+tfastf36s : $(COMP_LIBO) $(COMPACC_SO) showsum.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tf.obj scaleswtf.obj karlin.obj $(DROPTFF_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\tfastf36s.exe $(COMP_LIBO) $(COMPACC_SO) showsum.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tf.obj $(DROPTFF_O) scaleswtf.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj
+
+tfasts36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfs.obj scaleswts.obj tatstats_fs.obj last_tat.obj karlin.obj $(DROPTFS_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\tfasts36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfs.obj $(DROPTFS_O) scaleswts.obj tatstats_fs.obj last_tat.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj
+
+tfastm36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfm.obj scaleswts.obj tatstats_fm.obj last_tat.obj karlin.obj $(DROPTFM_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\tfastm36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfm.obj $(DROPTFM_O) scaleswts.obj tatstats_fm.obj last_tat.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj
+
+ssearch36nosse2.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj scale_se.obj karlin.obj $(DROPGSW_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+ $(CL) /Fe..\bin\ssearch36nosse2.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj $(DROPGSW_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+
+ggsearch36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_gnw.obj scale_sn.obj karlin.obj $(DROPGNW_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+ $(CL) /Fe..\bin\ggsearch36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_gnw.obj $(DROPGNW_O) scale_sn.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+
+glsearch36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_lnw.obj scale_sn.obj karlin.obj $(DROPLNW_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+ $(CL) /Fe..\bin\glsearch36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_lnw.obj $(DROPLNW_O) scale_sn.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+
+lalign36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(LSHOWALIGN).obj htime.obj apam.obj doinit.obj init_lal.obj scale_se.obj karlin.obj last_thresh.obj $(DROPLAL_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+ $(CL) /Fe..\bin\lalign36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(LSHOWALIGN).obj htime.obj apam.obj doinit.obj init_lal.obj last_thresh.obj $(DROPLAL_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+
+ssearch36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj scale_se.obj karlin.obj $(DROPGSW_SSE2_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+ $(CL) /Fe..\bin\ssearch36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj $(DROPGSW_SSE2_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+
+osearch36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ssw.obj scale_se.obj karlin.obj dropnsw.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\osearch36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ssw.obj dropnsw.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj
+
+usearch36.exe : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj scale_se.obj karlin.obj dropnsw.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\usearch36.exe $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj dropnsw.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj
+
+ssearch36nosse2_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj scale_se.obj karlin.obj $(DROPGSW_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+ $(CL) /Fe..\bin\ssearch36nosse2_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj $(DROPGSW_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj $(THR_LIBS)
+
+ssearch36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj scale_se.obj karlin.obj $(DROPGSW_SSE2_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+ $(CL) /Fe..\bin\ssearch36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj $(DROPGSW_SSE2_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj $(THR_LIBS)
+
+osearch36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj scale_se.obj karlin.obj dropnsw.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\osearch36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj dropnsw.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj $(THR_LIBS)
+
+usearch36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj scale_se.obj karlin.obj dropnsw.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\usearch36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_sw.obj dropnsw.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj $(THR_LIBS)
+
+ggsearch36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_gnw.obj scale_sn.obj karlin.obj $(DROPGNW_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+ $(CL) /Fe..\bin\ggsearch36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_gnw.obj $(DROPGNW_O) scale_sn.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj $(THR_LIBS)
+
+glsearch36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_lnw.obj scale_sn.obj karlin.obj $(DROPLNW_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj
+ $(CL) /Fe..\bin\glearch36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_lnw.obj $(DROPLNW_O) scale_sn.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj pssm_asn_subs.obj getopt.obj $(THR_LIBS)
+
+fasta36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fa.obj scale_se.obj karlin.obj $(DROPNFA_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\fasta36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fa.obj $(DROPNFA_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj $(THR_LIBS)
+
+fasta36s_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) showsum.obj re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fa.obj scale_se.obj karlin.obj $(DROPNFA_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\fasta36s_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) showsum.obj re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fa.obj $(DROPNFA_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj $(THR_LIBS)
+
+fasta36u_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) showun.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fa.obj scale_se.obj karlin.obj $(DROPNFA_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fasta36u_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) showun.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fa.obj $(DROPNFA_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj $(THR_LIBS)
+
+fasta36r_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) showrel.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fa.obj scale_se.obj karlin.obj $(DROPNFA_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fasta36r_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) showrel.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fa.obj $(DROPNFA_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj $(THR_LIBS)
+
+fastf36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ff.obj scaleswtf.obj last_tat.obj tatstats_ff.obj karlin.obj $(DROPFF_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fastf36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ff.obj $(DROPFF_O) scaleswtf.obj last_tat.obj tatstats_ff.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj $(THR_LIBS)
+
+fastf36s_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) showsum.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ff.obj scaleswtf.obj karlin.obj $(DROPFF_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fastf36s_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) showsum.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_ff.obj $(DROPFF_O) scaleswtf.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj $(THR_LIBS)
+
+fasts36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fs.obj scaleswts.obj last_tat.obj tatstats_fs.obj karlin.obj $(DROPFS_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fasts36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fs.obj $(DROPFS_O) scaleswts.obj last_tat.obj tatstats_fs.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj $(THR_LIBS)
+
+fastm36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fs.obj scaleswts.obj last_tat.obj tatstats_fm.obj karlin.obj $(DROPFM_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\fastm36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fs.obj $(DROPFM_O) scaleswts.obj last_tat.obj tatstats_fm.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj $(NRAND).obj url_subs.obj getopt.obj $(THR_LIBS)
+
+fastx36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj c_dispn.obj htime.obj apam.obj doinit.obj init_fx.obj faatran.obj scale_se.obj karlin.obj drop_fx.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\fastx36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fx.obj drop_fx.obj faatran.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj $(THR_LIBS)
+
+fasty36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj c_dispn.obj htime.obj apam.obj doinit.obj init_fy.obj faatran.obj scale_se.obj karlin.obj drop_fz.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\fasty36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_fy.obj drop_fz.obj faatran.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj url_subs.obj $(NRAND).obj getopt.obj $(THR_LIBS)
+
+tfasta36.exe : $(COMP_LIBO) $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfa.obj scale_se.obj karlin.obj $(DROPTFA_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\tfasta36.exe $(COMP_LIBO) $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfa.obj $(DROPTFA_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+
+tfasta36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj c_dispn.obj htime.obj apam.obj doinit.obj init_tfa.obj scale_se.obj karlin.obj $(DROPTFA_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\tfasta36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfa.obj $(DROPTFA_O) scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj $(THR_LIBS)
+
+tfastf36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj c_dispn.obj htime.obj apam.obj doinit.obj init_tf.obj scaleswtf.obj last_tat.obj tatstats_ff.obj karlin.obj $(DROPTFF_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\tfastf36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tf.obj $(DROPTFF_O) scaleswtf.obj last_tat.obj tatstats_ff.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj $(THR_LIBS)
+
+tfasts36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj c_dispn.obj htime.obj apam.obj doinit.obj init_tfs.obj scaleswts.obj last_tat.obj tatstats_fs.obj karlin.obj $(DROPTFS_O) $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj
+ $(CL) /Fe..\bin\tfasts36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfs.obj $(DROPTFS_O) scaleswts.obj last_tat.obj tatstats_fs.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj $(NRAND).obj url_subs.obj getopt.obj $(THR_LIBS)
+
+tfastx36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfx.obj scale_se.obj karlin.obj drop_tfx.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\tfastx36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfx.obj drop_tfx.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj $(THR_LIBS)
+
+tfasty36_t.exe : $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfy.obj scale_se.obj karlin.obj drop_tfz.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj
+ $(CL) /Fe..\bin\tfasty36_t.exe $(COMP_THRO) work_thr2.obj $(THR_SUBS).obj $(COMPACC_TO) $(SHOWBESTO) re_getlib.obj $(SHOWALIGN).obj htime.obj apam.obj doinit.obj init_tfy.obj drop_tfz.obj scale_se.obj karlin.obj $(LGETLIB) c_dispn.obj $(NCBL_LIB) lib_sel.obj faatran.obj url_subs.obj $(NRAND).obj getopt.obj $(THR_LIBS)
+
+comp_lib9.obj : comp_lib9.c mw.h structs.h defs.h param.h
+ $(CC) $(CFLAGS) -c comp_lib9.c
+
+comp_mlib9.obj : comp_lib9.c mw.h structs.h defs.h param.h
+ $(CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib9.c /Focomp_mlib9.obj
+
+comp_mthr9.obj : comp_lib9.c mw.h structs.h defs.h param.h thr_bufs2.h
+ $(CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib9.c /Focomp_mthr9.obj
+
+comp_lib4.obj : comp_lib4.c mw.h structs.h defs.h param.h
+ $(CC) $(CFLAGS) -c comp_lib4.c
+
+comp_mlib4.obj : comp_lib4.c mw.h structs.h defs.h param.h
+ $(CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib4.c /Focomp_mlib4.obj
+
+comp_thr2.obj : comp_lib4.c mw.h structs.h defs.h param.h thr_bufs2.h
+ $(CC) $(CFLAGS) -DCOMP_THR -c comp_lib4.c /Focomp_thr2.obj
+
+comp_mthr4.obj : comp_lib4.c mw.h structs.h defs.h param.h thr_bufs2.h
+ $(CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib4.c /Focomp_mthr4.obj
+
+work_thr2.obj : work_thr2.c mw.h structs.h defs.h param.h thr_bufs2.h
+ $(CC) $(CFLAGS) -c work_thr2.c
+
+print_pssm.exe : print_pssm.c getseq.c karlin.c apam.c
+ $(CC) /Fe..\bin\print_pssm.exe $(CFLAGS) print_pssm.c getseq.c karlin.c apam.c getopt.obj
+
+map_db.exe : map_db.c uascii.h ncbl2_head.h
+ $(CC) /Fe..\bin\map_db.exe map_db.c
+
+list_db.exe : list_db.c
+ $(CC) /Fe..\bin\list_db.exe list_db.c
+
diff --git a/make/Makefile.nmk_icl b/make/Makefile.nmk_icl
new file mode 100755
index 0000000..35a706c
--- /dev/null
+++ b/make/Makefile.nmk_icl
@@ -0,0 +1,35 @@
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+# options for Intel C compiler (v9.1)
+#
+# must be compiled/linked with /MT (or /MTd for debugging) to ensure
+# multi-threaded staticly linked executables. /MD uses dynamic
+# linking to DLL's, which may not be available on the users machine
+
+CC= icl /O2 /MT /W1
+#CC= icl /Zi /MTd /W1
+CL= icl /O2 /MT
+#CL= icl /Zi /MTd
+
+# standard options
+CFLAGS= -DSHOW_HELP -DSHOWSIM -DWIN32 -DHZ=100 -DSAMP_STATS -DPGM_DOC -DTHR_EXIT=pthread_exit -D_CRT_SECURE_NO_WARNINGS=1 -DMAX_WORKERS=2
+
+BIN = ../bin
+XDIR = /seqprg/bin
+
+THR_SUBS = pthr_subs2
+THR_LIBS= pthreadVC2.lib
+
+DROPGSW_SSE2_O = dropgsw2_sse2.obj smith_waterman_sse2.obj wm_align.obj calcons_sw.obj
+DROPGSW_O = dropgsw2.obj wm_align.obj calcons_sw.obj
+DROPLAL_SSE2_O = droplal_sse2.obj smith_waterman_sse2.obj lsim4.obj calcons_la.obj
+DROPLAL_O = droplal.obj lsim4.obj calcons_la.obj
+
+#
+
+# renamed (fasta36) programs
+include ../make/Makefile36.nmk_com
+# conventional (fasta3) names
+# include ../make/Makefile.common
+
diff --git a/make/Makefile.os_x b/make/Makefile.os_x
new file mode 100644
index 0000000..f68f4e1
--- /dev/null
+++ b/make/Makefile.os_x
@@ -0,0 +1,69 @@
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+# this file works for Mac OS X (PPC)
+#
+# this file supports mmap()'ed databases in BLAST2 format use -DUSE_MMAP
+# for mmap()ed BLAST2 format.
+
+# the -DDEBUG option provides additional debugging information, particularly
+# with -D on the command line.
+
+# use -DBIG_LIB64 to generate 64-bit offsets in map_db .xin files
+
+SHELL=/bin/bash
+
+# in my hands, gcc-4.0 is about 40% slower than gcc-3.3 on the Altivec code
+CC= gcc -g -O3 -arch ppc -falign-loops=32 -O3 -maltivec -mpim-altivec -force_cpusubtype_ALL
+# -pg -finstrument-functions -lSaturn
+
+#CC= gcc-3.3 -g -falign-loops=32 -O3 -mcpu=7450 -faltivec
+#CC= gcc-3.3 -g -DDEBUG -mcpu=7450 -faltivec
+#CC= cc -g -Wall -pedantic -faltivec
+#
+# standard line for normal searching
+CFLAGS= -DSHOW_HELP -DSHOWSIM -DM10_CONS -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=2 -DTHR_EXIT=pthread_exit -DFASTA_HOST='"fasta.bioch.virginia.edu/fasta_www2"' -DUSE_MMAP -DUSE_FSEEKO -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DBIG_LIB64 -DMAX_MEMK=2*1024*1024
+
+# -DSUPERFAMNUM -DSFCHAR="'|'"
+
+# add for MySQL support
+# -I/usr/local/mysql/include -DMYSQL_DB
+
+HFLAGS= -o
+NFLAGS= -o
+
+THR_SUBS = pthr_subs2
+THR_LIBS =
+THR_CC =
+
+# for IBM with current pthreads
+#CC= xlc_r -v -g
+#THR_SUBS = ibm_pthr_subs2
+#THR_LIBS = -lpthreads
+#THR_CC =
+
+
+BIN = ../bin
+# diectory for universal binary process
+UDIR = $(BIN)/ppc
+
+#XDIR = ${HOME}/bin
+#XDIR = /home/slib/bin/MACOSX/
+#XDIR = /Users/seqprg/bin
+XDIR = /seqprg/bin
+#XDIR = ./ppc
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# Altivec acceleration
+#
+DROPGSW_O = $(DROPGSW_ALT_O)
+DROPLAL_O = $(DROPLAL_ALT_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# provide mysql function
+#include ../make/Makefile36m.common_sql
+
+# no mysql
+include ../make/Makefile36m.common
diff --git a/make/Makefile.os_x86 b/make/Makefile.os_x86
new file mode 100644
index 0000000..afe2e1b
--- /dev/null
+++ b/make/Makefile.os_x86
@@ -0,0 +1,61 @@
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+# $Name: $ - $Id: Makefile.os_x86 750 2011-05-19 17:07:40Z wrp $
+#
+# 12-Dec-2007 - modified to allow compilation of both accelerated and
+# non-accelerated Smith-Waterman
+
+# the -DDEBUG option provides additional debugging information, particularly
+# with -D on the command line.
+
+# use -DBIG_LIB64 to generate 64-bit offsets in map_db .xin files
+
+SHELL=/bin/bash
+
+CC= gcc -g -O3 -arch i386 -msse2
+#CC= gcc -g -DDEBUG -arch i386 -msse2
+
+#CC= cc -g -Wall -pedantic
+#
+# standard line for normal searching
+CFLAGS= -DSHOW_HELP -DSHOWSIM -DM10_CONS -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DFASTA_HOST='"your.fasta.host"' -DIS_LITTLE_ENDIAN -DUSE_MMAP -DUSE_FSEEKO -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DBIG_LIB64 -DMAX_MEMK=2*1024*1024
+# -I/usr/local/mysql/include -DMYSQL_DB # add for MySQL support
+
+#CFLAGS= -DSHOWSIM -DM10_CONS -DUNIX -DTIMES -DHZ=60 -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DPROGRESS -DFASTA_HOST='"your.fasta.host"' -DIS_LITTLE_ENDIAN -DUSE_MMAP -DUSE_FSEEKO -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DSUPERFAMNUM -DSFCHAR="'|'"
+
+LDFLAGS= -arch i386
+
+HFLAGS= -o
+NFLAGS= -o
+
+#for Linux, MacOS, DEC Unix V4.0
+THR_SUBS = pthr_subs2
+THR_LIBS =
+THR_CC =
+
+BIN = ../bin
+#XDIR = ${HOME}/bin
+#XDIR = /home/slib/bin/MACOSX/
+#XDIR = /Users/seqprg/bin
+XDIR = /seqprg/bin
+#XDIR = ./i386
+
+# diectory for universal binary process
+UDIR = $(BIN)/i386
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# Altivec acceleration
+#
+DROPGSW_O = $(DROPGSW_SSE_O)
+DROPLAL_O = $(DROPLAL_SSE_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# provide mysql function
+#include ../make/Makefile36m.common_mysql
+
+# no mysql
+include ../make/Makefile36m.common
diff --git a/make/Makefile.os_x86_64 b/make/Makefile.os_x86_64
new file mode 100644
index 0000000..8d3f7d3
--- /dev/null
+++ b/make/Makefile.os_x86_64
@@ -0,0 +1,61 @@
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+# $Name: $ - $Id: Makefile.os_x86_64 1228 2013-09-26 19:46:29Z wrp $
+#
+# 12-Dec-2007 - modified to allow compilation of both accelerated and
+# non-accelerated Smith-Waterman
+
+# the -DDEBUG option provides additional debugging information, particularly
+# with -D on the command line.
+
+# use -DBIG_LIB64 to generate 64-bit offsets in map_db .xin files
+
+SHELL=/bin/bash
+
+CC= cc -O -g -arch x86_64 -msse2
+#CC= cc -g -DDEBUG -fsanitize=address -arch x86_64 -msse2
+
+#CC= cc -g -Wall -pedantic
+#
+# standard line for normal searching
+CFLAGS= -DSHOW_HELP -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DM10_CONS -DFASTA_HOST='"your.fasta.host.here"' -DIS_LITTLE_ENDIAN -DUSE_MMAP -DUSE_FSEEKO -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DBIG_LIB64 -DLIB_MEM=12
+
+#-DSHOW_ALIGN_SCORE
+# -I/usr/include/mysql -DMYSQL_DB # add for MySQL support
+
+LDFLAGS= -arch x86_64
+
+HFLAGS= -o
+NFLAGS= -o
+
+#for Linux, MacOS, DEC Unix V4.0
+THR_SUBS = pthr_subs2
+THR_LIBS =
+THR_CC =
+
+BIN = ../bin
+#XDIR = ${HOME}/bin
+#XDIR = /home/slib/bin/MACOSX/
+#XDIR = /Users/seqprg/bin
+XDIR = /seqprg/bin
+#XDIR = ./x86_64
+
+# diectory for universal binary process
+UDIR = $(BIN)/x86_64
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# Altivec acceleration
+#
+DROPGSW_O = $(DROPGSW_SSE_O)
+DROPLAL_O = $(DROPLAL_SSE_O)
+DROPGNW_O = $(DROPGNW_SSE_O)
+DROPLNW_O = $(DROPLNW_SSE_O)
+
+# provide mysql function
+#include ../make/Makefile36m.common_mysql
+
+# no mysql
+include ../make/Makefile36m.common
diff --git a/make/Makefile.os_x86_clang b/make/Makefile.os_x86_clang
new file mode 100644
index 0000000..83397a8
--- /dev/null
+++ b/make/Makefile.os_x86_clang
@@ -0,0 +1,59 @@
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+# $Name: $ - $Id: Makefile.os_x86_clang 581 2011-02-28 03:45:26Z wrp $
+#
+# 12-Dec-2007 - modified to allow compilation of both accelerated and
+# non-accelerated Smith-Waterman
+
+# the -DDEBUG option provides additional debugging information, particularly
+# with -D on the command line.
+
+# use -DBIG_LIB64 to generate 64-bit offsets in map_db .xin files
+
+SHELL=/bin/bash
+
+CC= clang -g -O -arch x86_64 -msse2
+#CC= clang -g -DDEBUG -arch x86_64 -msse2
+
+#CC= cc -g -Wall -pedantic
+#
+# standard line for normal searching
+CFLAGS= -DSHOWSIM -DM10_CONS -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DFASTA_HOST='"xs00.achs.virginia.edu/fasta_www/cgi"' -DIS_LITTLE_ENDIAN -DUSE_MMAP -DUSE_FSEEKO -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DBIG_LIB64
+# -I/usr/local/mysql/include -DMYSQL_DB # add for MySQL support
+
+#CFLAGS= -DSHOWSIM -DM10_CONS -DUNIX -DTIMES -DHZ=60 -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DPROGRESS -DFASTA_HOST='"xs00.achs.virginia.edu/fasta_www/cgi"' -DIS_LITTLE_ENDIAN -DUSE_MMAP -DUSE_FSEEKO -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DSUPERFAMNUM -DSFCHAR="'|'"
+
+LDFLAGS= -arch x86_64
+
+HFLAGS= -o
+NFLAGS= -o
+
+#for Linux, MacOS, DEC Unix V4.0
+THR_SUBS = pthr_subs2
+THR_LIBS =
+THR_CC =
+
+BIN = ../bin
+#XDIR = ${HOME}/bin
+#XDIR = /home/slib/bin/MACOSX/
+#XDIR = /Users/seqprg/bin
+XDIR = /seqprg/bin
+#XDIR = ./x86_64
+
+# diectory for universal binary process
+UDIR = $(BIN)/x86_64
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# Altivec acceleration
+#
+DROPGSW_O = $(DROPGSW_SSE_O)
+DROPLAL_O = $(DROPLAL_SSE_O)
+
+# provide mysql function
+#include ../make/Makefile36m.common_mysql
+
+# no mysql
+include ../make/Makefile36m.common
diff --git a/make/Makefile.os_x86_icc b/make/Makefile.os_x86_icc
new file mode 100644
index 0000000..290093f
--- /dev/null
+++ b/make/Makefile.os_x86_icc
@@ -0,0 +1,61 @@
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+# $Name: $ - $Id: Makefile.os_x86_icc 488 2011-01-21 17:38:53Z wrp $
+#
+# 12-Dec-2007 - modified to allow compilation of both accelerated and
+# non-accelerated Smith-Waterman
+
+# the -DDEBUG option provides additional debugging information, particularly
+# with -D on the command line.
+
+# use -DBIG_LIB64 to generate 64-bit offsets in map_db .xin files
+
+SHELL=/bin/bash
+
+CC= icc -g -O -m64 # intel icc compiler
+#CC= icc -g -DDEBUG -m64
+
+#CC= cc -g -Wall -pedantic
+#
+# standard line for normal searching
+CFLAGS= -DSHOWSIM -DM10_CONS -DUNIX -DTIMES -DHZ=100 -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DFASTA_HOST='"xs00.achs.virginia.edu/fasta_www/cgi"' -DIS_LITTLE_ENDIAN -DUSE_MMAP -DUSE_FSEEKO -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DBIG_LIB64
+# -I/usr/local/mysql/include -DMYSQL_DB # add for MySQL support
+
+#CFLAGS= -DSHOWSIM -DM10_CONS -DUNIX -DTIMES -DHZ=60 -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DPROGRESS -DFASTA_HOST='"xs00.achs.virginia.edu/fasta_www/cgi"' -DIS_LITTLE_ENDIAN -DUSE_MMAP -DUSE_FSEEKO -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DSUPERFAMNUM -DSFCHAR="'|'"
+
+LDFLAGS= -arch x86_64
+
+HFLAGS= -o
+NFLAGS= -o
+
+#for Linux, MacOS, DEC Unix V4.0
+THR_SUBS = pthr_subs2
+THR_LIBS =
+THR_CC =
+
+BIN = ../bin
+#XDIR = ${HOME}/bin
+#XDIR = /home/slib/bin/MACOSX/
+#XDIR = /Users/seqprg/bin
+XDIR = /seqprg/bin
+#XDIR = ./x86_64
+
+# diectory for universal binary process
+UDIR = $(BIN)/x86_64
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# Altivec acceleration
+#
+DROPGSW_O = $(DROPGSW_SSE_O)
+DROPLAL_O = $(DROPLAL_SSE_O)
+DROPGNW_O = $(DROPGNW_SSE_O)
+DROPLNW_O = $(DROPLNW_SSE_O)
+
+# provide mysql function
+#include ../make/Makefile36m.common_mysql
+
+# no mysql
+include ../make/Makefile36m.common
diff --git a/make/Makefile.pLinux b/make/Makefile.pLinux
new file mode 100644
index 0000000..bed1ec0
--- /dev/null
+++ b/make/Makefile.pLinux
@@ -0,0 +1,83 @@
+# $Name: $ - $Id: Makefile.pLinux 488 2011-01-21 17:38:53Z wrp $
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+# this file works for DEC Alphas
+#
+# this file supports mmap()'ed databases in BLAST2 format use -DUSE_MMAP
+# for mmap()ed BLAST2 format.
+
+# the -DDEBUG option provides additional debugging information, particularly
+# with -D on the command line.
+
+# use -DBIG_LIB64 to generate and use 64-bit offsets in map_db .xin
+# files
+
+# for Tru64 4.0F, no "<inttypes.h>" 4.0G has inttypes.h
+
+CC= xlc_r
+
+#CC= cc -g3 -O -std1
+#CC= insure -g -DDEBUG
+#CC= cc -g -DDEBUG -std1
+
+#CC= gcc -g -Wall
+#
+# standard line for normal searching
+CFLAGS= -O3 -qtune=auto -qarch=auto -DUNIX -DTIMES -DBIGMEM -DMAX_WORKERS=4 -DSFCHAR="':'" -DTHR_EXIT=pthread_exit -DUSE_MMAP -DIS_BIG_ENDIAN -DSAMP_STATS -DPGM_DOC -D_LARGE_FILES -DHAS_INTTYPES -D__pLinux__ -DBIG_LIB64
+#
+#(-DMYSQL_DB for mySQL databases) (also requires change to Makefile35.common)
+
+# special options for SUPERFAMLIES
+#CFLAGS= -DM10_CONS -DUNIX -DTIMES -DHZ=60 -DBIGMEM -DSFCHAR="'|'" -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DPROGRESS -DSUPERFAMNUM -DIS_LITTLE_ENDIAN -DUSE_MMAP -DMAXBEST=200000
+
+LIB_M = -lm
+#LIB_M = -L/usr/local/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+#for DEC Unix V4.0
+#THR_SUBS = pthr_subs2
+#THR_LIBS = -lpthreads
+#THR_CC =
+
+#for Sun
+#THR_SUBS = uthr_subs
+#THR_LIBS = -lthread
+#THR_CC =
+#
+# for SGI with current pthreads
+#THR_SUBS = pthr_subs
+#THR_LIBS = -lpthreads
+#THR_CC =
+#
+# for IBM with current pthreads
+#CC= xlc_r -v -g
+#THR_SUBS = ibm_pthr_subs
+#THR_LIBS = -lpthreads
+#THR_CC =
+
+
+# for IBM Linux with current pthreads
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+
+BIN = ../bin
+XDIR = /seqprg/slib/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# renamed (fasta34) programs
+#include ../make/Makefile36m.common_sql
+include ../make/Makefile36m.common
+
diff --git a/make/Makefile.pLinux_sql b/make/Makefile.pLinux_sql
new file mode 100644
index 0000000..04ea521
--- /dev/null
+++ b/make/Makefile.pLinux_sql
@@ -0,0 +1,81 @@
+# $Name: $ - $Id: Makefile.pLinux_sql 488 2011-01-21 17:38:53Z wrp $
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+# this file works for DEC Alphas
+#
+# this file supports mmap()'ed databases in BLAST2 format use -DUSE_MMAP
+# for mmap()ed BLAST2 format.
+
+# the -DDEBUG option provides additional debugging information, particularly
+# with -D on the command line.
+
+# use -DBIG_LIB64 to generate and use 64-bit offsets in map_db .xin
+# files
+
+# for Tru64 4.0F, no "<inttypes.h>" 4.0G has inttypes.h
+
+CC= xlc_r
+
+#CC= cc -g3 -O -std1
+#CC= insure -g -DDEBUG
+#CC= cc -g -DDEBUG -std1
+
+#CC= gcc -g -Wall
+#
+
+CFLAGS= -O3 -qtune=auto -qarch=auto -DUNIX -DTIMES -DBIGMEM -DMAX_WORKERS=4 -DSFCHAR="':'" -DTHR_EXIT=pthread_exit -DUSE_MMAP -DIS_BIG_ENDIAN -DSAMP_STATS -DPGM_DOC -D_LARGE_FILES -DHAS_INTTYPES -D__pLinux__ -DFASTA_HOST='"fasta.bioch.virginia.edu/fasta/cgi"' -I/usr/include/mysql -DMYSQL_DB
+#
+#(-DMYSQL_DB for mySQL databases) (also requires change to Makefile35.common)
+
+# special options for SUPERFAMLIES
+#CFLAGS= -DM10_CONS -DUNIX -DTIMES -DHZ=60 -DBIGMEM -DSFCHAR="'|'" -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DPROGRESS -DSUPERFAMNUM -DIS_LITTLE_ENDIAN -DUSE_MMAP -DMAXBEST=200000
+
+#LIB_M = -lm
+LIB_M = -L/usr/local/lib/mysql -lmysqlclient -lm
+# for mySQL databases
+
+HFLAGS= -o
+NFLAGS= -o
+
+#for DEC Unix V4.0
+#THR_SUBS = pthr_subs2
+#THR_LIBS = -threads
+#THR_CC =
+
+#for Sun
+#THR_SUBS = uthr_subs
+#THR_LIBS = -lthread
+#THR_CC =
+#
+# for SGI with current pthreads
+#THR_SUBS = pthr_subs
+#THR_LIBS = -lpthreads
+#THR_CC =
+#
+# for IBM with current pthreads
+#CC= xlc_r -v -g
+#THR_SUBS = ibm_pthr_subs
+#THR_LIBS = -lpthreads
+#THR_CC =
+
+# for IBM Linux with current pthreads
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+
+BIN = ../bin
+XDIR = /seqprg/slib/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# renamed (fasta34) programs
+include ../make/Makefile36m.common_sql
+
diff --git a/make/Makefile.pcom b/make/Makefile.pcom
new file mode 100644
index 0000000..ee00810
--- /dev/null
+++ b/make/Makefile.pcom
@@ -0,0 +1,229 @@
+
+# combinations of files for "composite" drop* functions
+#
+DROPNSW_O = dropnsw.o lwm_align.o calcons_sw.o
+DROPNFA_O = drop_nfa.o lwm_align.o calcons_fa.o
+DROPGGL_O = drop_nfa.o gwm_align.o calcons_fa.o
+DROPBD_O = dropsbd.o wm_align.o calcons_fa.o
+DROPTFA_O = drop_tfa.o wm_align.o calcons_tfa.o
+DROPFF_O = drop_ff2.o calcons_ff.o
+DROPFS_O = drop_fs2.o calcons_fs.o
+DROPFM_O = drop_fm.o calcons_fm.o
+DROPTFF_O = drop_tff.o calcons_tff.o
+DROPTFS_O = drop_tfs.o calcons_tfs.o
+DROPTFM_O = drop_tfm.o calcons_tfm.o
+
+#COMPACC_TO = compacc.o # used with comp_lib5.c/comp_lib7.c
+#COMPACC_SO = compacc.o
+COMPACC_TO = compacc2_t.o # used with comp_lib5e.c/comp_lib7e.c/comp_lib8.c
+COMPACC_SO = compacc2_s.o
+
+SHOWBESTC = mshowbest.c
+SHOWBESTO = showbest.o build_ares.o
+SHOWALIGN = mshowalign2
+SHOWALIGN_T = mshowalign2_t
+SHOWALIGN_S = mshowalign2_s
+LSHOWALIGN = lshowalign
+MWH = mw.h
+MWHP = mw.h
+
+TPROGS = ssearch36_t fasta36_t fasts36_t fastx36_t tfastx36_t fasty36_t tfasty36_t tfasts36_t fastm36_t fastf36_t tfastf36_t glsearch36_t ggsearch36_t
+
+SPROGS = fasta36 ssearch36 lalign36 fasts36 fastx36 tfastx36 fasty36 tfasty36 tfasts36 fastm36 tfastm36 fastf36 tfastf36 glsearch36 ggsearch36
+
+APROGS = map_db
+
+XTPROGS = fastx36_t tfastx36_t fasty36_t tfasty36_t
+XPROGS = fastx36 tfastx36 fasty36 tfasty36
+
+PROGS = $(SPROGS) $(TPROGS) $(APROGS)
+
+all: $(PROGS)
+
+tall: $(TPROGS)
+
+sall: $(SPROGS)
+
+xall: $(XTPROGS) $(XPROGS) $(ZTPROGS) $(ZPROGS)
+
+clean-up:
+ rm -f *.o $(PROGS) $(APROGS); rm -rf $(BIN)/*
+
+install: $(PROGS)
+ pushd $(BIN); cp $(PROGS) $(XDIR); popd
+
+uinstall: $(PROGS)
+ pushd $(BIN); cp $(PROGS) $(UDIR); popd
+
+sinstall: $(SPROGS)
+ pushd $(BIN); cp $(SPROGS) $(XDIR); popd
+
+tinstall: $(TPROGS)
+ pushd $(BIN); cp $(TPROGS) $(XDIR); popd
+
+fasta36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fa.o scale_se.o karlin.o $(DROPNFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fasta36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fa.o $(DROPNFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M)
+
+fastx36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fx.o scale_se.o karlin.o drop_fx.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fastx36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fx.o drop_fx.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M)
+
+fasty36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fy.o scale_se.o karlin.o drop_fz.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fasty36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fy.o drop_fz.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M)
+
+fastf36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_ff.o scaleswts.o last_tat.o tatstats_ff.o karlin.o $(DROPFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastf36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_ff.o $(DROPFF_O) scaleswts.o last_tat.o tatstats_ff.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M)
+
+fasts36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fs.o scaleswts.o last_tat.o tatstats_fs.o karlin.o $(DROPFS_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fasts36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fs.o $(DROPFS_O) scaleswts.o last_tat.o tatstats_fs.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M)
+
+fastm36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fm.o scaleswts.o last_tat.o tatstats_fm.o karlin.o $(DROPFM_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastm36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fm.o $(DROPFM_O) scaleswts.o last_tat.o tatstats_fm.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M)
+
+tfastx36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfx.o scale_se.o karlin.o drop_tfx.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfastx36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfx.o drop_tfx.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M)
+
+tfasty36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfy.o scale_se.o karlin.o drop_tfz.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfasty36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfy.o drop_tfz.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M)
+
+tfastf36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tf.o scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(DROPTFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfastf36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tf.o $(DROPTFF_O) scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M)
+
+tfastf36s : $(COMP_LIBO) $(COMPACC_SO) showsum.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tf.o scaleswtf.o karlin.o $(DROPTFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfastf36s $(COMP_LIBO) $(COMPACC_SO) showsum.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tf.o $(DROPTFF_O) scaleswtf.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M)
+
+tfasts36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfs.o scaleswts.o tatstats_fs.o last_tat.o karlin.o $(DROPTFS_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfasts36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfs.o $(DROPTFS_O) scaleswts.o tatstats_fs.o last_tat.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M)
+
+tfastm36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfm.o scaleswts.o tatstats_fm.o last_tat.o karlin.o $(DROPTFM_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfastm36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfm.o $(DROPTFM_O) scaleswts.o tatstats_fm.o last_tat.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M)
+
+ssearch36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o scale_se.o karlin.o $(DROPGSW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ssearch36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o $(DROPGSW_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M)
+
+# do not use accelerated Smith-Waterman
+ssearch36s : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o scale_se.o karlin.o $(DROPGSW_NA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ssearch36s $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o $(DROPGSW_NA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M)
+
+lalign36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(LSHOWALIGN).o htime.o apam.o doinit.o init_lal.o scale_se.o karlin.o last_thresh.o $(DROPLAL_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/lalign36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(LSHOWALIGN).o htime.o apam.o doinit.o init_lal.o $(DROPLAL_O) scale_se.o karlin.o last_thresh.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M)
+
+osearch36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_ssw.o scale_se.o karlin.o $(DROPNSW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/osearch36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_ssw.o $(DROPNSW_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M)
+
+glsearch36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPLNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/glsearch36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o $(DROPLNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M)
+
+ggsearch36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPGNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ggsearch36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o $(DROPGNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M)
+
+prss36 : ssearch36
+ ln -sf ssearch36 prss36
+
+ssearch36_t : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_se.o karlin.o $(DROPGSW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ssearch36_t $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPGSW_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+ssearch36s_t : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_se.o karlin.o $(DROPGSW_NA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ssearch36s_t $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPGSW_NA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+glsearch36_t : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPLNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/glsearch36_t $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPLNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+glsearch36s_t : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPLNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/glsearch36s_t $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPLNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+ggsearch36_t : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPGNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ggsearch36_t $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPGNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+ggsearch36s_t : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPGNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ggsearch36s_t $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPGNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+fasta36_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o scale_se.o karlin.o $(DROPNFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fasta36_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o $(DROPNFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+fasta36sum_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o scale_se.o karlin.o $(DROPNFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fasta36sum_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o $(DROPNFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+fasta36u_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showun.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o scale_se.o karlin.o $(DROPNFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fasta36u_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showun.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o $(DROPNFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fasta36r_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showrel.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o scale_se.o karlin.o $(DROPNFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fasta36r_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showrel.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o $(DROPNFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fastf36_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_ff.o scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(DROPFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastf36_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_ff.o $(DROPFF_O) scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fastf36s_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showsum.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_ff.o scaleswtf.o karlin.o $(DROPFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastf36s_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showsum.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_ff.o $(DROPFF_O) scaleswtf.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fasts36_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fs.o scaleswts.o last_tat.o tatstats_fs.o karlin.o $(DROPFS_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fasts36_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fs.o $(DROPFS_O) scaleswts.o last_tat.o tatstats_fs.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fastm36_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fm.o scaleswts.o last_tat.o tatstats_fm.o karlin.o $(DROPFM_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastm36_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fm.o $(DROPFM_O) scaleswts.o last_tat.o tatstats_fm.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fastx36_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o c_dispn.o htime.o apam.o doinit.o init_fx.o faatran.o scale_se.o karlin.o drop_fx.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fastx36_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fx.o drop_fx.o faatran.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+fasty36_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o c_dispn.o htime.o apam.o doinit.o init_fy.o faatran.o scale_se.o karlin.o drop_fz.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fasty36_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fy.o drop_fz.o faatran.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+tfasta36 : $(COMP_LIBO) compacc.o $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfa.o scale_se.o karlin.o $(DROPTFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfasta36 $(COMP_LIBO) compacc.o $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfa.o $(DROPTFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M)
+
+tfasta36_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o c_dispn.o htime.o apam.o doinit.o init_tfa.o scale_se.o karlin.o $(DROPTFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfasta36_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfa.o $(DROPTFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+tfastf36_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o c_dispn.o htime.o apam.o doinit.o init_tf.o scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(DROPTFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfastf36_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tf.o $(DROPTFF_O) scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+tfasts36_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o c_dispn.o htime.o apam.o doinit.o init_tfs.o scaleswts.o last_tat.o tatstats_fs.o karlin.o $(DROPTFS_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfasts36_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfs.o $(DROPTFS_O) scaleswts.o last_tat.o tatstats_fs.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+tfastx36_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfx.o scale_se.o karlin.o drop_tfx.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfastx36_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfx.o drop_tfx.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+tfasty36_t : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfy.o scale_se.o karlin.o drop_tfz.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfasty36_t $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfy.o drop_tfz.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+comp_mlib5e.o : comp_lib5e.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib5e.c -o comp_mlib5e.o
+
+comp_mthr5e.o : comp_lib5e.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib5e.c -o comp_mthr5e.o
+
+comp_mlib7e.o : comp_lib7e.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib7e.c -o comp_mlib7e.o
+
+comp_mthr7e.o : comp_lib7e.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib7e.c -o comp_mthr7e.o
+
+comp_mlib8.o : comp_lib8.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib8.c -o comp_mlib8.o
+
+comp_mthr8.o : comp_lib8.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib8.c -o comp_mthr8.o
+
+comp_mlib9.o : comp_lib9.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib9.c -o comp_mlib9.o
+
+comp_mthr9.o : comp_lib9.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib9.c -o comp_mthr9.o
+
+work_thr2.o : work_thr2.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c work_thr2.c
+
+print_pssm : print_pssm.c getseq.c karlin.c apam.cn pssm_asn_subs.c
+ $(CC) -o print_pssm $(CFLAGS) print_pssm.c getseq.c karlin.c apam.c pssm_asn_subs.c $(LIB_M)
+
+map_db : map_db.c uascii.h ncbl2_head.h
+ $(CC) $(CFLAGS) -o $(BIN)/map_db map_db.c
+
+list_db : list_db.c
+ $(CC) $(CFLAGS) -o $(BIN)/list_db list_db.c
+
+
+lav2ps : lav2plt.o lavplt_ps.o
+ $(CC) -DUNIX -o $(BIN)/lav2ps lav2plt.o lavplt_ps.o -lm
+
+lav2svg : lav2plt.o lavplt_svg.o
+ $(CC) -DUNIX -o $(BIN)/lav2svg lav2plt.o lavplt_svg.o -lm
diff --git a/make/Makefile.pcom_s b/make/Makefile.pcom_s
new file mode 100644
index 0000000..0cad72b
--- /dev/null
+++ b/make/Makefile.pcom_s
@@ -0,0 +1,162 @@
+
+# combinations of files for "composite" drop* functions
+#
+DROPNSW_O = dropnsw.o wm_align.o calcons_sw.o
+DROPNFA_O = drop_nfa.o wm_align.o calcons_fa.o
+DROPBD_O = dropsbd.o wm_align.o calcons_fa.o
+DROPTFA_O = drop_tfa.o wm_align.o calcons_tfa.o
+DROPFF_O = drop_ff2.o calcons_ff.o
+DROPFS_O = drop_fs2.o calcons_fs.o
+DROPFM_O = drop_fm.o calcons_fm.o
+DROPTFF_O = drop_tff.o calcons_tff.o
+DROPTFS_O = drop_tfs.o calcons_tfs.o
+DROPTFM_O = drop_tfm.o calcons_tfm.o
+
+#COMPACC_TO = compacc.o # used with comp_lib5.c/comp_lib7.c
+#COMPACC_SO = compacc.o
+COMPACC_TO = compacc2_t.o # used with comp_lib5e.c/comp_lib7e.c/comp_lib8.c
+COMPACC_SO = compacc2_s.o
+
+SHOWBESTC = mshowbest.c
+SHOWBESTO = showbest.o build_ares.o
+SHOWALIGN = mshowalign2
+SHOWALIGN_T = mshowalign2_t
+SHOWALIGN_S = mshowalign2_s
+LSHOWALIGN = lshowalign
+MWH = mw.h
+MWHP = mw.h
+
+SPROGS = fasta36 ssearch36 lalign36 fasts36 fastx36 tfastx36 fasty36 tfasty36 tfasts36 fastm36 tfastm36 fastf36 tfastf36 glsearch36 ggsearch36
+
+APROGS = map_db
+
+XPROGS = fastx36 tfastx36 fasty36 tfasty36
+
+PROGS = $(SPROGS) $(APROGS)
+
+all: $(PROGS)
+
+tall: $(TPROGS)
+
+sall: $(SPROGS)
+
+xall: $(XPROGS)
+
+clean-up:
+ rm -f *.o $(PROGS); rm -rf $(BIN)/*
+
+install: $(PROGS)
+ pushd $(BIN); cp $(PROGS) $(XDIR); popd
+
+uinstall: $(PROGS)
+ pushd $(BIN); cp $(PROGS) $(UDIR); popd
+
+sinstall: $(SPROGS)
+ pushd $(BIN); cp $(SPROGS) $(XDIR); popd
+
+tinstall: $(TPROGS)
+ pushd $(BIN); cp $(TPROGS) $(XDIR); popd
+
+fasta36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fa.o scale_se.o karlin.o $(DROPNFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fasta36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fa.o $(DROPNFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M)
+
+fastx36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fx.o scale_se.o karlin.o drop_fx.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fastx36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fx.o drop_fx.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M)
+
+fasty36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fy.o scale_se.o karlin.o drop_fz.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fasty36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fy.o drop_fz.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M)
+
+fastf36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_ff.o scaleswts.o last_tat.o tatstats_ff.o karlin.o $(DROPFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastf36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_ff.o $(DROPFF_O) scaleswts.o last_tat.o tatstats_ff.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M)
+
+fasts36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fs.o scaleswts.o last_tat.o tatstats_fs.o karlin.o $(DROPFS_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fasts36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fs.o $(DROPFS_O) scaleswts.o last_tat.o tatstats_fs.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M)
+
+fastm36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fm.o scaleswts.o last_tat.o tatstats_fm.o karlin.o $(DROPFM_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastm36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_fm.o $(DROPFM_O) scaleswts.o last_tat.o tatstats_fm.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M)
+
+tfastx36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfx.o scale_se.o karlin.o drop_tfx.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfastx36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfx.o drop_tfx.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M)
+
+tfasty36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfy.o scale_se.o karlin.o drop_tfz.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfasty36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfy.o drop_tfz.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M)
+
+tfasta36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfa.o scale_se.o karlin.o $(DROPTFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfasta36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfa.o $(DROPTFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M)
+
+tfastf36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tf.o scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(DROPTFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfastf36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tf.o $(DROPTFF_O) scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M)
+
+tfastf36s : $(COMP_LIBO) $(COMPACC_SO) showsum.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tf.o scaleswtf.o karlin.o $(DROPTFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfastf36s $(COMP_LIBO) $(COMPACC_SO) showsum.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tf.o $(DROPTFF_O) scaleswtf.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M)
+
+tfasts36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfs.o scaleswts.o tatstats_fs.o last_tat.o karlin.o $(DROPTFS_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfasts36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfs.o $(DROPTFS_O) scaleswts.o tatstats_fs.o last_tat.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M)
+
+tfastm36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfm.o scaleswts.o tatstats_fm.o last_tat.o karlin.o $(DROPTFM_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfastm36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfm.o $(DROPTFM_O) scaleswts.o tatstats_fm.o last_tat.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M)
+
+ssearch36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o scale_se.o karlin.o $(DROPGSW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ssearch36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o $(DROPGSW_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M)
+
+# do not use accelerated Smith-Waterman
+ssearch36s : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o scale_se.o karlin.o $(DROPGSW_NA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ssearch36s $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o $(DROPGSW_NA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M)
+
+lalign36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(LSHOWALIGN).o htime.o apam.o doinit.o init_lal.o scale_se.o karlin.o last_thresh.o $(DROPLAL_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/lalign36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(LSHOWALIGN).o htime.o apam.o doinit.o init_lal.o $(DROPLAL_O) scale_se.o karlin.o last_thresh.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M)
+
+osearch36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_ssw.o scale_se.o karlin.o $(DROPNSW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/osearch36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_ssw.o $(DROPNSW_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M)
+
+glsearch36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPLNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/glsearch36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o $(DROPLNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M)
+
+ggsearch36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPGNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ggsearch36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o $(DROPGNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M)
+
+prss36 : ssearch36
+ ln -sf ssearch36 prss36
+
+comp_mlib5e.o : comp_lib5e.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib5e.c -o comp_mlib5e.o
+
+comp_mthr5e.o : comp_lib5e.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib5e.c -o comp_mthr5e.o
+
+comp_mlib7e.o : comp_lib7e.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib7e.c -o comp_mlib7e.o
+
+comp_mthr7e.o : comp_lib7e.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib7e.c -o comp_mthr7e.o
+
+comp_mlib8.o : comp_lib8.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib8.c -o comp_mlib8.o
+
+comp_mthr8.o : comp_lib8.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib8.c -o comp_mthr8.o
+
+comp_mlib9.o : comp_lib9.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib9.c -o comp_mlib9.o
+
+comp_mthr9.o : comp_lib9.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib9.c -o comp_mthr9.o
+
+work_thr2.o : work_thr2.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c work_thr2.c
+
+print_pssm : print_pssm.c getseq.c karlin.c apam.c pssm_asn_subs.c
+ $(CC) -o print_pssm $(CFLAGS) print_pssm.c getseq.c karlin.c apam.c pssm_asn_subs.c $(LIB_M)
+
+map_db : map_db.c uascii.h ncbl2_head.h
+ $(CC) $(CFLAGS) -o $(BIN)/map_db map_db.c
+
+list_db : list_db.c
+ $(CC) $(CFLAGS) -o $(BIN)/list_db list_db.c
+
+
+lav2ps : lav2plt.o lavplt_ps.o
+ $(CC) -DUNIX -o $(BIN)/lav2ps lav2plt.o lavplt_ps.o -lm
+
+lav2svg : lav2plt.o lavplt_svg.o
+ $(CC) -DUNIX -o $(BIN)/lav2svg lav2plt.o lavplt_svg.o -lm
diff --git a/make/Makefile.pcom_t b/make/Makefile.pcom_t
new file mode 100644
index 0000000..ead70a1
--- /dev/null
+++ b/make/Makefile.pcom_t
@@ -0,0 +1,184 @@
+
+# combinations of files for "composite" drop* functions
+#
+DROPNSW_O = dropnsw.o wm_align.o calcons_sw.o
+DROPNFA_O = drop_nfa.o wm_align.o calcons_fa.o
+DROPBD_O = dropsbd.o wm_align.o calcons_fa.o
+DROPTFA_O = drop_tfa.o wm_align.o calcons_tfa.o
+DROPFF_O = drop_ff2.o calcons_ff.o
+DROPFS_O = drop_fs2.o calcons_fs.o
+DROPFM_O = drop_fm.o calcons_fm.o
+DROPTFF_O = drop_tff.o calcons_tff.o
+DROPTFS_O = drop_tfs.o calcons_tfs.o
+DROPTFM_O = drop_tfm.o calcons_tfm.o
+COMPACC_SO = compacc.o
+COMPACC_TO = compacc.o
+
+#COMPACC_TO = compacc.o # used with comp_lib5.c/comp_lib7.c
+#COMPACC_SO = compacc.o
+COMPACC_TO = compacc2_t.o # used with comp_lib5e.c/comp_lib7e.c/comp_lib8.c
+COMPACC_SO = compacc2_s.o
+
+SHOWBESTC = mshowbest.c
+SHOWBESTO = showbest.o build_ares.o
+SHOWALIGN = mshowalign2
+SHOWALIGN_T = mshowalign2_t
+SHOWALIGN_S = mshowalign2_s
+LSHOWALIGN = lshowalign
+MWH = mw.h
+MWHP = mw.h
+
+TPROGS = fasta36 ssearch36 lalign36 fasts36 fastx36 tfastx36 fasty36 tfasty36 tfasts36 fastm36 tfastm36 fastf36 tfastf36 glsearch36 ggsearch36
+
+APROGS = map_db
+
+PROGS = $(SPROGS) $(TPROGS) $(APROGS)
+
+all: $(PROGS)
+
+tall: $(TPROGS)
+
+clean-up:
+ rm -f *.o $(PROGS); rm -rf $(BIN)/*
+
+install: $(PROGS)
+ pushd $(BIN); cp $(PROGS) $(XDIR); popd
+
+uinstall: $(PROGS)
+ pushd $(BIN); cp $(PROGS) $(UDIR); popd
+
+sinstall: $(SPROGS)
+ pushd $(BIN); cp $(SPROGS) $(XDIR); popd
+
+tinstall: $(TPROGS)
+ pushd $(BIN); cp $(TPROGS) $(XDIR); popd
+
+lalign36 : $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(LSHOWALIGN).o htime.o apam.o doinit.o init_lal.o scale_se.o karlin.o last_thresh.o $(DROPLAL_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/lalign36 $(COMP_LIBO) $(COMPACC_SO) $(SHOWBESTO) re_getlib.o $(LSHOWALIGN).o htime.o apam.o doinit.o init_lal.o $(DROPLAL_O) scale_se.o karlin.o last_thresh.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M)
+
+ssearch36 : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_se.o karlin.o $(DROPGSW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ssearch36 $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPGSW_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+ssearch36s : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_se.o karlin.o $(DROPGSW_NA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ssearch36s $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPGSW_NA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+glsearch36 : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPLNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/glsearch36 $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPLNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+glsearch36s : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPLNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/glsearch36s $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPLNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+ggsearch36 : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPGNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ggsearch36 $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPGNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+ggsearch36s : $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o scale_sn.o karlin.o $(DROPGNW_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o
+ $(CC) $(HFLAGS) $(BIN)/ggsearch36s $(COMP_THRO) ${WORK_THRO} $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o $(DROPGNW_O) scale_sn.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o pssm_asn_subs.o $(LIB_M) $(THR_LIBS)
+
+fasta36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o scale_se.o karlin.o $(DROPNFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fasta36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o $(DROPNFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+fasta36sum : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o scale_se.o karlin.o $(DROPNFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fasta36sum $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showsum.o re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o $(DROPNFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+fasta36u : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showun.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o scale_se.o karlin.o $(DROPNFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fasta36u $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showun.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o $(DROPNFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fasta36r : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showrel.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o scale_se.o karlin.o $(DROPNFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fasta36r $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showrel.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fa.o $(DROPNFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fastf36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_ff.o scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(DROPFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastf36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_ff.o $(DROPFF_O) scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fastf36s : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showsum.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_ff.o scaleswtf.o karlin.o $(DROPFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastf36s $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) showsum.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_ff.o $(DROPFF_O) scaleswtf.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fasts36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fs.o scaleswts.o last_tat.o tatstats_fs.o karlin.o $(DROPFS_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fasts36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fs.o $(DROPFS_O) scaleswts.o last_tat.o tatstats_fs.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fastm36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fm.o scaleswts.o last_tat.o tatstats_fm.o karlin.o $(DROPFM_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/fastm36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fm.o $(DROPFM_O) scaleswts.o last_tat.o tatstats_fm.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+fastx36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o c_dispn.o htime.o apam.o doinit.o init_fx.o faatran.o scale_se.o karlin.o drop_fx.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fastx36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fx.o drop_fx.o faatran.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+fasty36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o c_dispn.o htime.o apam.o doinit.o init_fy.o faatran.o scale_se.o karlin.o drop_fz.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/fasty36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_fy.o drop_fz.o faatran.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+tfasta36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o c_dispn.o htime.o apam.o doinit.o init_tfa.o scale_se.o karlin.o $(DROPTFA_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfasta36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfa.o $(DROPTFA_O) scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+tfastf36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o c_dispn.o htime.o apam.o doinit.o init_tf.o scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(DROPTFF_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfastf36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tf.o $(DROPTFF_O) scaleswtf.o last_tat.o tatstats_ff.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+tfasts36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o c_dispn.o htime.o apam.o doinit.o init_tfs.o scaleswts.o last_tat.o tatstats_fs.o karlin.o $(DROPTFS_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfasts36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfs.o $(DROPTFS_O) scaleswts.o last_tat.o tatstats_fs.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+tfastm36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfm.o scaleswts.o tatstats_fm.o last_tat.o karlin.o $(DROPTFM_O) $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o
+ $(CC) $(HFLAGS) $(BIN)/tfastm36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_S).o htime.o apam.o doinit.o init_tfm.o $(DROPTFM_O) scaleswts.o tatstats_fm.o last_tat.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o mrandom.o url_subs.o $(LIB_M) $(THR_LIBS)
+
+tfastx36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfx.o scale_se.o karlin.o drop_tfx.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfastx36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfx.o drop_tfx.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+tfasty36 : $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfy.o scale_se.o karlin.o drop_tfz.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o
+ $(CC) $(HFLAGS) $(BIN)/tfasty36 $(COMP_THRO) $(WORK_THRO) $(THR_SUBS).o $(COMPACC_TO) $(SHOWBESTO) re_getlib.o $(SHOWALIGN_T).o htime.o apam.o doinit.o init_tfy.o drop_tfz.o scale_se.o karlin.o $(LGETLIB) c_dispn.o $(NCBL_LIB) lib_sel.o faatran.o url_subs.o mrandom.o $(LIB_M) $(THR_LIBS)
+
+comp_mlib4.o : comp_lib4.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib4.c -o comp_mlib4.o
+
+comp_mthr4.o : comp_lib4.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib4.c -o comp_mthr4.o
+
+comp_mlib5.o : comp_lib5.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib5.c -o comp_mlib5.o
+
+comp_mthr5.o : comp_lib5.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib5.c -o comp_mthr5.o
+
+comp_mlib5e.o : comp_lib5e.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib5e.c -o comp_mlib5e.o
+
+comp_mthr5e.o : comp_lib5e.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib5e.c -o comp_mthr5e.o
+
+comp_mlib7.o : comp_lib7.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib7.c -o comp_mlib7.o
+
+comp_mthr7.o : comp_lib7.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib7.c -o comp_mthr7.o
+
+comp_mlib7e.o : comp_lib7e.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib7e.c -o comp_mlib7e.o
+
+comp_mthr7e.o : comp_lib7e.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib7e.c -o comp_mthr7e.o
+
+comp_mlib8.o : comp_lib8.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib8.c -o comp_mlib8.o
+
+comp_mthr8.o : comp_lib8.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib8.c -o comp_mthr8.o
+
+comp_mlib9.o : comp_lib9.c mw.h structs.h defs.h param.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_MLIB -c comp_lib9.c -o comp_mlib9.o
+
+comp_mthr9.o : comp_lib9.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -DCOMP_THR -DCOMP_MLIB -c comp_lib9.c -o comp_mthr9.o
+
+work_thr2.o : work_thr2.c mw.h structs.h defs.h param.h thr_bufs2.h thr_buf_structs.h
+ $(CC) $(THR_CC) $(CFLAGS) -c work_thr2.c
+
+print_pssm : print_pssm.c getseq.c karlin.c apam.c pssm_asn_subs.c
+ $(CC) -o print_pssm $(CFLAGS) print_pssm.c getseq.c karlin.c apam.c pssm_asn_subs.c $(LIB_M)
+
+map_db : map_db.c uascii.h ncbl2_head.h
+ $(CC) $(CFLAGS) -o $(BIN)/map_db map_db.c
+
+list_db : list_db.c
+ $(CC) $(CFLAGS) -o $(BIN)/list_db list_db.c
+
+
+lav2ps : lav2plt.o lavplt_ps.o
+ $(CC) -DUNIX -o $(BIN)/lav2ps lav2plt.o lavplt_ps.o -lm
+
+lav2svg : lav2plt.o lavplt_svg.o
+ $(CC) -DUNIX -o $(BIN)/lav2svg lav2plt.o lavplt_svg.o -lm
diff --git a/make/Makefile.sgi b/make/Makefile.sgi
new file mode 100644
index 0000000..b0ce690
--- /dev/null
+++ b/make/Makefile.sgi
@@ -0,0 +1,58 @@
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+#
+# for more information on FASTA on SGI's, see:
+#
+# http://www.sgi.com/chembio/resources/fasta/index.html
+#
+# use -DBIG_LIB64 to generate 64-bit offsets in map_db .xin files. This
+# only works on SGI's with the -64 option.
+
+CC= cc -w -64 -mips4 -O2 -TENV:X=3 -DSGI_BUG -Wl,-multigot -DIRIX
+#CC= cc -64 -mips4 -g -DSGI_BUG -DDEBUG -DIRIX
+
+HFLAGS= -64 -mips4 -o
+NFLAGS= -64 -mips4 -o
+
+#CC= cc -g
+#HFLAGS= -o
+#NFLAGS= -o
+
+LIB_M= -lm
+# For R2000/R3000 MIPS Processors, use -mips1
+#
+#CC= cc -mips1 -O2
+#HFLAGS= -mips1 -o
+#NFLAGS= -mips1 -o
+#
+# For R4000 MIPS Processors, use -mips2:
+#
+#CC = cc -mips2 -O2
+#HFLAGS= -mips2 -o
+#NFLAGS= -mips2 -o
+#
+
+CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DBIGMEM -DSFCHAR="':'" -DMAX_WORKERS=4 -DTHR_EXIT=pthread_exit -DFASTA_HOST='"crick.med.virginia.edu/fasta/cgi"' -DIS_BIG_ENDIAN -DUSE_MMAP -DBIG_LIB64 -DHAS_INTTYPES -DSAMP_STATS -DPGM_DOC -DBIG_LIB64
+
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+BIN = ../bin
+XDIR = /seqprg/slib/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# renamed (fasta34) programs
+include ../make/Makefile36m.common
+# conventional (fasta3) names
+# include ../make/Makefile.common
diff --git a/make/Makefile.sse_alt b/make/Makefile.sse_alt
new file mode 100644
index 0000000..5bc1043
--- /dev/null
+++ b/make/Makefile.sse_alt
@@ -0,0 +1,23 @@
+#
+# $Name: $ - $ Id: $
+#
+# sets of files for no (DROPGSW_NA_O), SSE2, or Altivec Smith-Waterman acceleration
+#
+DROPGSW_NA_O = init_sw.o dropgsw2.o lwm_align.o calcons_sw.o
+DROPGSW_SSE_O = init_sw_sse.o dropgsw2_sse.o smith_waterman_sse2.o lwm_align.o calcons_sw.o
+DROPGSW_ALT_O = init_sw_alt.o dropgsw2_alt.o smith_waterman_altivec.o lwm_align.o calcons_sw.o
+
+P_DROPNSW_NA_O = dropnsw.o lwm_align.o calcons_sw.o
+P_DROPGSW_NA_O = dropgsw2.o lwm_align.o calcons_sw.o
+P_DROPGSW_SSE_O = dropgsw2_sse.o smith_waterman_sse2.o lwm_align.o calcons_sw.o
+P_DROPGSW_ALT_O = dropgsw2_alt.o smith_waterman_altivec.o lwm_align.o calcons_sw.o
+
+DROPLAL_NA_O = droplal2.o lsim4.o calcons_la.o
+DROPLAL_SSE_O = droplal2_sse.o smith_waterman_sse2.o lsim4.o calcons_la.o
+DROPLAL_ALT_O = droplal2_alt.o smith_waterman_altivec.o lsim4.o calcons_la.o
+
+DROPGNW_NA_O = init_gnw.o dropgnw.o gwm_align.o calcons_sw.o
+DROPGNW_SSE_O = init_gnw_sse.o dropgnw_sse.o global_sse2.o gwm_align.o calcons_sw.o
+
+DROPLNW_NA_O = init_lnw.o droplnw.o gwm_align.o calcons_sw.o
+DROPLNW_SSE_O = init_lnw_sse.o droplnw_sse.o glocal_sse2.o gwm_align.o calcons_sw.o
diff --git a/make/Makefile.sun b/make/Makefile.sun
new file mode 100644
index 0000000..5492aa0
--- /dev/null
+++ b/make/Makefile.sun
@@ -0,0 +1,52 @@
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+
+SHELL=/usr/bin/bash
+
+#CC= cc -g -xarch=v8plusa
+
+# switches for 64-bit addressing
+CC= cc -fast -xO4 -xarch=v9
+#CC= cc -g -xarch=v9
+
+# for SUNMP, use -DTHR_EXIT=thr_exit
+# HZ=100 for Solaris x86
+# -DIS_LITTLE_ENDIAN for Solaris x86
+
+CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DBIGMEM -DSFCHAR="':'" -DMAX_WORKERS=2 -DTHR_EXIT=thr_exit -DFASTA_setscope -DUSE_MMAP -DBIG_LIB64 -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DM10_CONS -DSAMP_STATS -DPGM_DOC
+HFLAGS= -o
+NFLAGS= -o
+
+# use -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
+# for files > 2 GB
+
+#for Sun pthreads (preferred, pthreads used on all other platforms)
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+#for Sun threads (no longer necessary as Sun supports pthreads)
+#THR_SUBS = uthr_subs2
+#THR_LIBS = -lthread
+#THR_CC =
+
+LIB_M= -lmopt
+
+BIN = ../bin
+XDIR = /seqprg/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+# no acceleration
+#
+DROPGSW_O = $(DROPGSW_NA_O)
+DROPLAL_O = $(DROPLAL_NA_O)
+DROPGNW_O = $(DROPGNW_NA_O)
+DROPLNW_O = $(DROPLNW_NA_O)
+
+# renamed (fasta34) programs
+include ../make/Makefile36m.common
+# conventional (fasta3) names
+# include ../make/Makefile.common
diff --git a/make/Makefile.sun_x86 b/make/Makefile.sun_x86
new file mode 100644
index 0000000..2a8c8ec
--- /dev/null
+++ b/make/Makefile.sun_x86
@@ -0,0 +1,51 @@
+#
+# makefile for fasta3, fasta3_t. Use makefile.pvm for pvcompxx.
+
+SHELL=/usr/bin/bash # used for make install
+
+# switches for 64-bit addressing - AMD64
+CC= cc -g -fast -m64
+
+# debugging options
+#CC= cc -g -DDEBUG -xarch=amd64
+
+# for SUNMP, use -DTHR_EXIT=thr_exit
+# HZ=100 for Solaris x86
+# Solaris X86 is little endian - be certain IS_BIG_ENDIAN is not defined
+
+CFLAGS= -DSHOWSIM -DUNIX -DTIMES -DHZ=100 -DBIGMEM -DSFCHAR="':'" -DMAX_WORKERS=2 -DTHR_EXIT=thr_exit -DFASTA_setscope -DUSE_MMAP -DBIG_LIB64 -DHAS_INTTYPES -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DUSE_FSEEKO -DM10_CONS -DSAMP_STATS -DPGM_DOC
+HFLAGS= -o
+NFLAGS= -o
+
+# use -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
+# for files > 2 GB
+
+#for Sun pthreads (preferred, pthreads used on all other platforms)
+THR_SUBS = pthr_subs2
+THR_LIBS = -lpthread
+THR_CC =
+
+#for Sun threads (no longer necessary as Sun supports pthreads)
+#THR_SUBS = uthr_subs2
+#THR_LIBS = -lthread
+#THR_CC =
+
+LIB_M= -lmopt
+
+BIN = ../bin
+XDIR = /seqprg/bin
+
+# set up files for SSE2/Altivec acceleration
+#
+include ../make/Makefile.sse_alt
+
+#
+DROPGSW_O = $(DROPGSW_SSE_O)
+DROPLAL_O = $(DROPLAL_SSE_O)
+DROPGNW_O = $(DROPGNW_SSE_O)
+DROPLNW_O = $(DROPLNW_SSE_O)
+
+# renamed (fasta35) programs
+include ../make/Makefile36m.common
+# conventional (fasta3) names
+# include ../make/Makefile.common
diff --git a/make/Makefile35.common b/make/Makefile35.common
new file mode 100644
index 0000000..b7b04bf
--- /dev/null
+++ b/make/Makefile35.common
@@ -0,0 +1,45 @@
+#
+# $Name: $ - $Id: Makefile35.common 344 2010-06-29 18:20:22Z wrp $
+#
+# commands common to all architectures
+# if your architecture does not support "include", append at the end.
+#
+
+# use for multiple query sequences
+COMP_LIBO=comp_mlib2.o
+COMP_THRO=comp_mthr2.o
+WORK_THRO=work_thr2.o
+GETSEQO =
+
+# standard nxgetaa, no memory mapping for 0 - 6
+LGETLIB=getseq.o lgetlib.o
+NGETLIB=nmgetlib
+
+# memory mapping for 0FASTA, 5PIRVMS, 6GCGBIN
+LGETLIB=getseq.o lgetlib.o lgetaa_m.o
+NGETLIB=nmgetlib
+
+NRAND=nrandom
+
+# use ncbl_lib.c for BLAST1.4 support instead of ncbl2_mlib.c
+#NCBL_LIB=ncbl_lib.o
+
+# this option should support both formats (BLAST1.4 not currently supported):
+#NCBL_LIB=ncbl_lib.o ncbl2_mlib.o
+
+# normally use ncbl2_mlib.c
+#NCBL_LIB=ncbl2_mlib.o
+#LIB_M= -lm
+
+# this option supports NCBI BLAST2 and mySQL
+# it requires "-I/usr/local/include/mysql -DMYSQL_DB" in CFLAGS
+# and "-L/usr/local/lib/mysql -lmysqlclient -lz" in LIB_M
+# some systems may also require a LD_LIBRARY_PATH change
+#LIB_M= -L/usr/local/lib/mysql -lmysqlclient -lz -lm
+LIB_M= -lm
+#NCBL_LIB=ncbl2_mlib.o mysql_lib.o
+NCBL_LIB=ncbl2_mlib.o
+
+include ../make/Makefile.pcom
+
+include ../make/Makefile.fcom
diff --git a/make/Makefile35.common_sql b/make/Makefile35.common_sql
new file mode 100644
index 0000000..3eda77c
--- /dev/null
+++ b/make/Makefile35.common_sql
@@ -0,0 +1,50 @@
+#
+# $Name: $ - $Id: Makefile35.common_sql 344 2010-06-29 18:20:22Z wrp $
+#
+# commands common to all architectures
+# if your architecture does not support "include", append at the end.
+#
+
+# use for "normal" fasta34(_t) programs - only one query
+COMP_LIBO=comp_lib2.o
+COMP_THRO=comp_thr2.o
+WORK_THRO=work_thr2.o
+GETSEQO = getseq.o
+# use for multiple query sequences, requires "-n" for DNA fasta, does not
+# work with prss34 (yet)
+#COMP_LIB=comp_mlib.o
+#COMP_THRO=comp_mthr.o
+#
+# standard nxgetaa, no memory mapping for 0 - 6
+LGETLIB=getseq.o lgetlib.o lgetaa_m.o
+NGETLIB=nmgetlib
+
+# memory mapping for 0FASTA, 5PIRVMS, 6GCGBIN
+# LGETLIB=getseq.o lgetlib.o lgetaa_m.o
+# NGETLIB=nmgetlib
+
+NRAND=nrandom
+
+# use ncbl_lib.c for BLAST1.4 support instead of ncbl2_mlib.c
+#NCBL_LIB=ncbl_lib.o
+
+# this option should support both formats (BLAST1.4 not currently supported):
+#NCBL_LIB=ncbl_lib.o ncbl2_mlib.o
+
+# normally use ncbl2_mlib.c
+#NCBL_LIB=ncbl2_mlib.o
+#LIB_M= -lm
+
+# this option supports NCBI BLAST2 and mySQL
+# it requires "-I/usr/include/mysql -DMYSQL_DB" in CFLAGS
+# and "-L/usr/lib/mysql -lmysqlclient -lz" in LIB_M
+# some systems may also require a LD_LIBRARY_PATH change
+LIB_M= -L/usr/lib/mysql -lmysqlclient -lz -lm
+#LIB_M= -lm
+NCBL_LIB=ncbl2_mlib.o mysql_lib.o
+#NCBL_LIB=ncbl2_mlib.o
+
+include ../make/Makefile.pcom
+
+include ../make/Makefile.fcom
+
diff --git a/make/Makefile35.nmk_com b/make/Makefile35.nmk_com
new file mode 100755
index 0000000..11792cf
--- /dev/null
+++ b/make/Makefile35.nmk_com
@@ -0,0 +1,30 @@
+#
+# $Name: $ - $Id: Makefile35.nmk_com 344 2010-06-29 18:20:22Z wrp $
+#
+# commands common to all architectures
+# if your architecture does not support "include", append at the end.
+#
+
+# use for multiple query sequences
+COMP_LIBO=comp_mlib2.obj
+COMP_THRO=comp_mthr2.obj
+WORK_THRO=work_thr2.obj
+GETSEQO =
+
+# standard nxgetaa, no memory mapping for 0 - 6
+LGETLIB=getseq.obj lgetlib.obj
+NGETLIB=nmgetlib
+
+# memory mapping for 0FASTA, 5PIRVMS, 6GCGBIN
+# no memory mapping for Win32
+#LGETLIB= lgetlib.obj lgetaa_m.obj
+
+NRAND=nrand
+
+# normally use ncbl2_mlib.c
+NCBL_LIB=ncbl2_mlib.obj
+#LIB_M= -lm
+
+include ../make/Makefile.nm_pcom
+
+include ../make/Makefile.nm_fcom
diff --git a/make/Makefile35m.common_mysql b/make/Makefile35m.common_mysql
new file mode 100644
index 0000000..03f4741
--- /dev/null
+++ b/make/Makefile35m.common_mysql
@@ -0,0 +1,49 @@
+#
+# $Name: $ - $Id: Makefile35m.common_mysql 344 2010-06-29 18:20:22Z wrp $
+#
+# commands common to all architectures
+# if your architecture does not support "include", append at the end.
+#
+
+# use for multiple query sequences
+# work with prss34 (yet)
+COMP_LIBO=comp_mlib2.o
+COMP_THRO=comp_mthr2.o
+WORK_THRO=work_thr2.o
+GETSEQO =
+
+# standard nxgetaa, no memory mapping for 0 - 6
+#LGETLIB=getseq.o lgetlib.o
+#NGETLIB=nmgetlib
+
+# memory mapping for 0FASTA, 5PIRVMS, 6GCGBIN
+LGETLIB= $(GETSEQO) lgetlib.o lgetaa_m.o
+NGETLIB=nmgetlib
+
+NRAND=nrandom
+
+# use ncbl_lib.c for BLAST1.4 support instead of ncbl2_mlib.c
+#NCBL_LIB=ncbl_lib.o
+
+# this option should support both formats (BLAST1.4 not currently supported):
+#NCBL_LIB=ncbl_lib.o ncbl2_mlib.o
+
+# normally use ncbl2_mlib.c
+#NCBL_LIB=ncbl2_mlib.o
+#LIB_M= -lm
+
+# this option supports NCBI BLAST2 and mySQL
+# it requires "-I/usr/local/include/mysql -DMYSQL_DB" in CFLAGS
+# and "-L/usr/local/lib/mysql -lmysqlclient -lz" in LIB_M
+# some systems may also require a LD_LIBRARY_PATH change
+LIB_M= -L/usr/lib64/mysql -lmysqlclient -lz -lm
+#LIB_M= -L/usr/lib/pgsql/ -lpq -lm -lcrypto -lssl
+# LIB_M= -lm
+NCBL_LIB=ncbl2_mlib.o mysql_lib.o
+#NCBL_LIB=ncbl2_mlib.o pgsql_lib.o
+# NCBL_LIB=ncbl2_mlib.o
+
+include ../make/Makefile.pcom
+
+include ../make/Makefile.fcom
+
diff --git a/make/Makefile35m.common_pgsql b/make/Makefile35m.common_pgsql
new file mode 100644
index 0000000..babe0e5
--- /dev/null
+++ b/make/Makefile35m.common_pgsql
@@ -0,0 +1,49 @@
+#
+# $Name: $ - $Id: Makefile35m.common_pgsql 344 2010-06-29 18:20:22Z wrp $
+#
+# commands common to all architectures
+# if your architecture does not support "include", append at the end.
+#
+
+# use for multiple query sequences
+# work with prss34 (yet)
+COMP_LIBO=comp_mlib2.o
+COMP_THRO=comp_mthr2.o
+WORK_THRO=work_thr2.o
+GETSEQO =
+
+# standard nxgetaa, no memory mapping for 0 - 6
+#LGETLIB=getseq.o lgetlib.o
+#NGETLIB=nmgetlib
+
+# memory mapping for 0FASTA, 5PIRVMS, 6GCGBIN
+LGETLIB= $(GETSEQO) lgetlib.o lgetaa_m.o
+NGETLIB=nmgetlib
+
+NRAND=nrandom
+
+# use ncbl_lib.c for BLAST1.4 support instead of ncbl2_mlib.c
+#NCBL_LIB=ncbl_lib.o
+
+# this option should support both formats (BLAST1.4 not currently supported):
+#NCBL_LIB=ncbl_lib.o ncbl2_mlib.o
+
+# normally use ncbl2_mlib.c
+#NCBL_LIB=ncbl2_mlib.o
+#LIB_M= -lm
+
+# this option supports NCBI BLAST2 and mySQL
+# it requires "-I/usr/local/include/mysql -DMYSQL_DB" in CFLAGS
+# and "-L/usr/local/lib/mysql -lmysqlclient -lz" in LIB_M
+# some systems may also require a LD_LIBRARY_PATH change
+# LIB_M= -L/usr/local/lib/mysql -lmysqlclient -lz -lm
+LIB_M= -L/usr/local/pgsql/lib -lpq -lm -lcrypto -lssl
+# LIB_M= -lm
+#NCBL_LIB=ncbl2_mlib.o mysql_lib.o
+NCBL_LIB=ncbl2_mlib.o pgsql_lib.o
+# NCBL_LIB=ncbl2_mlib.o
+
+include ../make/Makefile.pcom
+
+include ../make/Makefile.fcom
+
diff --git a/make/Makefile35m.common_sql b/make/Makefile35m.common_sql
new file mode 100644
index 0000000..d2cb6db
--- /dev/null
+++ b/make/Makefile35m.common_sql
@@ -0,0 +1,48 @@
+#
+# $Name: $ - $Id: Makefile35m.common_sql 344 2010-06-29 18:20:22Z wrp $
+#
+# commands common to all architectures
+# if your architecture does not support "include", append at the end.
+#
+
+# use for multiple query sequences
+# work with prss34 (yet)
+COMP_LIBO=comp_mlib2.o
+COMP_THRO=comp_mthr2.o
+WORK_THRO=work_thr2.o
+GETSEQO =
+
+# standard nxgetaa, no memory mapping for 0 - 6
+#LGETLIB=getseq.o lgetlib.o
+#NGETLIB=nmgetlib
+
+# memory mapping for 0FASTA, 5PIRVMS, 6GCGBIN
+LGETLIB= $(GETSEQO) lgetlib.o lgetaa_m.o
+NGETLIB=nmgetlib
+
+NRAND=nrandom
+
+# use ncbl_lib.c for BLAST1.4 support instead of ncbl2_mlib.c
+#NCBL_LIB=ncbl_lib.o
+
+# this option should support both formats (BLAST1.4 not currently supported):
+#NCBL_LIB=ncbl_lib.o ncbl2_mlib.o
+
+# normally use ncbl2_mlib.c
+#NCBL_LIB=ncbl2_mlib.o
+#LIB_M= -lm
+
+# this option supports NCBI BLAST2 and mySQL
+# it requires "-I/usr/local/include/mysql -DMYSQL_DB" in CFLAGS
+# and "-L/usr/lib/mysql -lmysqlclient -lz" in LIB_M
+# some systems may also require a LD_LIBRARY_PATH change
+# LIB_M= -L/usr/lib/mysql -lmysqlclient -lz -lm
+LIB_M= -L/usr/lib/mysql -lmysqlclient -lz -L/usr/local/pgsql/lib -lpq -lm -lcrypto -lssl
+# LIB_M= -lm
+NCBL_LIB=ncbl2_mlib.o mysql_lib.o pgsql_lib.o
+# NCBL_LIB=ncbl2_mlib.o
+
+include ../make/Makefile.pcom
+
+include ../make/Makefile.fcom
+
diff --git a/make/Makefile36.nmk_com b/make/Makefile36.nmk_com
new file mode 100644
index 0000000..8303d8f
--- /dev/null
+++ b/make/Makefile36.nmk_com
@@ -0,0 +1,30 @@
+#
+# $Name: $ - $Id: Makefile36.nmk_com 1203 2013-07-20 12:55:48Z wrp $
+#
+# commands common to all architectures
+# if your architecture does not support "include", append at the end.
+#
+
+# use for multiple query sequences
+COMP_LIBO=comp_mlib9.obj
+COMP_THRO=comp_mthr9.obj
+WORK_THRO=work_thr2.obj
+GETSEQO =
+
+# standard nxgetaa, no memory mapping for 0 - 6
+LGETLIB=getseq.obj lgetlib.obj
+NGETLIB=nmgetlib
+
+# memory mapping for 0FASTA, 5PIRVMS, 6GCGBIN
+# no memory mapping for Win32
+#LGETLIB= lgetlib.obj lgetaa_m.obj
+
+NRAND=mrandom
+
+# normally use ncbl2_mlib.c
+NCBL_LIB=ncbl2_mlib.obj
+#LIB_M= -lm
+
+include ../make/Makefile.nm_pcom
+
+include ../make/Makefile.nm_fcom
diff --git a/make/Makefile36m.common b/make/Makefile36m.common
new file mode 100644
index 0000000..3d86322
--- /dev/null
+++ b/make/Makefile36m.common
@@ -0,0 +1,51 @@
+#
+# $Name: $ - $Id: Makefile36m.common 1250 2014-01-24 21:33:39Z wrp $
+#
+# commands common to all architectures
+# if your architecture does not support "include", append at the end.
+#
+
+COMP_LIBO=comp_mlib9.o # reads database into memory for multi-query without delay
+COMP_THRO=comp_mthr9.o # threaded version
+
+WORK_THRO=work_thr2.o
+GETSEQO =
+
+# standard nxgetaa, no memory mapping for 0 - 6
+#LGETLIB=getseq.o lgetlib.o
+#NGETLIB=nmgetlib
+
+# memory mapping for 0FASTA, 5PIRVMS, 6GCGBIN
+LGETLIB= $(GETSEQO) lgetlib.o lgetaa_m.o
+NGETLIB=nmgetlib
+
+# use ncbl_lib.c for BLAST1.4 support instead of ncbl2_mlib.c
+#NCBL_LIB=ncbl_lib.o
+
+# this option should support both formats (BLAST1.4 not currently supported):
+#NCBL_LIB=ncbl_lib.o ncbl2_mlib.o
+
+# normally use ncbl2_mlib.c
+#NCBL_LIB=ncbl2_mlib.o
+#LIB_M= -lm
+
+# this option supports NCBI BLAST2 and mySQL
+# it requires "-I/usr/include/mysql -DMYSQL_DB" in CFLAGS
+# and "-L/usr/lib64/mysql -lmysqlclient -lz" in LIB_M
+# some systems may also require a LD_LIBRARY_PATH change
+
+LIB_M= -lm -lz
+#LIB_M= -L/usr/lib64/mysql -lmysqlclient -lz -lm
+NCBL_LIB=ncbl2_mlib.o
+#NCBL_LIB=ncbl2_mlib.o mysql_lib.o
+
+# threaded as _t, serial
+# include ../make/Makefile.pcom
+
+# threaded without _t
+include ../make/Makefile.pcom_t
+
+# serial only
+# include ../make/Makefile.pcom_s
+
+include ../make/Makefile.fcom
diff --git a/make/Makefile36mpi.common b/make/Makefile36mpi.common
new file mode 100644
index 0000000..32fe30d
--- /dev/null
+++ b/make/Makefile36mpi.common
@@ -0,0 +1,43 @@
+#
+# $Name: $ - $Id: Makefile36mpi.common 849 2011-10-21 20:09:55Z wrp $
+#
+# commands common to all architectures
+# if your architecture does not support "include", append at the end.
+#
+
+# use for multiple query sequences
+COMP_THRO=comp_mpi9.o
+WORK_THR_O=work_mpi2.o
+WORK_THRX_O=work_mpi2x.o
+GETSEQO =
+
+# standard nxgetaa, no memory mapping for 0 - 6
+#LGETLIB=getseq.o lgetlib.o
+#NGETLIB=nmgetlib
+
+# memory mapping for 0FASTA, 5PIRVMS, 6GCGBIN
+LGETLIB= $(GETSEQO) lgetlib.o lgetaa_m.o
+NGETLIB=nmgetlib
+
+# use ncbl_lib.c for BLAST1.4 support instead of ncbl2_mlib.c
+#NCBL_LIB=ncbl_lib.o
+
+# this option should support both formats (BLAST1.4 not currently supported):
+#NCBL_LIB=ncbl_lib.o ncbl2_mlib.o
+
+# normally use ncbl2_mlib.c
+#NCBL_LIB=ncbl2_mlib.o
+#LIB_M= -lm -lz
+
+# this option supports NCBI BLAST2 and mySQL
+# it requires "-I/usr/local/include/mysql -DMYSQL_DB" in CFLAGS
+# and "-L/usr/local/lib/mysql -lmysqlclient -lz" in LIB_M
+# some systems may also require a LD_LIBRARY_PATH change
+#LIB_M= -L/usr/local/lib/mysql -lmysqlclient -lz -lm
+#NCBL_LIB=ncbl2_mlib.o mysql_lib.o
+NCBL_LIB=ncbl2_mlib.o
+LIB_M = -lm -lz
+
+include ../make/Makefile.mp_com2
+
+include ../make/Makefile.fcom
diff --git a/make/Makefile36t.common b/make/Makefile36t.common
new file mode 100644
index 0000000..e6d4952
--- /dev/null
+++ b/make/Makefile36t.common
@@ -0,0 +1,43 @@
+#
+# $Name: $ - $Id: Makefile36t.common 344 2010-06-29 18:20:22Z wrp $
+#
+# commands common to all architectures
+# if your architecture does not support "include", append at the end.
+#
+
+# use for multiple query sequences
+COMP_LIBO=comp_mlib4.o
+COMP_THRO=comp_mthr4.o
+WORK_THRO=work_thr2.o
+GETSEQO =
+
+# standard nxgetaa, no memory mapping for 0 - 6
+#LGETLIB=getseq.o lgetlib.o
+#NGETLIB=nmgetlib
+
+# memory mapping for 0FASTA, 5PIRVMS, 6GCGBIN
+LGETLIB= $(GETSEQO) lgetlib.o lgetaa_m.o
+NGETLIB=nmgetlib
+
+# use ncbl_lib.c for BLAST1.4 support instead of ncbl2_mlib.c
+#NCBL_LIB=ncbl_lib.o
+
+# this option should support both formats (BLAST1.4 not currently supported):
+#NCBL_LIB=ncbl_lib.o ncbl2_mlib.o
+
+# normally use ncbl2_mlib.c
+#NCBL_LIB=ncbl2_mlib.o
+#LIB_M= -lm
+
+# this option supports NCBI BLAST2 and mySQL
+# it requires "-I/usr/local/include/mysql -DMYSQL_DB" in CFLAGS
+# and "-L/usr/local/lib/mysql -lmysqlclient -lz" in LIB_M
+# some systems may also require a LD_LIBRARY_PATH change
+#LIB_M= -L/usr/local/lib/mysql -lmysqlclient -lz -lm
+#NCBL_LIB=ncbl2_mlib.o mysql_lib.o
+NCBL_LIB=ncbl2_mlib.o
+LIB_M= -lm
+
+include ../make/Makefile.pcom
+
+include ../make/Makefile.fcom
diff --git a/make/README b/make/README
new file mode 100644
index 0000000..b492066
--- /dev/null
+++ b/make/README
@@ -0,0 +1,44 @@
+
+22-Jan-2014
+
+fasta36/Make
+
+================
+Makefiles for different Unix/Linux/MacOS configurations
+
+****************
+These make files are designed to be run from the ../src directory, e.g.
+
+cd ~/fasta36/src
+make -f ../make/Makfile.linux64_sse2 all
+****************
+
+While several different architectures are specified here, the files
+that are used the most are:
+
+Makefile.linux64_sse2 -- standard Linux(64-bit) Makefile
+
+ -- now equivalent to Makefile.linux64 and Makefile.linux. For
+ non-sse2 compiles, use Makefile.linux64_nosse2
+
+Makefile.linux_icc_sse2 -- Linux (64-bit) with Intel icc compiler
+
+Makefile.os_x86_64 -- standard MacOS (64-bit) Makefile (also sse2)
+
+Makfile.nmk_ics -- Windows 32-bit with Intel icc compiler
+
+================
+
+Most of the other Makefiles have not been tested for months or years.
+
+================================================================
+
+The major Makefiles above include other makefiles, including
+Makefile36m.common, Makefile.pcom_t, Makefile.fcom to compile and link
+the appropriate programs.
+
+The Windows Makefile.nmk_icl uses Makefile.nm_fcom and Makefile.nm_pcom
+
+The windows environment requires Microsoft nmake.
+
+================================================================
diff --git a/make/make_osx_univ.sh b/make/make_osx_univ.sh
new file mode 100755
index 0000000..a9d0420
--- /dev/null
+++ b/make/make_osx_univ.sh
@@ -0,0 +1,31 @@
+#!/bin/csh
+
+set bin = ../bin
+if (! -d ../bin ) mkdir $bin
+#if (! -d ../bin/ppc) mkdir $bin/ppc
+if (! -d ../bin/i386) mkdir $bin/i386
+if (! -d ../bin/x86_64) mkdir $bin/x86_64
+
+# cd ../src
+# rm *.o
+# make -f ../make/Makefile.os_x all
+# make -f ../make/Makefile.os_x uinstall
+
+rm *.o
+make -f ../make/Makefile.os_x86 all
+make -f ../make/Makefile.os_x86 uinstall
+
+rm *.o
+make -f ../make/Makefile.os_x86_64 all
+make -f ../make/Makefile.os_x86_64 uinstall
+rm *.o
+cd ../bin
+foreach n ( i386/* )
+set f=$n:t
+#lipo -create ppc/$f i386/$f x86_64/$f -output $f
+lipo -create i386/$f x86_64/$f -output $f
+echo "Universal $f built"
+end
+#rm -rf ppc/ i386/ x86_64/
+#rm -rf i386/ x86_64/
+echo "Done!"
diff --git a/misc/README b/misc/README
new file mode 100644
index 0000000..5abd2ca
--- /dev/null
+++ b/misc/README
@@ -0,0 +1,14 @@
+
+22-Jan-2014
+
+fasta36/misc
+
+Perl scripts for simple tasks
+
+parse_m9.pl -- parse -m 9 output to produce tab-delimited files with tab-delimited:
+ query_acc, query_len, lib_acc, lib_len, score, bits, evalue, f_id, f_sim, alen, and start/stop coordinates
+
+res2R.pl -- convert fasta36 -R raw.results files into something 'R' can digest
+
+shuffle_embed.pl -- take a sequence and embed it into a shuffled version of itself
+
diff --git a/misc/parse_m9.pl b/misc/parse_m9.pl
new file mode 100755
index 0000000..2ddffbe
--- /dev/null
+++ b/misc/parse_m9.pl
@@ -0,0 +1,139 @@
+#!/usr/bin/perl -w
+#
+# parse_m9.pl -- a simple script to parse fasta/fastx/ssearch -m 9 output and produce a simple set of results:
+# >query_id<tab>len
+# hit_acc<tab>len<tab>score<tab>bits<tab>expect<tab>f_id<tab>f_sim<tab>...
+#
+use strict;
+use Getopt::Long;
+use vars qw($e_cutoff $p_cutoff $head);
+
+die "usage -- parse_m9.pl [--head] [--expect e_cut] [--percid p_cut] m9_out.file\n" unless @ARGV;
+
+$e_cutoff = 10.0;
+$p_cutoff = 0.0;
+$head = 0;
+
+GetOptions("expect=s" => \$e_cutoff,
+ "percid=s" => \$p_cutoff,
+ "head" => \$head,
+ );
+
+my @hit_fields = ();
+my @m9_fields = ();
+my $first_hit = 1;
+
+my $res_handle;
+for my $s_res_file ( @ARGV ) {
+
+ next unless open($res_handle, $s_res_file);
+
+ while (my ($q_num, $query_descr, $query_len, $best_yes) = skip_to_results($res_handle)) {
+ last unless $query_descr;
+
+ unless ($best_yes) {
+ <$res_handle>; # skip >>><<<
+# uncomment for queries with no hits
+# print ">$query_descr\t$query_len\n";
+ next;
+ }
+
+ print ">$query_descr\t$query_len\n" unless $head;
+
+ while (my $line = <$res_handle>) { # for each result
+ last if $line =~ m/>>><<</;
+ next if $line =~ m/^\+\-/; # skip over HSPs
+ chomp ($line);
+ my ($left, $right) = split(/\t/,$line);
+ my @fields = split(/\s+/,$left);
+ my @afields = split(/\s+/,$right);
+
+ my $evalue = $fields[-1];
+ my $percid = $afields[0]*100.0;
+ last if ($evalue > $e_cutoff && $percid < $p_cutoff);
+
+ my $frame = "";
+ my $l_len = $fields[-4];
+ if ($fields[-4] =~ m/\[f|r\]/) {
+ $l_len = $fields[-5];
+ $frame = $fields[-4];
+ if ($head) {unshift @hit_fields, "[fr]";}
+ }
+
+ if ($head && $first_hit) {
+ unshift @hit_fields, qw(acc llen);
+ print "#" . join("\t",(@hit_fields, @m9_fields)) . "\n";
+ print ">$query_descr\t$query_len\n";
+ $first_hit = 0;
+ $head = 0;
+ }
+
+ $l_len =~ s/\(//;
+ $l_len =~ s/\)//;
+ my ($l_db,$l_acc) = parse_descr($fields[0]);
+
+ my @out_fields = ($l_acc, $l_len);
+ if ($l_db) { unshift @out_fields, $l_db;}
+ if ($frame) { push @out_fields, $frame;}
+ print join("\t",(@out_fields, @fields[-3,-2,-1], @afields)) . "\n";
+ }
+ }
+}
+
+sub skip_to_results {
+ my ($res_handle) = @_;
+ my ($q_num, $query_desc, $query_len, $best_yes);
+
+ while (my $line = <$res_handle>) {
+ if ($line =~ m/^\s*(\d+)>>>(\S+)\s/) {
+ ($q_num,$query_desc) = ($1,$2);
+ ($query_len) = ($line =~ m/\s(\d+)\s\w+$/);
+ goto have_query;
+ }
+ elsif ($line =~ m/>>>\/\/\//) {goto done;}
+ }
+ warn "EOF - no query\n";
+ done:
+ return "";
+
+ have_query:
+ while (my $line = <$res_handle>) {
+ $best_yes = 0;
+ if ($line =~ m/^The best scores are:/) {
+ my ($left, $right) = split(/\t/,$line);
+ if ($head) {
+ my @afields = split(/\s+/,$left);
+ @m9_fields = split(/\s+/,$right);
+ @hit_fields = @afields[-3,-2,-1];
+ }
+ $best_yes = 1;
+ last;
+ }
+ last if ($line =~ m/^!! No sequences/);
+ }
+ return ($q_num, $query_desc, $query_len, $best_yes);
+}
+
+sub parse_descr {
+ my ($descr) = @_;
+
+ my ($dummy, $gi, $db, $acc);
+
+ if ($descr !~ m/\|/) {
+ $db="";
+ $acc=$descr;
+ }
+ elsif ($descr =~ m/gi\|/) {
+ # has std gi|12345|ref|acc
+ ($dummy, $gi, $db, $acc) = split(/\|/,$descr);
+ }
+ elsif ($descr =~ m/\d+\|\w+/) {
+ # has std 12345|ref|acc from libtype=10
+ ($gi, $db, $acc) = split(/\|/,$descr);
+ }
+
+ # remove version number
+ $acc =~ s/\.\d+$//;
+
+ return ($db, $acc);
+}
diff --git a/misc/res2R.pl b/misc/res2R.pl
new file mode 100755
index 0000000..a8d1f75
--- /dev/null
+++ b/misc/res2R.pl
@@ -0,0 +1,22 @@
+#!/usr/bin/perl -w
+
+# convert FASTA .res file to fields for 'R'
+
+# lose the first line
+<>;
+
+print "len\tscore\n";
+
+my @line;
+
+while(<>) {
+ last if (m/\/\*\*/);
+
+ @line = split(/\s+/);
+
+#fields are:
+# [0] ACC; [1] 0; [2] len; [3] frame; [4] comp; [5] H; [6-8] score[0-2];
+# [9] rst.escore [10] segnum; [11] seglen; [12]lseek
+
+ print "$line[2]\t$line[6]\n";
+}
diff --git a/misc/shuffle_embed.pl b/misc/shuffle_embed.pl
new file mode 100755
index 0000000..ee034d6
--- /dev/null
+++ b/misc/shuffle_embed.pl
@@ -0,0 +1,148 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long;
+use Pod::Usage;
+
+my ($window, $insert, $shelp, $help, $n_shuff) = (20, 1, 0, 0,1);
+
+GetOptions("window=i" => \$window,
+ "insert=i" => \$insert,
+ "n=i" => \$n_shuff,
+ "h|?" => \$shelp,
+ "help" => \$help,
+
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+
+my ($seq, $header) = ("","");
+
+while (my $line = <>) {
+ chomp($line);
+ if ($line =~ /^>/) {
+ if ($seq) { process_seq($header, $seq, $window, $insert, $n_shuff);}
+ $header = $line;
+ $seq = "";
+ }
+ else {
+ $seq .= $line;
+ }
+}
+
+if ($seq) { process_seq($header, $seq, $window, $insert, $n_shuff);}
+
+exit(0);
+
+sub process_seq {
+ my ($header, $seq, $window, $insert, $n_shuff) = @_;
+
+ # remove non amino-acids
+ $seq =~ s/[^A-Za-z]//g;
+ my $seq_len = length($seq);
+
+ for (my $shuff_cnt = 0; $shuff_cnt < $n_shuff; $shuff_cnt++) {
+
+ my $shuff_seq = win_shuffle($seq, $window);
+ my $left_sseq = substr($shuff_seq, 0, ($seq_len+1)/2);
+ my $right_sseq = substr($shuff_seq, ($seq_len+1)/2, $seq_len - ($seq_len+1)/2 +1);
+
+ my $embed_seq = $left_sseq;
+ if ($insert) {
+ $embed_seq .= $seq;
+ }
+ $embed_seq .= $right_sseq;
+
+ my ($acc, $descr) = ($header =~ m/^>(\S+)\s*(.*)$/);
+
+ if ($insert) {
+ if ($n_shuff > 1) {
+ printf ">%s_%d e:%d-%d %s\n",$acc,$shuff_cnt,length($left_sseq)+1,length($left_sseq) + $seq_len,$descr;
+ }
+ else {
+ printf ">%s e:%d-%d %s\n",$acc,length($left_sseq)+1,length($left_sseq) + $seq_len,$descr;
+ }
+ } else {
+ if ($n_shuff > 1) {
+ printf ">%s_shuff_%d %s\n",$acc,$shuff_cnt, $descr;
+ }
+ else {
+ printf ">%s_shuff %s\n",$acc, $descr;
+ }
+ }
+
+ $embed_seq =~ s/(.{60})/$1\n/g;
+
+ print "$embed_seq\n";
+ }
+}
+
+my $random_seed;
+
+sub win_shuffle {
+ my ($seq, $win) = @_;
+
+ # break sequence into $win len pieces
+ my @seq_arr = ($seq =~ m/(.{1,$win})/g);
+
+ # shuffle the subsets
+ for (my $j = 0; $j < @seq_arr; $j++) {
+ my @subs_arr = split(//,$seq_arr[$j]);
+ fy_shuffle(\@subs_arr);
+ $seq_arr[$j] = join("", at subs_arr);
+ }
+
+ # now shuffle the window order
+ fy_shuffle(\@seq_arr);
+ # and put it back together
+ return join("", at seq_arr);
+}
+
+# fy_shuffle array_ref
+sub fy_shuffle {
+ my $arr = shift;
+
+ die "fy_shuffle (array_ref)" unless (ref($arr) eq 'ARRAY');
+
+ return unless @$arr;
+ my $i = scalar(@$arr)-1;
+
+ while ($i > 0) {
+ my $is = int(rand($i));
+ ($arr->[$i],$arr->[$is]) = ($arr->[$is],$arr->[$i]);
+ $i--;
+ }
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+shuffle_embed.pl
+
+=head1 SYNOPSIS
+
+shuffle_embed.pl --n=1 --insert=1 --window=20 file.seq > file.shuff_emb
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --insert=0 shuffle only, do not insert unshuffled
+ --n=1 number of shuffles
+ --window size of shuffle window
+
+=head1 DESCRIPTION
+
+shuffle_embed.pl takes a fasta formatted protein or DNA sequence file,
+reads the sequence, shuffles it, splits the shuffled sequence in the
+middle, and embeds the unshuffled sequence between the two halves of
+the shuffled sequence.
+
+With --insert 0, the sequences produced are random, no unshuffled
+sequence is embedded.
+
+=cut
diff --git a/psisearch2/README.md b/psisearch2/README.md
new file mode 100644
index 0000000..5bbf1e6
--- /dev/null
+++ b/psisearch2/README.md
@@ -0,0 +1,92 @@
+
+## PSISEARCH2 - iterative PSSM-based similarity searching using PSIBLAST or SSEARCH36
+
+#### September, 2016
+
+`psisearch2_msa.pl` and `psisearch2_msa.py` (both scripts have
+identical arguments and functionality) perform iterative searches
+using PsiBLAST or ssearch36, but with additional options that
+dramatically improve search selectivity. In tests with challenging
+queries, `psisearch2_msa.pl/py` searches often reduce the number of
+false-positives more than ten fold, and sometimes 100-fold or more.
+
+For a simple test of `psisearch2,` try (from the `psisearch2/` directory):
+
+```
+ ./psisearch2_msa.pl --query ../seq/mgstm1.aa --db ../seq/prot_test.lseg
+```
+
+This command should produce the output:
+```
+#./psisearch2_msa.pl --query ../seq/mgstm1.aa --db ../seq/prot_test.lseg
+./psisearch2_msa.pl ssearch ../seq/mgstm1.aa ../seq/prot_test.lseg converged (2 iterations)
+```
+as well as four files:
+```
+mgstm1.aa.it1
+mgstm1.aa.it1.bnd_out
+mgstm1.aa.it2
+mgstm1.aa.it2.bnd_out
+```
+
+Real iterative searches must be run against comprehensive sequence
+databases, like SwissProt, RefSeq proteins, or Uniprot, e.g.:
+
+```
+ ./psisearch2_msa.pl --query ../seq/mgstm1.aa --db /slib2/swissprot.lseg
+```
+
+## More selective searches
+
+By default, `psisearch2_msa.pl` simple runs a search program
+(`ssearch36` by default, use `--pgm psisblast` to run `psiblast`),
+scans the output to produce a multiple sequence alignment, which is
+then used to build a `PSSM` for the next iteration. Running
+`psisearch2_msa.pl` for five iterations should produce results very
+similar to running `psiblast` for five iterations.
+
+`psisearch2_msa.pl` can perform much more selective searches using the
+`--query_seed` option, which is equivalent to the `--int_seed=query`
+and `--end_seed=query` options. The `--query_seed` option causes the
+`m89_btop_msa2.pl` program to insert query residues into the gapped
+positions of subject sequences in the sequence library used to produce
+the `PSSM`. This `PSSM` is slightly less sensitive than the normal
+model, but it is much less likely to produce alignment-overextension,
+so it is much less likely for alignments to extend into neighboring,
+non-homologous regions and contaminate the `PSSM` model.
+
+In addition to `--query_seed`, two other options: `--align`, and
+`--domain` can also be used to reduce alignment overextension.
+`--align` causes `psisearch2_msa.pl` to include only portion of a
+subject sequence that aligned the first time it shares significant
+similarity to the query PSSM; additional residues from the sequence
+that are aligned in later iterations are not included. This option can
+be used with both `--pgm ssearch` and `--pgm psiblast`.
+
+The `--domain` option uses a more sophisticated strategy for including
+additional residues in a PSSM. They are included only if the
+similarity score across the domain has a probability less than 0.001
+(q-value 30). This option is only available for `--pgm ssearch`, and
+requires that a second option, `--annot_db`, be specified. Typically
+`--annot_db=pfam`.
+
+## Customizing psisearch2
+
+`psisearch2_msa.pl` is a script that uses other programs for
+similarity searching and constructing the PSSM (and for annotating
+domains in alignments if the `--domain` option is used). The location
+of these programs is defined in the `psisearch2_msa.pl` and
+`psisearch2_msa.py` scripts using the `$pgm_bin` and `$pgm_data`
+variable (perl, `pgm_bin` and `pgm_data` for python). You will
+probably need to modify those variables for your installation. In
+particular, the NCBI `datatool` program is required for producing the
+asn binary files required by `ssearch36`.
+
+## `psisearch2_msa.pl/py` output
+
+In this current version, the `psisearch2_msa.pl` and
+`psisearch2_msa.py` programs *ONLY* produce tab-delimited BTOP output.
+The programs do not produce the traditional `psiblast` alignment,
+which includes a list of hits with E()-values, and a set of
+alignments. More alignment output flexibility will be available soon.
+
diff --git a/psisearch2/m89_btop_msa2.pl b/psisearch2/m89_btop_msa2.pl
new file mode 100755
index 0000000..0e3d679
--- /dev/null
+++ b/psisearch2/m89_btop_msa2.pl
@@ -0,0 +1,927 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014,2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+################################################################
+# m89_btop_msa2.pl --query query.file blast_tab_btop_file
+################################################################
+# m89_btop_msa2.pl takes a query sequence and either a BLAST -outfmt
+# 7/fasta -m 8CB output file (default) or fasta -m 8B (--m_format m9)
+# output file with a BTOP field, and constructs a query-driven
+# multiple sequence alignment of the subject sequences that can be
+# used as input to psiblast with the "--in_msa msa.file" option.
+#
+# (because BLAST BTOP encoding provides the mismatched residues, the
+# library sequences are not required to produce the MSA -- they are
+# available in the BTOP string)
+#
+# The BTOP alignment encoding file generated from "blastp/n" or
+# "blast_formatter" using the command: blast_formatter -archive
+# blast_output.asn -outfmt '7 qseqid sseqid pident length mismatch
+# gapopen qstart qend sstart send evalue bitscore score btop' >
+# blast_output.tab_annot
+#
+################################################################
+
+use strict;
+use Pod::Usage;
+use Getopt::Long;
+
+# read lines of the form:
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|121694|sp|P20432|GSTT1_DROME 100.00 209 0 0 1 209 1 209 6e-156 433 1113 209
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|1170090|sp|P04907|GSTF3_MAIZE 26.77 198 123 7 4 185 6 197 2e-08 51.2 121 FL1YG ... 1NKRA1YW1
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|81174731|sp|P0ACA5|SSPA_ECO57 39.66 58 32 2 43 100 49 103 8e-06 43.9 102 EDFLLI ... V-I-NEQS3FM
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|121695|sp|P12653|GSTF1_MAIZE 27.62 181 107 7 32 203 34 199 9e-05 40.8 94 LI1LF ... N-1AS1CLLM1
+
+
+my ($shelp, $help, $m_format, $evalue, $qvalue, $domain_bound) = (0, 0, "m8CB", 0.001, 30.0,0);
+my ($query_file, $bound_file_in, $bound_file_only, $bound_file_out, $masked_lib_out,$mask_type_end, $mask_type_int) = ("","","","","","","");
+my $query_lib_r = 0;
+my ($eval2_fmt, $eval2) = (0,"");
+
+GetOptions(
+ "query=s" => \$query_file,
+ "query_file=s" => \$query_file,
+ "eval2=s" => \$eval2, # change the evalue used for inclusion
+ "evalue=f" => \$evalue,
+ "expect=f" => \$evalue,
+ "qvalue=f" => \$qvalue,
+ "format=s" => \$m_format,
+ "m_format=s" => \$m_format,
+ "mformat=s" => \$m_format,
+ "bound_file_in=s" => \$bound_file_in,
+ "bound_file_only=s" => \$bound_file_only,
+ "bound_file_out=s" => \$bound_file_out,
+ "bound_in=s" => \$bound_file_in,
+ "bound_only=s" => \$bound_file_only,
+ "bound_out=s" => \$bound_file_out,
+ "masked_library_out=s" => \$masked_lib_out,
+ "masked_lib_out=s" => \$masked_lib_out,
+ "mask_lib_out=s" => \$masked_lib_out,
+ "mask_out=s" => \$masked_lib_out,
+ "end_mask_type=s" => \$mask_type_end,
+ "end_mask=s" => \$mask_type_end,
+ "domain_bound" => \$domain_bound,
+ "domain" => \$domain_bound,
+ "int_mask_type=s" => \$mask_type_int,
+ "int_mask=s" => \$mask_type_int,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+unless (-f STDIN || -p STDIN || @ARGV) {
+ pod2usage(1);
+}
+
+################
+# initialization
+my @random_res = ();
+if ($mask_type_end =~ m/^rand/i) {
+ @random_res = init_random_res();
+}
+
+my @m9_field_names = qw(percid perc_sim raw_score a_len q_start q_end qc_start qc_end s_start s_end sc_start sc_end gap_q gap_l fs);
+my @m8_field_names = qw(q_seqid s_seqid percid a_len mismatch gopen q_start q_end s_start s_end evalue bits);
+
+my @hit_list = ();
+
+my %seq_bound = (); # boundaries for each accession
+my %acc_names = (); # generate uniq s_seq_id names
+my %multi_align = ();
+my @multi_names = ();
+
+################
+# get query sequence, and insert into MSA
+#
+my ($query_acc, $query_seq_r, $query_len);
+if ($query_file) {
+ ($query_acc, $query_seq_r) = parse_query_lib($query_file);
+ $query_len = scalar(@$query_seq_r)-1; # -1 for ' ' 1: offset
+}
+
+if (! $query_file || !$query_len) {
+ die "query sequence required";
+}
+
+push @multi_names, $query_acc;
+$acc_names{$query_acc} = 1;
+$multi_align{$query_acc} = btop2alignment($query_seq_r, $query_len, {BTOP=>$query_len, q_start=>1, q_end=>$query_len}, 0);
+my $max_sseqid_len = length($query_acc);
+
+################
+# get sequence boundaries if available
+#
+my $seq_bound_hr = 0;
+my @seq_bound_accs = ();
+
+$seq_bound_hr = \%seq_bound;
+
+if ($bound_file_in) {
+ $seq_bound_hr = parse_bound_file($bound_file_in);
+}
+elsif ($bound_file_only) {
+ $seq_bound_hr = parse_bound_file($bound_file_only);
+}
+
+################
+# skip down to "The best scores are:"
+#
+my ($q_num, $query_descr, $q_len, $lib_cnt, $lib_len, $best_yes, $last_fields_r);
+
+if ($m_format =~ m/^m9/i) {
+ ($q_num, $query_descr, $q_len, $lib_cnt, $lib_len, $best_yes) = skip_to_m9results();
+ warn "Cannot find the best scores are:" unless $query_descr;
+}
+elsif ($m_format =~ m/^m8/) {
+ ($query_descr, $best_yes, $last_fields_r) = skip_to_m8results();
+ warn "Cannot find the best scores are: in $ARGV" unless $query_descr;
+
+ if (scalar(@{$last_fields_r})) {
+ push @m8_field_names, @{$last_fields_r};
+ }
+ push @m8_field_names, "annot";
+}
+else {
+ die "cannot recognize format: $m_format";
+}
+
+my $eval_fptr = \&eval_func;
+if ($eval2_fmt && $eval2) {
+ if($eval2 eq 'eval2') {
+ $eval_fptr = \&eval2_func;
+ }
+ elsif ($eval2 eq 'ave') {
+ $eval_fptr = \&eval_ave;
+ }
+}
+
+my ($tmp, $gi, $q_db, $q_acc, $q_id);
+
+if ($query_descr =~ /^gi\|\d+\|/) {
+ ($tmp, $gi, $q_db, $q_acc, $q_id) = split(/\|/,$query_descr);
+}
+elsif ($query_descr eq 'unnamed') {
+ $q_acc = 'unnamed'
+}
+else {
+ ($q_db,$q_acc, $q_id) = split(/\|/,$query_descr);
+}
+
+unless ($q_acc) {
+ $q_acc = $query_descr;
+}
+
+$acc_names{$q_acc} = 1; # this is necessary for the new acc-only NCBI SwissProt libraries
+
+$q_acc =~ s/\.\d+$//;
+
+while (my $line = <>) {
+
+ chomp $line;
+ next unless ($line);
+ my %hit_data =();
+ my ($s_seqid, $subj_acc, $s_seqid_u);
+ my $annot_f='NULL';
+
+ if ($m_format =~ m/^m9/i) {
+ last if $line =~ m/>>>/;
+ next if $line =~ m/^\+\-/; # skip over HSPs
+ my ($left, $right, $align_f) = ("","",'NULL');
+ ($left, $right, $align_f, $annot_f) = split(/\t/,$line);
+
+ $align_f= 'NULL' unless $align_f;
+ $annot_f= 'NULL' unless $annot_f;
+
+ my @fields = split(/\s+/,$left);
+ my ($ldb, $l_id, $l_acc) = ("","","");
+ if ($fields[0] =~ m/:/) {
+ ($ldb, $l_id) = split(/:/,$fields[0]);
+ ($l_acc) = $fields[1];
+ } else {
+ ($ldb, $l_acc,$l_id) = split(/\|/,$fields[0]);
+ }
+
+ @hit_data{@m9_field_names} = split(/\s+/,$right);
+ if ($eval2_fmt) {
+ @hit_data{qw(bits evalue eval2)} = @fields[-3, -2,-1];
+ }
+ else {
+ @hit_data{qw(bits evalue)} = @fields[-2,-1];
+ }
+
+ #
+ # currently preselbdr files have $ldb|$l_acc, not full s_seqid, so construct it
+ #
+ ($s_seqid, $subj_acc) = (join('|',($ldb, $l_acc, $l_id)), "$ldb|$l_acc");
+ @hit_data{qw(s_seqid subj_acc)} = ($s_seqid, $subj_acc);
+ @hit_data{qw(query_id query_acc)} = ($query_descr, $q_acc);
+ $hit_data{BTOP} = $align_f;
+ $hit_data{annot} = $annot_f;
+ }
+ else {
+ last if $line =~ m/^#/;
+ @hit_data{@m8_field_names} = split(/\t/,$line);
+ $subj_acc = $hit_data{'s_seqid'};
+ $subj_acc =~ s/^gi\|\d+\|(\w+\|\w+)\|?\w+/$1/;
+ }
+
+ # a better solution would be to rename the q_seqid, or at least to
+ # check for identity
+ # next if ($hit_data{q_seqid} eq $hit_data{s_seqid});
+ next if ($hit_data{a_len} == $query_len && $hit_data{BTOP} =~ m/^$query_len$/);
+
+ $s_seqid_u = $hit_data{'s_seqid'};
+ if ($acc_names{$s_seqid_u}) {
+ next; # skip additional HSPs
+# $acc_names{$s_seqid_u}++;
+# $s_seqid_u .= "_". $acc_names{$subj_acc};
+ }
+ else {
+ $acc_names{$hit_data{'s_seqid'}} = 1;
+ }
+
+ # must be after duplicate seqid check because blast HSP's have bad E-values after good.
+ next if ($eval_fptr->(\%hit_data) > $evalue);
+
+ $hit_data{s_seqid_u} = $s_seqid_u;
+
+ if (length($s_seqid_u) > $max_sseqid_len) {
+ $max_sseqid_len = length($s_seqid_u);
+ }
+
+ my $have_dom = 0;
+ if ($domain_bound && $hit_data{annot}) {
+ my $hit_doms_ar = parse_hit_domains($hit_data{annot});
+ # scan from left to right to make domain boundaries based on $qvalue
+ # the following seems reversed, but it is putting upper (and lower) limits on boundaries
+ my ($left_bound, $right_bound) = @hit_data{qw(s_end s_start)};
+ foreach my $dom_r ( @$hit_doms_ar ) {
+ next unless $dom_r->{target} eq 'subj';
+ # next if $dom_r->{virtual}; # should be controlled by annotation process
+ next unless $dom_r->{qval} > $qvalue;
+
+ if ($dom_r->{s_start} < $left_bound) {
+ $left_bound = $dom_r->{s_start};
+ $have_dom = 1;
+ }
+
+ if ($dom_r->{s_end} > $right_bound) {
+ $right_bound = $dom_r->{s_end};
+ $have_dom = 1;
+ }
+ }
+
+ if ($have_dom) {
+ if (exists($seq_bound_hr->{$subj_acc})) {
+ @{$seq_bound_hr->{$subj_acc}}{qw(start end)} = ($left_bound, $right_bound);
+ }
+ else {
+ $seq_bound_hr->{$subj_acc} = {start=>$left_bound, end=>$right_bound};
+ push @seq_bound_accs, $subj_acc;
+ }
+ }
+ }
+
+ # must have separate @hit_list that can be sorted, for searches with multiple alignment results
+
+ if ($bound_file_only || $have_dom) {
+ if (exists($seq_bound_hr->{$subj_acc})) {
+ my ($status, $alignment) = bound_btop2alignment($query_seq_r, $query_len, \%hit_data, @{$seq_bound_hr->{$subj_acc}}{qw(start end)});
+ if ($status) { # aligment is within boundary
+ push @multi_names, $hit_data{s_seqid_u};
+ $multi_align{$hit_data{s_seqid_u}} = $alignment;
+ }
+ # do not delete entry, because it needs to be preserved
+ }
+ }
+ elsif ($bound_file_in) {
+ if (exists($seq_bound_hr->{$subj_acc})) {
+ my ($status, $alignment) = bound_btop2alignment($query_seq_r, $query_len, \%hit_data, @{$seq_bound_hr->{$subj_acc}}{qw(start end)});
+ if ($status) {
+ push @multi_names, $hit_data{s_seqid_u};
+ $multi_align{$hit_data{s_seqid_u}} = $alignment;
+# push @multi_align, $alignment;
+ }
+ }
+ else {
+ push @multi_names, $hit_data{s_seqid_u};
+# push @multi_align, btop2alignment($query_seq_r, $query_len, \%hit_data, );
+ $multi_align{$hit_data{s_seqid_u}} = btop2alignment($query_seq_r, $query_len, \%hit_data);
+ @{$seq_bound_hr->{$subj_acc}}{qw(start end)} = @hit_data{qw(s_start s_end)};
+ push @seq_bound_accs, $subj_acc;
+ }
+ }
+ else { # no sequence boundaries
+ push @multi_names, $hit_data{s_seqid_u};
+ $multi_align{$hit_data{s_seqid_u}} = btop2alignment($query_seq_r, $query_len, \%hit_data);
+# push @multi_align, btop2alignment($query_seq_r, $query_len, \%hit_data);
+ if (!$have_dom && ($bound_file_out)) {
+ @{$seq_bound_hr->{$subj_acc}}{qw(start end)} = @hit_data{qw(s_start s_end)};
+ push @seq_bound_accs, $subj_acc;
+ }
+ }
+}
+
+# final MSA output
+$max_sseqid_len += 2;
+
+printf "BTOP%s multiple sequence alignment\n\n\n",$m_format;
+
+my $i_pos = 0;
+for (my $j = 0; $j < $query_len/60; $j++) {
+ my $i_end = $i_pos + 59;
+ if ($i_end >= $query_len) {$i_end = $query_len-1;}
+ for my $acc (@multi_names) {
+ next unless $acc;
+ printf("%-".$max_sseqid_len."s %s\n",$acc,join("",@{$multi_align{$acc}}[$i_pos .. $i_end]));
+ }
+ $i_pos += 60;
+ print "\n\n";
+}
+
+################
+# if bound_file_out provide it
+if ($bound_file_out) {
+ open(my $bound_fd, ">", $bound_file_out) || die "cannot open $bound_file_out";
+ for my $s_acc ( @seq_bound_accs ) {
+ print $bound_fd join("\t", ($s_acc, @{$seq_bound_hr->{$s_acc}}{qw(start end)})),"\n";
+ }
+ close($bound_fd);
+}
+
+if ($masked_lib_out) {
+ open(my $masked_fd, ">", $masked_lib_out) || die "cannot open $masked_lib_out";
+
+ for my $s_acc ( @multi_names ) {
+ print $masked_fd ">$s_acc\n";
+
+ # here we have four choices for masking:
+ # (1) simply delete the '-'s
+ # (2) delete the leading/trailing '-',s replace interal '-'s with 'X'
+ # (3) replace leading/trailing '-' with 'X', remove internal
+ # (4) replace leading/trailing '-' with query sequence, remove internal
+ # (5) replace leading/trailing '-' with random, remove internal
+ # (6) replace leading/trailing '-' with random, internal with 'X'
+
+# my @masked_seq = @{$multi_align{$s_acc}};
+ my $seq = join('',@{$multi_align{$s_acc}});
+
+ my @masked_seq = @{$multi_align{$s_acc}};
+ my $n_res = scalar(@masked_seq);
+ my $n_rand_res = scalar(@random_res);
+ if ($mask_type_end =~ m/x/i) {
+ for (my $i=0; $i < $n_res; $i++) {
+ last if ($masked_seq[$i] ne '-') ;
+ $masked_seq[$i] = 'X';
+ }
+ for (my $i=$n_res-1; $i >= 0; $i--) {
+ last if ($masked_seq[$i] ne '-') ;
+ $masked_seq[$i] = 'X';
+ }
+ }
+ elsif ($mask_type_end =~ m/^q/i) {
+ if ($mask_type_int =~ m/^q/i) {
+ for (my $i=0; $i < $n_res; $i++) {
+ if ($masked_seq[$i] eq '-') {
+ $masked_seq[$i] = $multi_align{$query_acc}[$i];
+ }
+ }
+ }
+ else {
+ my $li = 0;
+ for ( ; $li < $n_res; $li++) {
+ last if ($masked_seq[$li] ne '-') ;
+ $masked_seq[$li] = $multi_align{$query_acc}[$li];
+ }
+ my $ri = $n_res-1;
+ for ( ; $ri >= 0; $ri--) {
+ last if ($masked_seq[$ri] ne '-') ;
+ $masked_seq[$ri] = $multi_align{$query_acc}[$ri];
+ }
+ if ($mask_type_int =~ m/^rand/i) {
+ for (my $i=$li; $i <= $ri; $i++) {
+ if ($masked_seq[$i] eq '-') {
+ $masked_seq[$i] = $random_res[int(rand($n_rand_res))];
+ }
+ }
+ }
+ }
+ } elsif ($mask_type_end =~ m/^rand/i) {
+ if ($mask_type_int =~ m/^rand/i) {
+ for (my $i=0; $i < $n_res; $i++) {
+ if ($masked_seq[$i] eq '-') {
+ $masked_seq[$i] = $random_res[int(rand($n_rand_res))];
+ }
+ }
+ }
+ else {
+ for (my $i=0; $i < $n_res; $i++) {
+ last if ($masked_seq[$i] ne '-') ;
+ $masked_seq[$i] = $random_res[int(rand($n_rand_res))];
+ }
+ for (my $i=$n_res-1; $i >= 0; $i--) {
+ last if ($masked_seq[$i] ne '-') ;
+ $masked_seq[$i] = $random_res[int(rand($n_rand_res))];
+ }
+ }
+ }
+
+ my $masked_seq = join("", at masked_seq);
+ if ($mask_type_int =~ m/X/) {
+ $masked_seq =~ s/\-/X/g;
+ }
+ else {
+ $masked_seq =~ s/\-//g;
+ }
+
+ $masked_seq =~ s/(.{60})/$1\n/g;
+ print $masked_fd "$masked_seq\n";
+ }
+ close($masked_fd);
+}
+
+# input: a blast BTOP string of the form: "1VA160TS7KG10RK27"
+# returns a list_ref of tokens: (1, "VA", 60, "TS", 7, "KG, 10, "RK", 27)
+#
+sub decode_btop {
+ my ($btop_str) = @_;
+
+ my @tokens = split(/(\d+)/,$btop_str);
+
+ shift @tokens unless $tokens[0];
+
+ my @out_tokens = ();
+
+ for my $token (@tokens) {
+ if ($token =~ m/^\d+$/) {
+ push @out_tokens, $token
+ }
+ else {
+ my @mis_tokens = split(/(..)/,$token);
+ for my $mis (@mis_tokens) {
+ if ($mis) {push @out_tokens, $mis};
+ }
+ }
+ }
+
+ return \@out_tokens;
+}
+
+sub parse_hit_domains {
+ my ($annot_str) = @_;
+
+## annot_str looks like: "|RX:6-65:6-65:s=311;b=125.4;I=1.000;Q=339.6;C=C.HTH~1
+# |XR:6-65:6-65:s=311;b=125.4;I=1.000;Q=339.6;C=C.HTH~1
+# |RX:66-297:66-297:s=1200;b=483.7;I=1.000;Q=1409.6;C=NODOM~0
+# |XR:66-297:66-297:s=1200;b=483.7;I=1.000;Q=1409.6;C=NODOM~0
+
+ return 0 unless ($annot_str);
+
+ my @hit_annots = ();
+
+ my @annots = split(/\|/,$annot_str);
+ shift @annots; # remove first blank
+
+ for my $annot ( @annots ) {
+ my %dom_info = ();
+
+ # parse an entry:
+ # |RX:6-65:6-65:s=311;b=125.4;I=1.000;Q=339.6;C=C.HTH~1
+ my @d_fields = split(";",$annot);
+
+ ($dom_info{dom}) = ($d_fields[4] =~ m/C=(.+?)~?\d*v?$/); # also remove virtual domain symbols
+ next if ($dom_info{dom} =~ m/NODOM/);
+
+ ################
+ # parse @d_fields
+ if ($d_fields[4] =~ m/v$/) {
+ $dom_info{virtual} = 1;
+ }
+ else {
+ $dom_info{virtual} = 0;
+ }
+
+ ($dom_info{bits}) = ($d_fields[1] =~ m/b=(\-?\d+\.?\d*)/);
+ unless (defined($dom_info{bits})) {
+ warn "missing score info - annot: $annot\n annot_str: $annot_str";
+ $dom_info{bits} = '\N';
+ }
+ ($dom_info{percid}) = ($d_fields[2] =~ m/I=(\-?[\d\.]+)/);
+ unless (defined($dom_info{percid})) {
+ warn "missing percid info - annot: $annot\n annot_str: $annot_str";
+ $dom_info{percid} = '\N';
+ }
+
+ ($dom_info{qval}) = ($d_fields[3] =~ m/Q=([\d\.]+)/);
+
+ ################
+ # parse @c_fields
+ my @c_fields = split(":",$d_fields[0]);
+
+ if ($c_fields[0] =~ m/RX/) {$dom_info{target} = 'query';}
+ else {$dom_info{target} = 'subj';}
+
+ @dom_info{qw(q_start q_end)} = ($c_fields[1] =~ m/(\d+)\-(\d+)/);
+ @dom_info{qw(s_start s_end)} = ($c_fields[2] =~ m/(\d+)\-(\d+)/);
+ ($dom_info{score}) = ($c_fields[3] =~ m/s=(\-?\d+)/);
+ unless (defined($dom_info{score})) {
+ warn "missing score info - annot: $annot\n annot_str: $annot_str";
+ $dom_info{score} = '\N';
+ }
+
+ push @hit_annots, \%dom_info;
+ }
+
+ return \@hit_annots;
+}
+
+
+sub btop2alignment {
+ my ($query_seq_r, $query_len, $hit_data_hr, $seq_bound_hr) = @_;
+
+ # $query_seq_r is 1: based
+ my @alignment = ();
+
+ # the left unaligned region gets " ";
+ for (my $i=1; $i < $hit_data_hr->{q_start}; $i++) {
+ push @alignment, "-";
+ }
+
+ my $btop_align_r = decode_btop($hit_data_hr->{BTOP});
+
+ my ($seq0, $seq1) = ("","");
+ my $qix = $hit_data_hr->{q_start};
+ for my $btop (@{$btop_align_r}) {
+ if ($btop =~ m/^\d+$/) { # matching query sequence, add it up
+ for (my $i=0; $i < $btop; $i++) {
+ push @alignment, $query_seq_r->[$qix++];
+ }
+ }
+ else { # could be: TS/-S/T-
+ ($seq0, $seq1) = split(//,$btop);
+ if ($seq0 ne '-') {
+ push @alignment, $seq1;
+ $qix++;
+ }
+ }
+ }
+ # all done with alignment, double check that $qix = $hit_data_hr->{q_end}
+ unless ($qix == $hit_data_hr->{q_end}+1) {
+ warn "$qix != ".$hit_data_hr->{q_end}+1;
+ }
+
+ for (my $i = $hit_data_hr->{q_end}+1; $i <= $query_len; $i++) {
+ push @alignment, "-";
+ }
+
+ return \@alignment;
+}
+
+################
+# generates MSA alignment entry between $sb_start and $sb_end
+# if there are no aligned residues between these locations, return $status=0
+
+sub bound_btop2alignment {
+ my ($query_seq_r, $query_len, $hit_data_hr, $sb_start, $sb_end) = @_;
+
+ # $query_seq_r is 1: based
+ my @alignment = ();
+
+ my $have_aligned_res = 0;
+
+ # the left unaligned region gets " ";
+ for (my $i=1; $i < $hit_data_hr->{q_start}; $i++) {
+ push @alignment, "-";
+ }
+
+ my $btop_align_r = decode_btop($hit_data_hr->{BTOP});
+
+ my ($seq0, $seq1) = ("","");
+ my ($qix, $six) = @{$hit_data_hr}{qw(q_start s_start)};
+
+ for my $btop (@{$btop_align_r}) {
+ if ($btop =~ m/^\d+$/) { # matching query sequence, add it up
+ for (my $i=0; $i < $btop; $i++) {
+ if ($six >= $sb_start && $six <= $sb_end) {
+ push @alignment, $query_seq_r->[$qix];
+ $have_aligned_res=1;
+ }
+ else {
+ push @alignment, '-';
+ }
+ $qix++; $six++;
+ }
+ }
+ else { # could be: TS/-S/T-
+ ($seq0, $seq1) = split(//,$btop);
+ if ($seq1 eq '-') { # gap in subject
+ push @alignment, '-';
+ $qix++;
+ }
+ elsif ($seq0 ne '-') { # mismatch
+ if ($six >= $sb_start && $six <= $sb_end) {
+ $have_aligned_res=1;
+ push @alignment, $seq1;
+ }
+ else {
+ push @alignment, '-';
+ }
+ $qix++;
+ $six++;
+ }
+ else { # gap in query, consume $six
+ $six++;
+ }
+ }
+ }
+ # all done with alignment, double check that $qix = $hit_data_hr->{q_end}
+ unless ($qix == $hit_data_hr->{q_end}+1) {
+ warn $qix." != ".$hit_data_hr->{q_end}+1;
+ }
+
+ for (my $i = $hit_data_hr->{q_end}+1; $i <= $query_len; $i++) {
+ push @alignment, "-";
+ }
+
+ return ($have_aligned_res, \@alignment);
+}
+
+sub parse_query_lib {
+ my ($query_file) = @_;
+
+ my %query_seqs = ();
+
+ open(my $qfd, $query_file);
+
+
+ my ($header, $sequence) = ("","");
+ while (my $entry = <$qfd>) { # returns an entire fasta entry
+ chomp $entry;
+ if ($entry =~ m/^>/) {
+ $header = $entry;
+ }
+ else {
+ $sequence .= $entry
+ }
+ }
+
+ $sequence =~ s/[^A-Za-z\*]//g; # remove everything but letters
+ $sequence = uc($sequence);
+
+ $header =~ s/^>//;
+ $header =~ s/\s.*$//;
+
+ my @seq = split(//,$sequence);
+ unshift @seq,""; # @seq is now 1-based
+
+ close($qfd);
+
+ return ($header, \@seq);
+}
+
+sub parse_bound_file {
+ my ($bound_file) = @_;
+
+ open(my $qfd, $bound_file) || return 0;
+
+ while (my $line = <$qfd>) {
+ next if ($line =~ m/^#/);
+ chomp $line;
+ my @data = split(/\t/,$line);
+ if (!defined($seq_bound{$data[0]})) {
+ $seq_bound{$data[0]} = {start=>$data[1], end=>$data[2]};
+ push @seq_bound_accs, $data[0];
+ }
+ else {
+ warn "multiple boundaries for $data[0]";
+ }
+ }
+
+ return \%seq_bound;
+}
+
+sub skip_to_m9results {
+
+ my ($q_num, $query_desc, $q_start, $q_stop, $q_len, $l_num, $l_len, $best_yes);
+
+ while (my $line = <>) {
+ if ($line =~ m/^\s*(\d+)>>>(\S+)\s.+ \- (\d+) aa$/) {
+ ($q_num,$query_desc, $q_len) = ($1,$2,$3);
+# ($q_len) = ($line =~ m/(\d+) aa$/);
+ $line = <>; # skip Library:
+ $line = <>; # 153571012 residues in 291716 sequences
+ ($l_len, $l_num) = ($line =~ m/^\s+(\d+)\s+residues in\s+(\d+)/);
+ goto have_query;
+ }
+ elsif ($line =~ m/>>>\/\/\//) {goto done;}
+ }
+ done:
+ return (0,"");
+
+ have_query:
+ while (my $line = <>) {
+ $best_yes = 0;
+
+ if ($line =~ m/^The best scores are:/) {
+ $best_yes = 1;
+ $eval2_fmt = 1 if ($line =~ m/E2()/);
+ last;
+ }
+ last if ($line =~ m/^!! No sequences/);
+ }
+ return ($q_num, $query_desc,$q_start, $q_stop, $q_len, $l_num, $l_len, $best_yes);
+}
+
+sub skip_to_m8results {
+
+ my ($query_desc, $best_yes);
+
+ $best_yes = 0;
+ $eval2_fmt = 0;
+
+ my @last_fields = ();
+
+ while (my $line = <>) {
+ if ($line =~ m/^# Query:/) { # Query:
+ ($query_desc) = ($line =~ m/^# Query:\s+(\S+)/);
+ $query_desc = 'unnamed' unless ($query_desc);
+# ($q_len) = ($line =~ m/(\d+) aa$/);
+ $line = <>; # Database:
+ $line = <>; # Fields:
+
+ unless ($line =~ m/# Fields:/) {
+ warn "!!! warning !!!: # Fields not found: $line";
+ }
+
+ if ($line =~ m/,\seval2/) { # only with FASTA
+ push @last_fields, "eval2";
+ $eval2_fmt = 1;
+ }
+
+ if ($line =~ m/,\sscore,\s+BTOP/i) { # only with BLAST
+ push @last_fields, qw(score BTOP);
+ }
+ elsif ($line =~ m/\BTOP,\s+score/i) {
+ push @last_fields, qw(BTOP score);
+ }
+ elsif ($line =~ m/,\s+BTOP/) {
+ push @last_fields, qw(BTOP);
+ }
+
+ $line = <>; # NNN fits found or
+ if ($line =~ m/^#\s+\d+\s+hits found/) {
+ $best_yes = 1;
+ }
+ goto have_query;
+ }
+ elsif ($line =~ m/>>>\/\/\//) {goto done;}
+ }
+ done:
+ return (0,"");
+
+ have_query:
+ return ($query_desc, $best_yes, \@last_fields);
+}
+
+################
+# eval_func -- return evalue
+sub eval_func {
+ my ($hit) = @_;
+ return $hit->{evalue};
+}
+
+# eval_func -- return evalue
+sub eval2_func {
+ my ($hit) = @_;
+ return $hit->{eval2};
+}
+# eval_func -- return evalue
+sub eval_ave {
+ my ($hit) = @_;
+ return sqrt($hit->{evalue}*$hit->{eval2});
+}
+
+
+################
+# init_random_res initializes a 1000 element array of amino acid residues with Robinson/Robinson frequencies
+
+sub init_random_res {
+ my @rr_res = qw(A R N D C Q E G H I L K M F P S T W Y V);
+ my @rr_counts = (35155, 23105, 20212, 24161, 8669, 19208,
+ 28354, 33229, 9906, 23161, 40625, 25872,
+ 10101, 17367, 23435, 32070, 26311, 5990,
+ 14488, 29012);
+ my $rr_total = 450431;
+
+ my $rr_seq = "";
+ for (my $i=0; $i < 20; $i++) {
+ $rr_seq = $rr_seq . $rr_res[$i] x int(1000.0 *$rr_counts[$i]/$rr_total + 0.5);
+ }
+ return split(//,$rr_seq);
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ m89_btop_msa2.pl
+
+=head1 SYNOPSIS
+
+ m89_btop_msa2.pl --query_file query.fasta fasta_m8CB_output.file
+ m89_btop_msa2.pl --query_file query.fasta blast_outfmt7_BTOP.output.file
+ m89_btop_msa2.pl --query_file query.fasta [--m_format m9] fasta_m9C_output.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+
+ --query_file -- query sequence file
+ --query -- same as --query_file
+ (only one sequence per file)
+
+ --eval2 : "": use E()-value, "eval2": use E2()/eval2, "ave": use geom. mean
+
+ --bound_file_in -- tab delimited accession<tab>start<tab>end that
+ specifies MSA boundaries WITHIN alignment.
+ Additional hits use alignment (or domain)
+ boundaries.
+
+ --bound_file_only -- tab delimited accession<tab>start<tab>end that
+ specifies MSA boundaries WITHIN alignment.
+ Only sequences in --bound_file_only will be in the MSA.
+
+ --bound_file_out -- "--bound_file" for next iteration of psisearch2
+
+ --domain_bound parse domain annotations (-V) from m9B file
+ --domain
+
+ --masked_lib_out -- FASTA format library of MSA sequences
+
+ --int_mask_type = "query", "rand", "X", "none"
+ --end_mask_type = "query", "rand", "X", "none"
+specify the residues to be inserted into output library
+
+=head1 DESCRIPTION
+
+C<m89_btop_msa2.pl> takes a fasta36/ssearch36 -m 9B ouput file, which
+includes a BTOP encoded alignment string, and produces the multiple
+sequence alignment (MSA) implied by the query sequence, alignment
+boundaries, and pairwise alignments. The alignment does not allow
+gaps in the query sequence, only in the subject sequences.
+
+The C<--query_file> must be specified, and the query sequence is
+provided as the first sequence in the MSA.
+
+If a C<--bound_file> is provided, then the ends of the alignments are
+reduced to the coordinates specified by the C<bound_file>. In
+addition, only sequences included in the C<bound_file> are included in
+the MSA.
+
+Output: A clustal-like interleaved multiple sequence alignment that
+can be used as input (using the C<-in_msa> option) to C<psiblast>.
+
+If an C<--masked_lib_out> filename.fasta is specified, a version of the MSA
+in FASTA format is written to filename.fasta. This file can be
+converted to BLAST format (C<makeblastdb --in filename.fasta>) and
+and the converted blast library can be used to rebuild a PSSM with
+C<psiblast -num_iterations 2 -db filename.fasta -in_msa filename.msa
+-out_pssm filename.asn_txt>.
+
+The sequences in the C<--masked_lib_out> fasta file can be modifed
+where gaps are present in the MSA as specified by the C<--int_mask_type> and C<--end_mask_type>
+options. If no --int_mask_type/--end_mask_type is specified
+("none"), then the subject sequence in the output library matches the
+aligned part of the subject sequenc (gaps characters are deleted). If
+the --end/int_mask_type is "query", "rand", or "X", then either the
+aligned query residue, a random residue, or an "X" substituted at
+each gap, producing a library of subject sequences that may differ
+from the original subject sequences. These different options can be
+used to force C<psiblast> to build a PSSM that more accurately
+reflects the original C<ssearch36> alignment.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/psisearch2/psisearch2_msa.pl b/psisearch2/psisearch2_msa.pl
new file mode 100755
index 0000000..2bf2d5b
--- /dev/null
+++ b/psisearch2/psisearch2_msa.pl
@@ -0,0 +1,453 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2016 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+use strict;
+use Getopt::Long;
+use Pod::Usage;
+
+################
+# implementation of simple shell script to do iterative searches
+#
+# logic:
+# (1) do initial search
+# (2) use results of initial search to produce MSA/PSSM for next search
+# (3) do PSSM search
+# (4) use results of PSSM search to produce MSA/PSSM for iterative step 3
+#
+################
+#
+# command:
+# psisearch2_msa.pl --query query_file --db database --num_iter N --evalue 0.002 --no_msa --int_mask none/query/random --end_mask none/query/random --tmp_dir results/ --domain --align --out_suffix none --pgm ssearch/psiblast
+#
+################
+
+use vars qw( $query_file $db_file $num_iter $evalue $int_mask $end_mask $query_mask $no_msa $tmp_dir $dom_flag $align_flag $suffix $srch_pgm $file_out $help $shelp $error_log $rm_flag $annot_type $quiet);
+use vars qw( $prev_msa $next_msa $prev_hitdb $next_hitdb $prev_pssm $next_pssm $prev_bound_in $next_bound_out $tmp_file_list $save_all $delete_bnd $delete_tmp);
+
+
+################
+# locations of required programs:
+# (1) m89_btop_msa2.pl
+# (2) ssearch
+# (3) NCBI blast+ programs: psiblast/makeblastdb
+# (4) NCBI datatool (required only for ssearch36 PSSMs)
+
+my $pgm_bin = "/seqprg/bin";
+my $pgm_data = "/seqprg/data";
+my $ssearch_bin = "$pgm_bin/ssearch36";
+my $psiblast_bin = "$pgm_bin/psiblast";
+my $makeblastdb_bin = "$pgm_bin/makeblastdb";
+my $datatool_bin = "$pgm_bin/datatool -m $pgm_data/NCBI_all.asn";
+my $align2msa_lib = "m89_btop_msa2.pl";
+
+my %srch_subs = ('ssearch' => \&get_ssearch_cmd,
+ 'psiblast' => \&get_psiblast_cmd,
+ );
+
+my %annot_cmds = ('rpd3' => qq("\!../scripts/ann_pfam28.pl --pfacc --db RPD3 --vdoms --split_over"),
+ 'rpd3nv' => qq("\!../scripts/ann_pfam28.pl --pfacc --db RPD3 --split_over"),
+ 'rpd3nvn' => qq("\!../scripts/ann_pfam28.pl --pfacc --db RPD3 --split_over --neg"),
+ 'pfam' => qq("\!../scripts/ann_pfam30.pl --pfacc --vdoms --split_over"));
+
+($num_iter, $evalue, $dom_flag, $align_flag, $int_mask, $end_mask, $query_mask, $srch_pgm, $tmp_dir, $error_log, $annot_type, $quiet) =
+ ( 5, 0.002, 0, 0, 'none', 'none', 0, 'ssearch','',0, 0, "", 0);
+($save_all, $tmp_file_list, $delete_bnd, $delete_tmp) = (0, "", 0, 0);
+
+
+my $pgm_command = "#".join(" ",($0, at ARGV));
+print STDERR "#",join(" ",($0, at ARGV)),"\n" if ($error_log);
+
+GetOptions(
+ 'query|sequence=s' => \$query_file,
+ 'db|database=s' => \$db_file,
+ 'suffix|out_suffix=s' => \$suffix,
+ 'dir=s' => \$tmp_dir,
+ 'evalue=f' => \$evalue,
+ 'annot_db=s' => \$annot_type,
+ 'out_name=s' => \$file_out,
+ 'iter=i' => \$num_iter,
+ # 'in_msa=s' => \$prev_msa,
+ # 'out_msa=s' => \$next_msa,
+ # 'in_hitdb=s' => \$prev_hitdb,
+ # 'out_hitdb=s' => \$next_hitdb,
+ 'in_pssm=s' => \$prev_pssm,
+ # 'out_pssm=s' => \$next_pssm,
+ 'in_bounds=s' => \$prev_bound_in,
+ 'out_bounds=s' => \$next_bound_out,
+ 'num_iter|max_iter=i' => \$num_iter,
+ # 'no_msa' => \$no_msa,
+ 'dom|domain' => \$dom_flag,
+ 'align' => \$align_flag,
+ 'query_seed|query_mask' => \$query_mask,
+ 'int_mask|int-mask|int_seed|int-seed=s' => \$int_mask,
+ 'end_mask|end-mask|end_seed|end-seed=s' => \$end_mask,
+ 'pgm=s' => \$srch_pgm,
+ 'quiet' => \$quiet,
+ 'q' => \$quiet,
+ 'silent' => \$quiet,
+ 'h|?' => \$shelp,
+ "help" => \$help,
+ "errors" => \$error_log,
+ "save_list:s" => \$tmp_file_list, # files to save (not delete)
+ "save_tmp|save_all" => \$save_all,
+ "del_bnd|delete_bnd" => \$delete_bnd,
+ "del_tmp|del_all|delete_tmp|delete_all" => \$delete_tmp,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+
+pod2usage(1) unless $query_file && -r $query_file; # need a query
+pod2usage(1) unless $db_file ; # need a database
+
+my @del_file_ext = qw(msa psibl_out hit_db asntxt asnbin);
+
+if ($srch_pgm =~ m/psiblast/) {
+ pop(@del_file_ext);
+}
+
+if ($query_mask) {
+ $int_mask='query' unless $int_mask ne 'none';
+ $end_mask='query' unless $end_mask ne 'none';
+}
+
+if ($delete_tmp) {
+ $delete_bnd = 1;
+}
+elsif ($save_all) {
+ @del_file_ext = ();
+ $tmp_file_list = "";
+ $delete_bnd = 0
+}
+
+################
+# which tmp files should be saved/deleted?
+#
+my %save_file_ext = ();
+if ($tmp_file_list) {
+ my @new_del_file_ext = ();
+ for my $ext (split(/,\s*/,$tmp_file_list)) {
+ # possible values are: msa, asnbin, asntxt, psibl_out, hit_db (see @del_file_ext)
+ $save_file_ext{$ext} = 1;
+ }
+
+ for my $ext (@del_file_ext) {
+ push @new_del_file_ext, $ext unless $save_file_ext{$ext};
+ }
+ @del_file_ext = @new_del_file_ext;
+}
+
+print "$pgm_command\n" unless ($quiet);
+
+my $this_iter = "it1";
+
+my ($query_pref) = ($query_file =~ m/([\w\.]+)$/);
+
+$file_out = $query_pref unless $file_out;
+
+my $this_file_pref = "$file_out.$this_iter";
+$this_file_pref = "$this_file_pref.$suffix" if ($suffix);
+my $this_file_out = $this_file_pref;
+$this_file_out = "$tmp_dir/$this_file_out" if ($tmp_dir);
+
+my $prev_file_out = $this_file_out;
+
+####
+# parse output to build PSSM
+# generate output filenames
+
+# do the first search
+my $search = $srch_subs{$srch_pgm}($query_file, $db_file, $prev_pssm);
+log_system("$search > $this_file_out 2> $this_file_out.err");
+
+my ($this_pssm, $this_bound_out) = build_msa_pssm($query_file, $this_file_out, $prev_bound_in);
+
+# now have necessary files for next iteration
+
+for (my $it=2; $it <= $num_iter; $it++) {
+
+ $prev_pssm = $this_pssm;
+ $prev_bound_in = $this_bound_out;
+ ####
+ # build filename for this iteration
+ $this_file_pref = $this_file_out = "$file_out.it$it";
+ $this_file_out = "$this_file_pref.$suffix" if ($suffix);
+ $this_file_out = "$tmp_dir/$this_file_out" if ($tmp_dir);
+
+ $search = $srch_subs{$srch_pgm}($query_file, $db_file, $prev_pssm);
+ log_system("$search > $this_file_out 2> $this_file_out.err");
+
+ # here, we are done with previous .msa, .asntxt, .asnbin, etc files. Delete them if desired
+ if (@del_file_ext) {
+ my @del_file_list = ();
+ for my $ext (@del_file_ext) {
+ push @del_file_list, "$prev_file_out.$ext";
+ }
+ log_system("rm ".join(" ", at del_file_list));
+ }
+ $prev_file_out = $this_file_out;
+
+ ($this_pssm, $this_bound_out) = build_msa_pssm($query_file, $this_file_out, $prev_bound_in);
+
+ if (has_converged($prev_bound_in, $this_bound_out)) {
+ print STDERR "$0 $srch_pgm $query_file $db_file converged ($it iterations)\n" unless ($quiet);
+
+ if (@del_file_ext) {
+ my @del_file_list = ();
+ for my $ext (@del_file_ext) {
+ push @del_file_list, "$prev_file_out.$ext";
+ }
+ log_system("rm ".join(" ", at del_file_list));
+ log_system("rm $prev_bound_in $this_bound_out") if ($delete_bnd);
+ }
+ exit(0);
+ }
+ log_system("rm $prev_bound_in") if ($delete_bnd);
+}
+
+if (@del_file_ext) {
+ my @del_file_list = ();
+ for my $ext (@del_file_ext) {
+ push @del_file_list, "$prev_file_out.$ext";
+ }
+ log_system("rm ".join(" ", at del_file_list));
+}
+
+log_system("rm $this_bound_out") if ($delete_bnd);
+
+unless ($quiet) {
+ print STDERR "$0 $srch_pgm $query_file $db_file finished ($num_iter iterations)\n";
+}
+
+################
+# log_system()
+# run system on string, logging first if $error_log
+#
+sub log_system {
+
+ my ($cmd) = @_;
+
+ print STDERR "$cmd\n" if $error_log;
+ system($cmd);
+}
+
+################
+# sub get_ssearch_cmd()
+# builds an ssearch command line with query, db, and pssm
+#
+sub get_ssearch_cmd {
+ my ($query_file, $db_file, $pssm_file) = @_;
+
+ my $search_cmd = qq($ssearch_bin -S -m 8CB -d 0 -E "1.0 0" -s BP62);
+ if ($annot_type) {
+ $search_cmd .= qq( -V $annot_cmds{$annot_type});
+ }
+ if ($pssm_file) {
+ $search_cmd .= qq( -P "$pssm_file 2");
+ }
+
+ $search_cmd .= qq( $query_file $db_file);
+
+ return $search_cmd;
+}
+
+################
+# sub get_psiblast_cmd()
+# builds an ssearch command line with query, db, and pssm
+#
+sub get_psiblast_cmd {
+ my ($query_file, $db_file, $pssm_file) = @_;
+
+ my $search_cmd = qq($psiblast_bin -num_threads 4 -max_target_seqs 5000 -outfmt '7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore score btop' -inclusion_ethresh $evalue -num_iterations 1 -db $db_file);
+ if ($pssm_file) {
+ $search_cmd .= qq( -in_pssm $pssm_file);
+# $search_cmd .= qq( -comp_based_stats 0);
+ }
+ else {
+ $search_cmd .= qq( -query $query_file);
+ }
+
+ return $search_cmd;
+}
+
+################
+# sub build_msa_pssm()
+#
+# given query, search output file ($this_file_out), prev_boundary_file
+# uses m89_btop_msa2.pl to generate PSSM in .asntxt or .asnbin format, also bound_file_out if $align_flag
+# (later - optionally deletes intermediate files)
+#
+# always produce a $bound_file_out file to test for convergence
+#
+sub build_msa_pssm {
+ my ($query_file, $this_file_out,$prev_bound_in) = @_;
+
+ my ($this_msa, $this_hit_db, $this_pssm_asntxt, $this_pssm_asnbin, $this_psibl_out, $this_bound_out) =
+ ("$this_file_out.msa",
+ "$this_file_out.hit_db",
+ "$this_file_out.asntxt",
+ "$this_file_out.asnbin",
+ "$this_file_out.psibl_out",
+ "$this_file_out.bnd_out",
+ );
+
+ my $blastdb_err = "$this_file_out.mkbldb_err";
+ my $aln2msa_cmd = qq($align2msa_lib --query $query_file --evalue $evalue --masked_lib_out=$this_hit_db);
+
+ if ($int_mask) {
+ $aln2msa_cmd .= qq( --int_mask_type $int_mask);
+ }
+
+ if ($end_mask) {
+ $aln2msa_cmd .= qq( --end_mask_type $end_mask);
+ }
+
+ if ($dom_flag) {
+ $aln2msa_cmd .= qq( --domain);
+ }
+
+ if ($align_flag && $prev_bound_in) {
+ $aln2msa_cmd .= qq( --bound_file_in $prev_bound_in);
+ }
+
+ # always produce this file to check for convergence
+ $aln2msa_cmd .= qq( --bound_file_out $this_bound_out);
+
+ log_system("$aln2msa_cmd $this_file_out > $this_msa");
+
+ my $makeblastdb_cmd = "$makeblastdb_bin -in $this_hit_db -dbtype prot -parse_seqids > $blastdb_err";
+
+ log_system($makeblastdb_cmd);
+
+ my $buildpssm_cmd = "$psiblast_bin -max_target_seqs 5000 -outfmt 7 -inclusion_ethresh 100.0 -in_msa $this_msa -db $this_hit_db -out_pssm $this_pssm_asntxt -num_iterations 1 -save_pssm_after_last_round";
+
+ log_system("$buildpssm_cmd > $this_psibl_out 2> $this_psibl_out.err");
+
+ log_system("rm $this_hit_db.p* $blastdb_err");
+
+ # remove uninformative error logs
+ log_system("rm $this_psibl_out.err $this_file_out.err") unless $error_log;
+
+ unless ($srch_pgm eq 'psiblast') {
+ my $asn2asn_cmd = "$datatool_bin -v $this_pssm_asntxt -e $this_pssm_asnbin";
+ log_system($asn2asn_cmd);
+ return ($this_pssm_asnbin, $this_bound_out);
+ }
+ else {
+ return ($this_pssm_asntxt, $this_bound_out);
+ }
+}
+
+################
+# sub has_converged()
+# reads two boundary files and compares accessions
+#
+sub has_converged {
+ my ($file1, $file2) = @_;
+
+ my @f1_names = ();
+ my @f2_names = ();
+
+ open (my $fd1, '<', $file1) || die "cannot read $file1";
+ while (my $line = <$fd1>) {
+ chomp($line);
+ my @fields = split(/\t/,$line);
+ push @f1_names, $fields[0];
+ }
+ close $fd1;
+
+ open (my $fd2, '<', $file2) || die "cannot read $file2";
+ while (my $line = <$fd2>) {
+ chomp($line);
+ my @fields = split(/\t/,$line);
+ push @f2_names, $fields[0];
+ }
+ close $fd2;
+
+ # check for same length
+ return 0 if (scalar(@f1_names) != scalar(@f2_names));
+
+ @f1_names = sort @f1_names;
+ @f2_names = sort @f2_names;
+
+ for (my $i=0; $i < scalar(@f1_names); $i++) {
+ return 0 if ($f1_names[$i] ne $f2_names[$i]);
+ }
+ return 1;
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+psisearch2_msa.pl
+
+=head1 SYNOPSIS
+
+ psisearch2_msa.pl --query q_file --db db_file --pgm ssearch|psiblast --num_iter 5
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --query query file (also --sequence)
+ --db database file (--database)
+ --pgm program used for searching, ssearch or psiblast
+ --num_iter maximum number of iterations (--max_iter)
+ --dir working directory and location of output
+ --evalue threshold for inclusion in PSSM
+ --out_name/--suffix result file is "out_name.it#.suffix"
+ --in_msa/--out_msa [not implemented] MSA used to build PSSM, requires --in_hitdb
+ --in_hitdb/--out_hitdb [not implemented] used to build PSSM
+ --in_pssm/--out_pssm [--out_pssm not implemented]
+ --in_bounds/--out_bounds used to control alignment boundaries for PSSM
+ --int_mask/--end_mask none|query|random - values embeeded in library sequences based on gaps in MSA
+ --delete_all remove all tmp files (also --delete_tmp, --del_all, --del_tmp)
+ --delete_bnd remove boundary file (included with --delete_all, but not deleted by default)
+ --save_all save temporary files (.asnbin, .asntxt, .msa, .psiblout, .hit_db)
+ --save_list: comma delimited list of file extensions (above) to save
+
+=head1 DESCRIPTION
+
+C<psisearch2_msa.pl> automates successive iterations of C<psiblast> or
+C<ssearch36> using different strategies to reduce PSSM contamination
+from alignment over-extension. C<psisearch2_msa.pl> uses the
+C<m89_btop_msa2.pl> program to read BTOP formatted output from
+C<psiblast> or C<ssearch36> and produce both a multiple sequence
+alignment (MSA) and a fasta formatted custom database of the sequences in the
+MSA. C<psiblast> then produces a PSSM From the MSA and custom database.
+
+Different strategies to reduce PSSM contamination from alignment
+overextension can be specified using the C<--int_mask>, C<--end_mask>,
+C<--align>, and C<--domain> options. If C<--int_mask> and
+C<--end_mask> are not specified (or set to "none"), then the PSSM is
+generated by aligning the MSA with the sequence residues that were
+aligned in the previous search. C<--int_mask> and C<--int_mask> are
+set to "query", then any gaps in the MSA are filled with the
+corresponding aligned residue from the query sequence. In our
+experience, this dramatically reduces alignment over-extension and
+false-positives.
+
+=head1 AUTHORS
+
+William R. Pearson (wrp at virginia.edu) and Weizhong Li (wli at ebi.ac.uk)
+
+=cut
diff --git a/psisearch2/psisearch2_msa.py b/psisearch2/psisearch2_msa.py
new file mode 100755
index 0000000..1d2b71f
--- /dev/null
+++ b/psisearch2/psisearch2_msa.py
@@ -0,0 +1,368 @@
+#!/usr/bin/python
+
+################################################################
+# copyright (c) 2016 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+import sys
+import os
+import argparse
+import subprocess
+import re
+
+# python re-write of psisearch2_msa.pl
+#
+# logic:
+# (1) do initial search
+# (2) use results of initial search to produce MSA/PSSM for next search
+# (3) do PSSM search
+# (4) use results of PSSM search to produce MSA/PSSM for iterative step 3
+#
+################
+#
+# command:
+# psisearch2_msa.py --query query_file --db database --num_iter N --evalue 0.002 --no_msa --int_mask none/query/random --end_mask none/query/random --tmp_dir results/ --domain --align --suffix --pgm ssearch/psiblast
+#
+################
+
+################
+# locations of required programs:
+# (1) m89_btop_msa2.pl
+# (2) ssearch
+# (3) NCBI blast+ programs: psiblast/makeblastdb
+# (4) NCBI datatool (required only for ssearch36 PSSMs)
+
+pgm_bin = "/seqprg/bin"
+pgm_data = "/seqprg/data"
+ssearch_bin = pgm_bin+"/ssearch36"
+psiblast_bin = pgm_bin+"/psiblast"
+makeblastdb_bin = pgm_bin+"/makeblastdb"
+datatool_bin = "%s/datatool -m %s/NCBI_all.asn" % (pgm_bin,pgm_data)
+align2msa_lib = "m89_btop_msa2.pl"
+
+annot_cmds = {'rpd3': '"!../scripts/ann_pfam28.pl --pfacc --db RPD3 --vdoms --split_over"',
+ 'rpd3nv':'"!../scripts/ann_pfam28.pl --pfacc --db RPD3 --split_over"',
+ 'pfam':'"!../scripts/ann_pfam30.pl --pfacc --vdoms --split_over"'}
+
+num_iter = 5
+evalue = 0.002
+dom_flag = 0
+align_flag = 0
+int_mask = 'none'
+end_mask = 'none'
+srch_pgm = 'ssearch'
+tmp_dir = ''
+error_log = 0
+rm_flag = 0
+annot_type = ''
+quiet = 0
+
+################
+# log_system()
+# run system on string, logging first if error_log
+#
+def log_system (cmd, error_log):
+
+ if (error_log) :
+ sys.stderr.write(cmd+"\n")
+
+ subprocess.call(cmd, shell=True)
+# print cmd
+
+################
+# sub get_ssearch_cmd()
+# builds an ssearch command line with query, db, and pssm
+#
+def get_ssearch_cmd(query_file, db_file, pssm_file) :
+
+ search_cmd = '%s -S -m 8CB -d 0 -E "1.0 0" -s BP62' % (ssearch_bin)
+
+ if (annot_type) :
+ search_cmd += " -V %s" % (annot_cmds[annot_type])
+
+ if (pssm_file) :
+ search_cmd += ' -P "%s 2"' % (pssm_file)
+
+ search_cmd += " %s %s" % (query_file, db_file)
+
+ return search_cmd
+
+
+################
+# sub get_psiblast_cmd()
+# builds an ssearch command line with query, db, and pssm
+#
+def get_psiblast_cmd(query_file, db_file, pssm_file) :
+
+ search_cmd = "%s -num_threads 4 -max_target_seqs 5000 -outfmt '7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore score btop' -inclusion_ethresh %f -num_iterations 1 -db %s" % (psiblast_bin, evalue, db_file)
+
+ if (pssm_file) :
+ search_cmd += " -in_pssm %s" % (pssm_file)
+ else :
+ search_cmd += " -query %s" % (query_file)
+
+ return search_cmd
+
+
+################
+# sub build_msa_pssm()
+#
+# given query, search output file (this_file_out), prev_boundary_file
+# uses m89_btop_msa2.pl to generate PSSM in .asntxt or .asnbin format, also bound_file_out if align_flag
+# (later - optionally deletes intermediate files)
+#
+# always produce a bound_file_out file to test for convergence
+#
+def build_msa_pssm(query_file, this_file_out,prev_bound_in, error_log) :
+
+ (this_msa, this_hit_db, this_pssm_asntxt, this_pssm_asnbin, this_psibl_out, this_bound_out) = (this_file_out+".msa",this_file_out+".hit_db",this_file_out+".asntxt",this_file_out+".asnbin",this_file_out+".psibl_out",this_file_out+".bnd_out")
+
+ blastdb_err = this_file_out+".mkbldb_err"
+ aln2msa_cmd = "%s --query %s --evalue %f --masked_lib_out=%s" % (align2msa_lib, query_file, evalue, this_hit_db)
+
+ if (int_mask) :
+ aln2msa_cmd += " --int_mask_type %s" % (int_mask)
+
+ if (end_mask) :
+ aln2msa_cmd += " --end_mask_type %s" % (end_mask)
+
+ if (dom_flag) :
+ aln2msa_cmd += " --domain"
+
+ if (align_flag and prev_bound_in) :
+ aln2msa_cmd += " --bound_file_in %s" %(prev_bound_in)
+
+ # always produce this file to check for convergence
+ aln2msa_cmd += " --bound_file_out %s" % (this_bound_out)
+
+ log_system("%s %s > %s"%(aln2msa_cmd, this_file_out, this_msa), error_log)
+
+ makeblastdb_cmd = "%s -in %s -dbtype prot -parse_seqids > %s" % (makeblastdb_bin, this_hit_db, blastdb_err)
+
+ log_system(makeblastdb_cmd, error_log)
+
+ buildpssm_cmd = "%s -max_target_seqs 5000 -outfmt 7 -inclusion_ethresh 100.0 -in_msa %s -db %s -out_pssm %s -num_iterations 1 -save_pssm_after_last_round" % (psiblast_bin, this_msa, this_hit_db, this_pssm_asntxt)
+
+ log_system("%s > %s 2> %s.err" % (buildpssm_cmd, this_psibl_out, this_psibl_out), error_log)
+
+ log_system("rm %s.p* %s" % (this_hit_db,blastdb_err), error_log)
+
+ # remove uninformative error logs
+ if (not error_log) :
+ log_system("rm "+this_psibl_out+".err "+this_file_out+".err",error_log)
+
+ if (srch_pgm != 'psiblast') :
+ asn2asn_cmd = "%s -v %s -e %s" % (datatool_bin, this_pssm_asntxt, this_pssm_asnbin)
+ log_system(asn2asn_cmd, error_log)
+ return (this_pssm_asnbin, this_bound_out)
+ else :
+ return (this_pssm_asntxt, this_bound_out)
+
+################
+# sub has_converged()
+# reads two boundary files and compares accessions
+#
+def has_converged(file1, file2) :
+
+ f1_names = []
+ f2_names = []
+
+ with open(file1) as fd:
+ for line in fd:
+ line = line.rstrip('\n')
+ fields = line.split('\t')
+ f1_names.append(fields[0])
+
+ with open(file2) as fd:
+ for line in fd:
+ line = line.rstrip('\n')
+ fields = line.split('\t')
+ f2_names.append(fields[0])
+
+ # check for same length
+ if (len(f1_names) != len(f2_names)) :
+ return 0
+
+ f1_names.sort()
+ f2_names.sort()
+
+ for i,v in enumerate(f1_names) :
+ if (f2_names[i] != v) :
+ return 0
+
+ return 1
+
+# main()
+
+srch_subs = {'ssearch' : get_ssearch_cmd,
+ 'psiblast': get_psiblast_cmd}
+
+pgm_command = "# "+" ".join(sys.argv);
+if (error_log) :
+ sys.stderr.write('pgm_command\n')
+
+arg_parse = argparse.ArgumentParser(description='Iterative search with SSEARCH/PSIBLAST')
+arg_parse.add_argument('--query', dest='query_file', action='store',help='query sequence file')
+arg_parse.add_argument('--sequence', dest='query_file', action='store',help='query sequence file')
+arg_parse.add_argument('--db', dest='db_file', action='store',help='sequence database name')
+arg_parse.add_argument('--database', dest='db_file', action='store',help='sequence database name')
+arg_parse.add_argument('--dir', dest='tmp_dir', action='store',help='directory for result and tmp_file output')
+arg_parse.add_argument('--evalue', dest='evalue', default=0.002, type=float, action='store',help='E()-value threshold for inclusion in PSSM')
+arg_parse.add_argument('--annot_db', dest='annot_type', action='store',help='source of domain annotations')
+arg_parse.add_argument('--suffix', dest='suffix', action='store',help='suffix for result output')
+arg_parse.add_argument('--out_name', dest='file_out', action='store',help='result file name')
+arg_parse.add_argument('--iter', dest='num_iter', default=5, type=int, action='store',help='number of iterations')
+arg_parse.add_argument('--in_pssm', dest='prev_pssm', action='store',help='initial PSSM')
+arg_parse.add_argument('--in_bounds', dest='prev_bound_in', type=str, action='store',help='initial boundaries')
+arg_parse.add_argument('--domain', dest='dom_flag', action='store_true',help='use domain annotations')
+arg_parse.add_argument('--align', dest='align_flag', action='store_true',help='use alignment boundaries')
+arg_parse.add_argument('--pgm', dest='srch_pgm', action='store',default='ssearch',help='search program: ssearch/psiblast')
+arg_parse.add_argument('--query_seed', dest='query_mask', action='store_true',help='use query seeding')
+arg_parse.add_argument('--int_seed', dest='int_mask', action='store',default='none',help='sequence masking: none/query/random')
+arg_parse.add_argument('--end_seed', dest='end_mask', action='store',default='none',help='sequence masking: none/query/random')
+arg_parse.add_argument('--int_mask', dest='int_mask', action='store',default='none',help='sequence masking: none/query/random')
+arg_parse.add_argument('--end_mask', dest='end_mask', action='store',default='none',help='sequence masking: none/query/random')
+arg_parse.add_argument('--save_list', dest='tmp_file_list', action='store',help='temporary extensions saved')
+arg_parse.add_argument('--save_all', dest='save_all', action='store_true',help='save all temporary files')
+arg_parse.add_argument('--delete_all', dest='delete_tmp', action='store_true',help='delete all temporary files')
+arg_parse.add_argument('--delete_bnd', dest='delete_bnd', action='store_true',help='delete boundary temporary file')
+arg_parse.add_argument('--quiet', dest='quiet', action='store_true',help='fewer messages')
+arg_parse.add_argument('-Q', dest='quiet', action='store_true',help='fewer messages')
+
+args = arg_parse.parse_args()
+if (args.quiet) :
+ quiet = args.quiet
+
+if (args.srch_pgm) :
+ srch_pgm = args.srch_pgm
+
+if (not quiet) :
+ print pgm_command
+
+del_file_ext = ["msa","psibl_out","hit_db","asntxt","asnbin"]
+
+if (re.match('psiblast',srch_pgm)) :
+ del_file_ext.pop()
+
+
+if (args.query_mask) :
+ if (args.int_mask == 'none') :
+ args.int_mask = 'query'
+ if (args.end_mask == 'none') :
+ args.end_mask = 'query'
+
+delete_bnd = args.delete_bnd
+if (args.delete_tmp) :
+ delete_bnd = 1
+elif (args.save_all) :
+ delete_file_ext = ()
+ args.tmp_file_list = ''
+ delete_bnd = 0
+
+save_file_ext = {}
+if (args.tmp_file_list) :
+ new_del_file_ext = []
+ for ext in re.split(",\s*",args.tmp_file_list) :
+ save_file_ext[ext] = 1
+ for ext in del_file_ext :
+ if (not (ext in save_file_ext)):
+ new_del_file_ext.append(ext)
+ del_file_ext = new_del_file_ext[:]
+
+this_iter = "it1"
+
+query_pref = query_file = args.query_file
+m = re.search(r'([\w\.]+)$',str(args.query_file))
+query_pref = m.groups()[0]
+
+if (not args.file_out) :
+ file_out = query_pref
+else :
+ file_out = args.file_out
+
+this_file_out = this_file_pref = file_out+"."+this_iter
+if (args.suffix) :
+ this_file_out = this_file_pref+"."+args.suffix
+if (args.tmp_dir) :
+ this_file_out = args.tmp_dir+"/"+this_file_out
+
+####
+# parse output to build PSSM
+# generate output filenames
+
+prev_file_out = this_file_out
+
+# do the first search
+search_str = srch_subs[srch_pgm](args.query_file, args.db_file, args.prev_pssm)
+log_system(search_str+" > "+this_file_out+" 2> "+this_file_out+".err", error_log)
+
+prev_file_out = this_file_out
+
+(this_pssm, this_bound_out) = build_msa_pssm(args.query_file, this_file_out, args.prev_bound_in, error_log)
+# now have necessary files for next iteration
+
+it=2
+while (it <= args.num_iter) :
+
+ prev_pssm = this_pssm
+ prev_bound_in = this_bound_out
+ ####
+ # build filename for this iteration
+ this_file_out = this_file_pref = "%s.it%d" % (file_out,it)
+ if (args.suffix) :
+ this_file_out = this_file_pref+"."+args.suffix
+ if (args.tmp_dir) :
+ this_file_out = args.tmp_dir+"/"+this_file_out
+
+ search_str = srch_subs[srch_pgm](args.query_file, args.db_file, prev_pssm)
+ log_system("%s > %s 2> %s" % (search_str,this_file_out,this_file_out+".err"), error_log)
+
+ if (len(del_file_ext)):
+ del_file_list = [ prev_file_out+'.'+ext for ext in del_file_ext]
+ log_system('rm '+' '.join(del_file_list),error_log)
+
+ prev_file_out = this_file_out
+
+ (this_pssm, this_bound_out) = build_msa_pssm(query_file, this_file_out, prev_bound_in, error_log)
+
+ if (has_converged(prev_bound_in, this_bound_out)) :
+ if (not quiet) :
+ sys.stderr.write(" %s %s %s %s converged (%d iterations)\n" % (sys.argv[0], srch_pgm, query_file, args.db_file, it))
+
+ if (len(del_file_ext)):
+ del_file_list = [ prev_file_out+'.'+ext for ext in del_file_ext]
+ log_system('rm '+' '.join(del_file_list),error_log)
+
+ if (delete_bnd) :
+ log_system("rm "+prev_bound_in,error_log)
+
+ exit(0)
+
+ if (delete_bnd) :
+ log_system("rm "+prev_bound_in,error_log)
+
+ it += 1
+
+if (len(del_file_ext)):
+ del_file_list = [ prev_file_out+'.'+ext for ext in del_file_ext]
+ log_system('rm '+' '.join(del_file_list),error_log)
+
+if (delete_bnd):
+ log_system("rm "+this_bound_out,error_log)
+
+if (not quiet) :
+ sys.stderr.write(" %s %s %s %s finished (%d iterations)\n" % (sys.argv[0], srch_pgm, query_file, args.db_file, it-1))
+
diff --git a/scripts/README b/scripts/README
new file mode 100644
index 0000000..5a4e5cf
--- /dev/null
+++ b/scripts/README
@@ -0,0 +1,108 @@
+
+22-Jan-2014
+13-Apr-2016 updated
+
+fasta36/scripts
+
+Perl scripts for annotating sequences and expanding libraries
+
+-- Sequence alignment scoring/annotation
+
+Two program scripts -- annot_blast_btop2.pl and blastp_cmd.sh -- have
+been added to support sub-alignment scoring of BLASTP alignments.
+
+annot_blast_btop2.pl takes three inputs: (1) a query sequence file; (2)
+a domain annotation script (see below), and (3) a BLAST tabular format
+output with two additional fields, "score" and "btop":
+
+annot_blast_btop2.pl --query query.file --ann_script ann_pfam_www.pl blast_tab_btop_file
+
+The blast_tab_btop_file can be produced using the blastp_cmd.sh shell
+script, which uses ASN.1 output and blast_formatter to produce both a
+standard alignment file and the modified blast tabular btop file.
+
+-- Implied Multiple sequence alignment --
+
+As part of a strategy to improve PSSM-based similarity searching, two
+scripts that use a BTOP encoded alignment string from either -m 8CB or
+-m 9B output files to produce a Clustal-like multiple sequence
+alignment (MSA) that can be used as input to psiblast to produce an
+ASN.1 text file (which can be converted with datatool to ASN.1 binary,
+which can be read by ssearch36 -P "file.asn1 2"). We used the BTOP
+encoding, rather than the more common CIGAR string (-m 9C), or the
+older alignment encoding (-m 9c), because the BTOP encoding only
+requires the query sequence to reproduce both the query and subject
+aligned residues. Thus:
+
+m8_btop_msa.pl --query gstt1_drome.aa gstt1_sp.bl_btop > gstt1_sp.ss_msa
+
+where "gstt1_sp.bl_btop" is "-m 8CB" output, produces:
+ ====
+ SSEARCHm8 multiple sequence alignment
+
+
+ sp|P20432|GSTT1_DROME MVDFYYLPGSSPCRSVIMTAKAVGVELNKKLLNLQAGEHLKPEFLKINPQHTIPTLVDNG
+ sp|P04907|GSTF3_MAIZE ---LYGMPLSPNVVRVATVLNEKGLDFEIVPVDLTTGAHKQPDFLALNPFGQIPALVDGD
+ sp|P12653|GSTF1_MAIZE -------------------------------INFATAEHKSPEHLVRNPFGQVPALQDGD
+ sp|P0ACA5|SSPA_ECO57 ------------------------------------------DLIDLNPNQSVPTLVDRE
+ sp|P00502|GSTA1_RAT VLHYFNARGRMECIRWLLA--AAGVEFDEKFI--QSPEDL--EKLKKDGNDQVPMVEIDG
+
+
+ ...
+ ====
+
+which can be used with psiblast -in_msa gstt1_sp.ss_msa.
+m9B_btop_msa.pl does the same for "-m 9B" output.
+
+-- Domain annotation --
+
+(Nov. 2015) These domain annotation scripts allow overlapping domains,
+and must be used with versions of the FASTA programs that support the
+current "start - stop domain_description" format (in contrast to the
+older format which put domain starts and stops on separate lines with
+'[' and ']'). Until this release, the "overlapping" domain scripts had
+'_e' in their name, e.g. ann_pfam28_e.pl. The "_e" scripts have been
+renamed, losing the '_e', and the old non-'_e' scripts have been
+removed from the distribuition.
+
+All of the "ann_*.pl" scripts are used to annotate query or library
+sequences using the -V option. See ../test/test2V.pl for examples.
+
+
+ann_feats2ipr.pl -- generate Uniprot sites, Interpro domains, from a mySQL database
+ann_feats2l.pl -- generate Uniprot sites, domains from a mySQL database
+
+ann_feats_up_www2.pl -- generate Uniprot sites, domains from an EBI
+ web server that converts Uniprot DAS to gff3.
+
+ann_feats_up_www.pl -- generate Uniprot sites, domains from a Uniprot
+ gff web server (less information than ann_feats_www2.pl)
+
+ann_ipr_www.pl -- Interpro domains from Interpro WWW site.
+
+ann_pdb_cath.pl -- generate CATH domains using PDB accessions from a mySQL database
+ann_pdb_vast.pl -- use VAST domains, but domain names are not informative
+
+ann_pfam27.pl -- generate Pfam domains using local Pfam mySQL database (Pfam27 with auto_pfamA, auto_pfamseq)
+ann_pfam28.pl -- generate Pfam domains using local Pfam mySQL database (Pfam28, no auto_pfamA, auto_pfamseq)
+ann_pfam_www.pl -- use Pfam Website, and XML::Twig, to get Pfam domain info.
+
+ann_exons_ens.pl -- generate exon boundaries on SwissProt proteins from Ensembl.
+ann_exons_up_www.pl -- generate exon boundaries on SwissProt proteins using the EBI/Proteins/API/coordinate service
+ann_exons_ncbi.pl -- generate exon boundaries on NCBI refseq proteins.
+
+-- Library expansion
+
+expand_uniref50.pl -- allows search of uniref50 to be expanded
+expand_links.pl -- script to take hits from a smaller library and expand to complete library
+links2sql.pl -- create links for expand_links.pl
+
+exp_up_ensg.pl -- expand uniprot sequences to include Ensembl splice variants
+
+-- Plot local alignments (.lav files)
+
+lav2plt.pl -- used to produce postscript or svg plots of "lalign36 -m 11" lav output files
+color_defs.pl -- used by lav2plt.pl to produce domain colors
+lavplt_ps.pl -- used by lav2plt.pl --dev ps
+lavplt_svg.pl -- used by lav2plt.pl --dev svg
+
diff --git a/scripts/README.scripts b/scripts/README.scripts
new file mode 100644
index 0000000..cb479d8
--- /dev/null
+++ b/scripts/README.scripts
@@ -0,0 +1,84 @@
+
+ $Id: README.scripts 1258 2014-05-02 13:27:07Z wrp $
+ $Revision: 1258 $
+
+May 13, 2013
+
+This directory contains a variety of scripts that work with the two
+scripting options for the FASTA programs:
+
+-e expand_script.pl
+
+and
+
+-V \!annotate_script.pl
+
+All these scripts use a mysql database that provides a mapping between
+the sequence identifiers provided by the sequence library and the
+additional sequences (-e) or annotations (-V) that are generated.
+Thus, the scripts will NOT work as written because they reference
+mysql databases that are only available inside the University of
+Virginia.
+
+Scripts for sequence library expansion:
+
+expand_uniref50.pl -- produces new sequences from the
+ Uniref50 mapping of Uniref50 to Uniprot.
+
+expand_links.pl -- produce new sequences from a custom-built database
+ of protein links
+links2sql.pl -- build the file of protein accessions to linked accessions
+
+exp_up_ensg.pl -- (human sequences only) use the ENSEMBL to Uniprot
+ mapping to extract alternative splice isoforms.
+
+The expansion scripts expect a file name for a file that contains:
+
+sp|P09488|GSTM1<tab>1e-100
+...
+
+The file is then opened, read, and the accessions extracted and used
+to find the linked sequences.
+
+================
+
+Scripts for library annotation:
+
+The annotation scripts are very similar to the expansion scripts, but
+have the option of either (1) taking the name of a file sequence
+annotations, e.g.
+
+ annot_script.pl annot_file
+
+or (2) taking an argument that is a single sequence identifier:
+
+ annot_script.pl 'sp|P09488|GSTM1_HUMAN' ('|' must be escaped for many shells)
+
+Three annotation scripts are available:
+
+ ann_feats2l.pl - get features and domains from local Uniprot database
+ ann_feats2ipr.pl -- get features from Uniprot and domains from a local Uniprot/Interpro database
+
+ ann_pfam.pl -- get domains (only) from local copy of Pfam SQL database
+ ann_pfam_www.pl -- get domains (only) from Pfam web services
+
+ ann_feats_up_www.pl -- get features/domains from Uniprot gff3 server (http://www.uniprot.org/uniprot/P0948.gff)
+
+The Uniprot gff service does not provide information on the actual sequence changes associated with mutants and variants:
+P09488 UniProtKB Natural variant 210 210 . . . ID=VAR_014497;Dbxref=dbSNP:rs449856
+P09488 UniProtKB Mutagenesis 7 7 . . . Note=Reduces catalytic activity 100-fold.
+However, the Uniprot XML service does provide this information, so a second resource:
+
+ ann_feats_up_www2.pl -- get features/domains from EMBL/EBI XSLT conversion of Uniprot XML:
+
+ http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/uniprotkb/P09488/gff2
+
+Provides substitution information, as well as links to references:
+
+P09488 UniProtKB natural_variant_site 210 210 . . . Note "S -> T" ; Note "UniProtKB FT ID: VAR_014497" ; Note "dbSNP:rs449856" ; Link "http://www.ensembl.org/Homo_sapiens/Variation/Explore?v=rs449856" ; Link "http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=449856"
+P09488 UniProtKB mutated_variant_site 7 7 . . . Note "Y -> F" ; Note "Reduces catalytic activity 100- fold" ; Link "http://www.ncbi.nlm.nih.gov/pubmed/16548513"
+
+All the annotation scripts offer -h and --help options.
+
+================
+
diff --git a/scripts/acc_examples b/scripts/acc_examples
new file mode 100644
index 0000000..b790db8
--- /dev/null
+++ b/scripts/acc_examples
@@ -0,0 +1,5 @@
+P09488
+sp|P09488
+up|P09488|GSTM1_HUMAN
+SP:GSTM1_HUMAN P09488
+SP:GSTM1_HUMAN
diff --git a/scripts/ann_exons_ens.pl b/scripts/ann_exons_ens.pl
new file mode 100755
index 0000000..a8666c1
--- /dev/null
+++ b/scripts/ann_exons_ens.pl
@@ -0,0 +1,287 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014,2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_exons_ens.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|23065544|ref|NP_000552.2|
+#
+# and returns the exons present in the protein from NCBI gff3 tables (human and mouse only)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the acc
+# (3) return the tab delimited exon boundaries
+
+
+use strict;
+
+use DBI;
+use Getopt::Long;
+use Pod::Usage;
+
+use vars qw($host $db $port $user $pass);
+
+my $hostname = `/bin/hostname`;
+
+($host, $db, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "uniprot", 0, "web_user", "fasta_www");
+
+my ($auto_reg,$rpd2_fams, $neg_doms, $lav, $no_doms, $pf_acc, $shelp, $help) = (0, 0, 0, 0,0, 0,0,0);
+my ($min_nodom) = (10);
+
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+GetOptions(
+ "host=s" => \$host,
+ "db=s" => \$db,
+ "user=s" => \$user,
+ "password=s" => \$pass,
+ "port=i" => \$port,
+ "lav" => \$lav,
+ "min_nodom=i" => \$min_nodom,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (@ARGV || -p STDIN || -f STDIN);
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+my %annot_types = ();
+my %domains = (NODOM=>0);
+my $domain_cnt = 0;
+
+my $get_annot_sub = \&get_ensembl_exons;
+
+my $get_exons_acc = $dbh->prepare(<<EOSQL);
+
+SELECT ex_num, ex_p_start as seq_start, ex_p_end as seq_end
+FROM ens_exons
+JOIN ens2up USING(ensp)
+WHERE acc=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_exons_id = $dbh->prepare(<<EOSQL);
+
+SELECT ex_num, ex_p_start as seq_start, ex_p_end as seq_end
+FROM ens_exons
+JOIN ens2up USING(ensp)
+JOIN annot2 using(acc)
+WHERE id=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_annots_sql = $get_exons_acc;
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+
+#if it's a file I can open, read and parse it
+unless ($query && ( $query =~ m/[\|:]/
+ || $query =~ m/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\s/
+ || $query =~ m/^(XN)(MP)_\d+/)) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, show_annots($a_line, $get_annot_sub);
+ }
+}
+else {
+ push @annots, show_annots("$query\t$seq_len", $get_annot_sub);
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && defined($domains{$annot->[-1]})) {
+ $annot->[-1] .= $color_sep_str.$domains{$annot->[-1]};
+ }
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($query_len, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_len) = split(/\t/,$query_len);
+
+ my $pfamA_acc;
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ $use_acc = 1;
+ $get_annots_sql = $get_exons_acc;
+
+ if ($annot_line =~ m/^gi\|/) {
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^(sp|tr|up)\|/) {
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+) (\w+)/) {
+ ($sdb, $id, $acc) = (lc($1), $2, $3);
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+)/) {
+ ($sdb, $id, $acc) = (lc($1), $2, "");
+ $use_acc=0;
+ }
+ elsif ($annot_line =~ m/\|/) {
+ ($sdb, $acc, $db) = split(/\|/,$annot_line);
+ }
+ else {
+ ($acc) = ($annot_line =~ m/^(\S+)/);
+ }
+
+ $acc =~ s/\.\d+$//;
+ if ($use_acc) {
+ $get_annots_sql->execute($acc);
+ }
+ else {
+ $get_annots_sql = $get_exons_id;
+ $get_annots_sql->execute($id);
+ }
+
+ $annot_data{list} = $get_annot_sub->($get_annots_sql, $seq_len);
+
+ return \%annot_data;
+}
+
+sub get_ensembl_exons {
+ my ($get_annots, $seq_length) = @_;
+
+ my @exons = ();
+
+ # get the list of domains, sorted by start
+ while ( my $row_href = $get_annots->fetchrow_hashref()) {
+
+ $row_href->{info} = "exon_".$row_href->{ex_num};
+ push @exons, $row_href
+ }
+
+ # check for domain overlap, and resolve check for domain overlap
+ # (possibly more than 2 domains), choosing the domain with the best
+ # evalue
+
+ my @feats = ();
+
+ for my $d_ref (@exons) {
+ if ($lav) {
+ push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info}];
+ }
+ else {
+ push @feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end}, $d_ref->{info} ];
+# push @feats, [$d_ref->{seq_end}, ']', '-', ""];
+ }
+
+ }
+
+ return \@feats;
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($value) = @_;
+
+ if (!defined($domains{$value})) {
+ $domain_cnt++;
+ $domains{$value} = $domain_cnt;
+ }
+ return $value;
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_feats.pl
+
+=head1 SYNOPSIS
+
+ ann_pfam.pl --neg-doms 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --neg-doms, -- report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --min_nodom=10 -- minimum length between domains for NODOM
+
+ --host, --user, --password, --port --db -- info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_pfam.pl> extracts domain information from a msyql
+database. Currently, the program works with database sequence
+descriptions in one of two formats:
+
+ >pf26|649|O94823|AT10B_HUMAN -- RPD2_seqs
+
+(pf26 databases have auto_pfamseq in the second field) and
+
+ >gi|1705556|sp|P54670.1|CAF1_DICDI
+
+C<ann_pfam.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>,
+and C<pfamA> tables of the C<pfam> database to extract domain
+information on a protein. For proteins that have multiple domains
+associated with the same overlapping region (domains overlap by more
+than 1/3 of the domain length), C<auto_pfam.pl> selects the domain
+annotation with the best C<domain_evalue_score>. When domains overlap
+by less than 1/3 of the domain length, they are shortened to remove
+the overlap.
+
+C<ann_pfam.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_pfam.pl> or C<-V "\!ann_pfam.pl --neg"> option.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_exons_ncbi.pl b/scripts/ann_exons_ncbi.pl
new file mode 100755
index 0000000..26d1acc
--- /dev/null
+++ b/scripts/ann_exons_ncbi.pl
@@ -0,0 +1,243 @@
+#!/usr/bin/perl -w
+
+# ann_exons_ncbi.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|23065544|ref|NP_000552.2|
+#
+# and returns the exons present in the protein from NCBI gff3 tables (human and mouse only)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the acc
+# (3) return the tab delimited exon boundaries
+
+
+use strict;
+
+use DBI;
+use Getopt::Long;
+use Pod::Usage;
+
+use vars qw($host $db $port $user $pass);
+
+my $hostname = `/bin/hostname`;
+
+($host, $db, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "seqdb_demo2", 0, "web_user", "fasta_www");
+
+my ($auto_reg,$rpd2_fams, $neg_doms, $lav, $no_doms, $pf_acc, $shelp, $help) = (0, 0, 0, 0,0, 0,0,0);
+my ($min_nodom) = (10);
+
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+GetOptions(
+ "host=s" => \$host,
+ "db=s" => \$db,
+ "user=s" => \$user,
+ "password=s" => \$pass,
+ "port=i" => \$port,
+ "lav" => \$lav,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "min_nodom=i" => \$min_nodom,
+ "pfacc" => \$pf_acc,
+ "RPD2" => \$rpd2_fams,
+ "auto_reg" => \$auto_reg,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (@ARGV || -p STDIN || -f STDIN);
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+my %annot_types = ();
+my %domains = (NODOM=>0);
+my $domain_cnt = 0;
+
+my $get_annot_sub = \&get_refseq_exons;
+
+my $get_exons_acc = $dbh->prepare(<<EOSQL);
+
+SELECT ex_num, ex_p_start as seq_start, ex_p_end as seq_end
+FROM ref_exons
+WHERE acc=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_annots_sql = $get_exons_acc;
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+
+#if it's a file I can open, read and parse it
+unless ($query && ($query =~ m/[\|:]/ || $query =~ m/^[XN]P_/)) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, show_annots($a_line, $get_annot_sub);
+ }
+}
+else {
+ push @annots, show_annots("$query $seq_len", $get_annot_sub);
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && defined($domains{$annot->[-1]})) {
+ $annot->[-1] .= $color_sep_str.$domains{$annot->[-1]};
+ }
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($query_len, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_len) = split(/\s+/,$query_len);
+
+ my $pfamA_acc;
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ $use_acc = 1;
+ $get_annots_sql = $get_exons_acc;
+
+ if ($annot_line =~ m/^gi\|/) {
+ ($tmp, $gi, $sdb, $acc) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^ref\|/) {
+ ($sdb, $acc) = split(/\|/,$annot_line);
+ }
+
+ $acc =~ s/\.\d+$//;
+ $get_annots_sql->execute($acc);
+
+ $annot_data{list} = $get_annot_sub->($get_annots_sql, $seq_len);
+
+ return \%annot_data;
+}
+
+sub get_refseq_exons {
+ my ($get_annots, $seq_length) = @_;
+
+ my @exons = ();
+
+ # get the list of domains, sorted by start
+ while ( my $row_href = $get_annots->fetchrow_hashref()) {
+
+ $row_href->{info} = "exon_".$row_href->{ex_num};
+ push @exons, $row_href
+ }
+
+ # check for domain overlap, and resolve check for domain overlap
+ # (possibly more than 2 domains), choosing the domain with the best
+ # evalue
+
+ my @feats = ();
+
+ for my $d_ref (@exons) {
+ if ($lav) {
+ push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info}];
+ }
+ else {
+ push @feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end}, $d_ref->{info} ];
+# push @feats, [$d_ref->{seq_end}, ']', '-', ""];
+ }
+
+ }
+
+ return \@feats;
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($value) = @_;
+
+ if (!defined($domains{$value})) {
+ $domain_cnt++;
+ $domains{$value} = $domain_cnt;
+ }
+ return $value;
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_feats.pl
+
+=head1 SYNOPSIS
+
+ ann_pfam.pl --neg-doms 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --neg-doms, -- report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --min_nodom=10 -- minimum length between domains for NODOM
+
+ --host, --user, --password, --port --db -- info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_pfam.pl> extracts domain information from a msyql
+database. Currently, the program works with database sequence
+descriptions in one of two formats:
+
+ >pf26|649|O94823|AT10B_HUMAN -- RPD2_seqs
+
+(pf26 databases have auto_pfamseq in the second field) and
+
+ >gi|1705556|sp|P54670.1|CAF1_DICDI
+
+C<ann_pfam.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>,
+and C<pfamA> tables of the C<pfam> database to extract domain
+information on a protein. For proteins that have multiple domains
+associated with the same overlapping region (domains overlap by more
+than 1/3 of the domain length), C<auto_pfam.pl> selects the domain
+annotation with the best C<domain_evalue_score>. When domains overlap
+by less than 1/3 of the domain length, they are shortened to remove
+the overlap.
+
+C<ann_pfam.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_pfam.pl> or C<-V "\!ann_pfam.pl --neg"> option.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_exons_up_www.pl b/scripts/ann_exons_up_www.pl
new file mode 100755
index 0000000..4db872d
--- /dev/null
+++ b/scripts/ann_exons_up_www.pl
@@ -0,0 +1,239 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014,2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_exons_up_www.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|23065544|ref|NP_000552.2|
+#
+# and returns the exons present in the protein from NCBI gff3 tables (human and mouse only)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the acc
+# (3) get exon information from EBI/Uniprot
+# (4) return the tab delimited exon boundaries
+
+# 22-May-2017 -- use get("http://"), not get_https("https://"), because EBI does not have LWP::Protocol:https
+
+use strict;
+
+use Getopt::Long;
+use LWP::Simple;
+use LWP::UserAgent;
+# use LWP::Protocol::https;
+use Pod::Usage;
+use JSON qw(decode_json);
+
+use vars qw($host $db $port $user $pass);
+
+my ($lav, $shelp, $help) = (0, 0,0);
+
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+GetOptions(
+ "lav" => \$lav,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (@ARGV || -p STDIN || -f STDIN);
+
+my %domains = (NODOM=>0);
+my @domain_list = (0);
+my $domain_cnt = 0;
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+my $get_annot_sub = \&get_up_www_exons;
+
+my $ua = LWP::UserAgent->new(ssl_opts=>{verify_hostname => 0});
+my $uniprot_url = 'http://www.ebi.ac.uk/proteins/api/coordinates/';
+my $uniprot_suff = ".json";
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+my %annot_set = (); # re-use annotations if they are available (not yet implemented)
+
+#if it's a file I can open, read and parse it
+unless ($query && ($query =~ m/[\|:]/ ||
+ $query =~ m/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\s/)) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, show_annots($a_line, $get_annot_sub);
+ }
+}
+else {
+ push @annots, show_annots("$query\t$seq_len", $get_annot_sub);
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($query_len, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_len) = split(/\t/,$query_len);
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ $use_acc = 1;
+
+ if ($annot_line =~ m/^gi\|/) {
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^(sp|tr|up)\|/) {
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+) (\w+)/) {
+ ($sdb, $id, $acc) = (lc($1), $2, $3);
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+)/) {
+ ($sdb, $id, $acc) = (lc($1), $2, "");
+ warn "*** $0 - accession required: $annot_line";
+ }
+ elsif ($annot_line =~ m/\|/) {
+ ($sdb, $acc) = split(/\|/,$annot_line);
+ }
+ else {
+ ($acc) = ($annot_line =~ m/^(\S+)/);
+ }
+
+ $acc =~ s/\.\d+$//;
+
+ my $exon_json = get($uniprot_url.$acc.$uniprot_suff);
+
+ unless (!$exon_json || $exon_json =~ m/errorMessage/ || $exon_json =~ m/Can not find/) {
+ $annot_data{list} = parse_json_up_exons($exon_json);
+ }
+
+ return \%annot_data;
+}
+
+sub parse_json_up_exons {
+ my ($exon_json) = @_;
+
+ my @exons = ();
+
+ my $acc_exons = decode_json($exon_json);
+
+ my $exon_num = 1;
+ for my $exon ( @{$acc_exons->{'gnCoordinate'}[0]{'genomicLocation'}{'exon'}} ) {
+ my ($p_begin, $p_end) = ($exon->{'proteinLocation'}{'begin'}{'position'},$exon->{'proteinLocation'}{'end'}{'position'});
+ if ($p_end >= $p_begin) {
+ push @exons, {
+ info=>"exon_".$exon_num.$color_sep_str.$exon_num,
+ seq_start=>$p_begin,
+ seq_end=>$p_end,
+ };
+ $exon_num++;
+ }
+ }
+
+ # check for domain overlap, and resolve check for domain overlap
+ # (possibly more than 2 domains), choosing the domain with the best
+ # evalue
+
+ my @ex_feats = ();
+
+ for my $d_ref (@exons) {
+ if ($lav) {
+ push @ex_feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info}];
+ }
+ else {
+ push @ex_feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end}, $d_ref->{info} ];
+ }
+ }
+ return \@ex_feats;
+}
+
+sub get_https {
+ my ($url) = @_;
+
+ my $result = "";
+ my $response = $ua->get($url);
+
+ if ($response->is_success) {
+ $result = $response->decoded_content;
+ } else {
+ $result = '';
+ }
+ return $result;
+}
+
+sub domain_name {
+
+ my ($value) = @_;
+
+ if (!defined($domains{$value})) {
+ $domain_cnt++;
+ $domains{$value} = $domain_cnt;
+ }
+ return $value;
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_exons_up_www.pl
+
+=head1 SYNOPSIS
+
+ ann_exons_up_www.pl 'sp|P09488|GSTM1_HUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --lav produce lav2plt.pl annotation format
+
+=head1 DESCRIPTION
+
+C<ann_exons_up_www.pl> extracts exon coordinates for proteins using
+the EBI Proteins REST API described here:
+C<https://www.ebi.ac.uk/proteins/api/doc/#coordinatesApi>. Exon
+intron boundaries, in protein coordinates, are available for Uniprot
+proteins with Ensembl entries.
+
+C<ann_pfam.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_exons_up_www.pl> or C<-V "q\!ann_exons_up_www.plg"> option.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_feats2ipr.pl b/scripts/ann_feats2ipr.pl
new file mode 100755
index 0000000..0a0ae84
--- /dev/null
+++ b/scripts/ann_feats2ipr.pl
@@ -0,0 +1,526 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_feats2ipr.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# this version takes "features:"
+# ACT_SITE, MOD_RES, SITE, METAL, VARIANT, MUTAGEN
+# from Uniprot and combines them with domain annotations from my merge of the Interpro database.
+#
+
+# ann_feats2ipr.pl is largely identical to ann_feats2l.pl, except that
+# it uses Interpro for domain/repeat information.
+
+use strict;
+
+use DBI;
+use Getopt::Long;
+use Pod::Usage;
+
+use vars qw($host $db $dom_db $a_table $port $user $pass);
+
+my %domains = ();
+my $domain_cnt = 0;
+
+my $hostname = `/bin/hostname`;
+
+unless ($hostname =~ m/ebi/) {
+ ($host, $db, $a_table, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "uniprot", "annot2", 0, "web_user", "fasta_www");
+# $host = 'localhost';
+} else {
+ ($host, $db, $a_table, $port, $user, $pass) = ("mysql-pearson-prod", "up_db", "annot", 4124, "web_user", "fasta_www");
+}
+
+my ($lav, $neg_doms, $no_doms, $no_feats, $no_label, $use_ipr, $acc_comment, $shelp, $help, $no_mod, $dom_db, $db_ref_acc) =
+ (0,0,0,0,0,0,0,0,0,0,0,0);
+
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+GetOptions(
+ "host=s" => \$host,
+ "db=s" => \$db,
+ "user=s" => \$user,
+ "password=s" => \$pass,
+ "port=i" => \$port,
+ "lav" => \$lav,
+ "no_mod" => \$no_mod,
+ "no-mod" => \$no_mod,
+ "no-doms" => \$no_doms,
+ "nodoms" => \$no_doms,
+ "dom_db=s" => \$dom_db,
+ "dom_acc" => \$db_ref_acc,
+ "dom-acc" => \$db_ref_acc,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "negdoms" => \$neg_doms,
+ "no_feats" => \$no_feats,
+ "no-feats" => \$no_feats,
+ "nofeats" => \$no_feats,
+ "no_label" => \$no_label,
+ "no-label" => \$no_label,
+ "nolabel" => \$no_label,
+ "ipr" => \$use_ipr,
+ "acc_comment" => \$acc_comment,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (-p STDIN || -f STDIN || @ARGV);
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+my @feat_keys = qw(ACT_SITE MOD_RES BINDING SITE METAL VARIANT MUTAGEN);
+my @feat_vals = ( '=','*','#','^','!','V','V');
+my @feat_text = ( "active site", "phosphorylation", "binding site", "site", "metal binding");
+
+my @dom_vals = ( [ '[', ']'],[ '[', ']']);
+
+my %annot_types = ();
+ at annot_types{@feat_keys} = @feat_vals;
+
+my $get_annot_sub = \&get_fasta_annots;
+if ($lav) {
+ $no_feats = 1;
+ $get_annot_sub = \&get_lav_annots;
+}
+
+if ($neg_doms) {
+ $domains{'NODOM'}=0;
+}
+
+if ($no_mod) {
+ @feat_keys = qw(ACT_SITE BINDING SITE METAL);
+ @feat_text = ( "active site", "binding site", "site", "metal binding");
+ @feat_vals = ( '=','#','^','!');
+ delete($annot_types{'MOD_RES'});
+ delete($annot_types{'MUTAGEN'});
+ delete($annot_types{'VARIANT'});
+}
+
+my $get_ft2_sites_id = $dbh->prepare(qq(select acc, pos, end, label, value, len from features2 join $a_table using(acc) where id=? and label in ('ACT_SITE','MOD_RES','BINDING','SITE','METAL','VARIANT','MUTAGEN') order by pos));
+
+my $get_ft2_sites_acc = $dbh->prepare(qq(select acc, pos, end, label, value, len from features2 join $a_table using(acc) where acc=? and label in ('ACT_SITE','MOD_RES','BINDING','SITE','METAL','VARIANT','MUTAGEN') order by pos));
+
+my $get_ft2_sites_refacc= $dbh->prepare(qq(select ref_acc, pos, end, label, value, len from features2 join $a_table using(acc) where ref_acc=? and label in ('ACT_SITE','MOD_RES','BINDING','SITE','METAL','VARIANT','MUTAGEN') order by pos));
+
+my $get_ipr_doms_id = $dbh->prepare(qq(select acc, start, stop, ipr_acc, db_ref, s_descr, len from prot2ipr_s join $a_table using(acc) join ipr_annot using(ipr_acc) where id=? order by start));
+
+my $get_ipr_domdb_id = $dbh->prepare(qq(select acc, start, stop, ipr_acc, db_ref, s_descr, len from prot2ipr join $a_table using(acc) join ipr_annot using(ipr_acc) where dom_db='$dom_db' AND id=? order by start));
+
+my $get_ipr_doms_acc = $dbh->prepare(qq(select acc, start, stop, ipr_acc, db_ref, s_descr, len from prot2ipr_s join $a_table using(acc) join ipr_annot using(ipr_acc) where acc=? order by start));
+
+my $get_ipr_doms_refacc = $dbh->prepare(qq(select ref_acc, start, stop, ipr_acc, db_ref, s_descr, len from prot2ipr_s join $a_table using(acc) join ipr_annot using(ipr_acc) where ref_acc=? order by start));
+
+my $get_ipr_domdb_acc = $dbh->prepare(qq(select acc, start, stop, ipr_acc, db_ref, s_descr, len from prot2ipr join $a_table using(acc) join ipr_annot using(ipr_acc) where dom_db='$dom_db' AND acc=? order by start));
+
+my $get_sites_sql = $get_ft2_sites_id;
+my $get_doms_sql = $get_ipr_doms_id;
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+unless ($no_feats || $no_label) {
+ for my $i ( 0 .. $#feat_text ) {
+ print "=",$feat_vals[$i],":",$feat_text[$i],"\n";
+ }
+ # print "=*:phosphorylation\n";
+ # print "==".":active site\n";
+ # print "=@".":site\n";
+ # print "=^:binding\n";
+ # print "=!:metal binding\n";
+}
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if $query;
+
+my @annots = ();
+
+#if it's a file I can open, read and parse it
+unless ($query && ($query =~ m/[\|:]/ ||
+ $query =~ m/^[NX]P_/ ||
+ $query =~ m/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\s/)) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ my $annots_ref = show_annots($a_line, $get_annot_sub);
+ push @annots, $annots_ref if ($annots_ref);
+ }
+} else {
+ my $annots_ref = show_annots("$query\t$seq_len", $get_annot_sub);
+ push @annots, $annots_ref if ($annots_ref);
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && defined($domains{$annot->[4]})) {
+ $annot->[-2] .= $color_sep_str.$domains{$annot->[4]};
+ }
+ if ($lav) {
+ print join("\t",@$annot[0 .. 2]),"\n";
+ }
+ else {
+ print join("\t",@$annot[0 .. 3]),"\n";
+ }
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($query_length, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_length) = split(/\t/,$query_length);
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ if ($annot_line =~ m/^gi\|/ && $annot_line =~ m/\|[sp|ref]\|/) {
+ $use_acc = 1;
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ if ($sdb !~ m/sp/ && $annot_line =~ m/\|sp\|(\w+)/) {
+ ($acc) = ($annot_line =~ m/\|sp\|(\w+)/);
+ }
+ }
+ elsif ($annot_line =~ m/^[sp|tr|ref|up]\|/ ) {
+ $use_acc = 1;
+ ($sdb, $acc) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^gi\|/) {
+ $use_acc = 1;
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+) (\w+)/) {
+ $use_acc = 1;
+ ($sdb, $id, $acc) = ($1,$2,$3);
+ $sdb = lc($sdb)
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+)/) {
+ $use_acc = 0;
+ ($sdb, $id, $acc) = ($1,$2,"");
+ $sdb = lc($sdb)
+ }
+ elsif ($annot_line =~ m/\|/) { # new NCBI swissprot format
+ $use_acc = 1;
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ } else {
+ $use_acc = 1;
+ $sdb = 'sp';
+ ($acc) = split(/\s+/,$annot_line);
+ }
+
+ # remove version number
+ unless ($use_acc) {
+ unless ($no_feats) {
+ $get_sites_sql = $get_ft2_sites_id;
+ $get_sites_sql->execute($id);
+ }
+ unless ($no_doms) {
+ if ($dom_db) {
+ $get_doms_sql = $get_ipr_domdb_id;
+ }
+ else {
+ $get_doms_sql = $get_ipr_doms_id;
+ }
+
+ $get_doms_sql->execute($id);
+ }
+ } else {
+ unless ($acc) {
+ print STDERR "ann_feats2ipr.pl no acc: $annot_line\n";
+ return 0;
+ }
+ $acc =~ s/\.\d+$//;
+ if ($sdb eq 'ref') {
+ unless ($no_feats) {
+ $get_sites_sql = $get_ft2_sites_refacc;
+ $get_sites_sql->execute($acc);
+ }
+ unless ($no_doms) {
+ $get_doms_sql = $get_ipr_doms_refacc;
+ $get_doms_sql->execute($acc);
+ }
+ }
+ else {
+ unless ($no_feats) {
+ $get_sites_sql = $get_ft2_sites_acc;
+ $get_sites_sql->execute($acc);
+ }
+ unless ($no_doms) {
+ if ($dom_db) {
+ $get_doms_sql = $get_ipr_domdb_acc;
+ }
+ else {
+ $get_doms_sql = $get_ipr_doms_acc;
+ }
+ $get_doms_sql->execute($acc);
+ }
+ }
+ }
+
+ $annot_data{list} = $get_annot_sub->($seq_length, $get_sites_sql, $get_doms_sql);
+
+ return \%annot_data;
+}
+
+sub get_fasta_annots {
+ my ($seq_len, $get_sites_sql, $get_doms_sql) = @_;
+
+ my ($acc, $pos, $end, $label, $value, $comment, $len);
+
+ $seq_len = 0;
+
+ my @feats2 = (); # features with start/stop, for checking overlap, adding negative
+ my @sites = (); # sites with one position
+
+ # get sites
+ unless ($no_feats) {
+ while (($acc, $pos, $end, $label, $value, $len) = $get_sites_sql->fetchrow_array()) {
+ $seq_len = $len if ($len > $seq_len);
+ next unless $annot_types{$label};
+ if ($label =~ m/VARIANT/) {
+ my ($aa_res, $comment) = split(/\(/,$value);
+ if ($comment) {
+ $comment =~ s/\)//;
+ # remove the /FTId=VAR_014497 information
+ $comment =~ s/\s+\/FTId=.*$//;
+ } else {
+ $comment = "";
+ }
+ next if ($comment =~ /MISSING/);
+ my ($vfrom, $vto) = ($aa_res =~ m/(\w)\s*->\s*(\w)/);
+ if ($vto) {
+ $comment = '' unless $comment;
+ $value = $vto;
+ push @sites, [$pos, $annot_types{$label}, $value, $comment, ""];
+ }
+ } elsif ($label =~ m/MUTAGEN/) {
+ my ($aa_res, $comment) = split(/: /,$value);
+ next if ($comment =~ /MISSING/);
+ my ($vfrom, $vto) = split(/\->/,$aa_res);
+ if ($vto) {
+ my @vto_list = split(/,/,$vto);
+ $value = $vto;
+ for my $val ( @vto_list) {
+ push @sites, [$pos, $annot_types{$label}, $val, "Mutagen: $comment", ""];
+ }
+ }
+ } else {
+ push @sites, [$pos, $annot_types{$label}, "-", "$label: $value", ""];
+ }
+ }
+ }
+
+ unless ($no_doms) {
+ my ($ipr_acc, $db_ref, $s_descr) = ("","","");
+ while (($acc, $pos, $end, $ipr_acc, $db_ref, $s_descr, $len) = $get_doms_sql->fetchrow_array()) {
+ $db_ref =~ s/G3DSA://;
+ $seq_len = $len unless ($seq_len > $len);
+
+ $value = domain_name($ipr_acc, $s_descr);
+ if ($acc_comment) {
+ $value .= "{$ipr_acc}";
+ }
+ if ($db_ref_acc) {
+ $value = $db_ref;
+ }
+ elsif ($use_ipr) {
+ $value = $ipr_acc;
+ }
+
+ push @feats2, [$pos, "-", $end, $value, $ipr_acc];
+ }
+ }
+
+ # ensure that domains do not overlap
+ for (my $i=1; $i < scalar(@feats2); $i++) {
+ my $diff = $feats2[$i-1]->[2] - $feats2[$i]->[0];
+ if ($diff >= 0) {
+ $feats2[$i-1]->[2] = $feats2[$i]->[0]+ int($diff/2);
+ $feats2[$i]->[0] = $feats2[$i-1]->[2] + 1;
+ }
+ }
+
+ my @n_feats2 = ();
+
+ if ($neg_doms) {
+ my $last_end = 0;
+ for my $feat ( @feats2 ) {
+ if ($feat->[0] - $last_end > 10) {
+ push @n_feats2, [$last_end+1, "-", $feat->[0]-1, "NODOM", "NODOM"];
+ }
+ $last_end = $feat->[2];
+ }
+ if ($seq_len - $last_end > 10) {
+ push @n_feats2, [$last_end+1, "-", $seq_len, "NODOM", "NODOM"];
+ }
+ }
+
+ my @feats = ();
+ for my $feat (@feats2, @n_feats2) {
+ push @feats, [$feat->[0], '[', '-', $feat->[-2], $feat->[-1] ];
+ push @feats, [$feat->[2], ']', '-', "", ""];
+ }
+
+ @feats = sort { $a->[0] <=> $b->[0] } (@sites, @feats);
+
+ return \@feats;
+}
+
+sub get_lav_annots {
+ my ($seq_len, $get_sites_sql, $get_doms_sql) = @_;
+
+ my @feats = ();
+
+ my %annot = ();
+ while (my ($acc, $pos, $end, $ipr_acc, $db_ref, $s_descr, $len) = $get_doms_sql->fetchrow_array()) {
+ # $value = domain_name($label,$value);
+ my $value = domain_name($ipr_acc,$s_descr);
+ push @feats, [$pos, $end, $value];
+ }
+
+ return \@feats;
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($ipr_acc, $s_descr) = @_;
+
+ $s_descr =~ s/[\-_]domain//;
+ $s_descr =~ s/[\-_]homology//;
+
+ $s_descr =~ s/^(.{20})/$1/;
+
+ if (!defined($domains{$ipr_acc})) {
+ $domain_cnt++;
+ $domains{$ipr_acc} = $domain_cnt;
+ }
+ return $s_descr;
+}
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_feats2.pl
+
+=head1 SYNOPSIS
+
+ ann_feats2.pl --no_doms --no_feats --lav 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+
+ --acc_comment provide the InterPro accession in {IPR00123} brackets for links
+ --dom_db=G3DSA use a single domain database (e.g. PF, G3DSA, PS5) from InterPro
+ --dom_acc provide the domain accession, not the description, as the domain label
+ --neg, --neg_doms, --neg-doms label non-domain regions > 10 residues as "NODOM"
+ --ipr proide InterPro accession as label
+ --no-doms do not show domain boundaries (domains are always shown with --lav)
+ --no-feats do not show feature (variants, active sites, phospho-sites)
+ --no-label do show feature key (==*phosphorylation, etc)
+
+ --lav produce lav2plt.pl annotation format, only show domains/repeats
+
+ --host, --user, --password, --port --db -- info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_feats2ipr.pl> extracts feature, domain, and repeat information from
+two msyql databases (default names: uniprot/ipr2) built by parsing the
+uniprot_sprot.dat and uniprot_trembl.dat feature tables. Given a
+command line argument that contains a sequence accession (P09488) or
+identifier (GSTM1_HUMAN), the program looks up the features available
+for that sequence and returns them in a tab-delimited format:
+
+ >sp|P09488
+ 2 - 88 DOMAIN: GST N-terminal.
+ 7 V F Mutagen: Reduces catalytic activity 100- fold.
+ 23 * - MOD_RES: Phosphotyrosine (By similarity).
+ 33 * - MOD_RES: Phosphotyrosine (By similarity).
+ 34 * - MOD_RES: Phosphothreonine (By similarity).
+ 90 - 208 DOMAIN: GST C-terminal.
+ 108 V S Mutagen: Changes the properties of the enzyme toward some substrates.
+ 108 V Q Mutagen: Reduces catalytic activity by half.
+ 109 V I Mutagen: Reduces catalytic activity by half.
+ 116 # - BINDING: Substrate.
+ 116 V A Mutagen: Reduces catalytic activity 10-fold.
+ 116 V F Mutagen: Slight increase of catalytic activity.
+ 173 V N in allele GSTM1B; dbSNP:rs1065411.
+ 210 V T in dbSNP:rs449856.
+
+If features are provided, then a legend of feature symbols is provided
+as well (disabled with C<--no-label>):
+
+ =*:phosphorylation
+ ==:active site
+ =@:site
+ =^:binding
+ =!:metal binding
+
+If the C<--lav> option is specified, domain and repeat features are
+presented in a different format for the C<lav2plt.pl> program:
+
+ >sp|P09488|GSTM1_HUMAN
+ 2 88 GST N-terminal.
+ 90 208 GST C-terminal.
+
+C<ann_feats2.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_feats2.pl> option. It can also be used with the lav2plt.pl
+program with the C<--xA "\!ann_feats2.pl --lav"> or C<--yA "\!ann_feats2.pl --lav"> options.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_feats2ipr_e.pl b/scripts/ann_feats2ipr_e.pl
new file mode 100755
index 0000000..99edc4f
--- /dev/null
+++ b/scripts/ann_feats2ipr_e.pl
@@ -0,0 +1,544 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_feats2ipr.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# this version takes "features:"
+# ACT_SITE, MOD_RES, SITE, METAL, VARIANT, MUTAGEN
+# from Uniprot and combines them with domain annotations from my merge of the Interpro database.
+#
+
+# ann_feats2ipr.pl is largely identical to ann_feats2l.pl, except that
+# it uses Interpro for domain/repeat information.
+
+use strict;
+
+use DBI;
+use Getopt::Long;
+use Pod::Usage;
+
+use vars qw($host $db $dom_db $a_table $port $user $pass);
+
+my %domains = ();
+my $domain_cnt = 0;
+
+my $hostname = `/bin/hostname`;
+
+unless ($hostname =~ m/ebi/) {
+ ($host, $db, $a_table, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "uniprot", "annot2", 0, "web_user", "fasta_www");
+# $host = 'xdb';
+} else {
+ ($host, $db, $a_table, $port, $user, $pass) = ("mysql-pearson-prod", "up_db", "annot", 4124, "web_user", "fasta_www");
+}
+
+my ($lav, $neg_doms, $no_doms, $no_feats, $no_label, $use_ipr, $acc_comment, $shelp, $help, $no_mod, $dom_db, $db_ref_acc, $bound_comment) =
+ (0,0,0,0,0,0,0,0,0,0,0,0,0);
+
+my ($show_color, $color_sep_str) = (1," :");
+$color_sep_str = '~';
+
+GetOptions(
+ "host=s" => \$host,
+ "db=s" => \$db,
+ "user=s" => \$user,
+ "password=s" => \$pass,
+ "port=i" => \$port,
+ "lav" => \$lav,
+ "bound_comment" => \$bound_comment,
+ "color!" => \$show_color,
+ "no_mod" => \$no_mod,
+ "no-mod" => \$no_mod,
+ "no-doms" => \$no_doms,
+ "nodoms" => \$no_doms,
+ "dom_db=s" => \$dom_db,
+ "dom_acc" => \$db_ref_acc,
+ "dom-acc" => \$db_ref_acc,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "negdoms" => \$neg_doms,
+ "no_feats" => \$no_feats,
+ "no-feats" => \$no_feats,
+ "nofeats" => \$no_feats,
+ "no_label" => \$no_label,
+ "no-label" => \$no_label,
+ "nolabel" => \$no_label,
+ "ipr" => \$use_ipr,
+ "acc_comment" => \$acc_comment,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (-p STDIN || -f STDIN || @ARGV);
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+my @feat_keys = qw(ACT_SITE MOD_RES BINDING SITE METAL VARIANT MUTAGEN);
+my @feat_vals = ( '=','*','#','^','!','V','V');
+my @feat_text = ( "active site", "phosphorylation", "binding site", "site", "metal binding");
+
+my %annot_types = ();
+ at annot_types{@feat_keys} = @feat_vals;
+
+my $get_annot_sub = \&get_fasta_annots;
+if ($lav) {
+ $no_feats = 1;
+ $get_annot_sub = \&get_lav_annots;
+}
+
+if ($neg_doms) {
+ $domains{'NODOM'}=0;
+}
+
+if ($no_mod) {
+ @feat_keys = qw(ACT_SITE BINDING SITE METAL);
+ @feat_text = ( "active site", "binding site", "site", "metal binding");
+ @feat_vals = ( '=','#','^','!');
+ delete($annot_types{'MOD_RES'});
+ delete($annot_types{'MUTAGEN'});
+ delete($annot_types{'VARIANT'});
+}
+
+my $get_ft2_sites_id = $dbh->prepare( <<EOSQL );
+SELECT acc, pos, end, label, value, len
+ FROM features2
+ JOIN $a_table USING(acc)
+WHERE id=?
+ AND label in ('ACT_SITE','MOD_RES','BINDING','SITE','METAL','VARIANT','MUTAGEN')
+ORDER BY pos
+EOSQL
+
+my $get_ft2_sites_acc = $dbh->prepare( <<EOSQL );
+SELECT acc, pos, end, label, value, len
+ FROM features2
+ JOIN $a_table USING(acc)
+WHERE acc=?
+ AND label IN ('ACT_SITE','MOD_RES','BINDING','SITE','METAL','VARIANT','MUTAGEN')
+ORDER BY pos
+EOSQL
+
+my $get_ft2_sites_refacc= $dbh->prepare( <<EOSQL );
+select ref_acc, pos, end, label, value, len from features2 join $a_table using(acc) where ref_acc=? and label in ('ACT_SITE','MOD_RES','BINDING','SITE','METAL','VARIANT','MUTAGEN') order by pos
+EOSQL
+
+my $get_ipr_doms_id = $dbh->prepare(qq(select acc, start, stop, ipr_acc, db_ref, s_descr, len from prot2ipr_s join $a_table using(acc) join ipr_annot using(ipr_acc) where id=? order by start));
+
+my $get_ipr_domdb_id = $dbh->prepare(qq(select acc, start, stop, ipr_acc, db_ref, s_descr, len from prot2ipr join $a_table using(acc) join ipr_annot using(ipr_acc) where dom_db='$dom_db' AND id=? order by start));
+
+my $get_ipr_doms_acc = $dbh->prepare(qq(select acc, start, stop, ipr_acc, db_ref, s_descr, len from prot2ipr_s join $a_table using(acc) join ipr_annot using(ipr_acc) where acc=? order by start));
+
+my $get_ipr_doms_refacc = $dbh->prepare(qq(select ref_acc, start, stop, ipr_acc, db_ref, s_descr, len from prot2ipr_s join $a_table using(acc) join ipr_annot using(ipr_acc) where ref_acc=? order by start));
+
+my $get_ipr_domdb_acc = $dbh->prepare(qq(select acc, start, stop, ipr_acc, db_ref, s_descr, len from prot2ipr join $a_table using(acc) join ipr_annot using(ipr_acc) where dom_db='$dom_db' AND acc=? order by start));
+
+my $get_sites_sql = $get_ft2_sites_id;
+my $get_doms_sql = $get_ipr_doms_id;
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+unless ($no_feats || $no_label) {
+ for my $i ( 0 .. $#feat_text ) {
+ print "=",$feat_vals[$i],":",$feat_text[$i],"\n";
+ }
+ # print "=*:phosphorylation\n";
+ # print "==".":active site\n";
+ # print "=@".":site\n";
+ # print "=^:binding\n";
+ # print "=!:metal binding\n";
+}
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+
+#if it's a file I can open, read and parse it
+unless ($query && $query =~ m/[\|:]/ ) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ my $annots_ref = show_annots($a_line, $get_annot_sub);
+ push @annots, $annots_ref if ($annots_ref);
+ }
+} else {
+ my $annots_ref = show_annots("$query\t$seq_len", $get_annot_sub);
+ push @annots, $annots_ref if ($annots_ref);
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && defined($domains{$annot->[4]})) {
+ if ($bound_comment) {
+ $annot->[-2] .= $color_sep_str.$annot->[0].":".$annot->[2];
+ }
+ elsif ($show_color) {
+ $annot->[-2] .= $color_sep_str.$domains{$annot->[4]};
+ }
+ }
+ if ($lav) {
+ print join("\t",@$annot[0 .. 2]),"\n";
+ }
+ else {
+ print join("\t",@$annot[0 .. 3]),"\n";
+ }
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($query_length, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_length) = split(/\t/,$query_length);
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ if ($annot_line =~ m/^gi\|/ && $annot_line =~ m/\|[sp|ref]\|/) {
+ $use_acc = 1;
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ if ($sdb !~ m/sp/ && $annot_line =~ m/\|sp\|(\w+)/) {
+ ($acc) = ($annot_line =~ m/\|sp\|(\w+)/);
+ }
+ }
+ elsif ($annot_line =~ m/^[sp|tr|ref]\|/ ) {
+ $use_acc = 1;
+ ($sdb, $acc) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^gi\|/) {
+ $use_acc = 1;
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+) (\w+)/) {
+ ($sdb, $id, $acc) = ($1,$2,$3);
+ $use_acc = 1;
+ $sdb = lc($sdb)
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+)/) {
+ ($sdb, $id, $acc) = ($1,$2,"");
+ $use_acc = 0;
+ $sdb = lc($sdb)
+ }
+ elsif ($annot_line !~ m/\|/) { # new NCBI swissprot format
+ $use_acc =1;
+ $sdb = 'sp';
+ ($acc) = split(/\s+/,$annot_line);
+ } else {
+ $use_acc = 1;
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+
+ # remove version number
+ unless ($use_acc) {
+ unless ($no_feats) {
+ $get_sites_sql = $get_ft2_sites_id;
+ $get_sites_sql->execute($id);
+ }
+ unless ($no_doms) {
+ if ($dom_db) {
+ $get_doms_sql = $get_ipr_domdb_id;
+ }
+ else {
+ $get_doms_sql = $get_ipr_doms_id;
+ }
+
+ $get_doms_sql->execute($id);
+ }
+ } else {
+ unless ($acc) {
+ print STDERR "ann_feats2ipr.pl no acc: $annot_line\n";
+ return 0;
+ }
+ $acc =~ s/\.\d+$//;
+ if ($sdb eq 'ref') {
+ unless ($no_feats) {
+ $get_sites_sql = $get_ft2_sites_refacc;
+ $get_sites_sql->execute($acc);
+ }
+ unless ($no_doms) {
+ $get_doms_sql = $get_ipr_doms_refacc;
+ $get_doms_sql->execute($acc);
+ }
+ }
+ else {
+ unless ($no_feats) {
+ $get_sites_sql = $get_ft2_sites_acc;
+ $get_sites_sql->execute($acc);
+ }
+ unless ($no_doms) {
+ if ($dom_db) {
+ $get_doms_sql = $get_ipr_domdb_acc;
+ }
+ else {
+ $get_doms_sql = $get_ipr_doms_acc;
+ }
+ $get_doms_sql->execute($acc);
+ }
+ }
+ }
+
+ $annot_data{list} = $get_annot_sub->($seq_length, $get_sites_sql, $get_doms_sql);
+
+ return \%annot_data;
+}
+
+sub get_fasta_annots {
+ my ($seq_len, $get_sites_sql, $get_doms_sql) = @_;
+
+ my ($acc, $pos, $end, $label, $value, $comment, $len);
+
+ $seq_len = 0;
+
+ my @feats2 = (); # features with start/stop, for checking overlap, adding negative
+ my @sites = (); # sites with one position
+
+ # get sites
+ unless ($no_feats) {
+ while (($acc, $pos, $end, $label, $value, $len) = $get_sites_sql->fetchrow_array()) {
+ $seq_len = $len if ($len > $seq_len);
+ next unless $annot_types{$label};
+ if ($label =~ m/VARIANT/) {
+ my ($aa_res, $comment) = split(/\(/,$value);
+ if ($comment) {
+ $comment =~ s/\)//;
+ # remove the /FTId=VAR_014497 information
+ $comment =~ s/\s+\/FTId=.*$//;
+ } else {
+ $comment = "";
+ }
+ next if ($comment =~ /MISSING/);
+ my ($vfrom, $vto) = ($aa_res =~ m/(\w)\s*->\s*(\w)/);
+ if ($vto) {
+ $comment = '' unless $comment;
+ $value = $vto;
+ push @sites, [$pos, $annot_types{$label}, $value, $comment, ""];
+ }
+ } elsif ($label =~ m/MUTAGEN/) {
+ my ($aa_res, $comment) = split(/: /,$value);
+ next if ($comment =~ /MISSING/);
+ my ($vfrom, $vto) = split(/\->/,$aa_res);
+ if ($vto) {
+ my @vto_list = split(/,/,$vto);
+ $value = $vto;
+ for my $val ( @vto_list) {
+ push @sites, [$pos, $annot_types{$label}, $val, "Mutagen: $comment", ""];
+ }
+ }
+ } else {
+ push @sites, [$pos, $annot_types{$label}, "-", "$label: $value", ""];
+ }
+ }
+ }
+
+ unless ($no_doms) {
+ my ($ipr_acc, $db_ref, $s_descr) = ("","","");
+ while (($acc, $pos, $end, $ipr_acc, $db_ref, $s_descr, $len) = $get_doms_sql->fetchrow_array()) {
+ $db_ref =~ s/G3DSA://;
+ $seq_len = $len unless ($seq_len > $len);
+
+ $value = domain_name($ipr_acc, $s_descr);
+ if ($acc_comment) {
+ $value .= "{$ipr_acc}";
+ }
+ if ($db_ref_acc) {
+ $value = $db_ref;
+ }
+ elsif ($use_ipr) {
+ $value = $ipr_acc;
+ }
+
+ push @feats2, [$pos, "-", $end, $value, $ipr_acc];
+ }
+ }
+
+ # ensure that domains do not overlap
+ for (my $i=1; $i < scalar(@feats2); $i++) {
+ my $diff = $feats2[$i-1]->[2] - $feats2[$i]->[0];
+ if ($diff >= 0) {
+ $feats2[$i-1]->[2] = $feats2[$i]->[0]+ int($diff/2);
+ $feats2[$i]->[0] = $feats2[$i-1]->[2] + 1;
+ }
+ }
+
+ my @n_feats2 = ();
+
+ if ($neg_doms) {
+ my $last_end = 0;
+ for my $feat ( @feats2 ) {
+ if ($feat->[0] - $last_end > 10) {
+ push @n_feats2, [$last_end+1, "-", $feat->[0]-1, "NODOM", "NODOM"];
+ }
+ $last_end = $feat->[2];
+ }
+ if ($seq_len - $last_end > 10) {
+ push @n_feats2, [$last_end+1, "-", $seq_len, "NODOM", "NODOM"];
+ }
+ }
+
+ my @feats = ();
+ for my $feat (@feats2, @n_feats2) {
+ push @feats, [$feat->[0], '-', $feat->[2], $feat->[-2], $feat->[-1] ];
+ }
+
+ @feats = sort { $a->[0] <=> $b->[0] } (@sites, @feats);
+
+ return \@feats;
+}
+
+sub get_lav_annots {
+ my ($seq_len, $get_sites_sql, $get_doms_sql) = @_;
+
+ my @feats = ();
+
+ my %annot = ();
+ while (my ($acc, $pos, $end, $ipr_acc, $db_ref, $s_descr, $len) = $get_doms_sql->fetchrow_array()) {
+ # $value = domain_name($label,$value);
+ my $value = domain_name($ipr_acc,$s_descr);
+ push @feats, [$pos, $end, $value];
+ }
+
+ return \@feats;
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($ipr_acc, $s_descr) = @_;
+
+ $s_descr =~ s/[\-_]domain//;
+ $s_descr =~ s/[\-_]homology//;
+
+ $s_descr =~ s/^(.{20})/$1/;
+
+ if (!defined($domains{$ipr_acc})) {
+ $domain_cnt++;
+ $domains{$ipr_acc} = $domain_cnt;
+ }
+ return $s_descr;
+}
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_feats2.pl
+
+=head1 SYNOPSIS
+
+ ann_feats2.pl --no_doms --no_feats --lav 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+
+ --acc_comment provide the InterPro accession in {IPR00123} brackets for links
+ --dom_db=G3DSA use a single domain database (e.g. PF, G3DSA, PS5) from InterPro
+ --dom_acc provide the domain accession, not the description, as the domain label
+ --neg, --neg_doms, --neg-doms label non-domain regions > 10 residues as "NODOM"
+ --ipr proide InterPro accession as label
+ --no-doms do not show domain boundaries (domains are always shown with --lav)
+ --no-feats do not show feature (variants, active sites, phospho-sites)
+ --no-label do show feature key (==*phosphorylation, etc)
+
+ --lav produce lav2plt.pl annotation format, only show domains/repeats
+
+ --host, --user, --password, --port --db -- info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_feats2ipr.pl> extracts feature, domain, and repeat information from
+two msyql databases (default names: uniprot/ipr2) built by parsing the
+uniprot_sprot.dat and uniprot_trembl.dat feature tables. Given a
+command line argument that contains a sequence accession (P09488) or
+identifier (GSTM1_HUMAN), the program looks up the features available
+for that sequence and returns them in a tab-delimited format:
+
+ >sp|P09488
+ 2 - 88 DOMAIN: GST N-terminal.
+ 7 V F Mutagen: Reduces catalytic activity 100- fold.
+ 23 * - MOD_RES: Phosphotyrosine (By similarity).
+ 33 * - MOD_RES: Phosphotyrosine (By similarity).
+ 34 * - MOD_RES: Phosphothreonine (By similarity).
+ 90 - 208 DOMAIN: GST C-terminal.
+ 108 V S Mutagen: Changes the properties of the enzyme toward some substrates.
+ 108 V Q Mutagen: Reduces catalytic activity by half.
+ 109 V I Mutagen: Reduces catalytic activity by half.
+ 116 # - BINDING: Substrate.
+ 116 V A Mutagen: Reduces catalytic activity 10-fold.
+ 116 V F Mutagen: Slight increase of catalytic activity.
+ 173 V N in allele GSTM1B; dbSNP:rs1065411.
+ 210 V T in dbSNP:rs449856.
+
+If features are provided, then a legend of feature symbols is provided
+as well (disabled with C<--no-label>):
+
+ =*:phosphorylation
+ ==:active site
+ =@:site
+ =^:binding
+ =!:metal binding
+
+If the C<--lav> option is specified, domain and repeat features are
+presented in a different format for the C<lav2plt.pl> program:
+
+ >sp|P09488|GSTM1_HUMAN
+ 2 88 GST N-terminal.
+ 90 208 GST C-terminal.
+
+C<ann_feats2.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_feats2.pl> option. It can also be used with the lav2plt.pl
+program with the C<--xA "\!ann_feats2.pl --lav"> or C<--yA "\!ann_feats2.pl --lav"> options.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_feats_up_sql.pl b/scripts/ann_feats_up_sql.pl
new file mode 100755
index 0000000..c11d3e2
--- /dev/null
+++ b/scripts/ann_feats_up_sql.pl
@@ -0,0 +1,463 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014,2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_feats_up_sql.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# this version can read feature2 uniprot features (acc/pos/end/label/value), but returns sorted start/end domains
+# modified 18-Jan-2016 to produce annotation symbols consistent with ann_feats_up_www2.pl
+
+use strict;
+
+use DBI;
+use Getopt::Long;
+use Pod::Usage;
+
+use vars qw($host $db $a_table $port $user $pass);
+
+my %domains = ();
+my $domain_cnt = 0;
+
+my $hostname = `/bin/hostname`;
+
+unless ($hostname =~ m/ebi/) {
+ ($host, $db, $a_table, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "uniprot", "annot2", 0, "web_user", "fasta_www");
+# $host = 'xdb';
+}
+else {
+ ($host, $db, $a_table, $port, $user, $pass) = ("mysql-pearson-prod", "up_db", "annot", 4124, "web_user", "fasta_www");
+}
+
+my ($sstr, $lav, $neg_doms, $no_vars, $no_doms, $no_feats, $shelp, $help, $pfam26) = (0,0,0,0,0,0,0,0,0,0);
+my ($min_nodom) = (10);
+
+my ($show_color) = (1);
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+GetOptions(
+ "host=s" => \$host,
+ "db=s" => \$db,
+ "user=s" => \$user,
+ "password=s" => \$pass,
+ "port=i" => \$port,
+ "lav" => \$lav,
+ "no_doms" => \$no_doms,
+ "no-doms" => \$no_doms,
+ "nodoms" => \$no_doms,
+ "no_var" => \$no_vars,
+ "no-var" => \$no_vars,
+ "novar" => \$no_vars,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "negdoms" => \$neg_doms,
+ "min_nodom=i" => \$min_nodom,
+ "min-nodom=i" => \$min_nodom,
+ "no_feats" => \$no_feats,
+ "no-feats" => \$no_feats,
+ "nofeats" => \$no_feats,
+ "color!" => \$show_color,
+ "sstr" => \$sstr,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (@ARGV || -p STDIN || -f STDIN);
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+
+my @feat_keys = qw(ACT_SITE MOD_RES BINDING SITE METAL);
+my @feat_vals = ( '=','*','#','^','!');
+my @feat_names = ('Active site', 'Modified', 'Substrate binding', 'Site', 'Metal binding');
+unless ($no_vars) {
+ push @feat_keys, qw(VARIANT MUTAGEN);
+ push @feat_vals, ('V','V');
+ push @feat_names, ('','');
+}
+
+my %feat_label = ();
+ at feat_label{@feat_keys} = @feat_names;
+
+my @dom_keys = qw( DOMAIN REPEAT );
+my @dom_vals = ( [ '[', ']'],[ '[', ']']);
+
+my @ssr_keys = qw( SSTR );
+my @ssr_vals = ( [ '[', ']']);
+
+my %annot_types = ();
+
+my $get_annot_sub = \&get_fasta_annots;
+if ($lav) {
+ $no_feats = 1;
+ $get_annot_sub = \&get_lav_annots;
+}
+
+if ($sstr) {@annot_types{@ssr_keys} = @ssr_vals;}
+else {
+ @annot_types{@feat_keys} = @feat_vals unless ($no_feats);
+ @annot_types{@dom_keys} = @dom_vals unless ($no_doms);
+}
+
+if ($neg_doms) {
+ $domains{'NODOM'}=0;
+}
+
+my $get_annots_id = $dbh->prepare(qq(select acc, pos, end, label, value, len from features2 join $a_table using(acc) where id=? order by pos));
+my $get_annots_acc = $dbh->prepare(qq(select acc, pos, end, label, value, len from features2 join $a_table using(acc) where acc=? order by pos));
+
+my $get_annots_refacc = $dbh->prepare(qq(select ref_acc, pos, end, label, value, len from features2 join $a_table using(acc) where ref_acc=? order by pos));
+
+my $up_atable = "uniprot." . $a_table;
+
+my $get_annots_sql = $get_annots_id;
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+unless ($no_feats || $sstr) {
+ for my $i ( 0 .. $#feat_keys) {
+ next unless $feat_label{$feat_keys[$i]};
+ print "=",$feat_vals[$i],":",$feat_label{$feat_keys[$i]},"\n";
+ }
+}
+
+# unless ($no_feats || $sstr) {
+# print "=*:phosphorylation\n";
+# print "==".":active site\n";
+# print "=@".":site\n";
+# print "=^:binding\n";
+# print "=!:metal binding\n";
+# }
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+
+#if it's a file I can open, read and parse it
+unless ($query && ($query =~ m/[\|:]/ ||
+ $query =~ m/^[NX]P_/ ||
+ $query =~ m/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\s/)) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, show_annots($a_line, $get_annot_sub);
+ }
+}
+else {
+ push @annots, show_annots("$query\t$seq_len", $get_annot_sub);
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && $show_color && defined($domains{$annot->[-1]})) {
+ $annot->[-1] .= $color_sep_str.$domains{$annot->[-1]};
+ }
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($query_len, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_len) = split(/\t/,$query_len);
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ if ($annot_line =~ m/^gi\|/) {
+ $use_acc = 1;
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+) (\w+)/) {
+ ($sdb, $id, $acc) = ($1,$2,$3);
+ $use_acc = 1;
+ $sdb = lc($sdb)
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+)/) {
+ ($sdb, $id) = ($1,$2);
+ $use_acc = 0;
+ $sdb = lc($sdb)
+ }
+ elsif ($annot_line !~ m/\|/) { # new NCBI swissprot format
+ $use_acc =1;
+ $sdb = 'sp';
+ ($acc) = split(/\s+/,$annot_line);
+ }
+ else {
+ $use_acc = 1;
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+
+ # remove version number
+ unless ($use_acc) {
+ $get_annots_sql = $get_annots_id;
+ $get_annots_sql->execute($id);
+ }
+ else {
+ unless ($sdb =~ m/ref/) {
+ $get_annots_sql = $get_annots_acc;
+ } else {
+ $get_annots_sql = $get_annots_refacc;
+ }
+ $acc =~ s/\.\d+$//;
+ $get_annots_sql->execute($acc);
+ }
+
+ $annot_data{list} = $get_annot_sub->(\%annot_types, $get_annots_sql, $seq_len);
+
+ return \%annot_data;
+}
+
+sub get_fasta_annots {
+ my ($annot_types, $get_annots_sql, $seq_len) = @_;
+
+ my ($acc, $pos, $end, $label, $value, $comment, $len);
+
+ $seq_len = 0;
+
+ my @feats2 = (); # features with start/stop, for checking overlap, adding negative
+ my @sites = (); # sites with one position
+
+ while (($acc, $pos, $end, $label, $value, $len) = $get_annots_sql->fetchrow_array()) {
+ $seq_len = $len if ($len > $seq_len);
+ if ($annot_types->{$label}) {
+ if ($label =~ m/VARIANT/) {
+ my ($aa_res, $comment) = split(/\(/,$value);
+ if ($comment) {
+ $comment =~ s/\)//;
+# remove the /FTId=VAR_014497 information
+ $comment =~ s/\s+\/FTId=.*$//;
+ }
+ else {$comment = "";}
+ next if ($comment =~ /MISSING/);
+ my ($vfrom, $vto) = ($aa_res =~ m/(\w)\s*->\s*(\w)/);
+ if ($vto) {
+ $comment = '' unless $comment;
+ $value = $vto;
+ push @sites, [$pos, $annot_types->{$label}, $value, $comment];
+ }
+ } elsif ($label =~ m/MUTAGEN/) {
+ my ($aa_res, $comment) = split(/: /,$value);
+ next if ($comment =~ /MISSING/);
+ my ($vfrom, $vto) = split(/\->/,$aa_res);
+ next if (length($vfrom) > 1 || length($vto) > 1);
+ if ($vto) {
+ my @vto_list = split(/,/,$vto);
+ $value = $vto;
+ for my $val ( @vto_list) {
+ push @sites, [$pos, $annot_types->{$label}, $val, "Mutagen: $comment"];
+ }
+ }
+ } elsif ($label =~ m/DOMAIN/ || $label =~ m/REPEAT/) {
+ $value = domain_name($label,$value);
+ push @feats2, [$pos, "-", $end, $value];
+
+ } elsif ($label =~ m/SSTR/) {
+ next if $value =~ m/TURN/;
+ push @feats2, [$pos, "-", $end, $value];
+ }
+ else {
+# print join("\t",($pos, $annot_types->{$label})),"\n";
+# print join("\t",($pos, $annot_types->{$label}, "-", "$label: $value")),"\n";
+ push @sites, [$pos, $annot_types->{$label}, "-", "$label: $value"];
+ }
+ }
+ }
+
+ # ensure that domains do not overlap
+ for (my $i=1; $i < scalar(@feats2); $i++) {
+ my $diff = $feats2[$i-1]->[2] - $feats2[$i]->[0];
+ if ($diff >= 0) {
+ $feats2[$i-1]->[2] = $feats2[$i]->[0]+ int($diff/2);
+ $feats2[$i]->[0] = $feats2[$i-1]->[2] + 1;
+ }
+ }
+
+ my @n_feats2 = ();
+
+ if ($neg_doms) {
+ my $last_end = 0;
+ for my $feat ( @feats2 ) {
+ if ($feat->[0] - $last_end > $min_nodom) {
+ push @n_feats2, [$last_end+1, "-", $feat->[0]-1, "NODOM"];
+ }
+ $last_end = $feat->[2];
+ }
+ if ($seq_len - $last_end > $min_nodom) {
+ push @n_feats2, [$last_end+1, "-", $seq_len, "NODOM"];
+ }
+ }
+
+ my @feats = ();
+ for my $feat (@feats2, @n_feats2) {
+ push @feats, [$feat->[0], '-', $feat->[2], $feat->[-1] ];
+# push @feats, [$feat->[2], ']', '-', ""];
+ }
+
+ @feats = sort { $a->[0] <=> $b->[0] } (@sites, @feats);
+
+ return \@feats;
+}
+
+sub get_lav_annots {
+ my ($annot_types, $get_annots_sql, $seq_len) = @_;
+
+ my ($pos, $end, $label, $value, $comment);
+
+ my @feats = ();
+
+ my %annot = ();
+ while (($acc, $pos, $end, $label, $value) = $get_annots_sql->fetchrow_array()) {
+ next unless ($label =~ m/^DOMAIN/ || $label =~ m/^REPEAT/);
+ $value =~ s/\s?\{.+\}\.?$//;
+ $value = domain_name($label,$value);
+ push @feats, [$pos, $end, $value];
+ }
+
+ return \@feats;
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($label, $value) = @_;
+
+ if ($label =~ /DOMAIN|REPEAT/) {
+ $value =~ s/;.*$//;
+ $value =~ s/\s+\d+\.?$//;
+ $value =~ s/\.\s*$//;
+ $value =~ s/\s+\d+\.\s+.*$//;
+ $value =~ s/\s+/_/;
+ if (!defined($domains{$value})) {
+ $domain_cnt++;
+ $domains{$value} = $domain_cnt;
+ }
+ return $value;
+ }
+ else {
+ return $value;
+ }
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_feats_up_sql.pl
+
+=head1 SYNOPSIS
+
+ ann_feats_up_sql.pl --no_doms --no_feats --lav 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --no-doms do not show domain boundaries (domains are always shown with --lav)
+ --no-feats do not show features (variants, active sites, phospho-sites)
+ --no-var do not show variant sites (--no_var, --novar)
+ --lav produce lav2plt.pl annotation format, only show domains/repeats
+ --neg-doms, -- report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --min_nodom=10 minimum non-domain length to produce NODOM
+ --host, --user, --password, --port --db -- info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_feats_up_sql.pl> extracts feature, domain, and repeat information from
+a msyql database (default name, uniprot) built by parsing the
+uniprot_sprot.dat and uniprot_trembl.dat feature tables. Given a
+command line argument that contains a sequence accession (P09488) or
+identifier (GSTM1_HUMAN), the program looks up the features available
+for that sequence and returns them in a tab-delimited format:
+
+ >sp|P09488|GSTM1_HUMAN
+ 2 - 88 GST_N-terminal~1
+ 7 V F Mutagen: Reduces catalytic activity 100- fold. {ECO:0000269|PubMed:16548513}.
+ 34 * - MOD_RES: Phosphothreonine. {ECO:0000250|UniProtKB:P10649}.
+ 90 - 208 GST_C-terminal~2
+ 108 V S Mutagen: Changes the properties of the enzyme toward some substrates. {ECO:0000269|PubMed:16548513, ECO:0000269|PubMed:9930979}.
+ 108 V Q Mutagen: Reduces catalytic activity by half. {ECO:0000269|PubMed:16548513, ECO:0000269|PubMed:9930979}.
+ 109 V I Mutagen: Reduces catalytic activity by half. {ECO:0000269|PubMed:16548513}.
+ 116 # - BINDING: Substrate.
+ 116 V A Mutagen: Reduces catalytic activity 10-fold. {ECO:0000269|PubMed:16548513}.
+ 116 V F Mutagen: Slight increase of catalytic activity. {ECO:0000269|PubMed:16548513}.
+ 173 V N in allele GSTM1B; dbSNP:rs1065411. {ECO:0000269|Ref.3, ECO:0000269|Ref.5}.
+ 210 * - MOD_RES: Phosphoserine. {ECO:0000250|UniProtKB:P04905}.
+ 210 V T in dbSNP:rs449856.
+
+If features are provided, then a legend of feature symbols is provided
+as well:
+
+ ==:Active site
+ =*:Modified
+ =#:Substrate binding
+ =^:Site
+ =!:Metal binding
+
+If the C<--lav> option is specified, domain and repeat features are
+presented in a different format for the C<lav2plt.pl> program:
+
+ >sp|P09488|GSTM1_HUMAN
+ 2 88 GST N-terminal.
+ 90 208 GST C-terminal.
+
+C<ann_feats_up_sql.pl> is designed to be used by the B<FASTA> programs
+with the C<-V \!ann_feats_up_sql.pl> option, or by the
+C<annot_blast_btop.pl> script. It can also be used with the
+lav2plt.pl program with the C<--xA "\!ann_feats_up_sql.pl --lav"> or
+C<--yA "\!ann_feats_up_sql.pl --lav"> options.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_feats_up_www2.pl b/scripts/ann_feats_up_www2.pl
new file mode 100755
index 0000000..becce41
--- /dev/null
+++ b/scripts/ann_feats_up_www2.pl
@@ -0,0 +1,455 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014,2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+## modified 29-Sept-2016 to use EBI/proteins JSON URL:
+## http://www.ebi.ac.uk/proteins/api/features/p12345
+
+# ann_feats_up_www2.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# SP:GSTM1_HUMAN P09488 218
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# this version can read feature2 uniprot features (acc/pos/end/label/value), but returns sorted start/end domains
+
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+use LWP::Simple;
+use JSON qw(decode_json);
+
+## use IO::String;
+
+my $up_base = 'http://www.ebi.ac.uk/proteins/api/features';
+
+my %domains = ();
+my $domain_cnt = 0;
+
+my $hostname = `/bin/hostname`;
+
+my ($sstr, $lav, $neg_doms, $no_doms, $no_feats, $no_vars, $no_over, $data_file, $shelp, $help) = (0,0,0,0,0,0,0,0,0,0);
+my ($min_nodom) = (10);
+
+my ($show_color) = (1);
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+GetOptions(
+ "lav" => \$lav,
+ "no-over" => \$no_over,
+ "no_doms" => \$no_doms,
+ "no-doms" => \$no_doms,
+ "nodoms" => \$no_doms,
+ "no_vars" => \$no_vars,
+ "no-vars" => \$no_vars,
+ "novars" => \$no_vars,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "negdoms" => \$neg_doms,
+ "min_nodom=i" => \$min_nodom,
+ "no_feats" => \$no_feats,
+ "no-feats" => \$no_feats,
+ "nofeats" => \$no_feats,
+ "data:s" => \$data_file,
+ "color!" => \$show_color,
+ "sstr" => \$sstr,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless @ARGV || $data_file || -p STDIN || -f STDIN;
+
+my @feat_keys = qw( ACT_SITE MOD_RES BINDING METAL SITE );
+my @feat_vals = ( '=','*','#','^','@');
+my @feat_names = ('Active site', 'Modified', 'Binding', 'Metal binding', 'Site');
+
+unless ($no_vars) {
+ push @feat_keys, qw(MUTAGEN VARIANT);
+ push @feat_vals, ('V','V',);
+ push @feat_names, ("","",);
+}
+
+my %feats_text = ();
+ at feats_text{@feat_keys} = @feat_names;
+
+my %feats_label;
+ at feats_label{@feat_keys} = @feat_names;
+
+my @dom_keys = qw( DOMAIN REPEAT );
+my @dom_vals = ( [ '[', ']'],[ '[', ']']);
+
+my @ssr_keys = qw(STRAND HELIX);
+my @ssr_vals = ( [ '[', ']'], [ '[', ']']);
+
+my %annot_types = ();
+
+my $get_annot_sub = \&json_annots;
+if ($lav) {
+ $no_feats = 1;
+}
+
+if ($sstr) {
+ @annot_types{@ssr_keys} = @ssr_vals;
+} else {
+ @annot_types{@feat_keys} = @feat_vals unless ($no_feats);
+ @annot_types{@dom_keys} = @dom_vals unless ($no_doms);
+}
+
+if ($neg_doms) {
+ $domains{'NODOM'}=0;
+}
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+unless ($no_feats || $sstr) {
+ for my $i ( 0 .. $#feat_keys) {
+ next unless $feats_label{$feat_keys[$i]};
+ print "=",$feat_vals[$i],":",$feats_label{$feat_keys[$i]},"\n";
+ }
+}
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+
+#if it's a file I can open, read and parse it
+
+unless ($data_file) {
+ unless ($query && ($query =~ m/[\|:]/
+ || $query =~ m/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\s/
+ || $query =~ m/^(XN)(MP)_\d+/)) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, lwp_annots($a_line, $get_annot_sub);
+ }
+ } else {
+ push @annots, lwp_annots("$query\t$seq_len", $get_annot_sub);
+ }
+} else { # just read the data from a file, give to $get_annot_sub().
+ my %annot_data = (seq_info => ">$data_file DATA");
+
+ open(DATA_IN, $data_file) || die "Cannot read $data_file";
+
+ my $lwp_data = "";
+ while (<DATA_IN>) {
+ $lwp_data .= $_;
+ }
+
+ $annot_data{list} = $get_annot_sub->(\%annot_types, $lwp_data,0);
+
+ push @annots, \%annot_data;
+}
+
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && $show_color && defined($domains{$annot->[-1]})) {
+ $annot->[-1] .= $color_sep_str.$domains{$annot->[-1]};
+ }
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub lwp_annots {
+ my ($query_len, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_len) = split(/\t/,$query_len);
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ if ($annot_line =~ m/^gi\|/) {
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ } elsif ($annot_line =~ m/^(SP|TR):(\w+)\s(\w+)/) {
+ ($sdb, $id, $acc) = (lc($1), $2, $3);
+ } elsif ($annot_line =~ m/^(SP|TR):(\w+)/) {
+ ($sdb, $id, $acc) = (lc($1), $2, "");
+ warn("*** $0 accession required: $annot_line\n");
+ } elsif ($annot_line =~ m/^(UR\d{3}:UniRef\d{2})_(\w+)/) {
+ $sdb = lc($1);
+ $id = $2;
+# $acc = $2;
+ } elsif ($annot_line =~ m/\|/) {
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ else {
+ ($acc) = ($annot_line =~ m/^(\S+)/);
+ }
+
+ $acc =~ s/\.\d+// if ($acc);
+
+ $annot_data{list} = [];
+ my $lwp_features = "";
+
+ if ($acc && ($acc =~ m/^[A-Z][0-9][A-Z0-9]{3}[0-9]/)) {
+ $lwp_features = get("$up_base/$acc.json");
+ }
+# elsif ($id && ($id =~ m/^\w+$/)) {
+# $lwp_features = get("$up_base/$id/$gff_post");
+# }
+
+ if ($lwp_features && ($lwp_features !~ /ERROR/)) {
+ my $annot_json = decode_json($lwp_features);
+ $annot_data{list} = $get_annot_sub->(\%annot_types, $annot_json, $seq_len);
+ }
+
+ return \%annot_data;
+}
+
+####
+# parses www.ebi.ac.uk/uniprot/api json
+#
+sub json_annots {
+ my ($annot_types, $json_ref, $seq_len) = @_;
+
+ my ($acc, $pos, $end, $label, $value, $comment, $len);
+
+ $seq_len = 0;
+
+ my @feats2 = (); # features with start/stop, for checking overlap, adding negative
+ my @sites = (); # sites with one position
+
+ my ($seq_str, $seq_acc, $seq_id) = @{$json_ref}{qw(sequence accession entryName)};
+ $seq_len = length($seq_str);
+
+ for my $feat ( @{$json_ref->{features}} ) {
+ if ($annot_types->{$feat->{type}}) {
+
+ my ($label, $pos, $end, $value) = @{$feat}{qw(type begin end description)};
+
+ $pos =~ s/[<>]//g;
+ $end =~ s/[<>]//g;
+
+ if ($label =~ m/DOMAIN/ || $label =~ m/REPEAT/) {
+ $value = domain_name($label,$value);
+ push @feats2, [$pos, "-", $end, $value];
+ } elsif ($label =~ m/HELIX/) {
+ push @feats2, [$pos, "-", $end, $label];
+ } elsif ($label =~ m/STRAND/) {
+ push @feats2, [$pos, "-", $end, $label];
+ } elsif ($label =~ m/VARIANT/ || $label =~ m/MUTAGEN/) {
+ push @sites, [$pos, $annot_types->{$label}, $feat->{alternativeSequence}, $value];
+ }
+ else {
+ next unless ($pos == $end);
+ if ($feats_text{$label}) {
+ my $info = $feats_text{$label};
+ if ($value) {
+ $info .= ": $value";
+ }
+ push @sites, [$pos, $annot_types->{$label}, "-", $info];
+ } else {
+ push @sites, [$pos, $annot_types->{$label}, "-", $value];
+ }
+ }
+ }
+ }
+
+ @feats2 = sort { $a->[0] <=> $b->[0] } @feats2;
+
+ if ($no_over) {
+ # check for containment
+ my $have_contained = 0;
+ my $last_container = 0;
+ for (my $i=1; $i < scalar(@feats2); $i++) {
+ if ($feats2[$i]->[0] >= $feats2[$last_container]->[0] && $feats2[$i]->[2] <= $feats2[$last_container]->[2]) {
+ $feats2[$i]->[1] = 'Delete';
+ $have_contained = 1;
+ } else {
+ $last_container=$i;
+ }
+ }
+
+ if ($have_contained) {
+ @feats2 = grep { $_->[1] !~ /Delete/ } @feats2;
+ }
+
+ # ensure that domains do not overlap
+ for (my $i=1; $i < scalar(@feats2); $i++) {
+ my $diff = $feats2[$i-1]->[2] - $feats2[$i]->[0];
+ if ($diff >= 0) {
+ $feats2[$i-1]->[2] = $feats2[$i]->[0]+ int($diff/2);
+ $feats2[$i]->[0] = $feats2[$i-1]->[2] + 1;
+ }
+ }
+ }
+
+ my @n_feats2 = ();
+
+ if ($neg_doms) {
+ my $last_end = 0;
+ for my $feat ( @feats2 ) {
+ if ($feat->[0] - $last_end > $min_nodom) {
+ push @n_feats2, [$last_end+1, "-", $feat->[0]-1, "NODOM"];
+ }
+ $last_end = $feat->[2];
+ }
+ if ($seq_len - $last_end > $min_nodom) {
+ push @n_feats2, [$last_end+1, "-", $seq_len, "NODOM"];
+ }
+ }
+
+ my @feats = ();
+ for my $feat (@feats2, @n_feats2) {
+ if (!$lav) {
+ push @feats, [$feat->[0], '-', $feat->[2], $feat->[-1] ];
+# push @feats, [$feat->[2], ']', '-', ""];
+ }
+ else {
+ push @feats, [$feat->[0], $feat->[2], $feat->[-1]];
+ }
+ }
+
+ @feats = sort { $a->[0] <=> $b->[0] } (@sites, @feats);
+
+ return \@feats;
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($label, $value) = @_;
+
+ $value = 'UnDef' unless $value;
+
+ $value =~ s/ /_/g;
+
+ if ($label =~ /Domain|Repeat/i) {
+ $value =~ s/;.*$//;
+ $value =~ s/\.\s*$//;
+ $value =~ s/\s+\d+$//;
+ if (!defined($domains{$value})) {
+ $domain_cnt++;
+ $domains{$value} = $domain_cnt;
+ }
+ return $value;
+ }
+ else {
+ return $value;
+ }
+}
+
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_feats_up_www2.pl
+
+=head1 SYNOPSIS
+
+ ann_feats_up_www2.pl --no_doms --no_feats --lav 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --no-doms do not show domain boundaries (domains are always shown with --lav)
+ --no-feats do not show feature (variants, active sites, phospho-sites)
+ --lav produce lav2plt.pl annotation format, only show domains/repeats
+
+ --neg-doms, -- report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --min_nodom=10 -- minimum length between domains for NODOM
+
+ --host, --user, --password, --port --db -- info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_feats_up_www2.pl> extracts feature, domain, and repeat
+information from the Uniprot DAS server through an XSLT transation
+provided by http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/uniprotkb.
+This server provides GFF descriptions of Uniprot entries, with most of
+the information provided in UniProt feature tables.
+
+C<ann_feats_up_www2.pl> is an alternative to C<ann_pfam.pl> and
+C<ann_pfam.pl> that does not require a local MySQL copy of Pfam.
+
+Given a command line argument that contains a sequence accession
+(P09488), the program looks up the features available for that
+sequence and returns them in a tab-delimited format:
+
+>sp|P09488|GSTM1_HUMAN
+2 [ - GST N-terminal :1
+7 V F Mutagen: Reduces catalytic activity 100- fold.
+23 * - MOD_RES: Phosphotyrosine (By similarity).
+33 * - MOD_RES: Phosphotyrosine (By similarity).
+34 * - MOD_RES: Phosphothreonine (By similarity).
+88 ] -
+90 [ - GST C-terminal :2
+108 V Q Mutagen: Reduces catalytic activity by half.
+108 V S Mutagen: Changes the properties of the enzyme toward some substrates.
+109 V I Mutagen: Reduces catalytic activity by half.
+116 # - BINDING: Substrate.
+116 V A Mutagen: Reduces catalytic activity 10-fold.
+116 V F Mutagen: Slight increase of catalytic activity.
+173 V N in allele GSTM1B; dbSNP:rs1065411.
+208 ] -
+210 V T in dbSNP:rs449856.
+
+If features are provided, then a legend of feature symbols is provided
+as well:
+
+ =*:phosphorylation
+ ==:active site
+ =@:site
+ =^:binding
+ =!:metal binding
+
+If the C<--lav> option is specified, domain and repeat features are
+presented in a different format for the C<lav2plt.pl> program:
+
+ >sp|P09488|GSTM1_HUMAN
+ 2 88 GST N-terminal.
+ 90 208 GST C-terminal.
+
+C<ann_feats_up_www2.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_feats_up_www2.pl> option. It can also be used with the lav2plt.pl
+program with the C<--xA "\!ann_feats_up_www2.pl --lav"> or C<--yA "\!ann_feats_up_www2.pl --lav"> options.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_ipr_www.pl b/scripts/ann_ipr_www.pl
new file mode 100755
index 0000000..77f1a4e
--- /dev/null
+++ b/scripts/ann_ipr_www.pl
@@ -0,0 +1,467 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_ipr_www.pl gets an annotation file from fasta36 -V with a line of the form:
+
+
+# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg)
+
+# this version only annotates sequences known to InterPro
+# and only provides domain information
+
+# This script uses the dbfetch iprmc database, which REQUIRES a
+# Uniprot Acc (not ID). If an Acc is not provided, we must get an ACC
+# first from the ID.
+
+# SP:GSTM1_HUMAN P09488 218
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited domains
+#
+
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+use LWP::Simple;
+## use IO::String;
+
+# use dbfetch and IPRMC to get Interpro domain coordinates
+# http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=iprmc&id=gstm1_human&format=gff2&style=default&Retrieve=Retrieve
+
+my $ipr_base = 'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=iprmc&id=';
+my $gff_post = '&format=gff2&style=default&Retrieve=Retrieve';
+
+################################################################
+#
+##gff-version 2
+##Type Protein
+# InterPro Matches for UniProtKB entries
+##source-version InterProMatches 49.0
+##date 20-NOV-14
+
+##sequence-region P09488 1 218
+# P09488 InterProScan region 99 190 1.1999999998684077E-49 . . Signature GENE3D G3DSA:1.20.1050.10 "G3DSA:1.20.1050.10" T ; InterPro IPR010987 "Glutathione S-transferase, C-terminal-like"
+# P09488 InterProScan region 2 98 5.800000000494973E-51 . . Signature GENE3D G3DSA:3.40.30.10 "G3DSA:3.40.30.10" T ; InterPro IPR012336 "Thioredoxin-like fold"
+# P09488 InterProScan region 105 189 3.900000000000007E-16 . . Signature PFAM PF00043 "GST_C" T ; InterPro IPR004046 "Glutathione S-transferase, C-terminal"
+# P09488 InterProScan region 4 82 7.299999999999985E-21 . . Signature PFAM PF02798 "GST_N" T ; InterPro IPR004045 "Glutathione S-transferase, N-terminal"
+# P09488 InterProScan region 31 43 1.1000015067164208E-25 . . Signature PRINTS PR01267 "GSTRNSFRASEM" T ; InterPro IPR003081 "Glutathione S-transferase, Mu class"
+# P09488 InterProScan region 44 56 1.1000015067164208E-25 . . Signature PRINTS PR01267 "GSTRNSFRASEM" T ; InterPro IPR003081 "Glutathione S-transferase, Mu class"
+# P09488 InterProScan region 87 98 1.1000015067164208E-25 . . Signature PRINTS PR01267 "GSTRNSFRASEM" T ; InterPro IPR003081 "Glutathione S-transferase, Mu class"
+# P09488 InterProScan region 139 152 1.1000015067164208E-25 . . Signature PRINTS PR01267 "GSTRNSFRASEM" T ; InterPro IPR003081 "Glutathione S-transferase, Mu class"
+# P09488 InterProScan region 1 88 0.0 . . Signature PROFILE PS50404 "GST_NTER" T ; InterPro IPR004045 "Glutathione S-transferase, N-terminal"
+# P09488 InterProScan region 90 208 0.0 . . Signature PROFILE PS50405 "GST_CTER" T ; InterPro IPR010987 "Glutathione S-transferase, C-terminal-like"
+# P09488 InterProScan region 1 217 0.0 . . Signature PANTHER PTHR11571 "PTHR11571" T
+# P09488 InterProScan region 1 217 0.0 . . Signature PANTHER PTHR11571:SF117 "PTHR11571:SF117" T
+# P09488 InterProScan region 86 217 8.190000000746436E-47 . . Signature SSF SSF47616 "SSF47616" T ; InterPro IPR010987 "Glutathione S-transferase, C-terminal-like"
+# P09488 InterProScan region 3 85 3.339999999911062E-23 . . Signature SSF SSF52833 "SSF52833" T ; InterPro IPR012336 "Thioredoxin-like fold"
+###
+
+my %domains = ();
+my $domain_cnt = 0;
+
+my $hostname = `/bin/hostname`;
+
+my ($sstr, $lav, $neg_doms, $no_doms, $no_feats, $no_over, $data_file, $shelp, $help) = (0,0,0,0,1,0,0,0,0);
+my $dom_dbs = "PFAM+PROFILE+GENE3D";
+
+my ($min_nodom) = (10);
+
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+GetOptions(
+ "lav" => \$lav,
+ "no-over" => \$no_over,
+ "no_doms" => \$no_doms,
+ "no-doms" => \$no_doms,
+ "nodoms" => \$no_doms,
+ "dom_dbs:s" => \$dom_dbs, # PF, PS,
+ "dbs:s" => \$dom_dbs,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "negdoms" => \$neg_doms,
+ "min_nodom=i" => \$min_nodom,
+ "no_feats" => \$no_feats,
+ "no-feats" => \$no_feats,
+ "nofeats" => \$no_feats,
+ "data:s" => \$data_file,
+ "sstr" => \$sstr,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (-p STDIN || -f STDIN || @ARGV || $data_file);
+
+my @feat_keys = qw(catalytic_residue posttranslation_modification binding_motif metal_contact
+ polypeptide_region mutated_variant_site natural_variant_site);
+
+my %feats_text = ();
+ at feats_text{@feat_keys} = ('Active site', '', 'Substrate binding', 'Metal binding', 'Site', '','');
+
+my %feats_label;
+ at feats_label{@feat_keys} = ('Active site', 'Modified', 'Substrate binding', 'Metal binding', 'Site', '','');
+
+my @feat_vals = ( '=','*','#','^','@','V','V');
+
+my %annot_types = ();
+
+my $get_annot_sub = \&iprmc_annots;
+if ($lav) {
+ $no_feats = 1;
+}
+
+if ($dom_dbs) {
+ my @dom_db_list = split(/\+/,$dom_dbs);
+
+ for my $dom_db (@dom_db_list) {
+ $annot_types{$dom_db} = $dom_db;
+ }
+}
+
+if ($neg_doms) {
+ $domains{'NODOM'}=0;
+}
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+unless ($no_feats || $sstr) {
+ for my $i ( 0 .. $#feat_keys) {
+ next unless $feats_label{$feat_keys[$i]};
+ print "=",$feat_vals[$i],":",$feats_label{$feat_keys[$i]},"\n";
+ }
+}
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+
+#if it's a file I can open, read and parse it
+
+unless ($data_file) {
+ unless ($query && ($query =~ m/[\|:]/
+ || $query =~ m/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\s/
+ || $query =~ m/^(NX)(MP)_\d+/)) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, lwp_annots($a_line, $get_annot_sub);
+ }
+ } else {
+ push @annots, lwp_annots("$query\t$seq_len", $get_annot_sub);
+ }
+} else { # just read the data from a file, give to $get_annot_sub().
+ my %annot_data = (seq_info => ">$data_file DATA");
+
+ open(DATA_IN, $data_file) || die "Cannot read $data_file";
+
+ my $lwp_data = "";
+ while (<DATA_IN>) {
+ $lwp_data .= $_;
+ }
+
+ $annot_data{list} = $get_annot_sub->(\%annot_types, $lwp_data,0);
+
+ push @annots, \%annot_data;
+}
+
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && defined($domains{$annot->[-1]})) {
+ $annot->[-2] .= $color_sep_str.$domains{$annot->[-1]};
+ }
+ print join("\t",@{$annot}[0..3]),"\n";
+ }
+}
+
+exit(0);
+
+sub lwp_annots {
+ my ($query_len, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_len) = split(/\t/,$query_len);
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ if ($annot_line =~ m/^gi\|/) {
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ } elsif ($annot_line =~ m/^(SP|TR):(\w+) (\w+)/) {
+ ($sdb, $id, $acc) = (lc($1), $2, $3);
+ } elsif ($annot_line =~ m/^(SP|TR):(\w+)/) {
+ ($sdb, $id, $acc) = (lc($1), $2, "");
+ } elsif ($annot_line =~ m/^(UR\d{3}:UniRef\d{2})_(\w+)/) {
+ $sdb = lc($1);
+ $id = $2;
+# $acc = $2;
+ } elsif ($annot_line =~ m/\|/) {
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ } else {
+ ($acc) = ($annot_line =~ m/^(\S+)/);
+ }
+
+ $acc =~ s/\.\d+// if ($acc);
+
+ $annot_data{list} = [];
+ my $lwp_domains = "";
+
+ if ($acc && ($acc =~ m/^[A-Z][0-9][A-Z0-9]{3}[0-9]/)) {
+ $lwp_domains = get($ipr_base . $acc . $gff_post);
+ } elsif ($id && ($id =~ m/^\w+$/)) {
+ $lwp_domains = get($ipr_base . $id . $gff_post);
+ }
+
+ if ($lwp_domains && ($lwp_domains !~ /ERROR/)) {
+ $annot_data{list} = $get_annot_sub->(\%annot_types, $lwp_domains, $seq_len);
+ }
+
+ return \%annot_data;
+}
+
+# parses www.uniprot.org gff feature table
+sub iprmc_annots {
+ my ($annot_types, $annot_data, $seq_len) = @_;
+
+ my ($acc, $pos, $end, $label, $value, $comment, $len);
+ my ($seq_acc, $seq_start, $seq_end, $tmp);
+
+ $seq_len = 0;
+
+ my @feats2 = (); # domains with start/stop, for checking overlap, adding negative
+ my @sites = (); # sites with one position
+
+ my @gff_lines = split(/\n/m,$annot_data);
+
+ while (my $gff_line = shift @gff_lines) {
+ chomp $gff_line;
+ if ($gff_line =~ m/^#sequence-region/) {
+ my @fields = split($gff_line, /\s+/);
+ $seq_end = $fields[-1];
+ last;
+ }
+ }
+
+ while (my $gff_line = shift(@gff_lines)) {
+ next if ($gff_line =~ m/^#/);
+ chomp($gff_line);
+
+ my @gff_line_arr = split(/\t/,$gff_line);
+ ($acc, $pos, $end, $comment) = @gff_line_arr[(0,3,4,-1)];
+
+ # parse the comment to get signature (domain_db), domain_db_acc, interpro_acc, description
+ my ($domain_info, $dom_acc) = parse_ipr_comment($comment);
+
+ next unless $domain_info;
+
+ push @feats2, [$pos, "-", $end, $domain_info, $dom_acc];
+
+ $value = '' unless $value;
+ # print join("\t",($pos, $annot_types->{$label})),"\n";
+ # print join("\t",($pos, $annot_types->{$label}, "-", "$label: $value")),"\n";
+ }
+
+ @feats2 = sort { $a->[0] <=> $b->[0] } @feats2;
+
+ if ($no_over) {
+ # check for containment
+ my $have_contained = 0;
+ my $last_container = 0;
+ for (my $i=1; $i < scalar(@feats2); $i++) {
+ if ($feats2[$i]->[0] >= $feats2[$last_container]->[0] && $feats2[$i]->[2] <= $feats2[$last_container]->[2]) {
+ $feats2[$i]->[1] = 'Delete';
+ $have_contained = 1;
+ } else {
+ $last_container=$i;
+ }
+ }
+
+ if ($have_contained) {
+ @feats2 = grep { $_->[1] !~ /Delete/ } @feats2;
+ }
+
+ # ensure that domains do not overlap
+ for (my $i=1; $i < scalar(@feats2); $i++) {
+ my $diff = $feats2[$i-1]->[2] - $feats2[$i]->[0];
+ if ($diff >= 0) {
+ $feats2[$i-1]->[2] = $feats2[$i]->[0]+ int($diff/2);
+ $feats2[$i]->[0] = $feats2[$i-1]->[2] + 1;
+ }
+ }
+ }
+
+ my @n_feats2 = ();
+
+ if ($neg_doms) {
+ my $last_end = 0;
+ for my $feat ( @feats2 ) {
+ if ($feat->[0] - $last_end > $min_nodom) {
+ push @n_feats2, [$last_end+1, "-", $feat->[0]-1, "NODOM", ""];
+ }
+ $last_end = $feat->[2];
+ }
+ if ($seq_len - $last_end > $min_nodom) {
+ push @n_feats2, [$last_end+1, "-", $seq_len, "NODOM", ""];
+ }
+ }
+
+ my @feats = ();
+
+ for my $feat (@feats2, @n_feats2) {
+ if (!$lav) {
+ push @feats, [$feat->[0], '-', $feat->[2], $feat->[-2], $feat->[-1] ];
+# push @feats, [$feat->[2], ']', '-', ""];
+ }
+ else {
+ push @feats, [$feat->[0], $feat->[2], $feat->[-1]];
+ }
+ }
+
+ @feats = sort { $a->[0] <=> $b->[0] } (@sites, @feats);
+
+ # now that domains are sorted, give them names
+ for my $feat ( @feats ) {
+ $feat->[-2] = domain_name($feat->[-2],$feat->[-1]);
+ }
+
+ return \@feats;
+}
+
+sub parse_ipr_comment {
+ my ($comment_str) = @_;
+
+ my @comments = split(/\s+;\s+/,$comment_str);
+ my @comment_info = ();
+ my $ipr_info = "";
+ $comments[0] =~ s/^Signature\s+//;
+
+ return ("","") unless @comments;
+
+ for my $comment (@comments) {
+ my %ipr_data = ();
+ @ipr_data{qw(db acc descr)} = ($comment =~ m/(\S+)\s+(\S+)\s+"([^"]+)"/);
+ return ("","") if $ipr_data{db} =~ m/(PRINTS|PROSITE)/i;
+ $ipr_data{descr} =~ s/\s+/_/g;
+ push @comment_info, \%ipr_data;
+ }
+
+ my $primary_acc = $comment_info[0]->{acc};
+
+ for my $comment (@comment_info) {
+ if ($comment->{db} =~ m/InterPro/) {
+ return ("$primary_acc:".$comment->{descr},$comment->{acc});
+ }
+ }
+ return ("","");
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($value, $ipr_acc) = @_;
+
+ $value = 'UnDef' unless $value;
+
+ $value =~ s/;.*$//;
+ $value =~ s/\.\s*$//;
+ $value =~ s/\s+\d+$//;
+ if (!defined($domains{$ipr_acc})) {
+ $domain_cnt++;
+ $domains{$ipr_acc} = $domain_cnt;
+ }
+ return $value;
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_ipr_www.pl
+
+=head1 SYNOPSIS
+
+ ann_ipr_www.pl --no_doms --no_feats --lav 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --no-doms do not show domain boundaries (domains are always shown with --lav)
+ --no-feats do not show feature (variants, active sites, phospho-sites)
+ --lav produce lav2plt.pl annotation format, only show domains/repeats
+
+ --neg-doms, -- report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --min_nodom=10 -- minimum length between domains for NODOM
+
+ --host, --user, --password, --port --db -- info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_ipr_www.pl> extracts feature, domain, and repeat
+information from the Uniprot DAS server through an XSLT transation
+provided by http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/uniprotkb.
+This server provides GFF descriptions of Uniprot entries, with most of
+the information provided in UniProt feature tables.
+
+C<ann_ipr_www.pl> is an alternative to C<ann_pfam.pl> and
+C<ann_pfam.pl> that does not require a local MySQL copy of Pfam.
+
+Given a command line argument that contains a sequence accession
+(P09488), the program looks up the domains available for that
+sequence and returns them in a tab-delimited format:
+
+>sp|P09488|GSTM1_HUMAN
+2 - 88 GST N-terminal :1
+90 - 208 GST C-terminal :2
+
+If the C<--lav> option is specified, domain and repeat domains are
+presented in a different format for the C<lav2plt.pl> program:
+
+ >sp|P09488|GSTM1_HUMAN
+ 2 88 GST N-terminal.
+ 90 208 GST C-terminal.
+
+C<ann_ipr_www.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_ipr_www.pl> option. It can also be used with the lav2plt.pl
+program with the C<--xA "\!ann_ipr_www.pl --lav"> or C<--yA "\!ann_ipr_www.pl --lav"> options.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_pdb_cath.pl b/scripts/ann_pdb_cath.pl
new file mode 100755
index 0000000..1eeeaf1
--- /dev/null
+++ b/scripts/ann_pdb_cath.pl
@@ -0,0 +1,345 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_feats.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# this version is designed for various formats of the pdbaa/pdbaa_off NCBI files with the lines:
+# >gi|4388890|pdb|1GTUA|sp|P09488 or
+# >gi|4388890|pdb|1GTU|A
+# if I can find |sp|P09488, I will use that, otherwise I will use
+# |pdb|1GTU|A (concatenated) and a different part of the cath_dom
+# database
+#
+
+use strict;
+
+use DBI;
+use Getopt::Long;
+use Pod::Usage;
+
+use vars qw($host $db $port $user $pass);
+
+my $hostname = `/bin/hostname`;
+
+($host, $db, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "uniprot", 0, "web_user", "fasta_www");
+
+my ($neg_doms, $lav, $shelp, $help, $class) = (0, 0, 0, 0, 0);
+my ($min_nodom) = (10);
+
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+GetOptions(
+ "host=s" => \$host,
+ "db=s" => \$db,
+ "user=s" => \$user,
+ "password=s" => \$pass,
+ "port=i" => \$port,
+ "lav" => \$lav,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "min_nodom=i" => \$min_nodom,
+ "class" => \$class,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless ( @ARGV || -p STDIN || -f STDIN);
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+my %domains = (NODOM=>0);
+my $domain_cnt = 0;
+
+my $get_offsets_pdb = $dbh->prepare(<<EOSQL);
+SELECT res_beg, pdb_beg, pdb_end, sp_beg, sp_end
+FROM pdb_chain_up
+WHERE pdb_acc=?
+ORDER BY res_beg
+EOSQL
+
+my $get_cathdoms_pdb = $dbh->prepare(<<EOSQL);
+SELECT s_start, s_stop, p_start, p_stop, cath_class, s_descr as info
+FROM cath_doms
+JOIN cath_names using(cath_class)
+WHERE pdb_acc=?
+ORDER BY s_start
+EOSQL
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 1 unless $seq_len;
+
+$query =~ s/^>// if $query;
+
+my @annots = ();
+
+#if it's a file I can open, read and parse it
+unless ($query && $query =~ m/[\|:]/) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, show_annots($lav,$a_line);
+ }
+}
+else {
+ push @annots, show_annots($lav, "$query $seq_len");
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && defined($domains{$annot->[-1]})) {
+ $annot->[-1] .= $color_sep_str.$domains{$annot->[-1]};
+ }
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($lav, $annot_line) = @_;
+
+ my ($query, $seq_len) = split(/\s+/,$annot_line);
+
+ my %annot_data = (seq_info => $query);
+
+ my ($tmp, $gi, $pdb, $pdb_acc, $pdb_id, $pdb_chain, $sdb, $up_acc, $off_flag);
+
+ $off_flag = 0;
+ if ($query =~ m/^gi\|/) {
+ if ($query =~ m/\|sp\|/) {
+ ($tmp, $gi, $pdb, $pdb_acc, $sdb, $up_acc) = split(/\|/,$query);
+ $up_acc =~ s/\.\d+$//;
+ $off_flag = 1;
+ }
+ elsif ($query =~ m/\|pdb\|/) {
+ ($tmp, $gi, $pdb, $pdb_id, $pdb_chain) = split(/\|/,$query);
+ $pdb_acc = $pdb_id . $pdb_chain;
+ }
+ }
+ elsif ($query =~ m/^sp\|/) {
+ ($pdb, $pdb_acc) = split(/\|/,$query);
+ }
+ elsif ($query =~ m/^pdb\|(\w{4})\|(\w)/) {
+ $pdb_acc = $1 . $2;
+ }
+ else {
+ $pdb_acc = $query;
+ }
+
+# only get the first res_beg because it is used to calculate pdbaa_off @c:xxx
+ $get_offsets_pdb->execute($pdb_acc);
+ my ($res_beg, $pdb_beg, $pdb_end, $sp_beg, $sp_end) = $get_offsets_pdb->fetchrow_array();
+
+ $res_beg = 1 unless defined($res_beg);
+ $pdb_beg = 1 unless defined($pdb_beg);
+ $sp_beg = 1 unless defined($sp_beg);
+
+ if (defined($sp_end) && $sp_end > $seq_len) {$seq_len = $sp_end;}
+ if (defined($pdb_end) && $pdb_end > $seq_len) {$seq_len = $pdb_end;}
+
+ # unless ($seq_len > 1) {
+ # if (defined($sp_end)) {
+ # $seq_len = $sp_end;
+ # }
+ # elsif (defined($pdb_end)) {
+ # $seq_len = $pdb_end;
+ # }
+ # }
+
+ $get_cathdoms_pdb->execute($pdb_acc);
+ $annot_data{list} = get_cath_annots($lav, $get_cathdoms_pdb, $pdb_beg, $seq_len, $off_flag);
+
+ return \%annot_data;
+}
+
+sub get_cath_annots {
+ my ($lav, $get_annots, $sp_offset, $seq_length, $is_offset) = @_;
+
+ my @cath_domains = ();
+
+ # get the list of domains, sorted by start
+ while ( my $row_href = $get_annots->fetchrow_hashref()) {
+
+ # put in logic to subtract sp_offset when necessary
+ if ($is_offset && $row_href->{p_start}) {
+ $row_href->{seq_start} = $row_href->{p_start} - $sp_offset;
+ $row_href->{seq_end} = $row_href->{p_stop} - $sp_offset;
+ }
+ else {
+ $row_href->{seq_start} = $row_href->{s_start} - $sp_offset;
+ $row_href->{seq_end} = $row_href->{s_stop} - $sp_offset;
+ }
+
+ if ($seq_length <= 1) {
+ $seq_length = $row_href->{seq_end};
+ }
+ else {
+ $row_href->{seq_end} = $seq_length if ($row_href->{seq_end} > $seq_length);
+ }
+
+ $row_href->{info} =~ s/\s+/_/g;
+
+ push @cath_domains, $row_href
+ }
+
+ return unless (scalar(@cath_domains));
+
+ # do a consistency check
+ for (my $i=1; $i < scalar(@cath_domains); $i++) {
+ if ($cath_domains[$i]->{seq_start} <= $cath_domains[$i-1]->{seq_end}) {
+ my $delta = $cath_domains[$i]->{seq_start} - $cath_domains[$i-1]->{seq_end};
+ $cath_domains[$i-1]->{seq_end} -= $delta/2;
+ $cath_domains[$i]->{seq_start} = $cath_domains[$i-1]->{seq_end}+1;
+ }
+ }
+
+ if ($neg_doms) {
+ my @ncath_domains;
+ my $prev_dom={seq_end=>0};
+ for my $cur_dom ( @cath_domains) {
+ if ($cur_dom->{seq_start} - $prev_dom->{seq_end} > $min_nodom) {
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $cur_dom->{seq_start}-1, info=>'NODOM');
+ push @ncath_domains, \%new_dom;
+ }
+ push @ncath_domains, $cur_dom;
+ $prev_dom = $cur_dom;
+ }
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM');
+ if ($new_dom{seq_end} > $new_dom{seq_start}) {push @ncath_domains, \%new_dom;}
+
+ @cath_domains = @ncath_domains;
+ }
+
+ for my $cath (@cath_domains) {
+ if ($class && $cath->{cath_class}) {$cath->{info} = $cath->{cath_class};}
+ $cath->{info} = domain_name($cath->{info});
+ }
+
+ my @feats = ();
+
+ if ($lav) {
+ for my $d_ref (@cath_domains) {
+ push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info} ];
+ }
+ }
+ else {
+ for my $d_ref (@cath_domains) {
+ push @feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end}, $d_ref->{info} ];
+ }
+ }
+
+ return \@feats;
+
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($value) = @_;
+
+ if (!defined($domains{$value})) {
+ $domain_cnt++;
+ $domains{$value} = $domain_cnt;
+ }
+ return $value;
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_feats.pl
+
+=head1 SYNOPSIS
+
+ ann_pdb_cath.pl --neg 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+
+ --lav produce lav2plt.pl annotation format, only show domains/repeats
+ --neg-doms, -- report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --min_nodom=10 -- minimum length between domains for NODOM
+
+ --host, --user, --password, --port --db -- info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_pfam26.pl> extracts domain information from the pfam26 msyql
+database. Currently, the program works with database sequence
+descriptions in one of two formats:
+
+ >pf26|649|O94823|AT10B_HUMAN -- RPD2_seqs
+
+(pf26 databases have auto_pfamseq in the second field) and
+
+ >gi|1705556|sp|P54670.1|CAF1_DICDI
+
+C<ann_pfam26.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>,
+and C<pfamA> tables of the C<pfam26> database to extract domain
+information on a protein. For proteins that have multiple domains
+associated with the same overlapping region (domains overlap by more
+than 1/3 of the domain length), C<auto_pfam26.pl> selects the domain
+annotation with the best C<domain_evalue_score>. When domains overlap
+by less than 1/3 of the domain length, they are shortened to remove
+the overlap.
+
+C<ann_pfam26.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_pfam26.pl> or C<-V "\!ann_pfam26.pl --neg"> option.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_pdb_vast.pl b/scripts/ann_pdb_vast.pl
new file mode 100755
index 0000000..130af5f
--- /dev/null
+++ b/scripts/ann_pdb_vast.pl
@@ -0,0 +1,320 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014, 2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_pdb_vast.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# this version is designed for various formats of the pdbaa/pdbaa_off NCBI files with the lines:
+# >gi|4388890|pdb|1GTUA|sp|P09488 or
+# >gi|4388890|pdb|1GTU|A
+# if I can find |sp|P09488, I will use that, otherwise I will use
+# |pdb|1GTU|A (concatenated) and a different part of the cath_dom
+# database
+#
+
+use strict;
+
+use LWP::Simple;
+use Getopt::Long;
+use HTML::TableExtract;
+use Pod::Usage;
+
+use vars qw($host $db $port $user $pass);
+
+my ($neg_doms, $lav, $shelp, $help, $class) = (0, 0, 0, 0, 0);
+my ($min_nodom) = (10);
+
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+GetOptions(
+ "host=s" => \$host,
+ "db=s" => \$db,
+ "lav" => \$lav,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "min_nodom=i" => \$min_nodom,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (@ARGV || -p STDIN || -f STDIN);
+
+################################################################
+# strategy for connecting to NCBI to get list of domains
+
+my $db = "structure";
+my $report = "FASTA";
+my $pdb_acc_chain = "";
+
+my $utils = "http://www.ncbi.nlm.nih.gov/entrez/eutils";
+my $vast_url = "http://www.ncbi.nlm.nih.gov/Structure/vastplus/vastplus.cgi?cmd=v&uid=";
+
+my $esearch = "$utils/esearch.fcgi?db=structure&retmax=1&term=";
+
+################################################################
+
+my %domains = (NODOM=>0);
+my $domain_cnt = 0;
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 1 unless $seq_len;
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+
+#if it's a file I can open, read and parse it
+unless ($query && $query =~ m/\|/) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, show_annots($lav,$a_line);
+ }
+}
+else {
+ push @annots, show_annots($lav, "$query $seq_len");
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && defined($domains{$annot->[-1]})) {
+ $annot->[-1] .= $color_sep_str.$domains{$annot->[-1]};
+ }
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($lav, $annot_line) = @_;
+
+ my ($query, $seq_len) = split(/\s+/,$annot_line);
+
+ my %annot_data = (seq_info => $query);
+
+ my ($tmp, $gi, $pdb, $pdb_acc, $pdb_id, $pdb_chain, $sdb, $up_acc, $off_flag);
+
+ $off_flag = 0;
+ if ($query =~ m/^gi\|/) {
+ if ($query =~ m/\|sp\|/) {
+ ($tmp, $gi, $pdb, $pdb_acc, $sdb, $up_acc) = split(/\|/,$query);
+ $up_acc =~ s/\.\d+$//;
+ $off_flag = 1;
+ }
+ elsif ($query =~ m/\|pdb\|/) {
+ ($tmp, $gi, $pdb, $pdb_id, $pdb_chain) = split(/\|/,$query);
+ $pdb_acc = $pdb_id . $pdb_chain;
+ }
+ }
+ elsif ($query =~ m/^sp\|/) {
+ ($pdb, $pdb_acc) = split(/\|/,$query);
+ }
+ elsif ($query =~ m/^pdb\|(\w{4})\|(\w)/) {
+ $pdb_acc = $1 . $2;
+ }
+ else {
+ $pdb_acc = $query;
+ }
+
+# only get the first res_beg because it is used to calculate pdbaa_off @c:xxx
+
+ $annot_data{list} = get_vast_annots($lav, $pdb_acc, $seq_len, $off_flag);
+
+ return \%annot_data;
+}
+
+sub get_vast_annots {
+ my ($lav, $pdb_acc, $seq_length) = @_;
+
+ my $domain_href = get_vast_info($pdb_acc);
+
+ return unless (scalar(@$domain_href));
+
+ # do a consistency check
+ for (my $i=1; $i < scalar(@$domain_href); $i++) {
+ if ($domain_href->[$i]{seq_start} <= $domain_href->[$i-1]{seq_end}) {
+ my $delta = $domain_href->[$i]{seq_start} - $domain_href->[$i-1]{seq_end};
+ $domain_href->[$i-1]{seq_end} -= $delta/2;
+ $domain_href->[$i]{seq_start} = $domain_href->[$i-1]{seq_end}+1;
+ }
+ }
+
+ if ($neg_doms) {
+ my @nvast_domains;
+ my $prev_dom={seq_end=>0};
+ for my $cur_dom ( @$domain_href) {
+ if ($cur_dom->{seq_start} - $prev_dom->{seq_end} > $min_nodom) {
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $cur_dom->{seq_start}-1, info=>'NODOM');
+ push @nvast_domains, \%new_dom;
+ }
+ push @nvast_domains, $cur_dom;
+ $prev_dom = $cur_dom;
+ }
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM');
+ if ($new_dom{seq_end} > $new_dom{seq_start}) {push @nvast_domains, \%new_dom;}
+
+ $domain_href = \@nvast_domains;
+ }
+
+ my @feats = ();
+
+ if ($lav) {
+ for my $d_ref (@$domain_href) {
+ push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info} ];
+ }
+ }
+ else {
+ for my $d_ref (@$domain_href) {
+ push @feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end}, $d_ref->{info} ];
+ }
+ }
+
+ return \@feats;
+
+}
+
+################################################################
+# get_vast_info ( pdb_acc_chain pdb|1ABC|D
+#
+
+sub get_vast_info {
+ my $pdb_acc_chain = shift @_;
+
+ $pdb_acc_chain =~ s/^pdb\|//;
+
+ my ($pdb_acc, $pdb_chain) = ($pdb_acc_chain =~ m/(\w{4})(\w)/);
+
+ my $esearch_result = get($esearch . $pdb_acc . "[pdb+accession]");
+
+ # print "\nESEARCH RESULT: $esearch_result\n";
+
+ my ($Count) = ($esearch_result =~ m|<Count>(\d+)</Count>|s);
+
+ my $mmdb_id = 0;
+
+ ($mmdb_id) = ($esearch_result =~ m|<Id>(\d+)</Id>|s);
+
+ my $vast_dom_html = get($vast_url.$mmdb_id);
+
+ my $te = HTML::TableExtract->new(depth=>0, count=>0);
+
+ $te->parse($vast_dom_html);
+
+ my ($ts) = $te->tables();
+
+ my @ts_rows = $ts->rows();
+
+ my $row_header = shift(@ts_rows);
+
+ # print "header:\t\t",join("\t",@$row_header),"\n";
+
+ my $current_chain = "";
+
+ my @domain_list = ();
+
+ for my $row ( @ts_rows ) {
+ if ($row->[0]) {
+ ($current_chain) = ($row->[0] =~ m/^\[(\w+)\]/);
+ }
+ next unless ($current_chain eq $pdb_chain);
+ next if ($row->[1] =~ m/^Entire/);
+ my $range = $row->[2];
+
+ # $range can actually be a list of ranges. Need to remove the spaces
+ $range =~ s/\s+//g;
+
+ push @domain_list, $range;
+ }
+
+ # here we have the list of domain, indexed, but some domains will have multiple parts
+
+ my @part_list = ();
+
+ my $dom_cnt=1;
+ for my $range (@domain_list) {
+ my @parts = split(/,/,$range);
+ for my $part ( @parts ) {
+ my ($start, $end) = ($part =~ m/(\d+)\s*\-\s*(\d+)/);
+ push @part_list, {info=>"VASTdom".$dom_cnt, seq_start=>$start, seq_end=>$end};
+ }
+ $dom_cnt++;
+ }
+
+ @part_list = sort { $a->{seq_start} <=> $b->{seq_start} } @part_list;
+
+ return \@part_list;
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_feats.pl
+
+=head1 SYNOPSIS
+
+ ann_pdb_vast.pl --neg 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+
+ --lav produce lav2plt.pl annotation format, only show domains/repeats
+ --neg-doms, -- report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --min_nodom=10 -- minimum length between domains for NODOM
+
+=head1 DESCRIPTION
+
+C<ann_pdb_cath.pl> extracts domain information from the pfam26 msyql
+database. Currently, the program works with database sequence
+descriptions in several formats:
+
+ >pdb|3F6F|A -- database|accession
+
+>gi|262118558|pdb|3F6F|A -- with GI number
+
+C<ann_pdb_vast.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_pfam26.pl> or C<-V "\!ann_pfam26.pl --neg"> option.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_pfam27.pl b/scripts/ann_pfam27.pl
new file mode 100755
index 0000000..505a711
--- /dev/null
+++ b/scripts/ann_pfam27.pl
@@ -0,0 +1,656 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_pfam_e.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# this version only annotates sequences known to Pfam:pfamseq:
+# >pf26|164|O57809|1A1D_PYRHO
+# and only provides domain information
+
+use strict;
+
+use DBI;
+use Getopt::Long;
+use Pod::Usage;
+
+use vars qw($host $db $port $user $pass);
+
+my $hostname = `/bin/hostname`;
+
+($host, $db, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "pfam27", 0, "web_user", "fasta_www");
+#$host = 'xdb';
+
+my ($auto_reg,$rpd2_fams, $vdoms, $neg_doms, $lav, $no_doms, $no_clans, $pf_acc, $no_over, $acc_comment, $shelp, $help) =
+ (0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0);
+my ($min_nodom, $min_vdom) = (10,10);
+
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+
+GetOptions(
+ "host=s" => \$host,
+ "db=s" => \$db,
+ "user=s" => \$user,
+ "password=s" => \$pass,
+ "port=i" => \$port,
+ "lav" => \$lav,
+ "acc_comment" => \$acc_comment,
+ "no-over" => \$no_over,
+ "no_over" => \$no_over,
+ "no-clans" => \$no_clans,
+ "no_clans" => \$no_clans,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "min_nodom=i" => \$min_nodom,
+ "pfacc" => \$pf_acc,
+ "RPD2" => \$rpd2_fams,
+ "auto_reg" => \$auto_reg,
+ "vdoms" => \$vdoms,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (@ARGV || -p STDIN || -f STDIN);
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+my %annot_types = ();
+my %domains = (NODOM=>0);
+my %domain_clan = (NODOM => {clan_id => 'NODOM', clan_acc=>0, domain_cnt=>0});
+my @domain_list = (0);
+my $domain_cnt = 0;
+
+my $get_annot_sub = \&get_pfam_annots;
+
+my $get_pfam_acc = $dbh->prepare(<<EOSQL);
+
+SELECT seq_start, seq_end, model_start, model_end, model_length, auto_pfamA, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM pfamseq
+JOIN pfamA_reg_full_significant using(auto_pfamseq)
+JOIN pfamA USING (auto_pfamA)
+WHERE in_full = 1
+AND pfamseq_acc=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_pfam_refacc = $dbh->prepare(<<EOSQL);
+
+SELECT seq_start, seq_end, auto_pfamA, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM pfamseq
+JOIN pfamA_reg_full_significant using(auto_pfamseq)
+JOIN pfamA USING (auto_pfamA)
+JOIN seqdb_demo2.annot as sa1 on(sa1.acc=pfamseq_acc and sa1.db='sp')
+JOIN seqdb_demo2.annot as sa2 using(prot_id)
+WHERE in_full = 1
+AND sa2.acc=?
+AND sa2.db='ref'
+ORDER BY seq_start
+
+EOSQL
+
+my $get_annots_sql = $get_pfam_acc;
+
+my $get_pfam_id = $dbh->prepare(<<EOSQL);
+
+SELECT seq_start, seq_end, auto_pfamA, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM pfamseq
+JOIN pfamA_reg_full_significant using(auto_pfamseq)
+JOIN pfamA USING (auto_pfamA)
+WHERE in_full=1
+AND pfamseq_id=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_pfam_clan = $dbh->prepare(<<EOSQL);
+
+SELECT clan_acc, clan_id
+FROM clans
+JOIN clan_membership using(auto_clan)
+WHERE auto_pfamA=?
+
+EOSQL
+
+my $get_rpd2_clans = $dbh->prepare(<<EOSQL);
+
+SELECT auto_pfamA, clan
+FROM ljm_db.RPD2_final_fams
+WHERE clan is not NULL
+
+EOSQL
+
+# -- LEFT JOIN clan_membership USING (auto_pfamA)
+# -- LEFT JOIN clans using(auto_clan)
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+
+my %rpd2_clan_fams = ();
+
+if ($rpd2_fams) {
+ $get_rpd2_clans->execute();
+ my ($auto_pfam, $auto_clan);
+ while (($auto_pfam, $auto_clan)=$get_rpd2_clans->fetchrow_array()) {
+ $rpd2_clan_fams{$auto_pfam} = $auto_clan;
+ }
+}
+
+#if it's a file I can open, read and parse it
+unless ($query && $query =~ m/[\|:]/) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, show_annots($a_line, $get_annot_sub);
+ }
+}
+else {
+ push @annots, show_annots("$query $seq_len", $get_annot_sub);
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && defined($domains{$annot->[-1]})) {
+ my ($a_name, $a_num) = domain_num($annot->[-1],$domains{$annot->[-1]});
+ if ($acc_comment) {
+ $annot->[-1] .= $a_name."{$domain_list[$a_num]}";
+ }
+ $annot->[-1] = $a_name.$color_sep_str.$a_num;
+ }
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($query_len, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_len) = split(/\s+/,$query_len);
+
+ my $pfamA_acc;
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ $use_acc = 1;
+ $get_annots_sql = $get_pfam_acc;
+
+ if ($annot_line =~ m/^pf26\|/) {
+ ($sdb, $gi, $acc, $id) = split(/\|/,$annot_line);
+ $dbh->do("use RPD2_pfam");
+ }
+ elsif ($annot_line =~ m/^gi\|/) {
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ if ($sdb =~ m/ref/) {
+ $get_annots_sql = $get_pfam_refacc;
+ }
+ }
+ elsif ($annot_line =~ m/^sp\|/) {
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^ref\|/) {
+ ($sdb, $acc) = split(/\|/,$annot_line);
+ $get_annots_sql = $get_pfam_refacc;
+ }
+ elsif ($annot_line =~ m/^tr\|/) {
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^SP:/i) {
+ ($sdb, $id) = split(/:/,$annot_line);
+ $use_acc = 0;
+ }
+ else {
+ $use_acc = 1;
+ ($acc) = split(/\s+/,$annot_line);
+ }
+
+ # remove version number
+ unless ($use_acc) {
+ $get_annots_sql = $get_pfam_id;
+ $get_annots_sql->execute($id);
+ }
+ else {
+ $acc =~ s/\.\d+$//;
+ $get_annots_sql->execute($acc);
+ }
+
+ $annot_data{list} = $get_annot_sub->($get_annots_sql, $seq_len);
+
+ return \%annot_data;
+}
+
+sub get_pfam_annots {
+ my ($get_annots, $seq_length) = @_;
+
+ $seq_length = 0 unless $seq_length;
+
+ my @pf_domains = ();
+
+ # get the list of domains, sorted by start
+ while ( my $row_href = $get_annots->fetchrow_hashref()) {
+ if ($auto_reg) {
+ $row_href->{info} = $row_href->{auto_pfamA_reg_full};
+ }
+ elsif ($pf_acc) {
+ $row_href->{info} = $row_href->{pfamA_acc};
+ }
+ else {
+ $row_href->{info} = $row_href->{pfamA_id};
+ }
+
+ if ($row_href && $row_href->{length} > $seq_length && $seq_length == 0) { $seq_length = $row_href->{length};}
+
+ next if ($row_href->{seq_start} >= $seq_length);
+ if ($row_href->{seq_end} > $seq_length) {
+ $row_href->{seq_end} = $seq_length;
+ }
+
+ push @pf_domains, $row_href
+ }
+
+ # check for domain overlap, and resolve check for domain overlap
+ # (possibly more than 2 domains), choosing the domain with the best
+ # evalue
+
+ if($no_over && scalar(@pf_domains) > 1) {
+
+ my @tmp_domains = @pf_domains;
+ my @save_domains = ();
+
+ my $prev_dom = shift @tmp_domains;
+
+ while (my $curr_dom = shift @tmp_domains) {
+
+ my @overlap_domains = ($prev_dom);
+
+ my $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
+ # check for overlap > domain_length/3
+
+ my ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
+ my $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) ||
+ (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})));
+
+ my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+
+ while ($inclusion || ($diff > 0 && $diff > $longer_len/3)) {
+ push @overlap_domains, $curr_dom;
+ $curr_dom = shift @tmp_domains;
+ last unless $curr_dom;
+ $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
+ ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
+ $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+ $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) ||
+ (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})));
+ }
+
+ # check for overlapping domains; >1 because $prev_dom is always there
+ if (scalar(@overlap_domains) > 1 ) {
+ # if $rpd2_fams, check for a chosen one
+ if ($rpd2_fams) {
+ for my $dom (@overlap_domains) {
+ if ($rpd2_clan_fams{$dom->{auto_pfamA}}) {
+ $prev_dom = $dom;
+ last;
+ }
+ }
+ }
+ else {
+ @overlap_domains = sort { $a->{evalue} <=> $b->{evalue} } @overlap_domains;
+ $prev_dom = $overlap_domains[0];
+ }
+ }
+
+ # $prev_dom should be the best of the overlaps, and we are no longer overlapping > dom_length/3
+ push @save_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ if ($prev_dom) {push @save_domains, $prev_dom;}
+
+ @pf_domains = @save_domains;
+
+ # now check for smaller overlaps
+ for (my $i=1; $i < scalar(@pf_domains); $i++) {
+ if ($pf_domains[$i-1]->{seq_end} >= $pf_domains[$i]->{seq_start}) {
+ my $overlap = $pf_domains[$i-1]->{seq_end} - $pf_domains[$i]->{seq_start};
+ $pf_domains[$i-1]->{seq_end} -= int($overlap/2);
+ $pf_domains[$i]->{seq_start} = $pf_domains[$i-1]->{seq_end}+1;
+ }
+ }
+ }
+
+ # $vdoms -- virtual Pfam domains -- the equivalent of $neg_doms,
+ # but covering parts of a Pfam model that are not annotated. split
+ # domains have been joined, so simply check beginning and end of
+ # each domain (but must also check for bounded-ness)
+ # only add when 10% or more is missing and missing length > $min_nodom
+
+ if ($vdoms && scalar(@pf_domains)) {
+ my @vpf_domains;
+
+ my $curr_dom = $pf_domains[0];
+ my $length = $curr_dom->{length};
+
+ my $prev_dom={seq_end=>0, pfamA_acc=>''};
+ my $prev_dom_end = 0;
+ my $next_dom_start = $length+1;
+
+ for (my $dom_ix=0; $dom_ix < scalar(@pf_domains); $dom_ix++ ) {
+ $curr_dom = $pf_domains[$dom_ix];
+
+ my $pfamA = $curr_dom->{pfamA_acc};
+
+ # first, look left, is there a domain there (if there is,
+ # it should be updated right
+
+ # my $min_vdom = $curr_dom->{model_length} / 10;
+
+ if ($prev_dom->{pfamA_acc}) { # look for previous domain
+ $prev_dom_end = $prev_dom->{seq_end};
+ }
+
+ # there is a domain to the left, how much room is available?
+ my $left_dom_len = min($curr_dom->{seq_start}-$prev_dom_end-1, $curr_dom->{model_start}-1);
+ if ( $left_dom_len > $min_vdom) {
+ # there is room for a virtual domain
+ my %new_dom = (seq_start=> $curr_dom->{seq_start}-$left_dom_len,
+ seq_end => $curr_dom->{seq_start}-1,
+ info=>'@'.$curr_dom->{info},
+ model_length=>$curr_dom->{model_length},
+ model_end => $curr_dom->{model_start}-1,
+ model_start => $left_dom_len,
+ pfamA_acc=>$pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ }
+
+ # save the current domain
+ push @vpf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+
+ if ($dom_ix < $#pf_domains) { # there is a domain to the right
+ # first, give all the extra space to the first domain (no splitting)
+ $next_dom_start = $pf_domains[$dom_ix+1]->{seq_start};
+ }
+ else {
+ $next_dom_start = $length;
+ }
+
+ # is there room for a virtual domain right
+
+ my $right_dom_len = min($next_dom_start-$curr_dom->{seq_end}-1, # space available
+ $curr_dom->{model_length}-$curr_dom->{model_end} # space needed
+ );
+ if ( $right_dom_len > $min_vdom) {
+ my %new_dom = (seq_start=> $curr_dom->{seq_end}+1,
+ seq_end=> $curr_dom->{seq_end}+$right_dom_len,
+ info=>'@'.$pfamA,
+ model_length => $curr_dom->{model_length},
+ pfamA_acc=> $pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ $prev_dom = \%new_dom;
+ }
+ } # all done, check for last one
+
+ # $curr_dom=$pf_domains[-1];
+ # # my $min_vdom = $curr_dom->{model_length}/10;
+
+ # my $right_dom_len = min($length - $curr_dom->{seq_end}+1, # space available
+ # $curr_dom->{model_length}-$curr_dom->{model_end} # space needed
+ # );
+ # if ($right_dom_len > $min_vdom) {
+ # my %new_dom = (seq_start=> $curr_dom->{seq_end}+1,
+ # seq_end => $curr_dom->{seq_end}+$right_dom_len,
+ # info=>'@'.$curr_dom->{pfamA_acc},
+ # model_len=> $curr_dom->{model_len},
+ # pfamA_acc => $curr_dom->{pfamA_acc},
+ # model_start => $curr_dom->{model_end}+1,
+ # model_end => $curr_dom->{model_len},
+ # );
+
+ # push @vpf_domains, \%new_dom;
+ # }
+
+ # @vpf_domains has both old @pf_domains and new neg-domains
+ @pf_domains = @vpf_domains;
+ }
+
+ if ($neg_doms) {
+ my @npf_domains;
+ my $prev_dom={seq_end=>0};
+ for my $curr_dom ( @pf_domains) {
+ if ($curr_dom->{seq_start} - $prev_dom->{seq_end} > $min_nodom) {
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $curr_dom->{seq_start}-1, info=>'NODOM');
+ push @npf_domains, \%new_dom;
+ }
+ push @npf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+ }
+ if ($seq_length - $prev_dom->{seq_end} > $min_nodom) {
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM');
+ if ($new_dom{seq_end} > $new_dom{seq_start}) {push @npf_domains, \%new_dom;}
+ }
+
+ # @npf_domains has both old @pf_domains and new neg-domains
+ @pf_domains = @npf_domains;
+ }
+
+ # now make sure we have useful names: colors
+
+ for my $pf (@pf_domains) {
+ $pf->{info} = domain_name($pf->{info}, $pf->{auto_pfamA}, $pf->{pfamA_acc});
+ }
+
+ my @feats = ();
+ for my $d_ref (@pf_domains) {
+ if ($lav) {
+ push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info}];
+ }
+ else {
+ push @feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end}, $d_ref->{info} ];
+# push @feats, [$d_ref->{seq_end}, ']', '-', ""];
+ }
+
+ }
+
+ return \@feats;
+}
+
+sub min {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 <= $arg2 ? $arg1 : $arg2);
+}
+
+sub max {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 >= $arg2 ? $arg1 : $arg2);
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($value, $pfamA_acc) = @_;
+ my $is_virtual = 0;
+
+ if ($value =~ m/^@/) {
+ $is_virtual = 1;
+ $value =~ s/^@//;
+ }
+
+ # check for clan:
+ if ($no_clans) {
+ if (! defined($domains{$value})) {
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ elsif (!defined($domain_clan{$value})) {
+ ## only do this for new domains, old domains have known mappings
+
+ ## ways to highlight the same domain:
+ # (1) for clans, substitute clan name for family name
+ # (2) for clans, use the same color for the same clan, but don't change the name
+ # (3) for clans, combine family name with clan name, but use colors based on clan
+
+ # check to see if it's a clan
+ $get_pfam_clan->execute($pfamA_acc);
+
+ my $pfam_clan_href=0;
+
+ if ($pfam_clan_href=$get_pfam_clan->fetchrow_hashref()) { # is a clan
+ my ($clan_id, $clan_acc) = @{$pfam_clan_href}{qw(clan_id clan_acc)};
+
+ # now check to see if we have seen this clan before (if so, do not increment $domain_cnt)
+ my $c_value = "C." . $clan_id;
+ if ($pf_acc) {$c_value = $clan_acc;}
+
+ $domain_clan{$value} = {clan_id => $clan_id,
+ clan_acc => $clan_acc};
+
+ if ($domains{$c_value}) {
+ $domain_clan{$value}->{domain_cnt} = $domains{$c_value};
+ $value = $c_value;
+ }
+ else {
+ $domain_clan{$value}->{domain_cnt} = ++ $domain_cnt;
+ $value = $c_value;
+ $domains{$value} = $domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ else { # not a clan
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ elsif ($domain_clan{$value} && $domain_clan{$value}->{clan_acc}) {
+ if ($pf_acc) {$value = $domain_clan{$value}->{clan_acc};}
+ else { $value = "C." . $domain_clan{$value}->{clan_id}; }
+ }
+
+ if ($is_virtual) {
+ $domains{'@'.$value} = $domains{$value};
+ $value = '@'.$value;
+ }
+ return $value;
+}
+
+sub domain_num {
+ my ($value, $number) = @_;
+ if ($value =~ m/^@/) {
+ $value =~ s/^@/v/;
+# $number = $number."v";
+ }
+ return ($value, $number);
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_feats.pl
+
+=head1 SYNOPSIS
+
+ ann_pfam_e.pl --neg-doms 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --no-over : generate non-overlapping domains (equivalent to ann_pfam.pl)
+ --no-clans : do not use clans with multiple families from same clan
+ --neg-doms : report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --min_nodom=10 : minimum length between domains for NODOM
+
+ --host, --user, --password, --port --db : info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_pfam_e.pl> extracts domain information from the pfam msyql
+database. Currently, the program works with database sequence
+descriptions in one of two formats:
+
+ Currently, the program works with database
+sequence descriptions in several formats:
+
+ >gi|1705556|sp|P54670.1|CAF1_DICDI
+ >sp|P09488|GSTM1_HUMAN
+ >sp:CALM_HUMAN
+
+C<ann_pfam_e.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>,
+and C<pfamA> tables of the C<pfam> database to extract domain
+information on a protein.
+
+If the "--no-over" option is set, overlapping domains are selected and
+edited to remove overlaps. For proteins with multiple overlapping
+domains (domains overlap by more than 1/3 of the domain length),
+C<auto_pfam_e.pl> selects the domain annotation with the best
+C<domain_evalue_score>. When domains overlap by less than 1/3 of the
+domain length, they are shortened to remove the overlap.
+
+C<ann_pfam_e.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_pfam_e.pl> or C<-V "\!ann_pfam_e.pl --neg"> option.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_pfam28.pl b/scripts/ann_pfam28.pl
new file mode 100755
index 0000000..6a4079b
--- /dev/null
+++ b/scripts/ann_pfam28.pl
@@ -0,0 +1,782 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014,2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_pfam.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# this version only annotates sequences known to Pfam:pfamseq:
+# and only provides domain information
+
+use strict;
+
+use DBI;
+use Getopt::Long;
+use Pod::Usage;
+
+use vars qw($host $db $port $user $pass);
+
+my $hostname = `/bin/hostname`;
+
+($host, $db, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "pfam28", 0, "web_user", "fasta_www");
+#$host = 'xdb';
+#$host = 'localhost';
+#$db = 'RPD2_pfam28u';
+
+my ($auto_reg,$rpd2_fams, $neg_doms, $vdoms, $lav, $no_doms, $no_clans, $pf_acc, $acc_comment, $bound_comment, $shelp, $help) =
+ (0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0,);
+my ($no_over, $split_over, $over_fract) = (0, 0, 3.0);
+
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+my ($min_nodom, $min_vdom) = (10,10);
+
+GetOptions(
+ "host=s" => \$host,
+ "db=s" => \$db,
+ "user=s" => \$user,
+ "password=s" => \$pass,
+ "port=i" => \$port,
+ "lav" => \$lav,
+ "acc_comment" => \$acc_comment,
+ "bound_comment" => \$bound_comment,
+ "no-over" => \$no_over,
+ "no_over" => \$no_over,
+ "split-over" => \$split_over,
+ "split_over" => \$split_over,
+ "over_fract" => \$over_fract,
+ "over-fract" => \$over_fract,
+ "no-clans" => \$no_clans,
+ "no_clans" => \$no_clans,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "min_nodom=i" => \$min_nodom,
+ "vdoms" => \$vdoms,
+ "v_doms" => \$vdoms,
+ "pfacc" => \$pf_acc,
+ "RPD2" => \$rpd2_fams,
+ "auto_reg" => \$auto_reg,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (@ARGV || -p STDIN || -f STDIN);
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+my %annot_types = ();
+my %domains = (NODOM=>0);
+my %domain_clan = (NODOM => {clan_id => 'NODOM', clan_acc=>0, domain_cnt=>0});
+my @domain_list = (0);
+my $domain_cnt = 0;
+
+my $pfamA_reg_full = 'pfamA_reg_full_significant';
+
+my $get_annot_sub = \&get_pfam_annots;
+
+my @pfam_fields = qw(seq_start seq_end model_start model_end model_length pfamA_acc pfamA_id auto_pfamA_reg_full domain_evalue_score as evalue length);
+
+my $get_pfam_acc = $dbh->prepare(<<EOSQL);
+SELECT seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM pfamseq
+JOIN $pfamA_reg_full using(pfamseq_acc)
+JOIN pfamA USING (pfamA_acc)
+WHERE in_full = 1
+AND pfamseq_acc=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_pfam_refacc = $dbh->prepare(<<EOSQL);
+
+SELECT seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM pfamseq
+JOIN $pfamA_reg_full using(pfamseq_acc)
+JOIN pfamA USING (pfamA_acc)
+JOIN seqdb_demo2.annot as sa1 on(sa1.acc=pfamseq_acc and sa1.db='sp')
+JOIN seqdb_demo2.annot as sa2 using(prot_id)
+WHERE in_full = 1
+AND sa2.acc=?
+AND sa2.db='ref'
+ORDER BY seq_start
+
+EOSQL
+
+my $get_annots_sql = $get_pfam_acc;
+
+my $get_pfam_id = $dbh->prepare(<<EOSQL);
+
+SELECT seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM pfamseq
+JOIN $pfamA_reg_full using(pfamseq_acc)
+JOIN pfamA USING (pfamA_acc)
+WHERE in_full=1
+AND pfamseq_id=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_pfam_clan = $dbh->prepare(<<EOSQL);
+
+SELECT clan_acc, clan_id
+FROM clan
+JOIN clan_membership using(clan_acc)
+WHERE pfamA_acc=?
+
+EOSQL
+
+my $get_rpd2_clans = $dbh->prepare(<<EOSQL);
+
+SELECT auto_pfamA, clan
+FROM ljm_db.RPD2_final_fams
+WHERE clan is not NULL
+
+EOSQL
+
+# -- LEFT JOIN clan_membership USING (auto_pfamA)
+# -- LEFT JOIN clans using(auto_clan)
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+
+my %rpd2_clan_fams = ();
+
+if ($rpd2_fams) {
+ $get_rpd2_clans->execute();
+ my ($auto_pfam, $auto_clan);
+ while (($auto_pfam, $auto_clan)=$get_rpd2_clans->fetchrow_array()) {
+ $rpd2_clan_fams{$auto_pfam} = $auto_clan;
+ }
+}
+
+#if it's a file I can open, read and parse it
+unless ($query && $query =~ m/[\|:]/) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, show_annots($a_line, $get_annot_sub);
+ }
+}
+else {
+ push @annots, show_annots("$query $seq_len", $get_annot_sub);
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && defined($domains{$annot->[-1]})) {
+ my ($a_name, $a_num) = domain_num($annot->[-1],$domains{$annot->[-1]});
+ $annot->[-1] = $a_name;
+ my $tmp_a_num = $a_num;
+ $tmp_a_num =~ s/v$//;
+ if ($acc_comment) {
+ $annot->[-1] .= "{$domain_list[$tmp_a_num]}";
+ }
+ if ($bound_comment) {
+ $annot->[-1] .= $color_sep_str.$annot->[0].":".$annot->[2];
+ }
+ $annot->[-1] .= $color_sep_str.$a_num;
+ }
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($query_len, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_len) = split(/\t/,$query_len);
+
+ my $pfamA_acc;
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ $use_acc = 1;
+ $get_annots_sql = $get_pfam_acc;
+
+ if ($annot_line =~ m/^pf\d+\|/) {
+ ($sdb, $gi, $pfamA_acc, $acc, $id) = split(/\|/,$annot_line);
+# $dbh->do("use RPD2_pfam");
+ }
+ elsif ($annot_line =~ m/^gi\|/) {
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ if ($sdb =~ m/ref/) {
+ $get_annots_sql = $get_pfam_refacc;
+ }
+ }
+ elsif ($annot_line =~ m/^(sp|tr)\|/) {
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^ref\|/) {
+ ($sdb, $acc) = split(/\|/,$annot_line);
+ $get_annots_sql = $get_pfam_refacc;
+ }
+ elsif ($annot_line =~ m/^(SP|TR):/i) {
+ ($sdb, $id) = split(/:/,$annot_line);
+ $use_acc = 0;
+ }
+ elsif ($annot_line !~ m/\|/) { # new NCBI swissprot format
+ $use_acc =1;
+ $sdb = 'sp';
+ ($acc) = split(/\s+/,$annot_line);
+ }
+
+ # remove version number
+ unless ($use_acc) {
+ $get_annots_sql = $get_pfam_id;
+ $get_annots_sql->execute($id);
+ } else {
+ unless ($acc) {
+ warn "missing acc in $annot_line";
+ next;
+ } else {
+ $acc =~ s/\.\d+$//;
+ $get_annots_sql->execute($acc);
+ }
+ }
+
+ $annot_data{list} = $get_annot_sub->($get_annots_sql, $seq_len);
+
+ return \%annot_data;
+}
+
+sub get_pfam_annots {
+ my ($get_annots, $seq_length) = @_;
+
+ $seq_length = 0 unless $seq_length;
+
+ my @pf_domains = ();
+
+ # get the list of domains, sorted by start
+
+ # $row_href has: seq_start, seq_end, model_start, model_end, model_length,
+ # pfamA_acc, pfamA_id, auto_pfamA_reg_full,
+ # domain_evalue_score as evalue, length
+
+ while ( my $row_href = $get_annots->fetchrow_hashref()) {
+ if ($auto_reg) {
+ $row_href->{info} = $row_href->{auto_pfamA_reg_full};
+ } elsif ($pf_acc) {
+ $row_href->{info} = $row_href->{pfamA_acc};
+ } else {
+ $row_href->{info} = $row_href->{pfamA_id};
+ }
+
+ if ($row_href && $row_href->{length} > $seq_length && $seq_length == 0) {
+ $seq_length = $row_href->{length};
+ }
+
+ next if ($row_href->{seq_start} >= $seq_length);
+ if ($row_href->{seq_end} > $seq_length) {
+ $row_href->{seq_end} = $seq_length;
+ }
+
+ push @pf_domains, $row_href
+ }
+
+ # before checking for domain overlap, check for "split-domains"
+ # (self-unbound) by looking for runs of the same domain that are
+ # ordered by model_start
+
+ if (scalar(@pf_domains) > 1) {
+ my @j_domains; #joined domains
+ my @tmp_domains = @pf_domains;
+
+ my $prev_dom = shift(@tmp_domains);
+
+ for my $curr_dom (@tmp_domains) {
+ # to join domains:
+ # (1) the domains must be in order by model_start/end coordinates
+ # (3) joining the domains cannot make the total combination too long
+
+ # check for model and sequence consistency
+ if (($prev_dom->{pfamA_acc} eq $curr_dom->{pfamA_acc}) # same family
+ && $prev_dom->{model_start} < $curr_dom->{model_start} # model check
+ && $prev_dom->{model_end} < $curr_dom->{model_end}
+
+ && ($curr_dom->{model_start} > $prev_dom->{model_end} * 0.80 # limit overlap
+ || $curr_dom->{model_start} < $prev_dom->{model_end} * 1.25)
+ && ((($curr_dom->{model_end} - $curr_dom->{model_start}+1)/$curr_dom->{model_length} +
+ ($prev_dom->{model_end} - $prev_dom->{model_start}+1)/$prev_dom->{model_length}) < 1.33)
+ ) { # join them by updating $prev_dom
+ $prev_dom->{seq_end} = $curr_dom->{seq_end};
+ $prev_dom->{model_end} = $curr_dom->{model_end};
+ $prev_dom->{auto_pfamA_reg_full} = $prev_dom->{auto_pfamA_reg_full} . ";". $curr_dom->{auto_pfamA_reg_full};
+ $prev_dom->{evalue} = ($prev_dom->{evalue} < $curr_dom->{evalue} ? $prev_dom->{evalue} : $curr_dom->{evalue});
+ } else {
+ push @j_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ }
+ push @j_domains, $prev_dom;
+ @pf_domains = @j_domains;
+
+
+ if ($no_over) { # for either $no_over or $split_over, check for overlapping domains and edit/split them
+
+ my @tmp_domains = @pf_domains; # allow shifts from copy of @pf_domains
+ my @save_domains = (); # where the new domains go
+
+ my $prev_dom = shift @tmp_domains;
+
+ while (my $curr_dom = shift @tmp_domains) {
+
+ my @overlap_domains = ($prev_dom);
+
+ my $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
+
+ my ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1,
+ $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
+
+ my $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) # start is right && end is left
+ && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) || # -- curr inside prev
+ (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) # start is left && end is right
+ && ($curr_dom->{seq_end} >= $prev_dom->{seq_end}))); # -- prev is inside curr
+
+ my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+
+ # check for overlap > domain_length/$over_fract
+ while ($inclusion || ($diff > 0 && $diff > $longer_len/$over_fract)) {
+ push @overlap_domains, $curr_dom;
+ $curr_dom = shift @tmp_domains;
+ last unless $curr_dom;
+ $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
+ ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
+ $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+ $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) ||
+ (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})));
+ }
+
+ # check for overlapping domains; >1 because $prev_dom is always there
+ if (scalar(@overlap_domains) > 1 ) {
+ # if $rpd2_fams, check for a chosen one
+
+ for my $dom ( @overlap_domains) {
+ $dom->{evalue} = 1.0 unless defined($dom->{evalue});
+ }
+
+ @overlap_domains = sort { $a->{evalue} <=> $b->{evalue} } @overlap_domains;
+ $prev_dom = $overlap_domains[0];
+ }
+
+ # $prev_dom should be the best of the overlaps, and we are no longer overlapping > dom_length/3
+ push @save_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+
+ if ($prev_dom) {
+ push @save_domains, $prev_dom;
+ }
+
+ @pf_domains = @save_domains;
+
+ # now check for smaller overlaps
+ for (my $i=1; $i < scalar(@pf_domains); $i++) {
+ if ($pf_domains[$i-1]->{seq_end} >= $pf_domains[$i]->{seq_start}) {
+ my $overlap = $pf_domains[$i-1]->{seq_end} - $pf_domains[$i]->{seq_start};
+ $pf_domains[$i-1]->{seq_end} -= int($overlap/2);
+ $pf_domains[$i]->{seq_start} = $pf_domains[$i-1]->{seq_end}+1;
+ }
+ }
+ }
+ elsif ($split_over) { # here, everything that overlaps by > $min_vdom should be split into a separate domain
+ my @save_domains = (); # where the new domains go
+
+ # check to see if one domain is included (or overlapping) more
+ # than xx% of the other. If so, pick the longer one
+
+ my ($prev_dom, $curr_dom) = ($pf_domains[0],0) ;
+ for (my $i=1; $i < scalar(@pf_domains); $i++) {
+ $curr_dom = $pf_domains[$i];
+
+ my ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
+ my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+
+ if (($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})
+ && $cur_len / $prev_len > 0.80) {
+ # $prev_dom stays the same, $curr_dom deleted
+ next;
+ }
+ elsif (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})
+ && $prev_len / $cur_len > 0.80) {
+ $prev_dom = $curr_dom; # this should delete $prev_dom
+ next;
+ }
+
+ if ($prev_dom->{seq_end} >= $curr_dom->{seq_start} + $min_vdom) {
+ my ($l_seq_end, $r_seq_start) = ($curr_dom->{seq_start}-1, $prev_dom->{seq_end}+1);
+
+ $prev_dom->{seq_end} = $l_seq_end;
+ push @save_domains, $prev_dom;
+ my $new_dom = {seq_start => $l_seq_end+1, seq_end=>$r_seq_start-1,
+ model_length => -1,
+ pfamA_acc=>$prev_dom->{pfamA_acc}."/".$curr_dom->{pfamA_acc},
+ pfamA_id=>$prev_dom->{pfamA_id}."/".$curr_dom->{pfamA_id},
+ };
+
+ if ($pf_acc) {
+ $new_dom->{info} = $new_dom->{pfamA_acc};
+ }
+ else {
+ $new_dom->{info} = $new_dom->{pfamA_id};
+ }
+
+ push @save_domains, $new_dom;
+ $curr_dom->{seq_start} = $r_seq_start;
+ $prev_dom = $curr_dom;
+ }
+ else {
+ push @save_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ }
+ push @save_domains, $prev_dom;
+ @pf_domains = @save_domains;
+ }
+ }
+
+ # $vdoms -- virtual Pfam domains -- the equivalent of $neg_doms,
+ # but covering parts of a Pfam model that are not annotated. split
+ # domains have been joined, so simply check beginning and end of
+ # each domain (but must also check for bounded-ness)
+ # only add when 10% or more is missing and missing length > $min_nodom
+
+ if ($vdoms && scalar(@pf_domains)) {
+ my @vpf_domains;
+
+ my $curr_dom = $pf_domains[0];
+ my $length = $curr_dom->{length};
+
+ my $prev_dom={seq_end=>0, pfamA_acc=>''};
+ my $prev_dom_end = 0;
+ my $next_dom_start = $length+1;
+
+ for (my $dom_ix=0; $dom_ix < scalar(@pf_domains); $dom_ix++ ) {
+ $curr_dom = $pf_domains[$dom_ix];
+
+ my $pfamA = $curr_dom->{pfamA_acc};
+
+ # first, look left, is there a domain there (if there is,
+ # it should be updated right
+
+ # my $min_vdom = $curr_dom->{model_length} / 10;
+
+ if ($curr_dom->{model_length} < $min_vdom) {
+ push @vpf_domains, $curr_dom;
+ next;
+ }
+ if ($prev_dom->{pfamA_acc}) { # look for previous domain
+ $prev_dom_end = $prev_dom->{seq_end};
+ }
+
+ # there is a domain to the left, how much room is available?
+ my $left_dom_len = min($curr_dom->{seq_start}-$prev_dom_end-1, $curr_dom->{model_start}-1);
+ if ( $left_dom_len > $min_vdom) {
+ # there is room for a virtual domain
+ my %new_dom = (seq_start=> $curr_dom->{seq_start}-$left_dom_len,
+ seq_end => $curr_dom->{seq_start}-1,
+ info=>'@'.$curr_dom->{info},
+ model_length=>$curr_dom->{model_length},
+ model_end => $curr_dom->{model_start}-1,
+ model_start => $left_dom_len,
+ pfamA_acc=>$pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ }
+
+ # save the current domain
+ push @vpf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+
+ if ($dom_ix < $#pf_domains) { # there is a domain to the right
+ # first, give all the extra space to the first domain (no splitting)
+ $next_dom_start = $pf_domains[$dom_ix+1]->{seq_start};
+ }
+ else {
+ $next_dom_start = $length;
+ }
+
+ # is there room for a virtual domain right
+
+ my $right_dom_len = min($next_dom_start-$curr_dom->{seq_end}-1, # space available
+ $curr_dom->{model_length}-$curr_dom->{model_end} # space needed
+ );
+ if ( $right_dom_len > $min_vdom) {
+ my %new_dom = (seq_start=> $curr_dom->{seq_end}+1,
+ seq_end=> $curr_dom->{seq_end}+$right_dom_len,
+ info=>'@'.$curr_dom->{info},
+ model_length => $curr_dom->{model_length},
+ pfamA_acc=> $pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ $prev_dom = \%new_dom;
+ }
+ } # all done, check for last one
+
+ # $curr_dom=$pf_domains[-1];
+ # # my $min_vdom = $curr_dom->{model_length}/10;
+
+ # my $right_dom_len = min($length - $curr_dom->{seq_end}+1, # space available
+ # $curr_dom->{model_length}-$curr_dom->{model_end} # space needed
+ # );
+ # if ($right_dom_len > $min_vdom) {
+ # my %new_dom = (seq_start=> $curr_dom->{seq_end}+1,
+ # seq_end => $curr_dom->{seq_end}+$right_dom_len,
+ # info=>'@'.$curr_dom->{pfamA_acc},
+ # model_len=> $curr_dom->{model_len},
+ # pfamA_acc => $curr_dom->{pfamA_acc},
+ # model_start => $curr_dom->{model_end}+1,
+ # model_end => $curr_dom->{model_len},
+ # );
+
+ # push @vpf_domains, \%new_dom;
+ # }
+
+ # @vpf_domains has both old @pf_domains and new neg-domains
+ @pf_domains = @vpf_domains;
+ }
+
+ if ($neg_doms) {
+ my @npf_domains;
+ my $prev_dom={seq_end=>0};
+ for my $curr_dom ( @pf_domains) {
+ if ($curr_dom->{seq_start} - $prev_dom->{seq_end} > $min_nodom) {
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $curr_dom->{seq_start}-1, info=>'NODOM');
+ push @npf_domains, \%new_dom;
+ }
+ push @npf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+ }
+ if ($seq_length - $prev_dom->{seq_end} > $min_nodom) {
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM');
+ if ($new_dom{seq_end} > $new_dom{seq_start}) {
+ push @npf_domains, \%new_dom;
+ }
+ }
+
+ # @npf_domains has both old @pf_domains and new neg-domains
+ @pf_domains = @npf_domains;
+ }
+
+ # now make sure we have useful names: colors
+
+ for my $pf (@pf_domains) {
+ $pf->{info} = domain_name($pf->{info}, $pf->{pfamA_acc});
+ }
+
+ my @feats = ();
+ for my $d_ref (@pf_domains) {
+ if ($lav) {
+ push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info}];
+ } else {
+ push @feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end}, $d_ref->{info} ];
+ # push @feats, [$d_ref->{seq_end}, ']', '-', ""];
+ }
+
+ }
+
+ return \@feats;
+}
+
+sub min {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 <= $arg2 ? $arg1 : $arg2);
+}
+
+sub max {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 >= $arg2 ? $arg1 : $arg2);
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($value, $pfamA_acc) = @_;
+ my $is_virtual = 0;
+
+ if ($value =~ m/^@/) {
+ $is_virtual = 1;
+ $value =~ s/^@//;
+ }
+
+ # check for clan:
+ if ($no_clans) {
+ if (! defined($domains{$value})) {
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ elsif (!defined($domain_clan{$value})) {
+ ## only do this for new domains, old domains have known mappings
+
+ ## ways to highlight the same domain:
+ # (1) for clans, substitute clan name for family name
+ # (2) for clans, use the same color for the same clan, but don't change the name
+ # (3) for clans, combine family name with clan name, but use colors based on clan
+
+ # check to see if it's a clan
+ $get_pfam_clan->execute($pfamA_acc);
+
+ my $pfam_clan_href=0;
+
+ if ($pfam_clan_href=$get_pfam_clan->fetchrow_hashref()) { # is a clan
+ my ($clan_id, $clan_acc) = @{$pfam_clan_href}{qw(clan_id clan_acc)};
+
+ # now check to see if we have seen this clan before (if so, do not increment $domain_cnt)
+ my $c_value = "C." . $clan_id;
+ if ($pf_acc) {$c_value = $clan_acc;}
+
+ $domain_clan{$value} = {clan_id => $clan_id,
+ clan_acc => $clan_acc};
+
+ if ($domains{$c_value}) {
+ $domain_clan{$value}->{domain_cnt} = $domains{$c_value};
+ $value = $c_value;
+ }
+ else {
+ $domain_clan{$value}->{domain_cnt} = ++ $domain_cnt;
+ $value = $c_value;
+ $domains{$value} = $domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ else { # not a clan
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ elsif ($domain_clan{$value} && $domain_clan{$value}->{clan_acc}) {
+ if ($pf_acc) {$value = $domain_clan{$value}->{clan_acc};}
+ else { $value = "C." . $domain_clan{$value}->{clan_id}; }
+ }
+
+ if ($is_virtual) {
+ $domains{'@'.$value} = $domains{$value};
+ $value = '@'.$value;
+ }
+ return $value;
+}
+
+sub domain_num {
+ my ($value, $number) = @_;
+ if ($value =~ m/^@/) {
+ $value =~ s/^@/v/;
+ $number = $number."v";
+ }
+ return ($value, $number);
+}
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_pfam28.pl
+
+=head1 SYNOPSIS
+
+ ann_pfam28.pl --neg-doms --vdoms 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --no-over : generate non-overlapping domains (equivalent to ann_pfam.pl)
+ --split-over : overlaps of two domains generate a new hybrid domain
+ --no-clans : do not use clans with multiple families from same clan
+ --neg-doms : report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --vdoms : produce "virtual domains" using model_start,
+ model_end for partial pfam domains
+ --min_nodom=10 : minimum length between domains for NODOM
+
+ --host, --user, --password, --port --db : info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_pfam28.pl> extracts domain information from the pfam msyql
+database. Currently, the program works with database
+sequence descriptions in several formats:
+
+ >gi|1705556|sp|P54670.1|CAF1_DICDI
+ >sp|P09488|GSTM1_HUMAN
+ >sp:CALM_HUMAN
+
+C<ann_pfam28.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>,
+and C<pfamA> tables of the C<pfam> database to extract domain
+information on a protein.
+
+If the C<--no-over> option is set, overlapping domains are selected and
+edited to remove overlaps. For proteins with multiple overlapping
+domains (domains overlap by more than 1/3 of the domain length),
+C<auto_pfam28.pl> selects the domain annotation with the best
+C<domain_evalue_score>. When domains overlap by less than 1/3 of the
+domain length, they are shortened to remove the overlap.
+
+If the C<--split-over> option is set, if two domains overlap, the
+overlapping region is split out of the domains and labeled as a new,
+virtual-lie, domain. If one domain is internal to another and spans
+80% of the domain, the shorter domain is removed.
+
+C<ann_pfam28.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_pfam28.pl> or C<-V "\!ann_pfam28.pl --neg"> option.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_pfam30.pl b/scripts/ann_pfam30.pl
new file mode 100755
index 0000000..2f894f1
--- /dev/null
+++ b/scripts/ann_pfam30.pl
@@ -0,0 +1,859 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014,2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_pfam.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# this is the first version that works with the new Pfam strategy of
+# separating Uniprot reference sequences from the rest of uniprot. as
+# a result, it is possible that 2 SQL queries will be required, one to
+# pfamA_reg_full_significant and a second to uniprot_reg_full.
+
+# modified 15-Jan-2017 to reduce the number of calls when the same
+# accession is present multiple times. Accessions are saved in a hash
+# than ensures uniqueness. (Could also speed things up by creating temporary table.)
+#
+
+
+use strict;
+
+use DBI;
+use Getopt::Long;
+use Pod::Usage;
+
+use vars qw($host $db $port $user $pass);
+
+my $hostname = `/bin/hostname`;
+
+($host, $db, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "pfam31", 0, "web_user", "fasta_www");
+#$host = 'xdb';
+#$host = 'localhost';
+#$db = 'RPD2_pfam28u';
+
+my ($auto_reg,$rpd2_fams, $neg_doms, $vdoms, $lav, $no_doms, $no_clans, $pf_acc, $acc_comment, $bound_comment, $shelp, $help) =
+ (0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0,);
+my ($no_over, $split_over, $over_fract) = (0, 0, 3.0);
+
+my ($color_sep_str, $show_color) = (" :",1);
+$color_sep_str = '~';
+
+my ($min_nodom, $min_vdom) = (10,10);
+
+GetOptions(
+ "host=s" => \$host,
+ "db=s" => \$db,
+ "user=s" => \$user,
+ "password=s" => \$pass,
+ "port=i" => \$port,
+ "lav" => \$lav,
+ "acc_comment" => \$acc_comment,
+ "bound_comment" => \$bound_comment,
+ "color!" => \$show_color,
+ "no-over" => \$no_over,
+ "no_over" => \$no_over,
+ "split-over" => \$split_over,
+ "split_over" => \$split_over,
+ "over_fract" => \$over_fract,
+ "over-fract" => \$over_fract,
+ "no-clans" => \$no_clans,
+ "no_clans" => \$no_clans,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "min_nodom=i" => \$min_nodom,
+ "vdoms" => \$vdoms,
+ "v_doms" => \$vdoms,
+ "pfacc" => \$pf_acc,
+ "RPD2" => \$rpd2_fams,
+ "auto_reg" => \$auto_reg,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (@ARGV || -p STDIN || -f STDIN);
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+my %annot_types = ();
+my %domains = (NODOM=>0);
+my %domain_clan = (NODOM => {clan_id => 'NODOM', clan_acc=>0, domain_cnt=>0});
+my @domain_list = (0);
+my $domain_cnt = 0;
+
+my $pfamA_reg_full = 'pfamA_reg_full_significant';
+my $uniprot_reg_full = 'uniprot_reg_full';
+
+my $get_annot_sub = \&get_pfam_annots;
+
+my @pfam_fields = qw(seq_start seq_end model_start model_end model_length pfamA_acc pfamA_id auto_pfamA_reg_full domain_evalue_score as evalue length);
+my @upfam_fields = qw(seq_start seq_end model_start model_end model_length pfamA_acc pfamA_id auto_uniprot_reg_full domain_evalue_score as evalue length);
+
+my $get_pfam_acc = $dbh->prepare(<<EOSQL);
+SELECT seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM pfamseq
+JOIN pfamA_reg_full_significant using(pfamseq_acc)
+JOIN pfamA USING (pfamA_acc)
+WHERE in_full = 1
+AND pfamseq_acc=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_upfam_acc = $dbh->prepare(<<EOSQL);
+SELECT seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_uniprot_reg_full as auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM uniprot
+JOIN uniprot_reg_full using(uniprot_acc)
+JOIN pfamA USING (pfamA_acc)
+WHERE in_full = 1
+AND uniprot_acc=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_pfam_refacc = $dbh->prepare(<<EOSQL);
+SELECT seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+ FROM $pfamA_reg_full
+ JOIN pfamseq using(pfamseq_acc)
+ JOIN pfamA USING (pfamA_acc)
+ JOIN uniprot.refseq2up as rf2up on(rf2up.up_acc=pfamseq_acc)
+ WHERE in_full = 1
+ AND rf2up.refseq_acc=?
+ ORDER BY seq_start
+
+EOSQL
+
+my $get_upfam_refacc = $dbh->prepare(<<EOSQL);
+SELECT seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_uniprot_reg_full as auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM uniprot
+JOIN uniprot_reg_full using(uniprot_acc)
+JOIN pfamA USING (pfamA_acc)
+JOIN uniprot.refseq2up as rf2up on(rf2up.up_acc=uniprot_acc)
+WHERE in_full = 1
+AND refseq_acc=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_annots_sql = $get_pfam_acc;
+
+my $get_pfam_id = $dbh->prepare(<<EOSQL);
+SELECT seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM pfamseq
+JOIN $pfamA_reg_full using(pfamseq_acc)
+JOIN pfamA USING (pfamA_acc)
+WHERE in_full=1
+AND pfamseq_id=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_upfam_id = $dbh->prepare(<<EOSQL);
+SELECT seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_uniprot_reg_full as auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM uniprot
+JOIN uniprot_reg_full using(pfamseq_acc)
+JOIN pfamA USING (pfamA_acc)
+WHERE in_full=1
+AND uniprot_id=?
+ORDER BY seq_start
+
+EOSQL
+
+my $get_pfam_clan = $dbh->prepare(<<EOSQL);
+
+SELECT clan_acc, clan_id
+FROM clan
+JOIN clan_membership using(clan_acc)
+WHERE pfamA_acc=?
+
+EOSQL
+
+my $get_rpd2_clans = $dbh->prepare(<<EOSQL);
+
+SELECT auto_pfamA, clan
+FROM ljm_db.RPD2_final_fams
+WHERE clan is not NULL
+
+EOSQL
+
+# -- LEFT JOIN clan_membership USING (auto_pfamA)
+# -- LEFT JOIN clans using(auto_clan)
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+my %annot_set = ();
+
+my %rpd2_clan_fams = ();
+
+if ($rpd2_fams) {
+ $get_rpd2_clans->execute();
+ my ($auto_pfam, $auto_clan);
+ while (($auto_pfam, $auto_clan)=$get_rpd2_clans->fetchrow_array()) {
+ $rpd2_clan_fams{$auto_pfam} = $auto_clan;
+ }
+}
+
+#if it's a file I can open, read and parse it
+unless ($query && ($query =~ m/[\|:]/ ||
+ $query =~ m/^[NX]P_/ ||
+ $query =~ m/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\s/)) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, show_annots($a_line, $get_annot_sub);
+ }
+}
+else {
+ push @annots, show_annots("$query\t$seq_len", $get_annot_sub);
+}
+
+for my $seq_annot (@annots) {
+ next unless $seq_annot;
+ my $annot_r = $annot_set{$seq_annot};
+ print ">",$annot_r->{seq_info},"\n";
+ for my $annot (@{$annot_r->{list}}) {
+ if (!$lav && defined($domains{$annot->[-1]})) {
+ my ($a_name, $a_num) = domain_num($annot->[-1],$domains{$annot->[-1]});
+ $annot->[-1] = $a_name;
+ my $tmp_a_num = $a_num;
+ $tmp_a_num =~ s/v$//;
+ if ($acc_comment) {
+ $annot->[-1] .= "{$domain_list[$tmp_a_num]}";
+ }
+ if ($bound_comment) {
+ $annot->[-1] .= $color_sep_str.$annot->[0].":".$annot->[2];
+ }
+ elsif ($show_color) {
+ $annot->[-1] .= $color_sep_str.$a_num;
+ }
+ }
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($query_len, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_len) = split(/\t/,$query_len);
+
+ my $pfamA_acc;
+
+ $use_acc = 1;
+ $get_annots_sql = $get_pfam_acc;
+
+ my $get_annots_sql_u = $get_upfam_acc;
+
+ if ($annot_line =~ m/^pf\d+\|/) {
+ ($sdb, $gi, $pfamA_acc, $acc, $id) = split(/\|/,$annot_line);
+# $dbh->do("use RPD2_pfam");
+ }
+ elsif ($annot_line =~ m/^gi\|/) {
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ if ($sdb =~ m/ref/) {
+ $get_annots_sql = $get_pfam_refacc;
+ $get_annots_sql_u = $get_upfam_refacc;
+ }
+ }
+ elsif ($annot_line =~ m/^(sp|tr|up)\|/) {
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^ref\|/) {
+ ($sdb, $acc) = split(/\|/,$annot_line);
+ $get_annots_sql = $get_pfam_refacc;
+ $get_annots_sql_u = $get_upfam_refacc;
+ }
+ elsif ($annot_line =~ m/^(SP|TR):/i) {
+ ($sdb, $id) = split(/:/,$annot_line);
+ $use_acc = 0;
+ }
+ elsif ($annot_line !~ m/\|/ && $annot_line !~ m/:/) {
+ $use_acc = 1;
+ ($acc) = split(/\s+/,$annot_line);
+ }
+ # deal with no-database SwissProt/NR
+ else {
+ ($acc)=($annot_line =~ /^(\S+)/);
+ }
+
+ # here we have an $acc or an $id: check to see if we have the data
+
+ my %annot_data = (seq_info=>$annot_line);
+ my $annot_key = '';
+ unless ($use_acc) {
+ next if ($annot_set{$id});
+ $annot_set{$id} = \%annot_data;
+ $annot_key = $id;
+
+ $get_annots_sql = $get_pfam_id;
+ $get_annots_sql->execute($id);
+ unless ($get_annots_sql->rows()) {
+ $get_annots_sql = $get_annots_sql_u;
+ $get_annots_sql->execute($id);
+ }
+ } else {
+ unless ($acc) {
+ warn "missing acc in $annot_line";
+ return "";
+ }
+ else {
+ $acc =~ s/\.\d+$//;
+
+ $annot_key = $acc;
+ if ($annot_set{$acc}) {
+ goto ret_label;
+ }
+ $annot_set{$acc} = \%annot_data;
+
+ $get_annots_sql->execute($acc);
+ unless ($get_annots_sql->rows()) {
+ $get_annots_sql = $get_annots_sql_u;
+ $get_annots_sql->execute($acc);
+ }
+ }
+ }
+
+ $annot_data{list} = $get_annot_sub->($get_annots_sql, $seq_len);
+
+ret_label:
+ return $annot_key;
+}
+
+sub get_pfam_annots {
+ my ($get_annots, $seq_length) = @_;
+
+ $seq_length = 0 unless $seq_length;
+
+ my @pf_domains = ();
+
+ # get the list of domains, sorted by start
+
+ # $row_href has: seq_start, seq_end, model_start, model_end, model_length,
+ # pfamA_acc, pfamA_id, auto_pfamA_reg_full,
+ # domain_evalue_score as evalue, length
+
+ while ( my $row_href = $get_annots->fetchrow_hashref()) {
+ if ($auto_reg) {
+ $row_href->{info} = $row_href->{auto_pfamA_reg_full};
+ } elsif ($pf_acc) {
+ $row_href->{info} = $row_href->{pfamA_acc};
+ } else {
+ $row_href->{info} = $row_href->{pfamA_id};
+ }
+
+ if ($row_href && $row_href->{length} > $seq_length && $seq_length == 0) {
+ $seq_length = $row_href->{length};
+ }
+
+ next if ($row_href->{seq_start} >= $seq_length);
+ if ($row_href->{seq_end} > $seq_length) {
+ $row_href->{seq_end} = $seq_length;
+ }
+
+ push @pf_domains, $row_href
+ }
+
+ # before checking for domain overlap, check for "split-domains"
+ # (self-unbound) by looking for runs of the same domain that are
+ # ordered by model_start
+
+ if (scalar(@pf_domains) > 1) {
+ my @j_domains; #joined domains
+ my @tmp_domains = @pf_domains;
+
+ my $prev_dom = shift(@tmp_domains);
+
+ for my $curr_dom (@tmp_domains) {
+ # to join domains:
+ # (1) the domains must be in order by model_start/end coordinates
+ # (3) joining the domains cannot make the total combination too long
+
+ # check for model and sequence consistency
+ if (($prev_dom->{pfamA_acc} eq $curr_dom->{pfamA_acc}) # same family
+ && $prev_dom->{model_start} < $curr_dom->{model_start} # model check
+ && $prev_dom->{model_end} < $curr_dom->{model_end}
+
+ && ($curr_dom->{model_start} > $prev_dom->{model_end} * 0.80 # limit overlap
+ || $curr_dom->{model_start} < $prev_dom->{model_end} * 1.25)
+ && ((($curr_dom->{model_end} - $curr_dom->{model_start}+1)/$curr_dom->{model_length} +
+ ($prev_dom->{model_end} - $prev_dom->{model_start}+1)/$prev_dom->{model_length}) < 1.33)
+ ) { # join them by updating $prev_dom
+ $prev_dom->{seq_end} = $curr_dom->{seq_end};
+ $prev_dom->{model_end} = $curr_dom->{model_end};
+ $prev_dom->{auto_pfamA_reg_full} = $prev_dom->{auto_pfamA_reg_full} . ";". $curr_dom->{auto_pfamA_reg_full};
+ $prev_dom->{evalue} = ($prev_dom->{evalue} < $curr_dom->{evalue} ? $prev_dom->{evalue} : $curr_dom->{evalue});
+ } else {
+ push @j_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ }
+ push @j_domains, $prev_dom;
+ @pf_domains = @j_domains;
+
+
+ if ($no_over) { # for either $no_over or $split_over, check for overlapping domains and edit/split them
+
+ my @tmp_domains = @pf_domains; # allow shifts from copy of @pf_domains
+ my @save_domains = (); # where the new domains go
+
+ my $prev_dom = shift @tmp_domains;
+
+ while (my $curr_dom = shift @tmp_domains) {
+
+ my @overlap_domains = ($prev_dom);
+
+ my $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
+
+ my ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1,
+ $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
+
+ my $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) # start is right && end is left
+ && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) || # -- curr inside prev
+ (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) # start is left && end is right
+ && ($curr_dom->{seq_end} >= $prev_dom->{seq_end}))); # -- prev is inside curr
+
+ my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+
+ # check for overlap > domain_length/$over_fract
+ while ($inclusion || ($diff > 0 && $diff > $longer_len/$over_fract)) {
+ push @overlap_domains, $curr_dom;
+ $curr_dom = shift @tmp_domains;
+ last unless $curr_dom;
+ $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
+ ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
+ $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+ $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) ||
+ (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})));
+ }
+
+ # check for overlapping domains; >1 because $prev_dom is always there
+ if (scalar(@overlap_domains) > 1 ) {
+ # if $rpd2_fams, check for a chosen one
+
+ for my $dom ( @overlap_domains) {
+ $dom->{evalue} = 1.0 unless defined($dom->{evalue});
+ }
+
+ @overlap_domains = sort { $a->{evalue} <=> $b->{evalue} } @overlap_domains;
+ $prev_dom = $overlap_domains[0];
+ }
+
+ # $prev_dom should be the best of the overlaps, and we are no longer overlapping > dom_length/3
+ push @save_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+
+ if ($prev_dom) {
+ push @save_domains, $prev_dom;
+ }
+
+ @pf_domains = @save_domains;
+
+ # now check for smaller overlaps
+ for (my $i=1; $i < scalar(@pf_domains); $i++) {
+ if ($pf_domains[$i-1]->{seq_end} >= $pf_domains[$i]->{seq_start}) {
+ my $overlap = $pf_domains[$i-1]->{seq_end} - $pf_domains[$i]->{seq_start};
+ $pf_domains[$i-1]->{seq_end} -= int($overlap/2);
+ $pf_domains[$i]->{seq_start} = $pf_domains[$i-1]->{seq_end}+1;
+ }
+ }
+ }
+ elsif ($split_over) { # here, everything that overlaps by > $min_vdom should be split into a separate domain
+ my @save_domains = (); # where the new domains go
+
+ # check to see if one domain is included (or overlapping) more
+ # than xx% of the other. If so, pick the longer one
+
+ my ($prev_dom, $curr_dom) = ($pf_domains[0],0) ;
+ for (my $i=1; $i < scalar(@pf_domains); $i++) {
+ $curr_dom = $pf_domains[$i];
+
+ my ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
+ my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+
+ if (($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})
+ && $cur_len / $prev_len > 0.80) {
+ # $prev_dom stays the same, $curr_dom deleted
+ next;
+ }
+ elsif (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})
+ && $prev_len / $cur_len > 0.80) {
+ $prev_dom = $curr_dom; # this should delete $prev_dom
+ next;
+ }
+
+ if ($prev_dom->{seq_end} >= $curr_dom->{seq_start} + $min_vdom) {
+ my ($l_seq_end, $r_seq_start) = ($curr_dom->{seq_start}-1, $prev_dom->{seq_end}+1);
+
+ $prev_dom->{seq_end} = $l_seq_end;
+ push @save_domains, $prev_dom;
+ my $new_dom = {seq_start => $l_seq_end+1, seq_end=>$r_seq_start-1,
+ model_length => -1,
+ pfamA_acc=>$prev_dom->{pfamA_acc}."/".$curr_dom->{pfamA_acc},
+ pfamA_id=>$prev_dom->{pfamA_id}."/".$curr_dom->{pfamA_id},
+ };
+
+ if ($pf_acc) {
+ $new_dom->{info} = $new_dom->{pfamA_acc};
+ }
+ else {
+ $new_dom->{info} = $new_dom->{pfamA_id};
+ }
+
+ push @save_domains, $new_dom;
+ $curr_dom->{seq_start} = $r_seq_start;
+ $prev_dom = $curr_dom;
+ }
+ else {
+ push @save_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ }
+ push @save_domains, $prev_dom;
+ @pf_domains = @save_domains;
+ }
+ }
+
+ # $vdoms -- virtual Pfam domains -- the equivalent of $neg_doms,
+ # but covering parts of a Pfam model that are not annotated. split
+ # domains have been joined, so simply check beginning and end of
+ # each domain (but must also check for bounded-ness)
+ # only add when 10% or more is missing and missing length > $min_nodom
+
+ if ($vdoms && scalar(@pf_domains)) {
+ my @vpf_domains;
+
+ my $curr_dom = $pf_domains[0];
+ my $length = $curr_dom->{length};
+
+ my $prev_dom={seq_end=>0, pfamA_acc=>''};
+ my $prev_dom_end = 0;
+ my $next_dom_start = $length+1;
+
+ for (my $dom_ix=0; $dom_ix < scalar(@pf_domains); $dom_ix++ ) {
+ $curr_dom = $pf_domains[$dom_ix];
+
+ my $pfamA = $curr_dom->{pfamA_acc};
+
+ # first, look left, is there a domain there (if there is,
+ # it should be updated right
+
+ # my $min_vdom = $curr_dom->{model_length} / 10;
+
+ if ($curr_dom->{model_length} < $min_vdom) {
+ push @vpf_domains, $curr_dom;
+ next;
+ }
+ if ($prev_dom->{pfamA_acc}) { # look for previous domain
+ $prev_dom_end = $prev_dom->{seq_end};
+ }
+
+ # there is a domain to the left, how much room is available?
+ my $left_dom_len = min($curr_dom->{seq_start}-$prev_dom_end-1, $curr_dom->{model_start}-1);
+ if ( $left_dom_len > $min_vdom) {
+ # there is room for a virtual domain
+ my %new_dom = (seq_start=> $curr_dom->{seq_start}-$left_dom_len,
+ seq_end => $curr_dom->{seq_start}-1,
+ info=>'@'.$curr_dom->{info},
+ model_length=>$curr_dom->{model_length},
+ model_end => $curr_dom->{model_start}-1,
+ model_start => $left_dom_len,
+ pfamA_acc=>$pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ }
+
+ # save the current domain
+ push @vpf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+
+ if ($dom_ix < $#pf_domains) { # there is a domain to the right
+ # first, give all the extra space to the first domain (no splitting)
+ $next_dom_start = $pf_domains[$dom_ix+1]->{seq_start};
+ }
+ else {
+ $next_dom_start = $length;
+ }
+
+ # is there room for a virtual domain right
+
+ my $right_dom_len = min($next_dom_start-$curr_dom->{seq_end}-1, # space available
+ $curr_dom->{model_length}-$curr_dom->{model_end} # space needed
+ );
+ if ( $right_dom_len > $min_vdom) {
+ my %new_dom = (seq_start=> $curr_dom->{seq_end}+1,
+ seq_end=> $curr_dom->{seq_end}+$right_dom_len,
+ info=>'@'.$curr_dom->{info},
+ model_length => $curr_dom->{model_length},
+ pfamA_acc=> $pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ $prev_dom = \%new_dom;
+ }
+ } # all done, check for last one
+
+ # $curr_dom=$pf_domains[-1];
+ # # my $min_vdom = $curr_dom->{model_length}/10;
+
+ # my $right_dom_len = min($length - $curr_dom->{seq_end}+1, # space available
+ # $curr_dom->{model_length}-$curr_dom->{model_end} # space needed
+ # );
+ # if ($right_dom_len > $min_vdom) {
+ # my %new_dom = (seq_start=> $curr_dom->{seq_end}+1,
+ # seq_end => $curr_dom->{seq_end}+$right_dom_len,
+ # info=>'@'.$curr_dom->{pfamA_acc},
+ # model_len=> $curr_dom->{model_len},
+ # pfamA_acc => $curr_dom->{pfamA_acc},
+ # model_start => $curr_dom->{model_end}+1,
+ # model_end => $curr_dom->{model_len},
+ # );
+
+ # push @vpf_domains, \%new_dom;
+ # }
+
+ # @vpf_domains has both old @pf_domains and new neg-domains
+ @pf_domains = @vpf_domains;
+ }
+
+ if ($neg_doms) {
+ my @npf_domains;
+ my $prev_dom={seq_end=>0};
+ for my $curr_dom ( @pf_domains) {
+ if ($curr_dom->{seq_start} - $prev_dom->{seq_end} > $min_nodom) {
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $curr_dom->{seq_start}-1, info=>'NODOM');
+ push @npf_domains, \%new_dom;
+ }
+ push @npf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+ }
+ if ($seq_length - $prev_dom->{seq_end} > $min_nodom) {
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM');
+ if ($new_dom{seq_end} > $new_dom{seq_start}) {
+ push @npf_domains, \%new_dom;
+ }
+ }
+
+ # @npf_domains has both old @pf_domains and new neg-domains
+ @pf_domains = @npf_domains;
+ }
+
+ # now make sure we have useful names: colors
+
+ for my $pf (@pf_domains) {
+ $pf->{info} = domain_name($pf->{info}, $pf->{pfamA_acc});
+ }
+
+ my @feats = ();
+ for my $d_ref (@pf_domains) {
+ if ($lav) {
+ push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info}];
+ } else {
+ push @feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end}, $d_ref->{info} ];
+ # push @feats, [$d_ref->{seq_end}, ']', '-', ""];
+ }
+
+ }
+
+ return \@feats;
+}
+
+sub min {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 <= $arg2 ? $arg1 : $arg2);
+}
+
+sub max {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 >= $arg2 ? $arg1 : $arg2);
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($value, $pfamA_acc) = @_;
+ my $is_virtual = 0;
+
+ if ($value =~ m/^@/) {
+ $is_virtual = 1;
+ $value =~ s/^@//;
+ }
+
+ # check for clan:
+ if ($no_clans) {
+ if (! defined($domains{$value})) {
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ elsif (!defined($domain_clan{$value})) {
+ ## only do this for new domains, old domains have known mappings
+
+ ## ways to highlight the same domain:
+ # (1) for clans, substitute clan name for family name
+ # (2) for clans, use the same color for the same clan, but don't change the name
+ # (3) for clans, combine family name with clan name, but use colors based on clan
+
+ # check to see if it's a clan
+ $get_pfam_clan->execute($pfamA_acc);
+
+ my $pfam_clan_href=0;
+
+ if ($pfam_clan_href=$get_pfam_clan->fetchrow_hashref()) { # is a clan
+ my ($clan_id, $clan_acc) = @{$pfam_clan_href}{qw(clan_id clan_acc)};
+
+ # now check to see if we have seen this clan before (if so, do not increment $domain_cnt)
+ my $c_value = "C." . $clan_id;
+ if ($pf_acc) {$c_value = $clan_acc;}
+
+ $domain_clan{$value} = {clan_id => $clan_id,
+ clan_acc => $clan_acc};
+
+ if ($domains{$c_value}) {
+ $domain_clan{$value}->{domain_cnt} = $domains{$c_value};
+ $value = $c_value;
+ }
+ else {
+ $domain_clan{$value}->{domain_cnt} = ++ $domain_cnt;
+ $value = $c_value;
+ $domains{$value} = $domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ else { # not a clan
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ elsif ($domain_clan{$value} && $domain_clan{$value}->{clan_acc}) {
+ if ($pf_acc) {$value = $domain_clan{$value}->{clan_acc};}
+ else { $value = "C." . $domain_clan{$value}->{clan_id}; }
+ }
+
+ if ($is_virtual) {
+ $domains{'@'.$value} = $domains{$value};
+ $value = '@'.$value;
+ }
+ return $value;
+}
+
+sub domain_num {
+ my ($value, $number) = @_;
+ if ($value =~ m/^@/) {
+ $value =~ s/^@/v/;
+ $number = $number."v";
+ }
+ return ($value, $number);
+}
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_pfam30.pl
+
+=head1 SYNOPSIS
+
+ ann_pfam30.pl --neg-doms --vdoms 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --no-over : generate non-overlapping domains (equivalent to ann_pfam.pl)
+ --split-over : overlaps of two domains generate a new hybrid domain
+ --no-clans : do not use clans with multiple families from same clan
+ --neg-doms : report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --vdoms : produce "virtual domains" using model_start,
+ model_end for partial pfam domains
+ --min_nodom=10 : minimum length between domains for NODOM
+
+ --host, --user, --password, --port --db : info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_pfam30.pl> extracts domain information from the pfam msyql
+database. Currently, the program works with database
+sequence descriptions in several formats:
+
+ >gi|1705556|sp|P54670.1|CAF1_DICDI
+ >sp|P09488|GSTM1_HUMAN
+ >sp:CALM_HUMAN
+
+C<ann_pfam30.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>,
+and C<pfamA> tables of the C<pfam> database to extract domain
+information on a protein.
+
+If the C<--no-over> option is set, overlapping domains are selected and
+edited to remove overlaps. For proteins with multiple overlapping
+domains (domains overlap by more than 1/3 of the domain length),
+C<auto_pfam28.pl> selects the domain annotation with the best
+C<domain_evalue_score>. When domains overlap by less than 1/3 of the
+domain length, they are shortened to remove the overlap.
+
+If the C<--split-over> option is set, if two domains overlap, the
+overlapping region is split out of the domains and labeled as a new,
+virtual-lie, domain. If one domain is internal to another and spans
+80% of the domain, the shorter domain is removed.
+
+C<ann_pfam30.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_pfam30.pl> or C<-V "\!ann_pfam30.pl --neg"> option.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_pfam30_tmptbl.pl b/scripts/ann_pfam30_tmptbl.pl
new file mode 100755
index 0000000..c9e5b19
--- /dev/null
+++ b/scripts/ann_pfam30_tmptbl.pl
@@ -0,0 +1,875 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014,2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_pfam.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# this is the first version that works with the new Pfam strategy of
+# separating Uniprot reference sequences from the rest of uniprot. as
+# a result, it is possible that 2 SQL queries will be required, one to
+# pfamA_reg_full_significant and a second to uniprot_reg_full.
+
+# modified 15-Jan-2017 to reduce the number of calls when the same
+# accession is present multiple times. Accessions are saved in a hash
+# than ensures uniqueness.
+#
+# Uses tmp_annot.temporary table for more rapid joins. $user must have
+# create temporary tables/select permissions for tmp_annot
+#
+
+use strict;
+
+use DBI;
+use Getopt::Long;
+use Pod::Usage;
+use File::Temp qw/tempfile/;
+
+use vars qw($host $db $port $user $pass);
+
+my $hostname = `/bin/hostname`;
+
+($host, $db, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "pfam30", 0, "web_user", "fasta_www");
+#$host = 'xdb';
+#$host = 'localhost';
+#$db = 'RPD2_pfam28u';
+
+my ($auto_reg,$rpd2_fams, $neg_doms, $vdoms, $lav, $no_doms, $no_clans, $pf_acc, $acc_comment, $bound_comment, $shelp, $help) =
+ (0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0,);
+my ($no_over, $split_over, $over_fract) = (0, 0, 3.0);
+
+my ($color_sep_str, $show_color) = (" :",1);
+$color_sep_str = '~';
+
+my ($min_nodom, $min_vdom) = (10,10);
+
+GetOptions(
+ "host=s" => \$host,
+ "db=s" => \$db,
+ "user=s" => \$user,
+ "password=s" => \$pass,
+ "port=i" => \$port,
+ "lav" => \$lav,
+ "acc_comment" => \$acc_comment,
+ "bound_comment" => \$bound_comment,
+ "color!" => \$show_color,
+ "no-over" => \$no_over,
+ "no_over" => \$no_over,
+ "split-over" => \$split_over,
+ "split_over" => \$split_over,
+ "over_fract" => \$over_fract,
+ "over-fract" => \$over_fract,
+ "no-clans" => \$no_clans,
+ "no_clans" => \$no_clans,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "min_nodom=i" => \$min_nodom,
+ "vdoms" => \$vdoms,
+ "v_doms" => \$vdoms,
+ "pfacc" => \$pf_acc,
+ "RPD2" => \$rpd2_fams,
+ "auto_reg" => \$auto_reg,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (@ARGV || -p STDIN || -f STDIN);
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+my %annot_types = ();
+my %domains = (NODOM=>0);
+my %domain_clan = (NODOM => {clan_id => 'NODOM', clan_acc=>0, domain_cnt=>0});
+my @domain_list = (0);
+my $domain_cnt = 0;
+
+my $pfamA_reg_full = 'pfamA_reg_full_significant';
+my $uniprot_reg_full = 'uniprot_reg_full';
+
+my @pfam_fields = qw(seq_start seq_end model_start model_end model_length pfamA_acc pfamA_id auto_pfamA_reg_full evalue length);
+my @upfam_fields = qw(seq_start seq_end model_start model_end model_length pfamA_acc pfamA_id auto_uniprot_reg_full length);
+
+my $get_pfam_acc = $dbh->prepare(<<EOSQL);
+SELECT t_acc, seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM pfamseq
+JOIN pfamA_reg_full_significant using(pfamseq_acc)
+JOIN pfamA USING (pfamA_acc)
+JOIN tmp_annot.targets on(pfamseq_acc=t_acc)
+WHERE in_full = 1
+ORDER BY t_acc,seq_start
+
+EOSQL
+
+my $get_upfam_acc = $dbh->prepare(<<EOSQL);
+SELECT t_acc, seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_uniprot_reg_full as auto_pfamA_reg_full, domain_evalue_score as evalue, length
+FROM uniprot
+JOIN uniprot_reg_full using(uniprot_acc)
+JOIN pfamA USING (pfamA_acc)
+JOIN tmp_annot.targets on(uniprot_acc=t_acc)
+WHERE in_full = 1
+ORDER BY t_acc, seq_start
+
+EOSQL
+
+my $get_pfam_refacc = $dbh->prepare(<<EOSQL);
+SELECT t_acc, seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+ FROM $pfamA_reg_full
+ JOIN pfamseq using(pfamseq_acc)
+ JOIN pfamA USING (pfamA_acc)
+ JOIN uniprot.refseq2up as rf2up on(rf2up.up_acc=pfamseq_acc)
+ JOIN tmp_annot.targets on(rf2up.refseq_acc=t_acc)
+ WHERE in_full = 1
+ ORDER BY t_acc, seq_start
+
+EOSQL
+
+my $get_upfam_refacc = $dbh->prepare(<<EOSQL);
+SELECT t_acc,seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_uniprot_reg_full as auto_pfamA_reg_full, domain_evalue_score as evalue, length
+ FROM uniprot
+ JOIN uniprot_reg_full using(uniprot_acc)
+ JOIN pfamA USING (pfamA_acc)
+ JOIN uniprot.refseq2up as rf2up on(rf2up.up_acc=uniprot_acc)
+ JOIN tmp_annot.targets on(rf2up.refseq_acc=t_acc)
+WHERE in_full = 1
+ORDER BY t_acc, seq_start
+
+EOSQL
+
+my $get_annots_sql = $get_pfam_acc;
+my $get_annots_sql_u = $get_upfam_acc;
+
+my $get_pfam_id = $dbh->prepare(<<EOSQL);
+SELECT t_acc, seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
+ FROM pfamseq
+ JOIN $pfamA_reg_full using(pfamseq_acc)
+ JOIN pfamA USING (pfamA_acc)
+ JOIN tmp_annot.targets on(pfamseq_id=t_acc)
+WHERE in_full=1
+ORDER BY t_acc, seq_start
+
+EOSQL
+
+my $get_upfam_id = $dbh->prepare(<<EOSQL);
+SELECT t_acc, seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_uniprot_reg_full as auto_pfamA_reg_full, domain_evalue_score as evalue, length
+ FROM uniprot
+ JOIN uniprot_reg_full using(pfamseq_acc)
+ JOIN pfamA USING (pfamA_acc)
+ JOIN tmp_annot.targets on(uniprot_id=t_acc)
+WHERE in_full=1
+ORDER BY t_acc, seq_start
+
+EOSQL
+
+my $get_pfam_clan = $dbh->prepare(<<EOSQL);
+
+SELECT clan_acc, clan_id
+FROM clan
+JOIN clan_membership using(clan_acc)
+WHERE pfamA_acc=?
+
+EOSQL
+
+my $get_rpd2_clans = $dbh->prepare(<<EOSQL);
+
+SELECT auto_pfamA, clan
+FROM ljm_db.RPD2_final_fams
+WHERE clan is not NULL
+
+EOSQL
+
+$dbh->do(<<EOSQL);
+create temporary table tmp_annot.targets (t_acc char(10) primary key)
+EOSQL
+
+# -- LEFT JOIN clan_membership USING (auto_pfamA)
+# -- LEFT JOIN clans using(auto_clan)
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+my @lav_list = qw(seq_start seq_end info);
+my @no_lav_list = qw(seq_start dash seq_end info);
+my $out_list_r = \@no_lav_list;
+if ($lav) {
+ $show_color = 0;
+ $out_list_r = \@lav_list;
+}
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+my %annot_set = ();
+
+my %rpd2_clan_fams = ();
+
+if ($rpd2_fams) {
+ $get_rpd2_clans->execute();
+ my ($auto_pfam, $auto_clan);
+ while (($auto_pfam, $auto_clan)=$get_rpd2_clans->fetchrow_array()) {
+ $rpd2_clan_fams{$auto_pfam} = $auto_clan;
+ }
+}
+
+#if it's a file I can open, read and parse it
+unless ($query && ($query =~ m/[\|:]/ ||
+ $query =~ m/^[NX]P_/ ||
+ $query =~ m/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\s/)) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, show_annots($a_line);
+ }
+}
+else {
+ push @annots, show_annots("$query\t$seq_len");
+}
+
+# @annots has a list of id's or annotations
+# write to temporary file, load data local infile, join to get results
+
+my ($fh, $temp_file) = tempfile(TEMPLATE=>'accannXXXXX');
+my @u_annots = keys %annot_set;
+print $fh join("\n", at u_annots);
+close($fh);
+
+$dbh->do("load data local infile '$temp_file' into table tmp_annot.targets");
+
+unlink($temp_file);
+
+# $get_annots_sql->execute();
+# while (my $annot_ar = $get_annots_sql->fetchrow_arrayref()) {
+# my %annot_data = ();
+# @annot_data{@pfam_fields} = @{$annot_ar}[1..10];
+# if (!defined($annot_set{$annot_ar->[0]}->{list})) {
+# $annot_set{$annot_ar->[0]}->{list} = [\%annot_data];
+# }
+# else {
+# push @{$annot_set{$annot_ar->[0]}->{list}}, \%annot_data;
+# }
+# }
+
+$get_annots_sql_u->execute();
+while (my $annot_hr = $get_annots_sql_u->fetchrow_hashref()) {
+ if (!defined($annot_set{$annot_hr->{t_acc}}->{list})) {
+ $annot_set{$annot_hr->{t_acc}}->{list} = [$annot_hr];
+ }
+ else {
+ push @{$annot_set{$annot_hr->{t_acc}}->{list}}, $annot_hr;
+ }
+}
+
+for my $u_acc (@u_annots) {
+ map_pfam_annots($annot_set{$u_acc});
+}
+
+for my $seq_annot (@annots) {
+ next unless $seq_annot;
+ my $annot_r = $annot_set{$seq_annot};
+ print ">",$annot_r->{seq_info},"\n";
+ for my $annot (@{$annot_r->{list}}) {
+ $annot->{dash} = '-';
+ if (defined($domains{$annot->{info}})) {
+ my ($a_name, $a_num) = domain_num($annot->{info},$domains{$annot->{info}});
+ $annot->{info} = $a_name;
+ my $tmp_a_num = $a_num;
+ $tmp_a_num =~ s/v$//;
+ if ($acc_comment) {
+ $annot->{info} .= "{$domain_list[$tmp_a_num]}";
+ }
+ if ($bound_comment) {
+ $annot->{info} .= $color_sep_str.$annot->{seq_start}.":".$annot->{seq_end};
+ }
+ elsif ($show_color) {
+ $annot->{info} .= $color_sep_str.$a_num;
+ }
+ }
+ print join("\t",@{$annot}{@{$out_list_r}}),"\n";
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($query_len) = @_;
+
+ my ($annot_line, $seq_len) = split(/\t/,$query_len);
+
+ my $pfamA_acc;
+
+ $use_acc = 1;
+ $get_annots_sql = $get_pfam_acc;
+ $get_annots_sql_u = $get_upfam_acc;
+
+ if ($annot_line =~ m/^pf\d+\|/) {
+ ($sdb, $gi, $pfamA_acc, $acc, $id) = split(/\|/,$annot_line);
+# $dbh->do("use RPD2_pfam");
+ }
+ elsif ($annot_line =~ m/^gi\|/) {
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ if ($sdb =~ m/ref/) {
+ $get_annots_sql = $get_pfam_refacc;
+ $get_annots_sql_u = $get_upfam_refacc;
+ }
+ }
+ elsif ($annot_line =~ m/^(sp|tr)\|/) {
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^ref\|/) {
+ ($sdb, $acc) = split(/\|/,$annot_line);
+ $get_annots_sql = $get_pfam_refacc;
+ $get_annots_sql_u = $get_upfam_refacc;
+ }
+ elsif ($annot_line =~ m/^(SP|TR):/i) {
+ ($sdb, $id) = split(/:/,$annot_line);
+ $use_acc = 0;
+ }
+ elsif ($annot_line !~ m/\|/ && $annot_line !~ m/:/) {
+ $use_acc = 1;
+ ($acc) = split(/\s+/,$annot_line);
+ }
+ # deal with no-database SwissProt/NR
+ else {
+ ($acc)=($annot_line =~ /^(\S+)/);
+ }
+
+ # here we have an $acc or an $id: check to see if we have the data
+
+ my %annot_data = (seq_info=>$annot_line, length=>$seq_len);
+ my $annot_key = '';
+ unless ($use_acc) {
+ next if ($annot_set{$id});
+ $annot_set{$id} = \%annot_data;
+ $annot_key = $id;
+
+ $get_annots_sql = $get_pfam_id;
+ }
+ else {
+ unless ($acc) {
+ warn "missing acc in $annot_line";
+ return "";
+ }
+ else {
+ $acc =~ s/\.\d+$//;
+
+ next if ($annot_set{$acc});
+ $annot_set{$acc} = \%annot_data;
+ $annot_key = $acc;
+ }
+ }
+
+ return $annot_key;
+}
+
+sub map_pfam_annots {
+ my ($annot_ref) = @_;
+
+ my $seq_length = $annot_ref->{length};
+ my $pf_domains_r = $annot_ref->{list};
+
+ my $row_href=$annot_ref->{list}[0];
+ if ($row_href->{length} && $row_href->{length} > $seq_length && $seq_length == 0) {
+ $annot_ref->{length} = $seq_length = $row_href->{length};
+ }
+
+ # fill in {info} field
+ for my $pf_dom (@$pf_domains_r) {
+ if ($auto_reg) {
+ $pf_dom->{info} = $pf_dom->{auto_pfamA_reg_full};
+ } elsif ($pf_acc) {
+ $pf_dom->{info} = $pf_dom->{pfamA_acc};
+ } else {
+ $pf_dom->{info} = $pf_dom->{pfamA_id};
+ }
+ }
+
+ # before checking for domain overlap, check for "split-domains"
+ # (self-unbound) by looking for runs of the same domain that are
+ # ordered by model_start
+
+ if (scalar(@{$pf_domains_r}) > 1) {
+ my @j_domains; #joined domains
+ my @tmp_domains = @{$pf_domains_r};
+
+ my $prev_dom = shift(@tmp_domains);
+
+ for my $curr_dom (@tmp_domains) {
+ # to join domains:
+ # (1) the domains must be in order by model_start/end coordinates
+ # (3) joining the domains cannot make the total combination too long
+
+ # check for model and sequence consistency
+ if (($prev_dom->{pfamA_acc} eq $curr_dom->{pfamA_acc}) # same family
+ && $prev_dom->{model_start} < $curr_dom->{model_start} # model check
+ && $prev_dom->{model_end} < $curr_dom->{model_end}
+
+ && ($curr_dom->{model_start} > $prev_dom->{model_end} * 0.80 # limit overlap
+ || $curr_dom->{model_start} < $prev_dom->{model_end} * 1.25)
+ && ((($curr_dom->{model_end} - $curr_dom->{model_start}+1)/$curr_dom->{model_length} +
+ ($prev_dom->{model_end} - $prev_dom->{model_start}+1)/$prev_dom->{model_length}) < 1.33)
+ ) { # join them by updating $prev_dom
+ $prev_dom->{seq_end} = $curr_dom->{seq_end};
+ $prev_dom->{model_end} = $curr_dom->{model_end};
+ $prev_dom->{auto_pfamA_reg_full} = $prev_dom->{auto_pfamA_reg_full} . ";". $curr_dom->{auto_pfamA_reg_full};
+ $prev_dom->{evalue} = ($prev_dom->{evalue} < $curr_dom->{evalue} ? $prev_dom->{evalue} : $curr_dom->{evalue});
+ } else {
+ push @j_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ }
+ push @j_domains, $prev_dom;
+ @{$pf_domains_r} = @j_domains;
+
+
+ if ($no_over) { # for either $no_over or $split_over, check for overlapping domains and edit/split them
+
+ my @tmp_domains = @{$pf_domains_r}; # allow shifts from copy of @pf_domains
+ my @save_domains = (); # where the new domains go
+
+ my $prev_dom = shift @tmp_domains;
+
+ while (my $curr_dom = shift @tmp_domains) {
+
+ my @overlap_domains = ($prev_dom);
+
+ my $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
+
+ my ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1,
+ $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
+
+ my $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) # start is right && end is left
+ && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) || # -- curr inside prev
+ (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) # start is left && end is right
+ && ($curr_dom->{seq_end} >= $prev_dom->{seq_end}))); # -- prev is inside curr
+
+ my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+
+ # check for overlap > domain_length/$over_fract
+ while ($inclusion || ($diff > 0 && $diff > $longer_len/$over_fract)) {
+ push @overlap_domains, $curr_dom;
+ $curr_dom = shift @tmp_domains;
+ last unless $curr_dom;
+ $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
+ ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
+ $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+ $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) ||
+ (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})));
+ }
+
+ # check for overlapping domains; >1 because $prev_dom is always there
+ if (scalar(@overlap_domains) > 1 ) {
+ # if $rpd2_fams, check for a chosen one
+
+ for my $dom ( @overlap_domains) {
+ $dom->{evalue} = 1.0 unless defined($dom->{evalue});
+ }
+
+ @overlap_domains = sort { $a->{evalue} <=> $b->{evalue} } @overlap_domains;
+ $prev_dom = $overlap_domains[0];
+ }
+
+ # $prev_dom should be the best of the overlaps, and we are no longer overlapping > dom_length/3
+ push @save_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+
+ if ($prev_dom) {
+ push @save_domains, $prev_dom;
+ }
+
+ @{$pf_domains_r} = @save_domains;
+
+ # now check for smaller overlaps
+ for (my $i=1; $i < scalar(@{$pf_domains_r}); $i++) {
+ if ($pf_domains_r->[$i-1]->{seq_end} >= $pf_domains_r->[$i]->{seq_start}) {
+ my $overlap = $pf_domains_r->[$i-1]->{seq_end} - $pf_domains_r->[$i]->{seq_start};
+ $pf_domains_r->[$i-1]->{seq_end} -= int($overlap/2);
+ $pf_domains_r->[$i]->{seq_start} = $pf_domains_r->[$i-1]->{seq_end}+1;
+ }
+ }
+ }
+ elsif ($split_over) { # here, everything that overlaps by > $min_vdom should be split into a separate domain
+ my @save_domains = (); # where the new domains go
+
+ # check to see if one domain is included (or overlapping) more
+ # than xx% of the other. If so, pick the longer one
+
+ my ($prev_dom, $curr_dom) = ($pf_domains_r->[0],0) ;
+ for (my $i=1; $i < scalar(@{$pf_domains_r}); $i++) {
+ $curr_dom = $pf_domains_r->[$i];
+
+ my ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
+ my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+
+ if (($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})
+ && $cur_len / $prev_len > 0.80) {
+ # $prev_dom stays the same, $curr_dom deleted
+ next;
+ }
+ elsif (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})
+ && $prev_len / $cur_len > 0.80) {
+ $prev_dom = $curr_dom; # this should delete $prev_dom
+ next;
+ }
+
+ if ($prev_dom->{seq_end} >= $curr_dom->{seq_start} + $min_vdom) {
+ my ($l_seq_end, $r_seq_start) = ($curr_dom->{seq_start}-1, $prev_dom->{seq_end}+1);
+
+ $prev_dom->{seq_end} = $l_seq_end;
+ push @save_domains, $prev_dom;
+ my $new_dom = {seq_start => $l_seq_end+1, seq_end=>$r_seq_start-1,
+ model_length => -1,
+ pfamA_acc=>$prev_dom->{pfamA_acc}."/".$curr_dom->{pfamA_acc},
+ pfamA_id=>$prev_dom->{pfamA_id}."/".$curr_dom->{pfamA_id},
+ };
+
+ if ($pf_acc) {
+ $new_dom->{info} = $new_dom->{pfamA_acc};
+ }
+ else {
+ $new_dom->{info} = $new_dom->{pfamA_id};
+ }
+
+ push @save_domains, $new_dom;
+ $curr_dom->{seq_start} = $r_seq_start;
+ $prev_dom = $curr_dom;
+ }
+ else {
+ push @save_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ }
+ push @save_domains, $prev_dom;
+ @{$pf_domains_r} = @save_domains;
+ }
+ }
+
+ # $vdoms -- virtual Pfam domains -- the equivalent of $neg_doms,
+ # but covering parts of a Pfam model that are not annotated. split
+ # domains have been joined, so simply check beginning and end of
+ # each domain (but must also check for bounded-ness)
+ # only add when 10% or more is missing and missing length > $min_nodom
+
+ if ($vdoms && scalar(@{$pf_domains_r})) {
+ my @vpf_domains;
+
+ my $curr_dom = $pf_domains_r->[0];
+ my $length = $curr_dom->{length};
+
+ my $prev_dom={seq_end=>0, pfamA_acc=>''};
+ my $prev_dom_end = 0;
+ my $next_dom_start = $length+1;
+
+ for (my $dom_ix=0; $dom_ix < scalar(@{$pf_domains_r}); $dom_ix++ ) {
+ $curr_dom = $pf_domains_r->[$dom_ix];
+
+ my $pfamA = $curr_dom->{pfamA_acc};
+
+ # first, look left, is there a domain there (if there is,
+ # it should be updated right
+
+ # my $min_vdom = $curr_dom->{model_length} / 10;
+
+ if ($curr_dom->{model_length} < $min_vdom) {
+ push @vpf_domains, $curr_dom;
+ next;
+ }
+ if ($prev_dom->{pfamA_acc}) { # look for previous domain
+ $prev_dom_end = $prev_dom->{seq_end};
+ }
+
+ # there is a domain to the left, how much room is available?
+ my $left_dom_len = min($curr_dom->{seq_start}-$prev_dom_end-1, $curr_dom->{model_start}-1);
+ if ( $left_dom_len > $min_vdom) {
+ # there is room for a virtual domain
+ my %new_dom = (seq_start=> $curr_dom->{seq_start}-$left_dom_len,
+ seq_end => $curr_dom->{seq_start}-1,
+ info=>'@'.$curr_dom->{info},
+ model_length=>$curr_dom->{model_length},
+ model_end => $curr_dom->{model_start}-1,
+ model_start => $left_dom_len,
+ pfamA_acc=>$pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ }
+
+ # save the current domain
+ push @vpf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+
+ if ($dom_ix < scalar(@$pf_domains_r)-1) { # there is a domain to the right
+ # first, give all the extra space to the first domain (no splitting)
+ $next_dom_start = $pf_domains_r->[$dom_ix+1]->{seq_start};
+ }
+ else {
+ $next_dom_start = $length;
+ }
+
+ # is there room for a virtual domain right
+
+ my $right_dom_len = min($next_dom_start-$curr_dom->{seq_end}-1, # space available
+ $curr_dom->{model_length}-$curr_dom->{model_end} # space needed
+ );
+ if ( $right_dom_len > $min_vdom) {
+ my %new_dom = (seq_start=> $curr_dom->{seq_end}+1,
+ seq_end=> $curr_dom->{seq_end}+$right_dom_len,
+ info=>'@'.$curr_dom->{info},
+ model_length => $curr_dom->{model_length},
+ pfamA_acc=> $pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ $prev_dom = \%new_dom;
+ }
+ } # all done, check for last one
+
+ # $curr_dom=$pf_domains_r->[-1];
+ # # my $min_vdom = $curr_dom->{model_length}/10;
+
+ # my $right_dom_len = min($length - $curr_dom->{seq_end}+1, # space available
+ # $curr_dom->{model_length}-$curr_dom->{model_end} # space needed
+ # );
+ # if ($right_dom_len > $min_vdom) {
+ # my %new_dom = (seq_start=> $curr_dom->{seq_end}+1,
+ # seq_end => $curr_dom->{seq_end}+$right_dom_len,
+ # info=>'@'.$curr_dom->{pfamA_acc},
+ # model_len=> $curr_dom->{model_len},
+ # pfamA_acc => $curr_dom->{pfamA_acc},
+ # model_start => $curr_dom->{model_end}+1,
+ # model_end => $curr_dom->{model_len},
+ # );
+
+ # push @vpf_domains, \%new_dom;
+ # }
+
+ # @vpf_domains has both old @{$pf_domains_r} and new neg-domains
+ @{$pf_domains_r} = @vpf_domains;
+ }
+
+ if ($neg_doms) {
+ my @npf_domains;
+ my $prev_dom={seq_end=>0};
+ for my $curr_dom ( @{$pf_domains_r}) {
+ if ($curr_dom->{seq_start} - $prev_dom->{seq_end} > $min_nodom) {
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $curr_dom->{seq_start}-1, info=>'NODOM');
+ push @npf_domains, \%new_dom;
+ }
+ push @npf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+ }
+ if ($seq_length - $prev_dom->{seq_end} > $min_nodom) {
+ my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM');
+ if ($new_dom{seq_end} > $new_dom{seq_start}) {
+ push @npf_domains, \%new_dom;
+ }
+ }
+
+ # @npf_domains has both old @pf_domains and new neg-domains
+ @{$pf_domains_r} = @npf_domains;
+ }
+
+ # now make sure we have useful names: colors
+
+ for my $pf (@{$pf_domains_r}) {
+ $pf->{info} = domain_name($pf->{info}, $pf->{pfamA_acc});
+ }
+}
+
+sub min {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 <= $arg2 ? $arg1 : $arg2);
+}
+
+sub max {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 >= $arg2 ? $arg1 : $arg2);
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+sub domain_name {
+
+ my ($value, $pfamA_acc) = @_;
+ my $is_virtual = 0;
+
+ if ($value =~ m/^@/) {
+ $is_virtual = 1;
+ $value =~ s/^@//;
+ }
+
+ # check for clan:
+ if ($no_clans) {
+ if (! defined($domains{$value})) {
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ elsif (!defined($domain_clan{$value})) {
+ ## only do this for new domains, old domains have known mappings
+
+ ## ways to highlight the same domain:
+ # (1) for clans, substitute clan name for family name
+ # (2) for clans, use the same color for the same clan, but don't change the name
+ # (3) for clans, combine family name with clan name, but use colors based on clan
+
+ # check to see if it's a clan
+ $get_pfam_clan->execute($pfamA_acc);
+
+ my $pfam_clan_href=0;
+
+ if ($pfam_clan_href=$get_pfam_clan->fetchrow_hashref()) { # is a clan
+ my ($clan_id, $clan_acc) = @{$pfam_clan_href}{qw(clan_id clan_acc)};
+
+ # now check to see if we have seen this clan before (if so, do not increment $domain_cnt)
+ my $c_value = "C." . $clan_id;
+ if ($pf_acc) {$c_value = $clan_acc;}
+
+ $domain_clan{$value} = {clan_id => $clan_id,
+ clan_acc => $clan_acc};
+
+ if ($domains{$c_value}) {
+ $domain_clan{$value}->{domain_cnt} = $domains{$c_value};
+ $value = $c_value;
+ }
+ else {
+ $domain_clan{$value}->{domain_cnt} = ++ $domain_cnt;
+ $value = $c_value;
+ $domains{$value} = $domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ else { # not a clan
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pfamA_acc;
+ }
+ }
+ elsif ($domain_clan{$value} && $domain_clan{$value}->{clan_acc}) {
+ if ($pf_acc) {$value = $domain_clan{$value}->{clan_acc};}
+ else { $value = "C." . $domain_clan{$value}->{clan_id}; }
+ }
+
+ if ($is_virtual) {
+ $domains{'@'.$value} = $domains{$value};
+ $value = '@'.$value;
+ }
+ return $value;
+}
+
+sub domain_num {
+ my ($value, $number) = @_;
+ if ($value =~ m/^@/) {
+ $value =~ s/^@/v/;
+ $number = $number."v";
+ }
+ return ($value, $number);
+}
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_pfam30_tmptbl.pl
+
+=head1 SYNOPSIS
+
+ ann_pfam30_tmptbl.pl --neg-doms --vdoms 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --no-over : generate non-overlapping domains (equivalent to ann_pfam.pl)
+ --split-over : overlaps of two domains generate a new hybrid domain
+ --no-clans : do not use clans with multiple families from same clan
+ --neg-doms : report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --vdoms : produce "virtual domains" using model_start,
+ model_end for partial pfam domains
+ --min_nodom=10 : minimum length between domains for NODOM
+
+ --host, --user, --password, --port --db : info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_pfam30_tmptbl.pl> extracts domain information from the pfam msyql
+database. Currently, the program works with database
+sequence descriptions in several formats:
+
+ >gi|1705556|sp|P54670.1|CAF1_DICDI
+ >sp|P09488|GSTM1_HUMAN
+ >sp:CALM_HUMAN
+
+C<ann_pfam30_tmptbl.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>,
+and C<pfamA> tables of the C<pfam> database to extract domain
+information on a protein.
+
+If the C<--no-over> option is set, overlapping domains are selected and
+edited to remove overlaps. For proteins with multiple overlapping
+domains (domains overlap by more than 1/3 of the domain length),
+C<auto_pfam28.pl> selects the domain annotation with the best
+C<domain_evalue_score>. When domains overlap by less than 1/3 of the
+domain length, they are shortened to remove the overlap.
+
+If the C<--split-over> option is set, if two domains overlap, the
+overlapping region is split out of the domains and labeled as a new,
+virtual-lie, domain. If one domain is internal to another and spans
+80% of the domain, the shorter domain is removed.
+
+C<ann_pfam30_tmptbl.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_pfam30_tmptbl.pl> or C<-V "\!ann_pfam30_tmptbl.pl --neg"> option.
+
+C<ann_pfam30_tmptbl.pl> requires an additional database, C<tmp_annot>,
+with C<create temporary tables>, C<insert>, and C<select> privileges
+for the default user.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_pfam_www.pl b/scripts/ann_pfam_www.pl
new file mode 100755
index 0000000..897cff9
--- /dev/null
+++ b/scripts/ann_pfam_www.pl
@@ -0,0 +1,687 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014, 2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_pfam_www_e.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg)
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# This version uses the Pfam RESTful interface, rather than a local database
+# >pf26|164|O57809|1A1D_PYRHO
+# and only provides domain information
+
+# use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+use LWP::Simple;
+use XML::Twig;
+# use Data::Dumper;
+
+my ($auto_reg,$rpd2_fams, $neg_doms, $vdoms, $lav, $no_clans, $pf_acc_flag, $shelp, $help, $no_over, $acc_comment, $bound_comment, $pfamB) =
+ (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+my ($show_color) = (1);
+my ($min_nodom, $min_vdom) = (10, 10);
+
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+GetOptions(
+ "lav" => \$lav,
+ "acc_comment" => \$acc_comment,
+ "bound_comment" => \$bound_comment,
+ "min_nodom=i" => \$min_nodom,
+ "neg" => \$neg_doms,
+ "neg_doms" => \$neg_doms,
+ "neg-doms" => \$neg_doms,
+ "no-over" => \$no_over,
+ "no_over" => \$no_over,
+ "no-clans" => \$no_clans,
+ "no_clans" => \$no_clans,
+ "color!" => \$show_color,
+ "pfamB" => \$pfamB,
+ "vdoms" => \$vdoms,
+ "v_doms" => \$vdoms,
+ "pfacc" => \$pf_acc_flag,
+ "pfam_acc" => \$pf_acc_flag,
+ "acc" => \$pf_acc_flag,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (@ARGV || -p STDIN || -f STDIN);
+
+my %annot_types = ();
+my %domains = (NODOM=>0);
+my %domain_clan = (NODOM => {clan_id => 'NODOM', clan_acc=>0, domain_cnt=>0});
+my @domain_list = (0);
+my $domain_cnt = 0;
+
+my $loc="http://pfam.xfam.org/";
+my $url;
+
+my @pf_domains;
+my %pfamA_fams = ();
+my ($pf_seq_length, $pf_model_length)=(0,0);
+my ($clan_acc, $clan_id) = ("","");
+
+my $get_annot_sub = \&get_pfam_www;
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+
+#if it's a file I can open, read and parse it
+unless ($query && ($query =~ m/[\|:]/ ||
+ $query =~ m/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\s/)) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, show_annots($a_line, $get_annot_sub);
+ }
+}
+else {
+ push @annots, show_annots("$query\t$seq_len", $get_annot_sub);
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && defined($domains{$annot->[-1]})) {
+ my ($a_name, $a_num) = domain_num($annot->[-1],$domains{$annot->[-1]});
+ $annot->[-1] = $a_name;
+ my $tmp_a_num = $a_num;
+ $tmp_a_num =~ s/v$//;
+ if ($acc_comment) {
+ $annot->[-1] .= "{$domain_list[$tmp_a_num]}";
+ }
+ if ($bound_comment) {
+ $annot->[-1] .= $color_sep_str.$annot->[0].":".$annot->[2];
+ }
+ elsif ($show_color) {
+ $annot->[-1] .= $color_sep_str.$a_num;
+ }
+ }
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub show_annots {
+ my ($query_len, $get_annot_sub) = @_;
+
+ my ($annot_line, $seq_len) = split(/\t/,$query_len);
+
+ my $pfamA_acc;
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ $use_acc = 1;
+ if ($annot_line =~ m/^pf26\|/) {
+ ($sdb, $gi, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^gi\|/) {
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ elsif ($annot_line =~ m/^(sp|tr|up)\|/) {
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ $use_acc = 1;
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+) (\w+)/) {
+ ($sdb, $id, $acc) = (lc($1), $2, $3);
+ $use_acc = 1;
+ }
+ elsif ($annot_line =~ m/^(SP|TR):(\w+)/) {
+ ($sdb, $id, $acc) = (lc($1), $2, "");
+ $use_acc = 0;
+ }
+ elsif ($annot_line !~ m/\|/ && $annot_line !~ m/:/) {
+ $use_acc = 1;
+ ($acc) = split(/\s+/,$annot_line);
+ }
+
+ # remove version number
+ unless ($use_acc) {
+ $annot_data{list} = get_pfam_www($id, $seq_len);
+ }
+ else {
+ $acc =~ s/\.\d+$//;
+ $annot_data{list} = get_pfam_www($acc, $seq_len);
+ }
+
+ return \%annot_data;
+}
+
+sub get_length {
+ my ($t, $elt) = @_;
+ $pf_seq_length = $elt->{att}->{length};
+}
+
+sub push_match {
+ my ($t, $elt) = @_;
+# return unless ($elt->{att}->{type} =~ m/Pfam-A/);
+ my $attr_ref = $elt->{att};
+ my $loc_ref = $elt->first_child('location')->{att};
+ push @pf_domains, { %$attr_ref, %$loc_ref };
+}
+
+sub get_model_length {
+ my ($t, $elt) = @_;
+ $pf_model_length = $elt->{att}->{model_length};
+}
+
+sub get_clan {
+ my ($t, $elt) = @_;
+ my $attr_ref = $elt->{att};
+# print Dumper($attr_ref);
+ ($clan_acc, $clan_id) = ($attr_ref->{clan_acc},$attr_ref->{clan_id});
+}
+
+sub get_pfam_www {
+ my ($acc, $seq_length) = @_;
+
+# if ($acc =~ m/_/) {$url = "protein?id=$acc&output=xml"; }
+# else {$url = "protein/$acc?output=xml"; }
+
+ $url = "protein/$acc?output=xml";
+
+ my $res = get($loc . $url);
+
+ @pf_domains = ();
+
+ my $twig_dom = XML::Twig->new(twig_roots => {matches => 1, sequence => 1},
+# start_tag_handlers => {
+# 'sequence' => \&get_length,
+# },
+ twig_handlers => {
+ 'match' => \&push_match,
+ 'sequence' => \&get_length,
+ },
+ pretty_print => 'indented');
+ my $xml = $twig_dom->parse($res);
+
+ if (!$seq_length || $seq_length == 0) {
+ $seq_length = $pf_seq_length;
+ }
+
+ @pf_domains = sort { $a->{start} <=> $b->{start} } @pf_domains;
+
+ unless ($pfamB) {
+ @pf_domains = grep { $_->{type} !~ m/Pfam-B/ } @pf_domains;
+ }
+
+ # for virtual domains, also need information about the families
+ for my $curr_dom (@pf_domains) {
+
+ my $acc = $curr_dom->{accession};
+ $url = "family/$acc?output=xml";
+
+ my $res = get($loc . $url);
+
+ my $twig_fam = XML::Twig->new(twig_roots => {hmm_details => 1, clan_membership=> 1},
+ twig_handlers => {
+ 'hmm_details' => \&get_model_length,
+ 'clan_membership' => \&get_clan,
+ },
+ pretty_print => 'indented');
+
+ ($clan_acc, $clan_id) = ("","");
+ my $fam_xml = $twig_fam->parse($res);
+
+ $pfamA_fams{$acc} = { model_length => $pf_model_length, clan_acc=>$clan_acc, clan_id=>$clan_id};
+ $curr_dom->{model_length} = $pf_model_length;
+ }
+
+ # check for domain overlap, and resolve check for domain overlap
+ # (possibly more than 2 domains), choosing the domain with the best
+ # evalue
+
+ my @raw_pf_domains = @pf_domains;
+ @pf_domains = ();
+
+ for my $dom_ref (@raw_pf_domains) {
+ if ($pf_acc_flag) {
+ $dom_ref->{info} = $dom_ref->{accession};
+ }
+ else {
+ $dom_ref->{info} = $dom_ref->{id};
+ }
+ next if ($dom_ref->{start} >= $seq_length);
+ if ($dom_ref->{end} >= $seq_length) {
+ $dom_ref->{end} = $seq_length;
+ }
+ push @pf_domains, $dom_ref;
+ }
+
+ if($no_over && scalar(@pf_domains) > 1) {
+
+ my @tmp_domains = @pf_domains;
+ my @save_domains = ();
+
+ my $prev_dom = shift @tmp_domains;
+
+ while (my $curr_dom = shift @tmp_domains) {
+
+ my @overlap_domains = ($prev_dom);
+
+ my $diff = $prev_dom->{end} - $curr_dom->{start};
+ # check for overlap > domain_length/3
+
+ my ($prev_len, $cur_len) = ($prev_dom->{end}-$prev_dom->{start}+1, $curr_dom->{end}-$curr_dom->{start}+1);
+ my $inclusion = ((($curr_dom->{start} >= $prev_dom->{start}) && ($curr_dom->{end} <= $prev_dom->{end})) ||
+ (($curr_dom->{start} <= $prev_dom->{start}) && ($curr_dom->{end} >= $prev_dom->{end})));
+
+ my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+
+ while ($inclusion || ($diff > 0 && $diff > $longer_len/3)) {
+ push @overlap_domains, $curr_dom;
+ $curr_dom = shift @tmp_domains;
+ last unless $curr_dom;
+ $diff = $prev_dom->{end} - $curr_dom->{start};
+ ($prev_len, $cur_len) = ($prev_dom->{end}-$prev_dom->{start}+1, $curr_dom->{end}-$curr_dom->{start}+1);
+ $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+ $inclusion = ((($curr_dom->{start} >= $prev_dom->{start}) && ($curr_dom->{end} <= $prev_dom->{end})) ||
+ (($curr_dom->{start} <= $prev_dom->{start}) && ($curr_dom->{end} >= $prev_dom->{end})));
+ }
+
+ # check for overlapping domains; >1 because $prev_dom is always there
+ if (scalar(@overlap_domains) > 1 ) {
+ # if $rpd2_fams, check for a chosen one
+
+ for my $dom ( @overlap_domains) {
+ $dom->{evalue} = 1.0 unless defined($dom->{evalue});
+ }
+
+ @overlap_domains = sort { $a->{evalue} <=> $b->{evalue} } @overlap_domains;
+ $prev_dom = $overlap_domains[0];
+ }
+
+ # $prev_dom should be the best of the overlaps, and we are no longer overlapping > dom_length/3
+ push @save_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ if ($prev_dom) {push @save_domains, $prev_dom;}
+
+ @pf_domains = @save_domains;
+
+ # now check for smaller overlaps
+ for (my $i=1; $i < scalar(@pf_domains); $i++) {
+ if ($pf_domains[$i-1]->{end} >= $pf_domains[$i]->{start}) {
+ my $overlap = $pf_domains[$i-1]->{end} - $pf_domains[$i]->{start};
+ $pf_domains[$i-1]->{end} -= int($overlap/2);
+ $pf_domains[$i]->{start} = $pf_domains[$i-1]->{end}+1;
+ }
+ }
+ }
+
+ # before checking for domain overlap, check for "split-domains"
+ # (self-unbound) by looking for runs of the same domain that are
+ # ordered by model_start
+
+ if (scalar(@pf_domains) > 1) {
+ my @j_domains; #joined domains
+ my @tmp_domains = @pf_domains;
+
+ my $prev_dom = shift(@tmp_domains);
+
+ for my $curr_dom (@tmp_domains) {
+ # to join domains:
+ # (1) the domains must be in order by model_start/end coordinates
+ # (3) joining the domains cannot make the total combination too long
+
+ # check for model and sequence consistency
+ if ($prev_dom->{accession} eq $curr_dom->{accession}) { # same family
+
+ my $prev_dom_len = $prev_dom->{hmm_end}-$prev_dom->{hmm_start}+1;
+ my $curr_dom_len = $curr_dom->{hmm_end}-$curr_dom->{hmm_start}+1;
+ my $prev_dom_fn = $prev_dom_len/$curr_dom->{model_length};
+ my $curr_dom_fn = $curr_dom_len/$curr_dom->{model_length};
+ my $missing_dom_fn = max(0,($curr_dom->{hmm_start} - $prev_dom->{hmm_end})/$curr_dom->{model_length});
+
+ if ( $prev_dom->{hmm_start} < $curr_dom->{hmm_start} # model check
+ && $prev_dom->{hmm_end} < $curr_dom->{hmm_end}
+ && ($curr_dom->{hmm_start} > $prev_dom->{hmm_end} * 0.80 # limit overlap
+ || $curr_dom->{hmm_start} < $prev_dom->{hmm_end} * 1.25)
+ && $prev_dom_fn + $curr_dom_fn < 1.33
+ && $missing_dom_fn < min($prev_dom_fn,$curr_dom_fn)) { # join them by updating $prev_dom
+ $prev_dom->{end} = $curr_dom->{end};
+ $prev_dom->{hmm_end} = $curr_dom->{hmm_end};
+ $prev_dom->{evalue} = ($prev_dom->{evalue} < $curr_dom->{evalue} ? $prev_dom->{evalue} : $curr_dom->{evalue});
+ next;
+ }
+
+ push @j_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ else {
+ push @j_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ }
+ push @j_domains, $prev_dom;
+ @pf_domains = @j_domains;
+ }
+
+ # $vdoms -- virtual Pfam domains -- the equivalent of $neg_doms,
+ # but covering parts of a Pfam model that are not annotated. split
+ # domains have been joined, so simply check beginning and end of
+ # each domain (but must also check for bounded-ness)
+ # only add when 10% or more is missing and missing length > $min_nodom
+
+ if ($vdoms && scalar(@pf_domains)) {
+ my @vpf_domains;
+
+ my $curr_dom = $pf_domains[0];
+ my $length = $curr_dom->{model_length};
+
+ my $prev_dom={end=>0, accession=>''};
+ my $prev_dom_end = 0;
+ my $next_dom_start = $length+1;
+
+ for (my $dom_ix=0; $dom_ix < scalar(@pf_domains); $dom_ix++ ) {
+ $curr_dom = $pf_domains[$dom_ix];
+
+ my $pfamA = $curr_dom->{accession};
+
+ # first, look left, is there a domain there (if there is,
+ # it should be updated right
+
+ # my $min_vdom = $curr_dom->{model_length} / 10;
+
+ if ($prev_dom->{accession}) { # look for previous domain
+ $prev_dom_end = $prev_dom->{end};
+ }
+
+ # there is a domain to the left, how much room is available?
+ my $left_dom_len = min($curr_dom->{start}-$prev_dom_end-1, $curr_dom->{hmm_start}-1);
+ if ( $left_dom_len > $min_vdom) {
+ # there is room for a virtual domain
+ my %new_dom = (start=> $curr_dom->{start}-$left_dom_len,
+ end => $curr_dom->{start}-1,
+ info=>'@'.$curr_dom->{accession},
+ model_length=> $curr_dom->{model_length},
+ hmm_end => $curr_dom->{hmm_start}-1,
+ hmm_start => $left_dom_len,
+ accession=>$pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ }
+
+ # save the current domain
+ push @vpf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+
+ if ($dom_ix < $#pf_domains) { # there is a domain to the right
+ # first, give all the extra space to the first domain (no splitting)
+ $next_dom_start = $pf_domains[$dom_ix+1]->{start};
+ }
+ else {
+ $next_dom_start = $seq_length;
+ }
+
+ # is there room for a virtual domain right
+
+ my $right_dom_len = min($next_dom_start-$curr_dom->{end}-1, # space available
+ $curr_dom->{model_length}-$curr_dom->{hmm_end} # space needed
+ );
+ if ( $right_dom_len > $min_vdom) {
+ my %new_dom = (start=> $curr_dom->{end}+1,
+ end=> $curr_dom->{end}+$right_dom_len,
+ info=>'@'.$pfamA,
+ model_length => $curr_dom->{model_length},
+ accession=> $pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ $prev_dom = \%new_dom;
+ }
+ } # all done, check for last one
+ # @vpf_domains has both old @pf_domains and new virtural-domains
+ @pf_domains = @vpf_domains;
+ }
+
+ if ($neg_doms) {
+ my @npf_domains;
+ my $prev_dom={end=>0};
+ for my $curr_dom ( @pf_domains) {
+ if ($curr_dom->{start} - $prev_dom->{end} > $min_nodom) {
+ my %new_dom = (start=>$prev_dom->{end}+1, end => $curr_dom->{start}-1, info=>'NODOM');
+ push @npf_domains, \%new_dom;
+ }
+ push @npf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+ }
+ if ($seq_length - $prev_dom->{end} > $min_nodom) {
+ my %new_dom = (start=>$prev_dom->{end}+1, end=>$seq_length, info=>'NODOM');
+ if ($new_dom{end} > $new_dom{start}) {push @npf_domains, \%new_dom;}
+ }
+
+ # @npf_domains has both old @pf_domains and new neg-domains
+ @pf_domains = @npf_domains;
+ }
+
+ # now make sure we have useful names: colors
+
+ for my $pf (@pf_domains) {
+ $pf->{info} = domain_name($pf->{info}, $acc, $pf->{accession});
+ }
+
+ my @feats = ();
+ for my $d_ref (@pf_domains) {
+ if ($lav) {
+ push @feats, [$d_ref->{start}, $d_ref->{end}, $d_ref->{info}];
+ }
+ else {
+ push @feats, [$d_ref->{start}, '-', $d_ref->{end}, $d_ref->{info} ];
+# push @feats, [$d_ref->{end}, ']', '-', ""];
+ }
+ }
+
+ return \@feats;
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+# in addition, domain_name() looks up each domain name to see if it
+# has a clan, and, if different domains share the same clan, they get
+# the same colors.
+
+sub domain_name {
+
+ my ($value, $seq_id, $pf_acc) = @_;
+ my $is_virtual = 0;
+
+ if ($value =~ m/^@/) {
+ $is_virtual = 1;
+ $value =~ s/^@//;
+ }
+
+ unless (defined($value)) {
+ warn "missing domain name for $seq_id";
+ return "";
+ }
+
+ if ($no_clans) {
+ if (! defined($domains{$value})) {
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pf_acc;
+ }
+ }
+ elsif (!defined($domain_clan{$value})) {
+ ## only do this for new domains, old domains have known mappings
+
+ ## ways to highlight the same domain:
+ # (1) for clans, substitute clan name for family name
+ # (2) for clans, use the same color for the same clan, but don't change the name
+ # (3) for clans, combine family name with clan name, but use colors based on clan
+
+ # return the clan name, identifier if a clan member
+ if (!defined($pfamA_fams{$pf_acc})) {
+
+ my $url = "family/$value?output=xml";
+
+ my $res = get($loc . $url);
+
+ my $twig_clan = XML::Twig->new(twig_roots => {'clan_membership'=>1},
+ twig_handlers => {
+ 'clan_membership' => \&get_clan,
+ },
+ pretty_print => 'indented');
+
+ # make certain to reinitialize
+ ($clan_acc, $clan_id) = ("","");
+ my $xml = $twig_clan->parse($res);
+ }
+ else {
+ ($clan_acc, $clan_id) = @{$pfamA_fams{$pf_acc}}{qw(clan_acc clan_id)};
+ }
+
+ if ($clan_acc) {
+ my $c_value = "C." . $clan_id;
+ if ($pf_acc_flag) {$c_value = "C." . $clan_acc;}
+
+ $domain_clan{$value} = {clan_id => $clan_id,
+ clan_acc => $clan_acc};
+
+ if ($domains{$c_value}) {
+ $domain_clan{$value}->{domain_cnt} = $domains{$c_value};
+ $value = $c_value;
+ }
+ else {
+ $domain_clan{$value}->{domain_cnt} = ++ $domain_cnt;
+ $value = $c_value;
+ $domains{$value} = $domain_cnt;
+ push @domain_list, $pf_acc;
+ }
+ }
+ else {
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pf_acc;
+ }
+ }
+ elsif ($domain_clan{$value} && $domain_clan{$value}->{clan_acc}) {
+ if ($pf_acc_flag) {$value = "C." . $domain_clan{$value}->{clan_acc};}
+ else { $value = "C." . $domain_clan{$value}->{clan_id}; }
+ }
+
+ if ($is_virtual) {
+ $domains{'@'.$value} = $domains{$value};
+ $value = '@'.$value;
+ }
+ return $value;
+}
+
+sub domain_num {
+ my ($value, $number) = @_;
+ if ($value =~ m/^@/) {
+ $value =~ s/^@/v/;
+ $number = $number."v";
+ }
+ return ($value, $number);
+}
+
+sub min {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 <= $arg2 ? $arg1 : $arg2);
+}
+
+sub max {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 >= $arg2 ? $arg1 : $arg2);
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_feats.pl
+
+=head1 SYNOPSIS
+
+ ann_pfam_www_e.pl --neg-doms 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+
+ --lav produce lav2plt.pl annotation format, only show domains/repeats
+ --neg-doms : report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --no-over : generate non-overlapping domains (equivalent to ann_pfam_www.pl)
+ --no-clans : do not use clans with multiple families from same clan
+ --min_nodom=10 : minimum length between domains for NODOM
+
+=head1 DESCRIPTION
+
+C<ann_pfam_www_e.pl> extracts domain information from the Pfam www site
+(pfam.xfam.org). Currently, the program works with database
+sequence descriptions in several formats:
+
+ >gi|1705556|sp|P54670.1|CAF1_DICDI
+ >sp|P09488|GSTM1_HUMAN
+ >sp:CALM_HUMAN
+
+C<ann_pfam_www_e.pl> uses the Pfam RESTful WWW interface
+(C<pfam.xfam.org/help#tabview=10>) to download domain
+names/locations/score. C<ann_pfam_www_e.pl> is an alternative to
+C<ann_pfam_e.pl> that does not require a MySQL instance with a Pfam
+database installation.
+
+If the "--no-over" option is set, overlapping domains are selected and
+edited to remove overlaps. For proteins with multiple overlapping
+domains (domains overlap by more than 1/3 of the domain length),
+C<auto_pfam_e.pl> selects the domain annotation with the best
+C<domain_evalue_score>. When domains overlap by less than 1/3 of the
+domain length, they are shortened to remove the overlap.
+
+C<ann_pfam_www_e.pl> is designed to be used by the B<FASTA> programs
+with the C<-V \!ann_pfam_www_e.pl> or C<-V "\!ann_pfam_www_e.pl --neg">
+option.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/ann_script_list b/scripts/ann_script_list
new file mode 100644
index 0000000..fc99c5e
--- /dev/null
+++ b/scripts/ann_script_list
@@ -0,0 +1,9 @@
+ann_exons_ens.pl
+ann_exons_up_www.pl
+ann_feats2ipr.pl
+ann_feats_up_sql.pl
+ann_feats_up_www2.pl
+ann_ipr_www.pl
+ann_pfam30.pl
+ann_pfam_www.pl
+ann_upfeats_pfam_www_e.pl
diff --git a/scripts/ann_upfeats_pfam_www_e.pl b/scripts/ann_upfeats_pfam_www_e.pl
new file mode 100755
index 0000000..f82ad0a
--- /dev/null
+++ b/scripts/ann_upfeats_pfam_www_e.pl
@@ -0,0 +1,801 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# ann_upfeats_pfam_www.pl gets an annotation file from fasta36 -V with a line of the form:
+
+# SP:GSTM1_HUMAN P09488 218
+#
+# it must:
+# (1) read in the line
+# (2) parse it to get the up_acc
+# (3) return the tab delimited features
+#
+
+# this version can read feature2 uniprot features (acc/pos/end/label/value), but returns sorted start/end domains
+
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+use LWP::Simple;
+use XML::Twig;
+use JSON qw(decode_json);
+## use IO::String;
+
+my $up_base = 'http://www.ebi.ac.uk/proteins/api/features';
+
+my %domains = ();
+my $domain_cnt = 0;
+
+my ($lav, $neg_doms, $no_doms, $no_feats, $no_over, $shelp, $help, $no_vars) = (0,0,0,0,0,0,0,0);
+my ($auto_reg, $vdoms, $no_clans, $pf_acc_flag, $acc_comment, $bound_comment) = (0, 0, 0, 0, 0, 0);
+
+my $color_sep_str = " :";
+$color_sep_str = '~';
+
+my ($min_nodom, $min_vdom) = (10,10);
+
+GetOptions(
+ "lav" => \$lav,
+ "acc_comment" => \$acc_comment,
+ "bound_comment" => \$bound_comment,
+ "no-over" => \$no_over,
+ "no_doms|no-doms|nodoms" => \$no_doms,
+ "neg" => \$neg_doms,
+ "neg_doms|neg-doms|negdoms" => \$neg_doms,
+ "min_nodom=i" => \$min_nodom,
+ "no_feats|no-feats|nofeats" => \$no_feats,
+ "no-vars|no_vars" => \$no_vars,
+ "vdoms" => \$vdoms,
+ "v_doms" => \$vdoms,
+ "pfacc" => \$pf_acc_flag,
+ "pfam_acc" => \$pf_acc_flag,
+ "acc" => \$pf_acc_flag,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless (@ARGV || -f STDIN || -p STDIN);
+
+my @feat_keys = qw( ACT_SITE MOD_RES BINDING METAL SITE );
+my @feat_vals = ( '=','*','#','^','@');
+my @feat_names = ('Active site', 'Modified', 'Binding', 'Metal binding', 'Site');
+
+unless ($no_vars) {
+ push @feat_keys, qw(MUTAGEN VARIANT);
+ push @feat_vals, ('V','V',);
+ push @feat_names, ("","",);
+}
+
+my %feats_text = ();
+ at feats_text{@feat_keys} = ('Active site', '', 'Substrate binding', 'Metal binding', 'Site', '','');
+
+my %feats_label;
+ at feats_label{@feat_keys} = ('Active site', 'Modified', 'Substrate binding', 'Metal binding', 'Site', '','');
+
+# my @feat_vals = ( '=','*','#','^','@','V','V');
+
+
+my @dom_keys = qw( DOMAIN REPEAT );
+my @dom_vals = ( [ '[', ']'],[ '[', ']']);
+
+my @ssr_keys = qw(STRAND HELIX);
+my @ssr_vals = ( [ '[', ']']);
+
+my %annot_types = ();
+
+# from ann_pfam_www_e.pl
+my %domain_clan = (NODOM => {clan_id => 'NODOM', clan_acc=>0, domain_cnt=>0});
+my @domain_list = (0);
+
+my $loc="http://pfam.xfam.org/";
+my $pf_url;
+
+my @pf_domains;
+my %pfamA_fams = ();
+my ($pf_seq_length, $pf_model_length)=(0,0);
+my ($clan_acc, $clan_id) = ("","");
+
+my $get_domain_sub = \&get_pfam_annots;
+
+if ($lav) {
+ $no_feats = 1;
+}
+
+ at annot_types{@feat_keys} = @feat_vals unless ($no_feats);
+
+if ($neg_doms) {
+ $domains{'NODOM'}=0;
+}
+
+my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
+
+unless ($no_feats) {
+ for my $i ( 0 .. $#feat_keys) {
+ next unless $feats_label{$feat_keys[$i]};
+ print "=",$feat_vals[$i],":",$feats_label{$feat_keys[$i]},"\n";
+ }
+}
+
+# get the query
+my ($query, $seq_len) = @ARGV;
+$seq_len = 0 unless defined($seq_len);
+
+$query =~ s/^>// if ($query);
+
+my @annots = ();
+
+#if it's a file I can open, read and parse it
+
+unless ($query && ($query =~ m/[\|:]/
+ || $query =~ m/^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\s/
+ || $query =~ m/^(XN)(MP)_\d+/)) {
+
+ while (my $a_line = <>) {
+ $a_line =~ s/^>//;
+ chomp $a_line;
+ push @annots, upfeats_pfam_www($a_line, \&up_json_annots, \&get_pfam_www);
+ }
+} else {
+ push @annots, upfeats_pfam_www("$query\t$seq_len", \&up_json_annots, \&get_pfam_www);
+}
+
+for my $seq_annot (@annots) {
+ print ">",$seq_annot->{seq_info},"\n";
+ for my $annot (@{$seq_annot->{list}}) {
+ if (!$lav && defined($domains{$annot->[-1]})) {
+ my ($a_name, $a_num) = domain_num($annot->[-1],$domains{$annot->[-1]});
+ $annot->[-1] = $a_name;
+ if ($acc_comment) {
+ $annot->[-1] .= "{$domain_list[$a_num]}";
+ }
+ if ($bound_comment) {
+ $annot->[-1] .= $color_sep_str.$annot->[0].":".$annot->[2];
+ }
+ $annot->[-1] .= $color_sep_str.$a_num;
+ }
+ print join("\t",@$annot),"\n";
+ }
+}
+
+exit(0);
+
+sub upfeats_pfam_www {
+ my ($query_len, $get_upfeats_sub, $get_pfam_sub) = @_;
+
+ my ($annot_line, $seq_len) = split(/\t/,$query_len);
+
+ my %annot_data = (seq_info=>$annot_line);
+
+ $use_acc = 1;
+
+ if ($annot_line =~ m/^gi\|/) {
+ ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
+ } elsif ($annot_line =~ m/^(SP|TR):(\w+)\s(\w+)/) {
+ ($sdb, $id, $acc) = (lc($1), $2, $3);
+ $use_acc = 1;
+ } elsif ($annot_line =~ m/^(SP|TR):(\w+)/) {
+ ($sdb, $id) = (lc($1), $2, '');
+ warn("*** $0 - accession required: $annot_line");
+ $use_acc = 0;
+ } elsif ($annot_line =~ m/^(UR\d{3}:UniRef\d{2})_(\w+)/) {
+ $sdb = lc($1);
+ $id = $2;
+# $acc = $2;
+ } elsif ($annot_line =~ m/\|/) {
+ ($sdb, $acc, $id) = split(/\|/,$annot_line);
+ }
+ else {
+ ($acc) = split(/\s+/,$annot_line);
+ }
+
+ $acc =~ s/\.\d+// if ($acc);
+ $annot_data{list} = [];
+ my $lwp_features = "";
+
+ if ($acc && ($acc =~ m/^[A-Z][0-9][A-Z0-9]{3}[0-9]/)) {
+ $lwp_features = get("$up_base/$acc.json");
+ }
+
+ my @annots = ();
+
+ if ($lwp_features && ($lwp_features !~ /ERROR/)) {
+ push @annots, up_json_annots(\%annot_types, $lwp_features, $seq_len);
+ }
+
+ unless ($use_acc) {
+ push @annots, $get_pfam_sub->($id, $seq_len);
+ }
+ else {
+ $acc =~ s/\.\d+$//;
+ push @annots, $get_pfam_sub->($acc, $seq_len);
+ }
+
+ @annots = sort { $a->[0] <=> $b->[0] } @annots;
+ $annot_data{list} = \@annots;
+
+ return \%annot_data;
+}
+
+# parses www.uniprot.org gff feature table
+sub up_json_annots {
+ my ($annot_types, $annot_data, $seq_len) = @_;
+
+ my $json_ref = decode_json($annot_data);
+
+ my ($acc, $pos, $end, $label, $value, $comment, $len);
+
+ $seq_len = 0;
+
+ my @sites = (); # sites with one position
+
+ my ($seq_str, $seq_acc, $seq_id) = @{$json_ref}{qw(sequence accession entryName)};
+
+ for my $feat ( @{$json_ref->{features}} ) {
+ next unless ($annot_types->{$feat->{type}});
+
+ my ($label, $pos, $end, $value) = @{$feat}{qw(type begin end description)};
+
+ if ($label =~ m/VARIANT/ || $label =~ m/MUTAGEN/) {
+ push @sites, [$pos, $annot_types->{$label}, $feat->{alternativeSequence}, $value];
+ }
+ else {
+ next unless ($pos == $end);
+ if ($feats_text{$label}) {
+ my $info = $feats_text{$label};
+ if ($value) {
+ $info .= ": $value";
+ }
+ push @sites, [$pos, $annot_types->{$label}, "-", $info];
+ } else {
+ push @sites, [$pos, $annot_types->{$label}, "-", $value];
+ }
+ }
+ }
+
+ return @sites;
+}
+
+sub get_length {
+ my ($t, $elt) = @_;
+ $pf_seq_length = $elt->{att}->{length};
+}
+
+sub push_match {
+ my ($t, $elt) = @_;
+# return unless ($elt->{att}->{type} =~ m/Pfam-A/);
+ my $attr_ref = $elt->{att};
+ my $loc_ref = $elt->first_child('location')->{att};
+ push @pf_domains, { %$attr_ref, %$loc_ref };
+}
+
+sub get_model_length {
+ my ($t, $elt) = @_;
+ $pf_model_length = $elt->{att}->{model_length};
+}
+
+sub get_clan {
+ my ($t, $elt) = @_;
+ my $attr_ref = $elt->{att};
+# print Dumper($attr_ref);
+ ($clan_acc, $clan_id) = ($attr_ref->{clan_acc},$attr_ref->{clan_id});
+}
+
+sub get_pfam_www {
+ my ($acc, $seq_length) = @_;
+
+# if ($acc =~ m/_/) {$url = "protein?id=$acc&output=xml"; }
+# else {$url = "protein/$acc?output=xml"; }
+
+ $pf_url = "protein/$acc?output=xml";
+
+ my $res = get($loc . $pf_url);
+
+ @pf_domains = ();
+
+ my $twig_dom = XML::Twig->new(twig_roots => {matches => 1, sequence => 1},
+# start_tag_handlers => {
+# 'sequence' => \&get_length,
+# },
+ twig_handlers => {
+ 'match' => \&push_match,
+ 'sequence' => \&get_length,
+ },
+ pretty_print => 'indented');
+ my $xml = $twig_dom->parse($res);
+
+ if (!$seq_length || $seq_length == 0) {
+ $seq_length = $pf_seq_length;
+ }
+
+ @pf_domains = sort { $a->{start} <=> $b->{start} } @pf_domains;
+
+ # to look for possible joining, need model_length
+ for my $curr_dom (@pf_domains) {
+
+ my $acc = $curr_dom->{accession};
+ $pf_url = "family/$acc?output=xml";
+
+ my $res = get($loc . $pf_url);
+
+ my $twig_fam = XML::Twig->new(twig_roots => {hmm_details => 1, clan_membership=> 1},
+ twig_handlers => {
+ 'hmm_details' => \&get_model_length,
+ 'clan_membership' => \&get_clan,
+ },
+ pretty_print => 'indented');
+
+ ($clan_acc, $clan_id) = ("","");
+ my $fam_xml = $twig_fam->parse($res);
+
+ $pfamA_fams{$acc} = { model_length => $pf_model_length, clan_acc=>$clan_acc, clan_id=>$clan_id};
+ $curr_dom->{model_length} = $pf_model_length;
+ }
+
+ # check for domain overlap, and resolve check for domain overlap
+ # (possibly more than 2 domains), choosing the domain with the best
+ # evalue
+
+ my @raw_pf_domains = @pf_domains;
+ @pf_domains = ();
+
+ for my $dom_ref (@raw_pf_domains) {
+ if ($pf_acc_flag) {
+ $dom_ref->{info} = $dom_ref->{accession};
+ }
+ else {
+ $dom_ref->{info} = $dom_ref->{id};
+ }
+ next if ($dom_ref->{start} >= $seq_length);
+ if ($dom_ref->{end} >= $seq_length) {
+ $dom_ref->{end} = $seq_length;
+ }
+ push @pf_domains, $dom_ref;
+ }
+
+ if($no_over && scalar(@pf_domains) > 1) {
+
+ my @tmp_domains = @pf_domains;
+ my @save_domains = ();
+
+ my $prev_dom = shift @tmp_domains;
+
+ while (my $curr_dom = shift @tmp_domains) {
+
+ my @overlap_domains = ($prev_dom);
+
+ my $diff = $prev_dom->{end} - $curr_dom->{start};
+ # check for overlap > domain_length/3
+
+ my ($prev_len, $cur_len) = ($prev_dom->{end}-$prev_dom->{start}+1, $curr_dom->{end}-$curr_dom->{start}+1);
+ my $inclusion = ((($curr_dom->{start} >= $prev_dom->{start}) && ($curr_dom->{end} <= $prev_dom->{end})) ||
+ (($curr_dom->{start} <= $prev_dom->{start}) && ($curr_dom->{end} >= $prev_dom->{end})));
+
+ my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+
+ while ($inclusion || ($diff > 0 && $diff > $longer_len/3)) {
+ push @overlap_domains, $curr_dom;
+ $curr_dom = shift @tmp_domains;
+ last unless $curr_dom;
+ $diff = $prev_dom->{end} - $curr_dom->{start};
+ ($prev_len, $cur_len) = ($prev_dom->{end}-$prev_dom->{start}+1, $curr_dom->{end}-$curr_dom->{start}+1);
+ $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
+ $inclusion = ((($curr_dom->{start} >= $prev_dom->{start}) && ($curr_dom->{end} <= $prev_dom->{end})) ||
+ (($curr_dom->{start} <= $prev_dom->{start}) && ($curr_dom->{end} >= $prev_dom->{end})));
+ }
+
+ # check for overlapping domains; >1 because $prev_dom is always there
+ if (scalar(@overlap_domains) > 1 ) {
+ # if $rpd2_fams, check for a chosen one
+
+ for my $dom ( @overlap_domains) {
+ $dom->{evalue} = 1.0 unless defined($dom->{evalue});
+ }
+
+ @overlap_domains = sort { $a->{evalue} <=> $b->{evalue} } @overlap_domains;
+ $prev_dom = $overlap_domains[0];
+ }
+
+ # $prev_dom should be the best of the overlaps, and we are no longer overlapping > dom_length/3
+ push @save_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ if ($prev_dom) {push @save_domains, $prev_dom;}
+
+ @pf_domains = @save_domains;
+
+ # now check for smaller overlaps
+ for (my $i=1; $i < scalar(@pf_domains); $i++) {
+ if ($pf_domains[$i-1]->{end} >= $pf_domains[$i]->{start}) {
+ my $overlap = $pf_domains[$i-1]->{end} - $pf_domains[$i]->{start};
+ $pf_domains[$i-1]->{end} -= int($overlap/2);
+ $pf_domains[$i]->{start} = $pf_domains[$i-1]->{end}+1;
+ }
+ }
+ }
+
+ # before checking for domain overlap, check for "split-domains"
+ # (self-unbound) by looking for runs of the same domain that are
+ # ordered by model_start
+
+ if (scalar(@pf_domains) > 1) {
+ my @j_domains; #joined domains
+ my @tmp_domains = @pf_domains;
+
+ my $prev_dom = shift(@tmp_domains);
+
+ for my $curr_dom (@tmp_domains) {
+ # to join domains:
+ # (1) the domains must be in order by model_start/end coordinates
+ # (3) joining the domains cannot make the total combination too long
+
+ # check for model and sequence consistency
+ if ($prev_dom->{accession} eq $curr_dom->{accession}) { # same family
+
+ my $prev_dom_len = $prev_dom->{hmm_end}-$prev_dom->{hmm_start}+1;
+ my $curr_dom_len = $curr_dom->{hmm_end}-$curr_dom->{hmm_start}+1;
+ my $prev_dom_fn = $prev_dom_len/$curr_dom->{model_length};
+ my $curr_dom_fn = $curr_dom_len/$curr_dom->{model_length};
+ my $missing_dom_fn = max(0,($curr_dom->{hmm_start} - $prev_dom->{hmm_end})/$curr_dom->{model_length});
+
+ if ( $prev_dom->{hmm_start} < $curr_dom->{hmm_start} # model check
+ && $prev_dom->{hmm_end} < $curr_dom->{hmm_end}
+ && ($curr_dom->{hmm_start} > $prev_dom->{hmm_end} * 0.80 # limit overlap
+ || $curr_dom->{hmm_start} < $prev_dom->{hmm_end} * 1.25)
+ && $prev_dom_fn + $curr_dom_fn < 1.33
+ && $missing_dom_fn < min($prev_dom_fn,$curr_dom_fn)) { # join them by updating $prev_dom
+ $prev_dom->{end} = $curr_dom->{end};
+ $prev_dom->{hmm_end} = $curr_dom->{hmm_end};
+ $prev_dom->{evalue} = ($prev_dom->{evalue} < $curr_dom->{evalue} ? $prev_dom->{evalue} : $curr_dom->{evalue});
+ next;
+ }
+
+ push @j_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ else {
+ push @j_domains, $prev_dom;
+ $prev_dom = $curr_dom;
+ }
+ }
+ push @j_domains, $prev_dom;
+ @pf_domains = @j_domains;
+ }
+
+ # $vdoms -- virtual Pfam domains -- the equivalent of $neg_doms,
+ # but covering parts of a Pfam model that are not annotated. split
+ # domains have been joined, so simply check beginning and end of
+ # each domain (but must also check for bounded-ness)
+ # only add when 10% or more is missing and missing length > $min_nodom
+
+ if ($vdoms) {
+ my @vpf_domains;
+
+ my $curr_dom = $pf_domains[0];
+
+ my $prev_dom={end=>0, accession=>''};
+ my $prev_dom_end = 0;
+ my $next_dom_start = $seq_length+1;
+
+ for (my $dom_ix=0; $dom_ix < scalar(@pf_domains); $dom_ix++ ) {
+ $curr_dom = $pf_domains[$dom_ix];
+
+ my $pfamA = $curr_dom->{accession};
+
+ # first, look left, is there a domain there (if there is,
+ # it should be updated right
+
+ # my $min_vdom = $curr_dom->{model_length} / 10;
+
+ if ($prev_dom->{accession}) { # look for previous domain
+ $prev_dom_end = $prev_dom->{end};
+ }
+
+ # there is a domain to the left, how much room is available?
+ my $left_dom_len = min($curr_dom->{start}-$prev_dom_end-1, $curr_dom->{hmm_start}-1);
+ if ( $left_dom_len > $min_vdom) {
+ # there is room for a virtual domain
+ my %new_dom = (start=> $curr_dom->{start}-$left_dom_len,
+ end => $curr_dom->{start}-1,
+ info=>'@'.$curr_dom->{accession},
+ model_length=> $curr_dom->{model_length},
+ hmm_end => $curr_dom->{hmm_start}-1,
+ hmm_start => $left_dom_len,
+ accession=>$pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ }
+
+ # save the current domain
+ push @vpf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+
+ if ($dom_ix < $#pf_domains) { # there is a domain to the right
+ # first, give all the extra space to the first domain (no splitting)
+ $next_dom_start = $pf_domains[$dom_ix+1]->{start};
+ }
+ else {
+ $next_dom_start = $seq_length;
+ }
+
+ # is there room for a virtual domain right
+
+ my $right_dom_len = min($next_dom_start-$curr_dom->{end}-1, # space available
+ $curr_dom->{model_length}-$curr_dom->{hmm_end} # space needed
+ );
+ if ( $right_dom_len > $min_vdom) {
+ my %new_dom = (start=> $curr_dom->{end}+1,
+ end=> $curr_dom->{end}+$right_dom_len,
+ info=>'@'.$pfamA,
+ model_length => $curr_dom->{model_length},
+ accession=> $pfamA,
+ );
+ push @vpf_domains, \%new_dom;
+ $prev_dom = \%new_dom;
+ }
+ } # all done, check for last one
+ # @vpf_domains has both old @pf_domains and new virtural-domains
+ @pf_domains = @vpf_domains;
+ }
+
+ if ($neg_doms) {
+ my @npf_domains;
+ my $prev_dom={end=>0};
+ for my $curr_dom ( @pf_domains) {
+ if ($curr_dom->{start} - $prev_dom->{end} > $min_nodom) {
+ my %new_dom = (start=>$prev_dom->{end}+1, end => $curr_dom->{start}-1, info=>'NODOM');
+ push @npf_domains, \%new_dom;
+ }
+ push @npf_domains, $curr_dom;
+ $prev_dom = $curr_dom;
+ }
+ if ($seq_length - $prev_dom->{end} > $min_nodom) {
+ my %new_dom = (start=>$prev_dom->{end}+1, end=>$seq_length, info=>'NODOM');
+ if ($new_dom{end} > $new_dom{start}) {push @npf_domains, \%new_dom;}
+ }
+
+ # @npf_domains has both old @pf_domains and new neg-domains
+ @pf_domains = @npf_domains;
+ }
+
+ # now make sure we have useful names: colors
+
+ for my $pf (@pf_domains) {
+ $pf->{info} = domain_name($pf->{info}, $acc, $pf->{accession});
+ }
+
+ my @feats = ();
+ for my $d_ref (@pf_domains) {
+ if ($lav) {
+ push @feats, [$d_ref->{start}, $d_ref->{end}, $d_ref->{info}];
+ }
+ else {
+ push @feats, [$d_ref->{start}, '-', $d_ref->{end}, $d_ref->{info} ];
+# push @feats, [$d_ref->{end}, ']', '-', ""];
+ }
+ }
+
+ return @feats;
+}
+
+# domain name takes a uniprot domain label, removes comments ( ;
+# truncated) and numbers and returns a canonical form. Thus:
+# Cortactin 6.
+# Cortactin 7; truncated.
+# becomes "Cortactin"
+#
+
+# in addition, domain_name() looks up each domain name to see if it
+# has a clan, and, if different domains share the same clan, they get
+# the same colors.
+
+sub domain_name {
+
+ my ($value, $seq_id, $pf_acc) = @_;
+ my $is_virtual = 0;
+
+ if ($value =~ m/^@/) {
+ $is_virtual = 1;
+ $value =~ s/^@//;
+ }
+
+ unless (defined($value)) {
+ warn "missing domain name for $seq_id";
+ return "";
+ }
+
+ if ($no_clans) {
+ if (! defined($domains{$value})) {
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pf_acc;
+ }
+ }
+ elsif (!defined($domain_clan{$value})) {
+ ## only do this for new domains, old domains have known mappings
+
+ ## ways to highlight the same domain:
+ # (1) for clans, substitute clan name for family name
+ # (2) for clans, use the same color for the same clan, but don't change the name
+ # (3) for clans, combine family name with clan name, but use colors based on clan
+
+ # return the clan name, identifier if a clan member
+ if (!defined($pfamA_fams{$pf_acc})) {
+
+ my $pf_url = "family/$value?output=xml";
+
+ my $res = get($loc . $pf_url);
+
+ my $twig_clan = XML::Twig->new(twig_roots => {'clan_membership'=>1},
+ twig_handlers => {
+ 'clan_membership' => \&get_clan,
+ },
+ pretty_print => 'indented');
+
+ # make certain to reinitialize
+ ($clan_acc, $clan_id) = ("","");
+ my $xml = $twig_clan->parse($res);
+ }
+ else {
+ ($clan_acc, $clan_id) = @{$pfamA_fams{$pf_acc}}{qw(clan_acc clan_id)};
+ }
+
+ if ($clan_acc) {
+ my $c_value = "C." . $clan_id;
+ if ($pf_acc_flag) {$c_value = "C." . $clan_acc;}
+
+ $domain_clan{$value} = {clan_id => $clan_id,
+ clan_acc => $clan_acc};
+
+ if ($domains{$c_value}) {
+ $domain_clan{$value}->{domain_cnt} = $domains{$c_value};
+ $value = $c_value;
+ }
+ else {
+ $domain_clan{$value}->{domain_cnt} = ++ $domain_cnt;
+ $value = $c_value;
+ $domains{$value} = $domain_cnt;
+ push @domain_list, $pf_acc;
+ }
+ }
+ else {
+ $domain_clan{$value} = 0;
+ $domains{$value} = ++$domain_cnt;
+ push @domain_list, $pf_acc;
+ }
+ }
+ elsif ($domain_clan{$value} && $domain_clan{$value}->{clan_acc}) {
+ if ($pf_acc_flag) {$value = "C." . $domain_clan{$value}->{clan_acc};}
+ else { $value = "C." . $domain_clan{$value}->{clan_id}; }
+ }
+
+ if ($is_virtual) {
+ $domains{'@'.$value} = $domains{$value};
+ $value = '@'.$value;
+ }
+ return $value;
+}
+
+sub domain_num {
+ my ($value, $number) = @_;
+ if ($value =~ m/^@/) {
+ $value =~ s/^@/v/;
+ $number = $number."v";
+ }
+ return ($value, $number);
+}
+
+sub min {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 <= $arg2 ? $arg1 : $arg2);
+}
+
+sub max {
+ my ($arg1, $arg2) = @_;
+
+ return ($arg1 >= $arg2 ? $arg1 : $arg2);
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ann_feats_up_www2.pl
+
+=head1 SYNOPSIS
+
+ ann_feats_up_www2.pl --no_doms --no_feats --lav 'sp|P09488|GSTM1_NUMAN' | accession.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ --no-doms do not show domain boundaries (domains are always shown with --lav)
+ --no-feats do not show feature (variants, active sites, phospho-sites)
+ --lav produce lav2plt.pl annotation format, only show domains/repeats
+
+ --neg-doms, -- report domains between annotated domains as NODOM
+ (also --neg, --neg_doms)
+ --min_nodom=10 -- minimum length between domains for NODOM
+
+ --host, --user, --password, --port --db -- info for mysql database
+
+=head1 DESCRIPTION
+
+C<ann_feats_up_www2.pl> extracts feature, domain, and repeat
+information from the Uniprot DAS server through an XSLT transation
+provided by http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/uniprotkb.
+This server provides GFF descriptions of Uniprot entries, with most of
+the information provided in UniProt feature tables.
+
+C<ann_feats_up_www2.pl> is an alternative to C<ann_pfam.pl> and
+C<ann_pfam.pl> that does not require a local MySQL copy of Pfam.
+
+Given a command line argument that contains a sequence accession
+(P09488), the program looks up the features available for that
+sequence and returns them in a tab-delimited format:
+
+>sp|P09488|GSTM1_HUMAN
+2 [ - GST N-terminal :1
+7 V F Mutagen: Reduces catalytic activity 100- fold.
+23 * - MOD_RES: Phosphotyrosine (By similarity).
+33 * - MOD_RES: Phosphotyrosine (By similarity).
+34 * - MOD_RES: Phosphothreonine (By similarity).
+88 ] -
+90 [ - GST C-terminal :2
+108 V Q Mutagen: Reduces catalytic activity by half.
+108 V S Mutagen: Changes the properties of the enzyme toward some substrates.
+109 V I Mutagen: Reduces catalytic activity by half.
+116 # - BINDING: Substrate.
+116 V A Mutagen: Reduces catalytic activity 10-fold.
+116 V F Mutagen: Slight increase of catalytic activity.
+173 V N in allele GSTM1B; dbSNP:rs1065411.
+208 ] -
+210 V T in dbSNP:rs449856.
+
+If features are provided, then a legend of feature symbols is provided
+as well:
+
+ =*:phosphorylation
+ ==:active site
+ =@:site
+ =^:binding
+ =!:metal binding
+
+If the C<--lav> option is specified, domain and repeat features are
+presented in a different format for the C<lav2plt.pl> program:
+
+ >sp|P09488|GSTM1_HUMAN
+ 2 88 GST N-terminal.
+ 90 208 GST C-terminal.
+
+C<ann_feats_up_www2.pl> is designed to be used by the B<FASTA> programs with
+the C<-V \!ann_feats_up_www2.pl> option. It can also be used with the lav2plt.pl
+program with the C<--xA "\!ann_feats_up_www2.pl --lav"> or C<--yA "\!ann_feats_up_www2.pl --lav"> options.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/annot_blast_btop2.pl b/scripts/annot_blast_btop2.pl
new file mode 100755
index 0000000..63acbf8
--- /dev/null
+++ b/scripts/annot_blast_btop2.pl
@@ -0,0 +1,1306 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014,2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+################################################################
+# annot_blast_btop2.pl --query query.file --ann_script ann_pfam_www.pl blast_tab_btop_file
+################################################################
+# annot_blast_btop2.pl associates domain annotation information and
+# subalignment scores with a blast tabular (-outfmt 6 or -outfmt 7)
+# file that contains the raw score and the BTOP alignment encoding
+# This file can be generated from "blastp/n" or "blast_formatter"
+# using the command:
+# blast_formatter -archive blast_output.asn -outfmt '7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore score btop' > blast_output.tab_annot
+#
+# If the BTOP field or query_file is not available, the script
+# produces domain content without sub-alignment scores.
+################################################################
+## 13-Jan-2017
+# modified to provide query/subject coordinates and identities if no
+# query sequence -- does not decrement for reverse-complement fastx/blastx DNA
+################################################################
+## 16-Nov-2015
+# modify to allow multi-query blast searches
+################################################################
+## 19-Dec-2015
+# add -q_annot_script to annotate query sequence
+#
+
+use strict;
+use IPC::Open2;
+use Pod::Usage;
+use Getopt::Long;
+# use Data::Dumper;
+
+# read lines of the form:
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|121694|sp|P20432|GSTT1_DROME 100.00 209 0 0 1 209 1 209 6e-156 433 1113 209
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|1170090|sp|P04907|GSTF3_MAIZE 26.77 198 123 7 4 185 6 197 2e-08 51.2 121 FL1YG ... 1NKRA1YW1
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|81174731|sp|P0ACA5|SSPA_ECO57 39.66 58 32 2 43 100 49 103 8e-06 43.9 102 EDFLLI ... V-I-NEQS3FM
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|121695|sp|P12653|GSTF1_MAIZE 27.62 181 107 7 32 203 34 199 9e-05 40.8 94 LI1LF ... N-1AS1CLLM1
+
+# and report the domain content ala -m 8CC
+
+my ($matrix, $ann_script, $q_ann_script, $show_raw, $shelp, $help) = ("BLOSUM62", "", "", 0, 0, 0);
+my ($query_lib_name) = (""); # if $query_lib_name, do not use $query_file_name
+my ($out_field_str) = ("");
+my $query_lib_r = 0;
+
+my @blosum62 = ();
+my @blosum62_diag = ();
+my %aa_map = ();
+my ($g_open, $g_ext) = (-11, -1);
+init_blosum62();
+
+GetOptions(
+ "matrix:s" => \$matrix,
+ "ann_script:s" => \$ann_script,
+ "q_ann_script:s" => \$q_ann_script,
+ "query:s" => \$query_lib_name,
+ "query_file:s" => \$query_lib_name,
+ "query_lib:s" => \$query_lib_name,
+ "out_fields:s" => \$out_field_str,
+ "script:s" => \$ann_script,
+ "q_script:s" => \$q_ann_script,
+ "raw_score" => \$show_raw,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+unless (-f STDIN || -p STDIN || @ARGV) {
+ pod2usage(1);
+}
+
+if ($query_lib_name) {
+ $query_lib_r = parse_query_lib($query_lib_name);
+}
+
+my @tab_fields = qw(q_seqid s_seqid percid alen mismatch gopen q_start q_end s_start s_end evalue bits score BTOP);
+
+# the fields that are displayed are listed here. By default, all fields except score and BTOP are displayed.
+my @out_tab_fields = @tab_fields[0 .. $#tab_fields-1];
+if ($show_raw) {
+ push @out_tab_fields, "raw_score";
+
+}
+if ($out_field_str) {
+ @out_tab_fields = split(/\s+/,$out_field_str);
+}
+
+my @header_lines = ();
+
+# need outer loop to enable multiple queries
+while (1) {
+
+ my $next_line = "";
+ my $have_data = 0;
+
+ my @hit_list = ();
+ my @q_hit_list = ();
+
+ while (my $line = <>) {
+ if ($line =~ /^#/) {
+ if ($have_data) {
+ $next_line = $line;
+ $have_data = 0;
+ last;
+ } else {
+ push @header_lines, $line;
+ }
+ next;
+ }
+
+ $have_data = 1;
+ my %hit_data = ();
+ chomp $line;
+ next unless $line;
+ @hit_data{@tab_fields} = split(/\t/,$line);
+
+ push @hit_list, \%hit_data;
+ }
+
+ # get the current query sequence
+ if ($q_ann_script && -x (split(/\s+/,$q_ann_script))[0]) {
+ # get the domains for the q_seqid using --q_ann_script
+ #
+ my ($Reader, $Writer);
+ my $pid = open2($Reader, $Writer, $q_ann_script);
+ my $hit = $hit_list[0];
+
+ print $Writer $hit->{q_seqid},"\n";
+ close($Writer);
+
+ @q_hit_list = ({ s_seq_id=> $hit->{q_seqid} });
+
+ read_annots($Reader, \@q_hit_list, 0);
+
+ waitpid($pid, 0);
+ }
+
+ # get the current query sequence
+ if ($ann_script && -x (split(/\s+/,$ann_script))[0]) {
+ # get the domains for each s_seqid using --ann_script
+ #
+ my ($Reader, $Writer);
+ my $pid = open2($Reader, $Writer, $ann_script);
+ for my $hit (@hit_list) {
+ print $Writer $hit->{s_seqid},"\n";
+ }
+ close($Writer);
+
+ read_annots($Reader, \@hit_list, 1);
+
+ waitpid($pid, 0);
+ }
+
+ for my $line (@header_lines) {
+ print $line;
+ }
+ @header_lines = ($next_line);
+
+ # now get query sequence if available
+
+ my $q_hit = $q_hit_list[0];
+
+ for my $hit (@hit_list) {
+ my @list_covered = ();
+
+ # If I have an encoded aligment {BTOP} and a query sequence $query_lib_r && $query_lib_r->{$hit->{q_seqid}}
+ # then I can calculate sub-alignment scores
+ if (defined($hit->{BTOP}) && $query_lib_r && $query_lib_r->{$hit->{q_seqid}}) {
+
+ $hit->{raw_score} = 0; # initialize in case no domains and raw_score requested
+ # calculate sub-alignment scores in subject/library coordinates
+ if (defined($hit->{domains}) && scalar(@{$hit->{domains}})) {
+ ($hit->{raw_score}, $hit->{aligned_domains_r}) =
+ sub_alignment_score($query_lib_r->{$hit->{q_seqid}},
+ $hit, \@blosum62, \@blosum62_diag, $hit->{domains}, 1);
+ }
+
+ if (defined($hit->{sites}) && scalar(@{$hit->{sites}})) {
+ $hit->{aligned_sites_r} = site_align($query_lib_r->{$hit->{q_seqid}},
+ $hit, \@blosum62, $hit->{sites}, 1);
+ }
+
+ # calculate sub-alignment scores in query coordinates
+ if (defined($q_hit->{domains}) && scalar(@{$q_hit->{domains}})) {
+ ($hit->{raw_score}, $hit->{q_aligned_domains_r}) =
+ sub_alignment_score($query_lib_r->{$hit->{q_seqid}},
+ $hit, \@blosum62, \@blosum62_diag, $q_hit->{domains}, 0);
+ }
+
+ if (defined($q_hit->{sites}) && scalar(@{$q_hit->{sites}})) {
+ $hit->{q_aligned_sites_r} = site_align($query_lib_r->{$hit->{q_seqid}},
+ $hit, \@blosum62, $q_hit->{sites}, 0);
+ }
+ }
+ elsif (defined($hit->{BTOP})) {
+ if (defined($hit->{domains}) && scalar(@{$hit->{domains}})) {
+ $hit->{aligned_domains_r} =
+ sub_alignment_pos($hit, $hit->{domains}, 1);
+ }
+ }
+ else { # no alignment info, can provide domain overlap, and subject coordinates
+ $hit->{raw_score} = 0;
+ for my $dom_r (@{$hit->{domains}}) {
+ next if $dom_r->{d_end} < $hit->{s_start}; # before start
+ last if $dom_r->{d_pos} > $hit->{s_end}; # after end
+
+ if ($dom_r->{d_pos} <= $hit->{s_end} && $dom_r->{d_end} >= $hit->{s_start}) {
+ push @list_covered, $dom_r->{descr};
+ }
+ }
+ }
+
+
+ ################
+ ## final output display
+
+ print join("\t",@{$hit}{@out_tab_fields}); # show fields from original blast tabular file
+
+ my $merged_annots_r = merge_annots($hit); # merge the four possible annotation lists into one.
+
+ if (scalar(@$merged_annots_r)) { # show subalignment scores if available
+ print "\t";
+
+ print format_annot_info($hit, $merged_annots_r);
+ }
+ elsif (@list_covered) { # otherwise show domain content
+ print "\t",join(";", at list_covered);
+ }
+ print "\n";
+ }
+
+ # for my $line (@footer_lines) {
+ # print $line;
+ # }
+ # @footer_lines = ();
+
+ last if eof(ARGV);
+}
+
+for my $line (@header_lines) {
+ print $line;
+}
+
+################
+# read_annots (\@hit_list)
+# input: $hit_entry->{s_seq_id, etc}, $target
+# output: modified $hit_entry->{domains}
+# modified $hit_entry->{sites}
+
+sub read_annots {
+ my ($Reader, $hit_list_r, $target) = @_;
+
+ my $current_domain = "";
+ my $hit_ix = 0;
+ my @hit_domains = ();
+ my @hit_sites = ();
+
+ while (my $line = <$Reader>) {
+ next if $line=~ m/^=/;
+ chomp $line;
+
+ # check for header
+ if ($line =~ m/^>/) {
+ if ($current_domain) { # previous domains/sites have already been found and parsed
+ if ($hit_list_r->[$hit_ix]{s_seqid} eq $current_domain) {
+ $hit_list_r->[$hit_ix]{domains} = [ @hit_domains ]; # previous domains
+ $hit_list_r->[$hit_ix]{sites} = [ @hit_sites ]; # previous sites
+ $hit_ix++;
+ } else {
+ warn "phase error: $current_domain != $hit_list_r->[$hit_ix]{s_seqid}";
+ }
+ }
+ @hit_domains = (); # current domains
+ @hit_sites = (); # current sites
+ $current_domain = $line;
+ $current_domain =~ s/^>//;
+ } else { # check for data
+ my %annot_info = (target=>$target);
+ my @a_fields = split(/\t/,$line);
+ if ($a_fields[1] eq '-') {
+ @annot_info{qw(d_pos type d_end descr)} = @a_fields;
+ $annot_info{descr} =~ s/ :(\d+)$/~$1/;
+ push @hit_domains, \%annot_info; # current
+ }
+ else {
+ @annot_info{qw(d_pos type d_val descr)} = @a_fields;
+ $annot_info{'d_end'} = $annot_info{'d_pos'};
+ push @hit_sites, \%annot_info; # current
+ }
+ }
+ }
+ close($Reader);
+
+ # all done, save the last one
+ $hit_list_r->[$hit_ix]{domains} = \@hit_domains;
+ $hit_list_r->[$hit_ix]{sites} = \@hit_sites;
+}
+
+# input: a blast BTOP string of the form: "1VA160TS7KG10RK27"
+# returns a list_ref of tokens: (1, "VA", 60, "TS", 7, "KG, 10, "RK", 27)
+#
+sub decode_btop {
+ my ($btop_str) = @_;
+
+ my @tokens = split(/(\d+)/,$btop_str);
+
+ shift @tokens unless $tokens[0];
+
+ my @out_tokens = ();
+
+ for my $token (@tokens) {
+ if ($token =~ m/^\d+$/) {
+ push @out_tokens, $token
+ }
+ else {
+ my @mis_tokens = split(/(..)/,$token);
+ for my $mis (@mis_tokens) {
+ if ($mis) {push @out_tokens, $mis};
+ }
+ }
+ }
+
+ return \@out_tokens;
+}
+
+sub parse_query_lib {
+ my ($query_file) = @_;
+
+ my %query_seqs = ();
+
+ open(my $qfd, $query_file);
+ { # local scope for $/
+ local $/ = "\n>";
+
+ while (my $entry = <$qfd>) { # returns an entire fasta entry
+ chomp $entry;
+ my ($header, $sequence) = ($entry =~ m/^>? # ^> only in first entry
+ ( [^\n]* ) \n # header line
+ ( .* ) # the sequence
+ /osx); # optimize, multiline, commented
+ $sequence =~ s/[^A-Za-z\*]//g; # remove everything but letters
+ $sequence = uc($sequence);
+ $header =~ s/\s.*$//;
+ my @seq = split(//,$sequence);
+ unshift @seq,""; # @seq is now 1-based
+ $query_seqs{$header} = \@seq;
+ }
+ }
+ return \%query_seqs;
+}
+
+sub parse_query_file {
+ my ($query_file) = @_;
+
+ my $seq_data = "";
+
+ open(my $qfd, $query_file);
+ while (my $line = <$qfd>) {
+ next if $line =~ m/^>/;
+ next if $line =~ m/^;/;
+ chomp $line;
+ $line =~ s/[^A-Za-z\*]//g;
+ $seq_data .= $line
+ }
+
+ $seq_data = uc($seq_data);
+
+ my @seq = split(//,$seq_data);
+
+ return \@seq;
+}
+
+sub init_blosum62 {
+
+ my @ncbi_blaa = qw( A R N D C Q E G H I L K M F P S T W Y V B Z X * );
+
+ $blosum62[ 0] = [ qw( 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4) ]; # A
+ $blosum62[ 1] = [ qw( -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4) ]; # R
+ $blosum62[ 2] = [ qw( -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4) ];
+ $blosum62[ 3] = [ qw( -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4) ];
+ $blosum62[ 4] = [ qw( 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4) ];
+ $blosum62[ 5] = [ qw( -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4) ];
+ $blosum62[ 6] = [ qw( -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4) ];
+ $blosum62[ 7] = [ qw( 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4) ];
+ $blosum62[ 8] = [ qw( -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4) ];
+ $blosum62[ 9] = [ qw( -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4) ];
+ $blosum62[10] = [ qw( -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4) ];
+ $blosum62[11] = [ qw( -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4) ];
+ $blosum62[12] = [ qw( -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4) ];
+ $blosum62[13] = [ qw( -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4) ];
+ $blosum62[14] = [ qw( -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4) ];
+ $blosum62[15] = [ qw( 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4) ];
+ $blosum62[16] = [ qw( 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4) ];
+ $blosum62[17] = [ qw( -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4) ];
+ $blosum62[18] = [ qw( -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4) ];
+ $blosum62[19] = [ qw( 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4) ];
+ $blosum62[20] = [ qw( -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4) ];
+ $blosum62[21] = [ qw( -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4) ];
+ $blosum62[22] = [ qw( 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4) ];
+ $blosum62[23] = [ qw( -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1) ];
+
+
+ die "blosum62 length mismatch $#blosum62 != $#ncbi_blaa" if (scalar(@blosum62) != scalar(@ncbi_blaa));
+
+ for (my $i=0; $i < scalar(@ncbi_blaa); $i++) {
+ $aa_map{$ncbi_blaa[$i]} = $i;
+ $blosum62_diag[$i] = $blosum62[$i][$i];
+ }
+
+ ($g_open, $g_ext) = (-11, -1);
+}
+
+# given: (1) a query sequence; (2) an encoded alignment; (3) a scoring matrix
+# calculate a score
+
+sub alignment_score {
+ my ($query_r, $query_start, $btop_align_r, $matrix_2d) = @_;
+
+ my ($gap0, $gap1) = (0,0);
+
+ my $qix = $query_start-1; # start from zero
+
+ my ($score, $m_score) = 0;
+ my ($seq0, $seq1) = ("","");
+ for my $btop (@{$btop_align_r}) {
+ if ($btop =~ m/^\d+$/) { # matching query sequence, add it up
+ for (my $i=0; $i < $btop; $i++) {
+ $score += $matrix_2d->{$query_r->[$qix]}{$query_r->[$qix]};
+ $qix++;
+ }
+ }
+ else {
+ ($seq0, $seq1) = split(//,$btop);
+ if ($btop=~ m/\-/) {
+ if ($seq0 eq '-') {
+ if ($gap0) { $score += $g_ext;}
+ else { $score += $g_open+$g_ext;}
+ $gap0 = 1;
+ }
+ else {
+ if ($gap1) { $score += $g_ext;}
+ else { $score += $g_open+$g_ext;}
+ $gap1 = 1;
+ $qix++;
+ }
+ }
+ else {
+ $score += $matrix_2d->{$seq0}{$seq1};
+ $gap1=$gap0 = 0;
+ $qix++;
+ }
+ }
+ }
+ return $score;
+}
+
+################################################################
+# sub_alignment_score()
+# input: $query_r : a query sequence;
+# $hit_r->{BTOP} : an encoded alignment;
+# $matrix_2d, $matrix_diag : a scoring matrix
+# $domain_r : domain boundaries in query (target=0) or subject (target=1)
+# $target : 0=query, 1=target
+#
+# calculate a score
+# updates $domain_r in place with new values:
+# domain_r->[]->{ident} (as fraction identical),
+# ->{score} --matrix raw similarity score
+# ->{qa_start,qa_end} domain boundaries in query
+# ->{sa_start, sa_end} domain boundaries in subject
+#
+sub sub_alignment_score {
+ my ($query_r, $hit_r, $matrix_2d, $matrix_diag, $domain_r, $target) = @_;
+
+ return (0, $domain_r) unless ($domain_r && scalar(@$domain_r));
+
+ my $btop_enc_r = decode_btop($hit_r->{BTOP});
+
+ my ($gap0, $gap1) = (0,0);
+
+ my @active_dom_list = ();
+ my @aligned_domains = ();
+
+ my $left_active_end = $domain_r->[-1]->{d_end}+1; # as far right as possible
+ my ($q_start, $s_start, $h_start, $h_end) = @{$hit_r}{qw(q_start s_start s_start s_end)};
+ my ($qix, $six) = ($q_start, $s_start); # $qix now starts from 1, like $ssix;
+
+ my $ds_ix = \$six; # use to track the subject position
+ # reverse coordinate names if $target==0
+ unless ($target) {
+ $ds_ix = \$qix; # track query position
+ $h_start = $hit_r->{q_start};
+ $h_end = $hit_r->{q_end};
+ }
+
+ my ($score, $m_score) = 0;
+ my ($seq0, $seq1) = ("","");
+
+ # find the first overlapping domain
+ my ($dom_ix, $dom_nx) = (0,scalar(@$domain_r));
+ my $dom_r = $domain_r->[0];
+
+ # skip over domains that do not overlap alignment
+ # capture first domain that alignment overlaps
+ for ($dom_ix=0; $dom_ix < $dom_nx; $dom_ix++) {
+ if ($domain_r->[$dom_ix]->{d_end} >= $h_start) { # if {d_end} < $_start, cannot overlap
+ $dom_r = $domain_r->[$dom_ix];
+ if ($dom_r->{d_pos} <= $h_start) { # {d_pos} is less, {d_end} is greater, overlap
+ push @aligned_domains, $dom_r;
+ $left_active_end = push_annot_match(\@active_dom_list, $dom_r, $q_start, $s_start, 0, 0);
+ }
+ else { last; }
+ }
+ }
+
+ my ($dom_score, $id_cnt) = (0,0);
+
+ for my $btop (@{$btop_enc_r}) {
+
+ if ($btop =~ m/^\d+$/) { # matching query sequence, add it up
+ for (my $i=0; $i < $btop; $i++) { # $i is used to count through BTOP, not to index anything.
+
+ my $seq0_map = $aa_map{'X'};
+ unless ($query_r->[$qix]) {
+ warn "qix: $qix out of range";
+ }
+ else {
+ $seq0_map = $aa_map{$query_r->[$qix]} if exists($aa_map{$query_r->[$qix]});
+# print "$qix:$six : ",$query_r->[$qix],"\n";
+ }
+
+ $m_score = $matrix_diag->[$seq0_map];
+ $score += $m_score;
+
+ if ($dom_ix < $dom_nx && $$ds_ix == $dom_r->{d_pos}) {
+ push @aligned_domains, $dom_r;
+ $left_active_end = push_annot_match(\@active_dom_list, $dom_r, $qix, $six, $id_cnt, $dom_score);
+ $dom_ix++;
+ $dom_r = $domain_r->[$dom_ix];
+ ($dom_score, $id_cnt) = (0,0);
+ }
+
+ if (@active_dom_list) {
+ $dom_score += $m_score;
+ $id_cnt++;
+ if ($$ds_ix == $left_active_end) {
+ $left_active_end = pop_annot_match(\@active_dom_list, $qix, $six, $$ds_ix, $id_cnt, $dom_score);
+ $dom_score = $id_cnt = 0;
+ }
+ }
+
+ $qix++;
+ $six++;
+ $gap0 = $gap1 = 0;
+ }
+ }
+ else {
+ ($seq0, $seq1) = split(//,$btop);
+
+# print "$qix:$six : $btop\n";
+
+ if ($btop=~ m/\-/) {
+ if ($seq0 eq '-') { # gap in seq0
+ if ($gap0) {
+ $m_score = $g_ext;
+ }
+ else {
+ $m_score = $g_open+$g_ext;
+ $gap0 = 1;
+ }
+
+ $score += $m_score;
+
+ if ($target) { # subject domains
+ if ($dom_ix < $dom_nx && $$ds_ix == $dom_r->{d_pos}) {
+ push @aligned_domains, $dom_r;
+ $left_active_end = push_annot_match(\@active_dom_list, $dom_r, $qix, $six, $id_cnt, $dom_score);
+ $dom_ix++;
+ $dom_r = $domain_r->[$dom_ix];
+ ($dom_score, $id_cnt) = (0,0);
+ }
+ if (@active_dom_list) {
+ $dom_score += $m_score;
+ if ($$ds_ix == $left_active_end) {
+ $left_active_end = pop_annot_match(\@active_dom_list, $qix, $six, $$ds_ix, $id_cnt, $dom_score);
+ $dom_score = $id_cnt = 0;
+ }
+ }
+ }
+ $six++;
+ }
+ else { # gap in seq1
+ if ($gap1) {
+ $m_score = $g_ext;
+ }
+ else {
+ $m_score = $g_open+$g_ext;
+ $gap1 = 1;
+ }
+ $score += $m_score;
+
+ unless ($target) { # query domains
+ if ($dom_ix < $dom_nx && $$ds_ix == $dom_r->{d_pos}) {
+ push @aligned_domains, $dom_r;
+ $left_active_end = push_annot_match(\@active_dom_list, $dom_r, $qix, $six, $id_cnt, $dom_score);
+ $dom_ix++;
+ $dom_r = $domain_r->[$dom_ix];
+ ($dom_score, $id_cnt) = (0,0);
+ }
+
+ if (@active_dom_list) {
+ $dom_score += $m_score;
+ if ($$ds_ix == $left_active_end) {
+ $left_active_end = pop_annot_match(\@active_dom_list, $qix, $six, $$ds_ix, $id_cnt, $dom_score);
+ $dom_score = $id_cnt = 0;
+ }
+ }
+ }
+ $qix++;
+ }
+ }
+ else { # mismatch
+ my ($seq0_map, $seq1_map) = ($aa_map{$seq0},$aa_map{$seq1});
+ $seq0_map = $aa_map{'X'} unless defined($seq0_map);
+ $seq1_map = $aa_map{'X'} unless defined($seq1_map);
+
+ $m_score = $matrix_2d->[$seq0_map][$seq1_map];
+ $score += $m_score;
+ if ($dom_ix < $dom_nx && $$ds_ix == $dom_r->{d_pos}) {
+ push @aligned_domains, $dom_r;
+ $left_active_end = push_annot_match(\@active_dom_list, $dom_r, $qix, $six, $id_cnt, $dom_score);
+ $dom_ix++;
+ $dom_r = $domain_r->[$dom_ix];
+ ($dom_score, $id_cnt) = (0,0);
+ }
+
+ if (@active_dom_list) {
+ $dom_score += $m_score;
+ if ($$ds_ix == $left_active_end) {
+ $left_active_end = pop_annot_match(\@active_dom_list, $qix, $six, $$ds_ix, $id_cnt, $dom_score);
+ $dom_score = $id_cnt = 0;
+ }
+ }
+ $qix++;
+ $six++;
+ $gap0 = $gap1 = 0;
+ }
+ }
+# print join(":",($qix, $six, $score)),"\n";
+ }
+
+ # all done, finish any domain stuff
+ if (@active_dom_list) {
+ last_annot_match(\@active_dom_list, $hit_r->{q_end}, $hit_r->{s_end}, $id_cnt, $dom_score);
+ }
+
+ return ($score, \@aligned_domains);
+}
+
+################################################################
+# sub_alignment_pos
+# input: $hit_r->{BTOP} : an encoded alignment;
+# $domain_r : domain boundaries in query (target=0) or subject (target=1)
+# $target : 0=query, 1=target
+#
+# updates $domain_r in place with new values:
+# domain_r->[]->{ident} (as fraction identical),
+# ->{sa_start, sa_end} domain boundaries in subject
+#
+sub sub_alignment_pos {
+ my ($hit_r, $domain_r, $target) = @_;
+
+ return (0, $domain_r) unless ($domain_r && scalar(@$domain_r));
+
+ my $btop_enc_r = decode_btop($hit_r->{BTOP});
+
+ my ($gap0, $gap1) = (0,0);
+
+ my @active_dom_list = ();
+ my @aligned_domains = ();
+
+ my $left_active_end = $domain_r->[-1]->{d_end}+1; # as far right as possible
+ my ($q_start, $s_start, $h_start) = @{$hit_r}{qw(q_start s_start s_start)};
+ my ($qix, $six) = ($q_start, $s_start); # $qix now starts from 1, like $ssix;
+
+ my $ds_ix = \$six; # use to track the subject position
+ # reverse coordinate names if $target==0
+ unless ($target) {
+ $ds_ix = \$qix; # track query position
+ $h_start = $hit_r->{q_start};
+ }
+
+ my ($score, $m_score) = 0;
+ my ($seq0, $seq1) = ("","");
+
+ # find the first overlapping domain
+ my ($dom_ix, $dom_nx) = (0,scalar(@$domain_r));
+ my $dom_r = $domain_r->[0];
+
+ # skip over domains that do not overlap alignment
+ # capture first domain that alignment overlaps
+ for ($dom_ix=0; $dom_ix < $dom_nx; $dom_ix++) {
+ if ($domain_r->[$dom_ix]->{d_end} >= $h_start) { # if {d_end} < $_start, cannot overlap
+ $dom_r = $domain_r->[$dom_ix];
+ if ($dom_r->{d_pos} <= $h_start) { # {d_pos} is less, {d_end} is greater, overlap
+ push @aligned_domains, $dom_r;
+ $left_active_end = push_annot_match(\@active_dom_list, $dom_r, $q_start, $s_start, 0, 0);
+ }
+ else { last; }
+ }
+ }
+
+ my ($dom_score, $id_cnt) = (0,0);
+
+ for my $btop (@{$btop_enc_r}) {
+
+ if ($btop =~ m/^\d+$/) { # matching query sequence, add it up
+ for (my $i=0; $i < $btop; $i++) { # $i is used to count through BTOP, not to index anything.
+
+ if ($dom_ix < $dom_nx && $$ds_ix == $dom_r->{d_pos}) {
+ push @aligned_domains, $dom_r;
+ $left_active_end = push_annot_match(\@active_dom_list, $dom_r, $qix, $$ds_ix, $id_cnt, $dom_score);
+ $dom_ix++;
+ $dom_r = $domain_r->[$dom_ix];
+ ($dom_score, $id_cnt) = (0,0);
+ }
+ if (@active_dom_list) {
+ $id_cnt++;
+ if ($$ds_ix == $left_active_end) {
+ $left_active_end = pop_annot_match(\@active_dom_list, $qix, $six, $$ds_ix, $id_cnt, $dom_score);
+ $dom_score = $id_cnt = 0;
+ }
+ }
+
+ $qix++;
+ $six++;
+ $gap0 = $gap1 = 0;
+ }
+ }
+ else {
+ ($seq0, $seq1) = split(//,$btop);
+
+# print "$qix:$six : $btop\n";
+
+ if ($btop=~ m/\-/) {
+ if ($seq0 eq '-') { # gap in seq0
+
+ if ($target) { # subject domains
+ if ($dom_ix < $dom_nx && $$ds_ix == $dom_r->{d_pos}) {
+ push @aligned_domains, $dom_r;
+ $left_active_end = push_annot_match(\@active_dom_list, $dom_r, $qix, $$ds_ix, $id_cnt, $dom_score);
+ $dom_ix++;
+ $dom_r = $domain_r->[$dom_ix];
+ ($dom_score, $id_cnt) = (0,0);
+ }
+ if (@active_dom_list) {
+ if ($dom_ix < $dom_nx && $$ds_ix == $left_active_end) {
+ $left_active_end = pop_annot_match(\@active_dom_list, $qix, $six, $$ds_ix, $id_cnt, $dom_score);
+ $dom_score = $id_cnt = 0;
+ }
+ }
+ }
+ $six++;
+ }
+ else { # gap in seq1
+
+ unless ($target) { # query domains
+ if ($dom_ix < $dom_nx && $$ds_ix == $dom_r->{d_pos}) {
+ push @aligned_domains, $dom_r;
+ $left_active_end = push_annot_match(\@active_dom_list, $dom_r, $qix, $$ds_ix, $id_cnt, $dom_score);
+ $dom_ix++;
+ $dom_r = $domain_r->[$dom_ix];
+ ($dom_score, $id_cnt) = (0,0);
+ }
+ if (@active_dom_list) {
+ $dom_score += $m_score;
+ if ($dom_ix < $dom_nx && $$ds_ix == $left_active_end) {
+ $left_active_end = pop_annot_match(\@active_dom_list, $qix, $six, $$ds_ix, $id_cnt, $dom_score);
+ $dom_score = $id_cnt = 0;
+ }
+ }
+ }
+ $qix++;
+ }
+ }
+ else { # mismatch
+ my ($seq0_map, $seq1_map) = ($aa_map{$seq0},$aa_map{$seq1});
+ if ($dom_ix < $dom_nx && $$ds_ix == $dom_r->{d_pos}) {
+ push @aligned_domains, $dom_r;
+ $left_active_end = push_annot_match(\@active_dom_list, $dom_r, $qix, $$ds_ix, $id_cnt, $dom_score);
+ $dom_ix++;
+ $dom_r = $domain_r->[$dom_ix];
+ ($dom_score, $id_cnt) = (0,0);
+ }
+ if (@active_dom_list) {
+ if ($$ds_ix == $left_active_end) {
+ $left_active_end = pop_annot_match(\@active_dom_list, $qix, $six, $$ds_ix, $id_cnt, $dom_score);
+ $dom_score = $id_cnt = 0;
+ }
+ }
+ $qix++;
+ $six++;
+ $gap0 = $gap1 = 0;
+ }
+ }
+# print join(":",($qix, $six, $score)),"\n";
+ }
+
+ # all done, finish any domain stuff
+ if (@active_dom_list) {
+ last_annot_match(\@active_dom_list, $hit_r->{q_end}, $hit_r->{s_end}, $id_cnt, $dom_score);
+ }
+
+ return ($score, \@aligned_domains);
+}
+
+################
+# push_annot_match - adds domain to set of @$active_doms_r,
+# update ->{score}, ->{ident} for existing @$active_doms_r
+# initialize ->{score}, ->{ident} to zero for new domain
+# insert (splice) new domain in list ordered left-to-right by ->{d_end}
+# returns current left-most {d_end} boundary
+#
+sub push_annot_match {
+ my ($active_doms_r, $dom_r, $q_pos, $s_pos, $c_ident, $c_score) = @_;
+
+ $dom_r->{ident} = 0;
+ $dom_r->{score} = 0;
+ $dom_r->{qa_start} = $dom_r->{qa_pos} = $q_pos;
+ $dom_r->{sa_start} = $dom_r->{sa_pos} = $s_pos;
+
+ # no previous domains, just initialize
+ unless (scalar(@$active_doms_r)) {
+ push @$active_doms_r, $dom_r;
+ return $dom_r->{d_end};
+ }
+
+ # some previous domains, update score, identity for domains in list
+ # also find insertion point
+ my $nx = scalar(@$active_doms_r);
+ my $min_ix = $nx;
+ for (my $ix=0; $ix < $nx; $ix++) {
+ $active_doms_r->[$ix]->{ident} += $c_ident;
+ $active_doms_r->[$ix]->{score} += $c_score;
+ if ($dom_r->{d_end} < $active_doms_r->[$ix]->{d_end}) {
+ $min_ix = $ix;
+ }
+ }
+
+ # now have location for insert
+ splice(@$active_doms_r, $min_ix, 0, $dom_r);
+ return $active_doms_r->[0]->{d_end};
+}
+
+################
+# pop_annot_match - update domains in @$active_doms_r
+# update: ->{ident}, ->{score}
+# add: ->{qa_end},->{sa_end}
+# remove all domains that end at $s_ix and convert {ident} count to fraction
+# return left-most right boundary
+
+sub pop_annot_match {
+ my ($active_doms_r, $q_pos, $s_pos, $d_pos, $c_ident, $c_score) = @_;
+
+ my $nx = scalar(@$active_doms_r);
+
+ # we know the left most (first) domain matches,
+ my $pop_count = 0;
+ for my $cur_r (@$active_doms_r) {
+ $cur_r->{ident} += $c_ident;
+ $cur_r->{score} += $c_score;
+ $pop_count++ if ($cur_r->{d_end} == $d_pos);
+ }
+
+ while ($pop_count-- > 0) {
+ my $cur_r = shift @$active_doms_r;
+ # convert identity count to identity fraction
+ $cur_r->{percid} = $cur_r->{ident}/($cur_r->{d_end} - $cur_r->{d_pos}+1);
+ $cur_r->{qa_end} = $cur_r->{qa_pos} = $q_pos;
+ $cur_r->{sa_end} = $cur_r->{sa_pos} = $s_pos;
+ }
+
+ if (scalar(@$active_doms_r)) {
+ my $leftmost_end = $active_doms_r->[0]->{d_end};
+ for (my $lix = 1; $lix < scalar(@$active_doms_r); $lix++) {
+ if ($active_doms_r->[$lix]->{d_end} < $leftmost_end) {
+ $leftmost_end = $active_doms_r->[$lix]->{d_end};
+ }
+ }
+ return $leftmost_end;
+ }
+ else {
+ return -1;
+ }
+}
+
+sub last_annot_match {
+ my ($active_doms_r, $q_pos, $s_pos, $c_ident, $c_score) = @_;
+
+ my $nx = scalar(@$active_doms_r);
+
+ # we know the left most (first) domain matches,
+ my $pop_count = 0;
+ for my $cur_r (@$active_doms_r) {
+ $cur_r->{ident} += $c_ident;
+ $cur_r->{score} += $c_score;
+ $cur_r->{percid} = $cur_r->{ident}/($cur_r->{d_end} - $cur_r->{d_pos}+1);
+ $cur_r->{qa_end} = $cur_r->{qa_pos} = $q_pos;
+ $cur_r->{sa_end} = $cur_r->{sa_pos} = $s_pos;
+ }
+
+ $active_doms_r = [];
+}
+
+# given: (1) a query sequence; (2) an encoded alignment; (3) a scoring matrix
+# report matches/mismatches on annotated sites
+# updates $site_r->[]->{q_coord, s_coord}
+# ->{q_res, s_res}
+
+sub site_align {
+ my ($query_r, $hit_r, $matrix_2d, $site_r, $target) = @_;
+
+ return [] unless ($site_r && scalar(@$site_r));
+
+ my @aligned_sites = ();
+
+ my $btop_enc_r = decode_btop($hit_r->{BTOP});
+
+ my ($q_start, $q_end, $s_start, $s_end) = @{$hit_r}{qw(q_start q_end s_start s_end)};
+ my ($qix, $six) = ($q_start, $s_start); # $qix, $six 1-based
+ my $ds_ix = \$six; # use to track the subject position
+
+ unless ($target) {
+ ($q_start, $q_end, $s_start, $s_end) = @{$hit_r}{qw(s_start s_end q_start q_end)};
+ $ds_ix = \$qix; # track query position
+ }
+
+ my ($seq0, $seq1) = ("","");
+
+ # find the first overlapping domain
+
+ my ($site_ix, $site_nx) = (0,scalar(@$site_r));
+ my $s_r = $site_r->[0];
+
+ # skip over sites that do not overlap alignment
+ for ($site_ix=0; $site_ix < $site_nx; $site_ix++) {
+ if ($site_r->[$site_ix]->{d_pos} >= $s_start) { # find the first site inside alignment
+ $s_r = $site_r->[$site_ix];
+ last;
+ }
+ }
+
+ return [] unless $site_ix < $site_nx;
+
+ for my $btop (@{$btop_enc_r}) {
+ last if ($site_ix >= $site_nx);
+ if ($btop =~ m/^\d+$/) { # matching query sequence, check for sites within current region
+ my $bt_end = $$ds_ix + $btop - 1;
+ if ($bt_end < $s_r->{d_pos}) { # no site in identical region
+ $qix += $btop;
+ $six += $btop;
+ }
+ else { # yes site in region, jump to it
+ my $c_pos;
+ while ($site_ix < $site_nx && $s_r->{d_pos} <= $bt_end) {
+ $c_pos = $$ds_ix; # must be inside loop because $ds_ix points to $qix or $six
+ $qix += $s_r->{d_pos} - $c_pos; # jump forward to site
+ $six += $s_r->{d_pos} - $c_pos; # jump forward to site
+ $seq0 = $query_r->[$qix];
+
+ @{$s_r}{qw(annot_ix qa_pos sa_pos q_res s_res m_symb d_end)} = ($site_ix, $qix, $six, $seq0, $seq0, match_symb($seq0, $seq0, $matrix_2d));
+ push @aligned_sites, $s_r;
+ $site_ix++;
+ $s_r=$site_r->[$site_ix];
+ }
+ # past the last site annotation, but not done with $btop;
+ $c_pos = ($bt_end - $$ds_ix + 1);
+ $qix += $c_pos;
+ $six += $c_pos;
+ }
+ }
+ else { # sequence does not match -- must check each position
+ ($seq0, $seq1) = split(//,$btop);
+ if ($btop =~ m/\-/) {
+ if ($seq0 eq '-') {
+ if ($target) {
+ while ($site_ix < $site_nx && $s_r->{d_pos} == $six) {
+ @{$s_r}{qw(annot_ix qa_pos sa_pos q_res s_res m_symb)} = ($site_ix, $qix, $six, $seq0, $seq1, match_symb($seq0, $seq1, $matrix_2d));
+ push @aligned_sites, $s_r;
+ $site_ix++;
+ $s_r=$site_r->[$site_ix];
+ }
+ }
+ $six++;
+ }
+ else { # gap in seq1, cannot match domain
+ unless ($target) {
+ while ($site_ix < $site_nx && $s_r->{d_pos} == $qix) {
+ @{$s_r}{qw(annot_ix qa_pos sa_pos q_res s_res m_symb)} = ($site_ix, $qix, $six, $seq0, $seq1, match_symb($seq0, $seq1, $matrix_2d));
+ push @aligned_sites, $s_r;
+ $site_ix++;
+ $s_r=$site_r->[$site_ix];
+ }
+ }
+ $qix++;
+ }
+ }
+ else { # mismatch; $btop string is twice length of covered region
+ while ($s_r->{d_pos} == $$ds_ix && $site_ix < $site_nx ) {
+ @{$s_r}{qw(annot_ix qa_pos sa_pos q_res s_res m_symb)} = ($site_ix, $qix, $six, $seq0, $seq1, match_symb($seq0, $seq1, $matrix_2d));
+ push @aligned_sites, $s_r;
+ $site_ix++; $s_r=$site_r->[$site_ix];
+ }
+ $qix++;
+ $six++;
+ }
+ }
+ }
+
+ return (\@aligned_sites);
+}
+
+sub match_symb {
+ my ($seq0, $seq1, $matrix_2d) = @_;
+
+ if (uc($seq0) eq uc($seq1)) {
+ return "=";
+ }
+ else {
+ my $seq0_map = $aa_map{$seq0};
+ $seq0_map = $aa_map{'X'} unless defined($seq0_map);
+
+ my $seq1_map = $aa_map{$seq1};
+ $seq1_map = $aa_map{'X'} unless defined($seq1_map);
+
+ my $m_score = $matrix_2d->[$seq0_map][$seq1_map];
+
+ if ($m_score < 0) {return "<";}
+ elsif ($m_score > 0) {return ">";}
+ else {return "z";}
+ }
+}
+
+
+
+# merge up to four lists of annotations into a single list, and return
+# a reference to the list
+# input: $hit references, possibly with {aligned_domains_r}, {aligned_sites_r}
+# {q_aligned_domains_r}, {q_aligned_sites_r}
+#
+sub merge_annots {
+ my ($hit_r) = @_;
+
+ my @merged_array = ();
+
+ # merge the sites arrays first, so that conserved annotated sites are juxtaposed
+
+ my ($qs_ix, $ss_ix, $qs_nx, $ss_nx) = (0,0,0,0);
+
+ $ss_nx = scalar(@{$hit_r->{aligned_sites_r}}) if (exists($hit_r->{aligned_sites_r}));
+ $qs_nx = scalar(@{$hit_r->{q_aligned_sites_r}}) if (exists($hit_r->{q_aligned_sites_r}));
+
+ if ($ss_nx && $qs_nx) { # have sites on both sequences
+ # find out how many positions match between {q_aligned_sites_r} and {aligned_sites_r}
+
+ my @uniq_sites = ();
+
+ for my $qs_ref (@{$hit_r->{q_aligned_sites_r}}) {
+ $qs_ref->{merged} = 0;
+ for my $ss_ref ( @{$hit_r->{aligned_sites_r}} ) {
+ next if ($ss_ref->{qa_pos} < $qs_ref->{qa_pos});
+ last if ($ss_ref->{qa_pos} > $qs_ref->{qa_pos});
+ if ($qs_ref->{qa_pos} == $ss_ref->{qa_pos} && $qs_ref->{type} eq $ss_ref->{type}) {
+ $qs_ref->{merged} = $ss_ref->{merged} = 1;
+ $qs_ref->{target} = $ss_ref->{target} = 2;
+ # save match
+ push @uniq_sites, $qs_ref;
+ }
+ }
+ }
+
+ # save merged sites
+ push @merged_array, @uniq_sites;
+
+ # save unmerged subject
+ @uniq_sites = ();
+ for my $ss_ref ( @{$hit_r->{aligned_sites_r}} ) {
+ push @uniq_sites, $ss_ref if (!defined($ss_ref->{merged}) || $ss_ref->{merged} == 0);
+ }
+ push @merged_array, @uniq_sites;
+
+ # save unmerged query
+ @uniq_sites = ();
+ for my $qs_ref ( @{$hit_r->{aligned_sites_r}} ) {
+ push @uniq_sites, $qs_ref if (!defined($qs_ref->{merged}) || $qs_ref->{merged} == 0);
+ }
+ push @merged_array, @uniq_sites;
+ }
+ elsif ($ss_nx) {
+ push @merged_array, @{$hit_r->{aligned_sites_r}};
+ }
+ elsif ($qs_nx) {
+ push @merged_array, @{$hit_r->{q_aligned_sites_r}};
+ }
+
+# for my $ann_r ( @merged_array) {
+# unless ($ann_r->{qa_pos}) {
+# print STDERR "missing qa_pos:",join(":",@{$ann_r}{qw(q_seqid s_seqid)}),"\n";
+# }
+# }
+
+ @merged_array = sort { $a->{qa_pos} <=> $b->{qa_pos} } @merged_array;
+
+
+ push @merged_array, @{$hit_r->{aligned_domains_r}} if (exists($hit_r->{aligned_domains_r}));
+ push @merged_array, @{$hit_r->{q_aligned_domains_r}} if (exists($hit_r->{q_aligned_domains_r}));
+
+ @merged_array = sort { $a->{qa_pos} <=> $b->{qa_pos} } @merged_array;
+
+ return \@merged_array;
+}
+
+# domain output formatter
+sub format_dom_info {
+ my ($hit_r, $raw_score, $dom_r) = @_;
+
+ unless ($raw_score) {
+ warn "no raw_score at: ".$hit_r->{s_seqid}."\n";
+ $raw_score = $hit_r->{score};
+ }
+
+ my ($score_scale, $fsub_score) = ($hit_r->{score}/$raw_score, $dom_r->{score}/$raw_score);
+
+ my $qval = 0.0;
+ if ($hit_r->{evalue} == 0.0) {
+ $qval = 3000.0
+ }
+ else {
+ $qval = -10.0*log($hit_r->{evalue})*$fsub_score/(log(10.0))
+ }
+
+ my ($ns_score, $s_bit) = (int($dom_r->{score} * $score_scale+0.5),
+ int($hit_r->{bits} * $fsub_score +0.5),
+ );
+ $qval = 0 if $qval < 0;
+
+ # print join(":",($dom_r->{ad_pos},$dom_r->{ad_end},$ns_score, $s_bit, sprintf("%.1f",$qval))),"\n";
+ return join(";",(sprintf("|XR:%d-%d:%d-%d:s=%d",
+ $dom_r->{qa_start},$dom_r->{qa_end},
+ $dom_r->{sa_start},$dom_r->{sa_end},$ns_score),
+ sprintf("b=%.1f",$s_bit),
+ sprintf("I=%.3f",$dom_r->{percid}),
+ sprintf("Q=%.1f",$qval),$dom_r->{descr}));
+}
+
+# merged annot output formatter
+sub format_annot_info {
+ my ($hit_r, $annot_list_r) = @_;
+
+ my $raw_score = 0;
+
+ if ($hit_r->{raw_score} ) {
+ $raw_score = $hit_r->{raw_score};
+ }
+ else {
+# warn "no raw_score at: ".$hit_r->{s_seqid}."\n";
+ $raw_score = $hit_r->{score};
+ }
+
+ my $score_scale = $hit_r->{score}/$raw_score;
+
+ my $annot_str = "";
+
+ # two types of annotations, domains and sites.
+
+ for my $annot_r ( @$annot_list_r ) {
+
+ if ($annot_r->{type} eq '-') { # domain with scores
+ my $fsub_score = $annot_r->{score}/$raw_score;
+
+ my $qval = 0.0;
+ if ($hit_r->{evalue} == 0.0) {
+ $qval = 3000.0
+ } else {
+ $qval = -10.0*log($hit_r->{evalue})*$fsub_score/(log(10.0))
+ }
+
+ my ($ns_score, $s_bit) = (int($annot_r->{score} * $score_scale+0.5),
+ int($hit_r->{bits} * $fsub_score +0.5),
+ );
+ $qval = 0 if $qval < 0;
+
+ $annot_str .= join(";",(sprintf("|%s:%d-%d:%d-%d:s=%d",
+ $annot_r->{target} ? "XR" : "RX",
+ $annot_r->{qa_start},$annot_r->{qa_end},
+ $annot_r->{sa_start},$annot_r->{sa_end},$ns_score),
+ sprintf("b=%.1f",$s_bit),
+ sprintf("I=%.3f",$annot_r->{percid}),
+ sprintf("Q=%.1f",$qval),$annot_r->{descr}));
+ }
+ else { # site annotation
+ my $ann_type = $annot_r->{type};
+ my $site_str = "|".$ann_type . "X";
+ if ($annot_r->{target} == 1) {
+ $site_str = "|X".$ann_type;
+ }
+ elsif ($annot_r->{target} == 2) {
+ $site_str = "|$ann_type$ann_type";
+ }
+
+ $annot_str .= "$site_str:" . sprintf("%d%s%s%d%s",
+ $annot_r->{qa_pos}, $annot_r->{q_res}, $annot_r->{m_symb}, $annot_r->{sa_pos}, $annot_r->{s_res});
+
+ }
+ }
+ return $annot_str;
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+annot_blast_btop2.pl
+
+=head1 SYNOPSIS
+
+ annot_blast_btop2 --ann_script ann_pfam_www_e.pl [--query_file query.fasta] --out_fields "q_seqid s_seqid percid evalue" blast_tabular_file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+
+ --ann_script -- annotation script returning site/domain locations for subject sequences
+ -- same as --script
+
+ --q_ann_script -- annotation script for query sequences
+ -- same as --q_script
+
+ --query_file -- fasta query sequence
+ -- same as --query, --query_lib
+ (can contain multiple sequences for multi-sequence search)
+
+ --out_fields -- blast tabular fields shown before domain information
+
+ --raw_score -- add the raw_score used to normalized domain scores to
+ tabular output (raw_scores are only calculated for domains)
+
+=head1 DESCRIPTION
+
+C<annot_blast_btop2.pl> runs the script specified by
+C<--ann_script/--q_ann_script> to annotate functional sites domain
+content of the sequences specified by the subject/query seqid field of
+blast tabular format (-outfmt 6 or 7) or FASTA blast tabular format
+(-m 8). The C<--ann_script/--q_ann_script> script produces domain
+boundary coordinates, which are mapped to the alignment. For searches
+against SwissProt sequences, C<--ann_script ann_feats_up_www2.pl> will
+acquire features and domains from Uniprot. C<--ann_script
+ann_pfam_www.pl --neg> will get domain information from Pfam, and
+score non-domain (NODOM) regions.
+
+The tab file is read and parsed, and then the subject/query seqid is used to
+capture domain locations in the subject/query sequence. If the domains
+overlap the aligned region, the domain names are appended to the
+intput.
+
+If a C<--query_file> is specified and two additional fields, C<score>
+and C<btop> are present, C<annot_blast_btop2.pl> calculates
+sub-alignment scores, including fraction identity, bit score, and
+Q-value (-log10(E-value)), partitioning the alignment score, identity,
+and bit score across the overlapping domains.
+
+The C<--out_fields> specifies the blast tabular fields that can be
+returned. By default, C<q_seqid s_seqid percid alen mismatch gopen
+q_start q_end s_start s_end evalue bits> (but not C<score> and
+C<BTOP>) are shown.
+
+Currently, this program is fully functional only for blastp (or
+blastn) searches. For translated searches (blastx) domain content,
+location and identity is provided, but not bit-scores or Q-values.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/blastp_cmd.sh b/scripts/blastp_cmd.sh
new file mode 100755
index 0000000..73f86e3
--- /dev/null
+++ b/scripts/blastp_cmd.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+cmd="";
+for i in "$@"
+do
+ case $i in
+ -o=*|--outname=*)
+ OUTNAME="${i#*=}"
+ shift # past argument=value
+ ;;
+ *)
+ cmd="$cmd $i"
+ ;;
+ esac
+done
+
+bl_asn=${OUTNAME}.asn
+bl0_out="$OUTNAME.html"
+blm_out="$OUTNAME.msa"
+blt_out="$OUTNAME.bl_tab"
+
+# echo "OUTFILE = ${OUTNAME}"
+
+#export BLAST_PATH="/ebi/extserv/bin/ncbi-blast+/bin"
+export BLAST_PATH="/seqprg/bin"
+
+$BLAST_PATH/blastp -outfmt 11 $cmd > $bl_asn
+$BLAST_PATH/blast_formatter -archive $bl_asn -outfmt 0 -html > $bl0_out
+$BLAST_PATH/blast_formatter -archive $bl_asn -outfmt '7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore score btop' > $blt_out
+$BLAST_PATH/blast_formatter -archive $bl_asn -outfmt 2 > $blm_out
+
diff --git a/scripts/color_defs.pl b/scripts/color_defs.pl
new file mode 100755
index 0000000..0d09ed5
--- /dev/null
+++ b/scripts/color_defs.pl
@@ -0,0 +1,170 @@
+# color_defs.pl used by lav2plt.pl for domain coloring
+
+################################################################
+# copyright (c) 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+%color_names = (
+aliceblue=>[240,248,255],
+antiquewhite=>[250,235,215],
+aqua=>[0,255,255],
+aquamarine=>[127,255,212],
+azure=>[240,255,255],
+beige=>[245,245,220],
+bisque=>[255,228,196],
+black=>[0,0,0],
+blanchedalmond=>[255,235,205],
+blue=>[0,0,255],
+blueviolet=>[138,43,226],
+brown=>[165,42,42],
+burlywood=>[222,184,135],
+cadetblue=>[95,158,160],
+chartreuse=>[127,255,0],
+chocolate=>[210,105,30],
+coral=>[255,127,80],
+cornflowerblue=>[100,149,237],
+cornsilk=>[255,248,220],
+crimson=>[220,20,60],
+cyan=>[0,255,255],
+darkblue=>[0,0,139],
+darkcyan=>[0,139,139],
+darkgoldenrod=>[184,134,11],
+darkgray=>[169,169,169],
+darkgreen=>[0,100,0],
+darkgrey=>[169,169,169],
+darkkhaki=>[189,183,107],
+darkmagenta=>[139,0,139],
+darkolivegreen=>[85,107,47],
+darkorange=>[255,140,0],
+darkorchid=>[153,50,204],
+darkred=>[139,0,0],
+darksalmon=>[233,150,122],
+darkseagreen=>[143,188,143],
+darkslateblue=>[72,61,139],
+darkslategray=>[47,79,79],
+darkslategrey=>[47,79,79],
+darkturquoise=>[0,206,209],
+darkviolet=>[148,0,211],
+deeppink=>[255,20,147],
+deepskyblue=>[0,191,255],
+dimgray=>[105,105,105],
+dimgrey=>[105,105,105],
+dodgerblue=>[30,144,255],
+firebrick=>[178,34,34],
+floralwhite=>[255,250,240],
+forestgreen=>[34,139,34],
+fuchsia=>[255,0,255],
+gainsboro=>[220,220,220],
+ghostwhite=>[248,248,255],
+gold=>[255,215,0],
+goldenrod=>[218,165,32],
+gray=>[128,128,128],
+green=>[0,128,0],
+greenyellow=>[173,255,47],
+grey=>[128,128,128],
+honeydew=>[240,255,240],
+hotpink=>[255,105,180],
+indianred=>[205,92,92],
+indigo=>[75,0,130],
+ivory=>[255,255,240],
+khaki=>[240,230,140],
+lavender=>[230,230,250],
+lavenderblush=>[255,240,245],
+lawngreen=>[124,252,0],
+lemonchiffon=>[255,250,205],
+lightblue=>[173,216,230],
+lightcoral=>[240,128,128],
+lightcyan=>[224,255,255],
+lightgoldenrodyellow=>[250,250,210],
+lightgray=>[211,211,211],
+lightgreen=>[144,238,144],
+lightgrey=>[211,211,211],
+lightpink=>[255,182,193],
+lightsalmon=>[255,160,122],
+lightseagreen=>[32,178,170],
+lightskyblue=>[135,206,250],
+lightslategray=>[119,136,153],
+lightslategrey=>[119,136,153],
+lightsteelblue=>[176,196,222],
+lightyellow=>[255,255,224],
+lime=>[0,255,0],
+limegreen=>[50,205,50],
+linen=>[250,240,230],
+magenta=>[255,0,255],
+maroon=>[128,0,0],
+mediumaquamarine=>[102,205,170],
+mediumblue=>[0,0,205],
+mediumorchid=>[186,85,211],
+mediumpurple=>[147,112,219],
+mediumseagreen=>[60,179,113],
+mediumslateblue=>[123,104,238],
+mediumspringgreen=>[0,250,154],
+mediumturquoise=>[72,209,204],
+mediumvioletred=>[199,21,133],
+midnightblue=>[25,25,112],
+mintcream=>[245,255,250],
+mistyrose=>[255,228,225],
+moccasin=>[255,228,181],
+navajowhite=>[255,222,173],
+navy=>[0,0,128],
+oldlace=>[253,245,230],
+olive=>[128,128,0],
+olivedrab=>[107,142,35],
+orange=>[255,165,0],
+orangered=>[255,69,0],
+orchid=>[218,112,214],
+palegoldenrod=>[238,232,170],
+palegreen=>[152,251,152],
+paleturquoise=>[175,238,238],
+palevioletred=>[219,112,147],
+papayawhip=>[255,239,213],
+peachpuff=>[255,218,185],
+peru=>[205,133,63],
+pink=>[255,192,203],
+plum=>[221,160,221],
+powderblue=>[176,224,230],
+purple=>[128,0,128],
+red=>[255,0,0],
+rosybrown=>[188,143,143],
+royalblue=>[65,105,225],
+saddlebrown=>[139,69,19],
+salmon=>[250,128,114],
+sandybrown=>[244,164,96],
+seagreen=>[46,139,87],
+seashell=>[255,245,238],
+sienna=>[160,82,45],
+silver=>[192,192,192],
+skyblue=>[135,206,235],
+slateblue=>[106,90,205],
+slategray=>[112,128,144],
+slategrey=>[112,128,144],
+snow=>[255,250,250],
+springgreen=>[0,255,127],
+steelblue=>[70,130,180],
+tan=>[210,180,140],
+teal=>[0,128,128],
+thistle=>[216,191,216],
+tomato=>[255,99,71],
+turquoise=>[64,224,208],
+violet=>[238,130,238],
+wheat=>[245,222,179],
+white=>[255,255,255],
+whitesmoke=>[245,245,245],
+yellow=>[255,255,0],
+yellowgreen=>[154,205,50],
+);
+
+1;
diff --git a/scripts/exp_up_ensg.pl b/scripts/exp_up_ensg.pl
new file mode 100755
index 0000000..f7356c5
--- /dev/null
+++ b/scripts/exp_up_ensg.pl
@@ -0,0 +1,145 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2010, 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+## usage - expand_link.pl seed_acc.file > linked_fasta.file
+##
+## take a fasta36 -e expand.sh result file of the form:
+## sp|P09488|<tab>1.1e-50
+##
+## and extract the accession number, looking it up from the an SQL
+## table $table. This script uses the database created by link2sql.pl
+## Code is included for linking to UniRef and as well as NCBI refseq
+## searches.
+
+## Once the linked accession numbers are found, the sequences are
+## extracted from the SQL database uniprot (see Mackey and Pearson
+## (2004) Current Protocols in Bioinformatics (L. Stein, ed) "Using
+## SQL databases for sequence similarity searching and analysis".
+## Alternatively, one could use blastdbcmd or fastacmd to extract the
+## sequences from an NCBI blast-formatted database.
+##
+
+use strict;
+use DBI;
+
+my ($host, $port, $db, $table, $user, $pass);
+
+my $hostname = `/bin/hostname`;
+
+unless ($hostname =~ m/ebi/) {
+ ($host, $db, $port, $user, $pass) = ("xdb", "uniprot", 0, "web_user", "fasta_www");
+}
+else {
+ ($host, $db, $port, $user, $pass) = ("mysql-pearson", "up_db", 4124, "web_user", "fasta_www");
+}
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+my %sth = (
+ up2ensg_id => "SELECT acc, ensg FROM ensg JOIN annot2 USING(acc) WHERE id=?",
+ up2ensg_acc => "SELECT * FROM ensg WHERE acc=?",
+ ensg2seq => "SELECT * FROM ensg JOIN annot2 USING(acc) JOIN protein USING(acc) WHERE ensg=?",
+ );
+
+for my $sth (keys(%sth)) {
+ $sth{$sth} = $dbh->prepare($sth{$sth});
+}
+
+my %acc_uniq = ();
+my %ensg_uniq = ();
+
+while (my $line = <>) {
+ chomp($line);
+ my ($hit, $e_val) = split(/\t/,$line);
+ processLine($hit,$sth{up2ensg});
+}
+
+for my $ensg_acc ( keys %ensg_uniq ) {
+
+ $sth{ensg2seq}->execute($ensg_acc);
+ while (my $row_href = $sth{ensg2seq}->fetchrow_hashref ) {
+ next if ($acc_uniq{$row_href->{acc}});
+# print ">". $row_href->{db} . "|". $row_href->{acc} . " (".
+# $ensg_uniq{$acc}->{acc}."|".$ensg_uniq{$acc}->{id}.":$acc) " .
+# $row_href->{descr}. "\n";
+# print ">" . uc($row_href->{db}) . ":$ensg_uniq{$acc}->{acc} $row_href->{acc} $acc $row_href->{descr}\n";
+ print ">",join('|', (lc($row_href->{db}),$row_href->{acc},$row_href->{id}))," ($ensg_uniq{$ensg_acc}->{id}|$ensg_acc) $row_href->{descr}\n";
+ print $row_href->{seq} . "\n";
+ }
+ $sth{ensg2seq}->finish();
+}
+
+$dbh->disconnect();
+
+sub processLine{
+ my ($id)=@_;
+ my ($dummy, $link_acc, $link_id);
+
+ my $use_acc = 1;
+ my $get_sth = $sth{up2ensg_acc};
+
+ if ($id =~ m/^gi\|/) {
+ # $id of the form: gi|12346|ref|NP_98765.1|<tab>1.1e-50
+ ($link_acc, $link_id) = (split(/\|/,$id))[3,4];
+ $link_acc =~ s/\.\d+$//;
+ }
+ elsif ($id =~ m/(\w+):(\w+)/) {
+ $link_id = $2;
+ $link_acc = '';
+ $use_acc = 0;
+ }
+ elsif ($id =~ m/sp\|([\w\-\.]+)/) {
+ ($dummy, $link_acc, $link_id) = split(/\|/,$id);
+ }
+ elsif ($id =~ m/tr\|([\w\-\.]+)/) {
+ ($dummy, $link_acc, $link_id) = split(/\|/,$id);
+ }
+# form: SP:GSTM1_MOUSE P10649
+ elsif ($id =~ m/SP\:(\w+)/) {
+ ($link_id) = ($1);
+ $use_acc = 0;
+ }
+ elsif ($id =~ m/TR\:(\w+)/) {
+ ($link_id) = ($1);
+ $use_acc = 0;
+ }
+ else {$link_acc = $id;}
+
+ if ($use_acc) {
+ return if ($acc_uniq{$link_acc});
+ $get_sth->execute($link_acc);
+ }
+ else {
+ $get_sth = $sth{up2ensg_id};
+ $get_sth->execute($link_id);
+ }
+
+ while (my ($acc, $ensg) = $get_sth->fetchrow_array()) {
+ $acc_uniq{$acc} = $acc unless $acc_uniq{$acc};
+ $ensg_uniq{$ensg} = {id=>$acc} unless $ensg_uniq{$ensg};
+ }
+ $get_sth->finish();
+}
diff --git a/scripts/expand_links.pl b/scripts/expand_links.pl
new file mode 100755
index 0000000..9be7c0c
--- /dev/null
+++ b/scripts/expand_links.pl
@@ -0,0 +1,100 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2010, 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+## usage - expand_link.pl seed_acc.file > linked_fasta.file
+##
+## take a fasta36 -e expand.sh result file of the form:
+## gi|12346|ref|NP_98765.1|<tab>1.1e-50
+##
+## and extract the accession number, looking it up from the an SQL
+## table $table. This script uses the database created by link2sql.pl
+## Code is included for linking to UniRef and as well as NCBI refseq
+## searches.
+
+## Once the linked accession numbers are found, the sequences are
+## extracted from the SQL database seqdb_demo2 (see Mackey and Pearson
+## (2004) Current Protocols in Bioinformatics (L. Stein, ed) "Using
+## SQL databases for sequence similarity searching and analysis".
+## Alternatively, one could use blastdbcmd or fastacmd to extract the
+## sequences from an NCBI blast-formatted database.
+##
+
+use strict;
+use DBI;
+
+my ($host, $db, $port, $user, $pass, $table) = ("xdb", "wrp_link", 0, "web_user", "fasta_www", "micr_samp_link50");
+
+my $dbh = DBI->connect("dbi:mysql:host=$host:$db",
+ $user, $password,
+ { RaiseError => 1, AutoCommit => 1}
+ ) or die $DBI::errstr;
+
+my %sth = (
+ seed2link => "SELECT link_acc FROM $table WHERE seed_acc=?",
+ link2seq => "SELECT * FROM seqdb_demo2.annot JOIN seqdb_demo2.protein USING(prot_id) WHERE acc=? AND pref=1"
+ );
+
+for my $sth (keys(%sth)) {
+ $sth{$sth} = $dbh->prepare($sth{$sth});
+}
+
+my %acc_uniq = ();
+
+while (my $line = <>) {
+ chomp($line);
+ my ($hit, $e_val) = split(/\t/,$line);
+ processLine($hit,$sth{seed2link});
+}
+
+for my $acc ( keys %acc_uniq ) {
+
+ $sth{link2seq}->execute($acc);
+ while (my $row_href = $sth{link2seq}->fetchrow_hashref ) {
+ print ">". $row_href->{db} . "|". $row_href->{acc} . " (micr_samp|$acc_uniq{$acc}) " .
+ $row_href->{descr}. "\n";
+ print $row_href->{seq} . "\n";
+ }
+ $sth{link2seq}->finish();
+}
+
+$dbh->disconnect();
+
+sub processLine{
+ my ($id,$sth)=@_;
+ my ($link_acc);
+
+ if ($id =~ m/^gi\|/) {
+ # $id of the form: gi|12346|ref|NP_98765.1|<tab>1.1e-50
+ $link_acc = (split(/\|/,$id))[3];
+ $link_acc =~ s/\.\d+$//;
+ }
+ elsif ($id =~ m/^UniRef/) {
+ $link_acc = $id;
+ $link_acc =~ s/^UniRef\d+_//;
+ }
+ else {$link_acc = $id;}
+
+ my $result = $sth->execute($link_acc);
+
+ while (my ($acc) = $sth->fetchrow_array()) {
+ next if ($acc eq $link_acc);
+ $acc_uniq{$acc} = $link_acc unless $acc_uniq{$acc};
+ }
+ $sth->finish();
+}
diff --git a/scripts/expand_uniref50.pl b/scripts/expand_uniref50.pl
new file mode 100755
index 0000000..43d6900
--- /dev/null
+++ b/scripts/expand_uniref50.pl
@@ -0,0 +1,83 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2010, 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+## usage - expand_uniref50.pl uref50acc.file > up_fasta.file
+#
+
+# (1) take a list of uniref50 accessions and uses uniref50link to get
+# the associated uniprot accessions
+# (2) take the uniprot accessions and produce a fasta library file
+# from them
+
+use strict;
+use DBI;
+
+my ($host, $db, $port, $user, $pass) = ("xdb", "uniprot", 0, "web_user", "fasta_www");
+
+my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
+$connect .= ";host=$host" if $host;
+$connect .= ";port=$port" if $port;
+
+my $dbh = DBI->connect($connect,
+ $user,
+ $pass
+ ) or die $DBI::errstr;
+
+my %up_sth = (
+ ur50_to_upacc => "SELECT uniprot_acc FROM uniref50link WHERE uniref50_acc=?",
+ upacc_to_seq => "SELECT * FROM annot2 join protein USING(acc) WHERE acc=?",
+ );
+
+for my $sth (keys(%up_sth)) {
+ $up_sth{$sth} = $dbh->prepare($up_sth{$sth});
+}
+
+my %acc_uniq = ();
+
+while (my $line = <>) {
+ next if ($line =~ m/^UniRef50_UPI/); # _UPI accessions are not in sp-trembl
+ chomp($line);
+ my ($up_acc, $e_val) = split(/\t/,$line);
+ processLine($up_acc,$up_sth{ur50_to_upacc});
+}
+
+for my $up_acc ( keys %acc_uniq ) {
+
+ $up_sth{upacc_to_seq}->execute($up_acc);
+ while (my $row_href = $up_sth{upacc_to_seq}->fetchrow_hashref ) {
+ print ">sp|". $row_href->{acc} . "|". $row_href->{id} . " (uref50|$acc_uniq{$up_acc}) " .
+ $row_href->{descr}. "\n";
+ print $row_href->{seq} . "\n";
+ }
+ $up_sth{upacc_to_seq}->finish();
+}
+
+$dbh->disconnect();
+
+sub processLine{
+ my ($id,$sth)=@_;
+
+ $id=~ s/UniRef50_//;
+ my $result = $sth->execute($id);
+
+ while (my ($acc) = $sth->fetchrow_array()) {
+ $acc_uniq{$acc} = $id unless $acc_uniq{$acc};
+ }
+ $sth->finish();
+}
diff --git a/scripts/lav2plt.pl b/scripts/lav2plt.pl
new file mode 100755
index 0000000..2f1b593
--- /dev/null
+++ b/scripts/lav2plt.pl
@@ -0,0 +1,349 @@
+#!/usr/bin/perl -w
+
+# lav2plt.pl - produce plotfrom lav output */
+
+# $Id: lav2plt.c 625 2011-03-23 17:21:38Z wrp $ */
+# $Revision: 625 $ */
+
+################################################################
+# copyright (c) 2012, 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+use strict;
+use Getopt::Long;
+use Pod::Usage;
+
+use vars qw($pminx $pmaxx $pminy $pmaxy $lvstr $max_x $max_y
+ $fxscal $fyscal $fxoff $fyoff
+ @linarr @elinval @blinval @ilinval
+ @line_colors @block_colors
+ $annot_color %annot_names %color_names);
+
+ at line_colors=qw(black blue brown green lightgreen);
+ at block_colors = qw( slategrey lightgreen lightblue pink cyan tan gold plum mediumplum );
+
+my ($have_bits, $have_zdb, $zdb_size,$lav_dev, $shelp, $help) = (0,0,0,'svg',0,0);
+my ($x_upd_script, $y_upd_script) = ("","");
+my ($x_annot_arr_r, $y_annot_arr_r) = (0,0);
+
+$annot_color = 1;
+%annot_names = ();
+
+GetOptions("B"=>\$have_bits,
+ "Z=i"=>\$zdb_size,
+ "xA=s"=>\$x_upd_script,
+ "yA=s"=> \$y_upd_script,
+ "dev=s" => \$lav_dev,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+
+#require "./lav_defs.pl";
+# $max_x, $max_y define the maximum plotting area
+# the actual bounding box/view area will be larger if annotation comments are available
+($max_x,$max_y)=(540,540);
+ at elinval=(1e-4,1e-2,1.0,100.0);
+ at blinval=(40.0,30.0,20.0,10.0);
+ at ilinval=(200,100,50,25);
+
+require "./color_defs.pl";
+
+if ($lav_dev =~ m/ps/) {require "./lavplt_ps.pl";}
+else {require "./lavplt_svg.pl";}
+
+my ($g_n0, $g_n1);
+
+my $pgm_desc;
+my ($s_name0, $s_name1);
+my ($s_desc0, $s_desc1, $ss_desc0, $ss_desc1);
+my ($p0_beg, $p1_beg, $p0_end, $p1_end);
+my $open_plt = 0;
+
+if ($zdb_size) {
+ $have_zdb = 1;
+}
+else {
+ $zdb_size = 1;
+}
+
+while (my $line = <>) {
+ chomp $line;
+ next unless ($line);
+ next if ($line =~ m/^#/);
+
+ if ($line =~ m/^d/) {$pgm_desc = get_str();}
+ elsif ($line =~ m/^h/) {
+ ($s_desc0, $s_desc1) = get_str2();
+ $s_desc0 =~ s/^gi\|\d+\|//;
+ $s_desc1 =~ s/^gi\|\d+\|//;
+ $s_desc0 = substr($s_desc0,0,50);
+ $s_desc1 = substr($s_desc1,0,50);
+ $ss_desc0 = ($s_desc0 =~ m/^(\S+)\s*/);
+ $ss_desc1 = ($s_desc1 =~ m/^(\S+)\s*/);
+ }
+ elsif ($line =~ m/^s/) {
+ ($s_name0, $p0_beg, $p0_end,$s_name1, $p1_beg, $p1_end) = get_seq_info();
+ $g_n0 = $p0_end - $p0_beg + 1;
+ $g_n1 = $p1_end - $p1_beg + 1;
+ }
+ elsif ($line =~ m/^a/) {
+ unless ($open_plt) {
+ if ($y_upd_script) {$y_annot_arr_r = get_annot($s_desc1, $y_upd_script);}
+ if ($x_upd_script) {$x_annot_arr_r = get_annot($s_desc0, $x_upd_script);}
+ openplt($g_n0, $g_n1, $p0_beg, $p1_beg, $s_desc0, $s_desc1, $x_annot_arr_r, $y_annot_arr_r,$have_zdb, $have_bits);
+ if (($g_n0 == $g_n1) && ($p0_beg == $p1_beg) && ($p0_end == $p1_end) && $ss_desc0 eq $ss_desc1) {
+ drawdiag($g_n0, $g_n1);
+ }
+ $open_plt = 1;
+ }
+ do_alignment($p0_beg, $p1_beg);
+ }
+}
+
+unless ($open_plt) {
+ if ($y_upd_script) {$y_annot_arr_r = get_annot($y_upd_script);}
+ if ($x_upd_script) {$x_annot_arr_r = get_annot($x_upd_script);}
+ openplt($g_n0, $g_n1, $p0_beg, $p1_beg, $s_desc0, $s_desc1, $x_annot_arr_r, $y_annot_arr_r,$have_zdb, $have_bits);
+ if (($g_n0 == $g_n1) && ($p0_beg == $p1_beg) && ($p0_end == $p1_end) &&
+ $ss_desc0 eq $ss_desc1) {
+ drawdiag($g_n0, $g_n1);
+ }
+ $open_plt = 1;
+}
+closeplt();
+exit(0);
+
+# get a quote enclosed string
+# d {
+# "../bin/lalign36 -m "F11 mchu.lav" ../seq/mchu.aa ../seq/mchu.aa"
+# }
+
+# void get_str(FILE *file, char *str, size_t len) {
+sub get_str {
+
+ my $str = "";
+ while (my $line = <>) {
+ chomp $line;
+ next unless $line;
+ next if ($line =~ m/^#/);
+ last if ($line =~ m/}/);
+ $str .= $line
+ }
+
+ $str =~ s/^\s+"//;
+ $str =~ s/"\s*$//;
+
+ return $str;
+}
+
+# get two quote enclosed strings
+# h {
+# "MCHU - Calmodulin - Human, rabbit, bovine, rat, a - 148 aa"
+# "MCHU - Calmodulin - Human, rabbit, bovine, rat, and ch"
+# }
+#
+#void get_str2(FILE *file, char *str0, size_t len0, char *str1, size_t len1)
+
+sub get_str2 {
+
+ my @str = ();
+ my ($str0,$str1) = ("","");
+
+ while (my $line = <>) {
+ chomp $line;
+ next unless $line;
+ next if ($line =~ m/^#/);
+ last if ($line =~ m/}/);
+ push @str, $line;
+ }
+
+ do {
+ $str0 .= shift @str;
+ } while (@str && $str0 !~ m/"\s*$/);
+
+ do {
+ $str1 .= shift @str;
+ } while (@str && $str1 !~ m/"\s*$/);
+
+ $str0 =~ s/^\s+"//;
+ $str0 =~ s/"\s*$//;
+
+ $str1 =~ s/^\s+"//;
+ $str1 =~ s/"\s*$//;
+
+ return ($str0, $str1);
+}
+
+#void get_seq_info(FILE *file,
+# char *str0, size_t len0, int *n0_begin, int *n0_end,
+# char *str1, size_t len1, int *n1_begin, int *n1_end)
+
+sub get_seq_info {
+
+ my @lines = ();
+ my ($str0, $str1) = ("","");
+ my ($n0_beg, $n0_end, $n1_beg, $n1_end, $blank);
+
+ while (my $line = <>) {
+ chomp($line);
+ next if ($line =~ m/^#/);
+ last if ($line =~ m/}/);
+ push @lines, $line;
+ }
+
+ ($blank, $str0, $n0_beg, $n0_end) = split(/\s+/,$lines[0]);
+ ($blank, $str1, $n1_beg, $n1_end) = split(/\s+/,$lines[1]);
+
+ $str0 =~ s/^\s+"//;
+ $str0 =~ s/"\s*$//;
+
+ $str1 =~ s/^\s+"//;
+ $str1 =~ s/"\s*$//;
+
+ return ($str0, $n0_beg, $n0_end, $str1, $n1_beg, $n1_end);
+}
+
+# void do_alignment(FILE *file, int p0_beg, int p1_beg)
+sub do_alignment {
+
+ my ($score, $s0_beg, $s0_end, $s1_beg, $s1_end, $percent, $bits);
+ my $have_line = 0;
+
+ while (my $line = <>) {
+ chomp $line;
+ next unless $line;
+ next if ($line =~ m/^#/);
+ last if ($line =~ m/}/);
+
+ my @fields = split(/\s+/,$line);
+ # loose first field if blank
+ unless ($fields[0]) {shift @fields;}
+
+ if ($fields[0] eq 's') {($score, $bits) = @fields[1,2];}
+ elsif ($fields[0] eq 'b') {($s0_beg, $s1_beg) = @fields[1,2];}
+ elsif ($fields[0] eq 'e') {($s0_end, $s1_end) = @fields[1,2];}
+ elsif ($fields[0] eq 'l') {
+ ($s0_beg, $s1_beg, $s0_end, $s1_end,$percent) = @fields[1..5];
+ if ($have_line) {
+ sxy_draw($s0_beg-$p0_beg+1, $s1_beg-$p1_beg+1);
+ sxy_draw($s0_end-$p0_beg+1, $s1_end-$p1_beg+1);
+ }
+ else {
+ opnline($score, $bits);
+ sxy_move($s0_beg - $p0_beg + 1, $s1_beg - $p1_beg + 1);
+ sxy_draw($s0_end - $p0_beg + 1, $s1_end - $p1_beg + 1);
+ $have_line = 1;
+ }
+ }
+ }
+ clsline();
+}
+
+# get annot does 2 things:
+# (1) read in the annotations
+# (2) make a hash of annotation colors, changing the color with each addition
+#
+
+sub get_annot {
+ my ($acc, $script) = @_;
+
+ my $FIN;
+
+ if ($script !~ /^!/) {
+ if (!open($FIN,$script)) {
+ warn "cannot open annotation file: $script\n";
+ return 0;
+ }
+ }
+ else { # run the script on the accession
+ $script =~ s/!//;
+ $acc =~ m/^(\S+)/;
+ $acc = $1;
+ if (!open($FIN, "$script \'$acc\' |")) {
+ warn "cannot run annotation script: $script $acc\n";
+ return 0;
+ }
+ }
+
+ my @annots = ();
+
+ my $header = <$FIN>;
+ while (my $line = <$FIN>) {
+ last if ($line =~ m/^>/);
+ chomp $line;
+ my %fields = ();
+# @fields{qw(beg type end descr)} = split(/\t/,$line);
+ @fields{qw(beg end descr)} = ($line =~ m/^\s*(\d+)\s+(\d+)\s+(\S.*)$/);
+ $fields{sdescr} = substr($fields{descr},0,12);
+ ($fields{sname}) = ($fields{sdescr} =~ m/^(\S+)/);
+ push @annots, \%fields;
+ unless ($annot_names{$fields{sname}}) {
+ $annot_names{$fields{sname}} = $annot_color++;
+ }
+ }
+ close($FIN);
+ return \@annots;
+}
+
+my $M_LN2=0.69314718055994530942;
+
+# produce e_val from bit score */
+
+#double bit_to_E (double bit)
+sub bit_to_E {
+ my $bit = shift @_;
+
+ my ($p_val);
+
+ $p_val = $g_n0 * $g_n1 / exp($bit * $M_LN2);
+ if ($p_val > 0.01) {$p_val = 1.0 - exp(-$p_val);}
+
+ return $zdb_size * $p_val;
+}
+
+=pod
+
+=head1 NAME
+
+lav2plt.pl
+
+=head1 SYNOPSIS
+
+ lav2plt.pl -h -help -B -Z=10000 --dev svg|ps --xA x_annot_script.pl --yA y_annot_script.pl output.lav
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+ -B have bit scores
+ -Z=# simulated database size
+ --dev svg|ps graphical output format
+ --xA/--yA domain annotation script
+
+=head1 DESCRIPTION
+
+C<lav2plt.pl> reads a local alignment lav file, produced by 'lalign36
+-m 11' and produces an alignment plot (on stdout) in svg (--dev svg, default)
+or postscript (--dev ps) format.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/lavplt_ps.pl b/scripts/lavplt_ps.pl
new file mode 100755
index 0000000..f8ed5d4
--- /dev/null
+++ b/scripts/lavplt_ps.pl
@@ -0,0 +1,540 @@
+################################################################
+# copyright (c) 2012, 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+#define SX(x) (int)((double)(x)*fxscal+fxoff+24)
+sub SX {
+ my $xx = shift;
+ return int($xx*$fxscal+$fxoff+24);
+}
+
+#define SY(y) (int)((double)(y)*fyscal+fyoff+48)
+sub SY {
+ my $yy = shift;
+ return int($yy*$fyscal+$fyoff+84);
+}
+
+# alignment lines: black blue cyan green lt_green */
+my @rlincol=(0.0,0.0,0.0,0.45,0.0);
+my @glincol=(0.0,0.0,0.5,0.30,1.0);
+my @blincol=(0.0,0.8,0.5,0.15,0.0);
+
+my @line_colors=qw(black blue cyan green lightgreen);
+my @block_colors = qw( slategrey lightgreen lightblue pink cyan tan gold plum mediumplum );
+
+# domain blocks: grey blue cyan green lt_green
+my @rblk_col=(0.33, 0.0, 0.0, 0.45, 0.0);
+my @gblk_col=(0.33, 0.0, 0.5, 0.30, 1.0);
+my @bblk_col=(0.33, 0.8, 0.5, 0.15, 0.0);
+
+# void openplt(long n0, long n1, int sq0off, int sq1off, char *xtitle, char *ytitle)
+sub openplt {
+ my ($n0, $n1, $sq0off, $sq1off, $xtitle, $ytitle, $x_annot_r, $y_annot_r, $have_zdb, $have_bits) = @_;
+
+ if ($lvstr) {
+ @elinval = split(/\s+/,$lvstr);
+ }
+ elsif ($ENV{LINEVAL}) {
+ @elinval = split(/\s+/,$ENV{LINEVAL});
+ }
+
+## 8.5 x 11 paper is 612 pt x 792 pt -- important to stay on one page, with comments
+## max_x, max_y are set to 540 pt -- 7.5 in, 9pt (0.75 in) margins, leaving little extra space
+## if comments are provided, max_x, max_y must be reduced (max_x for space, max_y to keep things square)
+##
+
+ my ($xbound, $ybound) = ($max_x + 24, $max_y + 72);
+ if ($x_annot_r) {$ybound += 64;}
+ if ($y_annot_r) {
+ $xbound += 100;
+ $max_x -= $max_x/10;
+ $max_y -= $max_x/10;
+ }
+
+ print("%!PS-Adobe-2.0\n");
+ print("%%Creator: plalign\n");
+ print("%%CreationDate: %s","2012-01-01");
+ print("%%DocumentFonts: Courier\n");
+ print("%%Pages: 1\n");
+ print("%%BoundingBox: 18 18 $xbound $ybound\n");
+ print("%%EndComments\n");
+ print("%%EndProlog\n");
+ print("%%Page: 1 1\n");
+ print("/Courier findfont 14 scalefont setfont\n");
+ print("/vcprint { gsave 90 rotate dup stringwidth pop 2 div neg 0 rmoveto\n");
+ print("show newpath stroke grestore } def\n");
+ print("/vprint { gsave 90 rotate\n");
+ print("show newpath stroke grestore } def\n");
+ print("/hcprint { gsave dup stringwidth pop 2 div neg 0 rmoveto\n");
+ print("show newpath stroke grestore } def\n");
+ print("/hrprint { gsave dup stringwidth pop neg 0 rmoveto\n");
+ print("show newpath stroke grestore } def\n");
+ print("/hprint { gsave show newpath stroke grestore } def\n");
+ # % x y w h RT - % draw a rectangle size w h at x y
+ print("/RT { [ ] 0 setdash newpath 4 -2 roll moveto dup 0 exch rlineto exch 0 rlineto neg 0 exch rlineto closepath } def \n");
+
+ ($pmaxx, $pmaxy) = ($n0, $n1);
+
+# $max_x, $max_y define the maximum plotting area
+# the actual bounding box/view area will be larger if annotation comments are available
+
+ $fxscal = ($max_x-1)/$n1;
+ $fyscal = ($max_y-1)/$n0;
+
+ if ($fxscal > $fyscal) {$fxscal = $fyscal;}
+ else {$fyscal = $fxscal;}
+
+ if ($fyscal * $n0 < $max_y/5.0) {
+ $fyscal = ($max_y-1)/($n0*5.0);
+ }
+
+ $fxscal *= 0.9; $fxoff = ($max_x-1)/11.0;
+ $fyscal *= 0.9; $fyoff = ($max_y-1)/11.0;
+
+ printf("%% openplt - frame - %ld %ld\n", $n0, $n1);
+
+ # draw the plot frame
+ linetype(0);
+ print("gsave\n");
+ print("currentlinewidth 1.5 mul setlinewidth\n");
+ newline();
+ move(SX(0),SY(0));
+ draw(SX(0),SY($n1+1));
+ draw(SX($n0+1),SY($n1+1));
+ draw(SX($n0+1),SY(0));
+ draw(SX(0),SY(0));
+ clsline($n0,$n1,100000);
+ print("grestore\n");
+
+ my $n_div = 11;
+ xaxis($n0,$sq1off, $xtitle, $n_div);
+
+ $n_div = 21 unless $n0 == $n1;
+ yaxis($n1,$sq0off, $ytitle, $n_div);
+ legend($have_zdb, $have_bits);
+
+ print("%% openplt done\n");
+
+ if ($x_annot_r) {xgrid($x_annot_r, $n0, $sq0off, $n1, $sq1off);}
+ if ($y_annot_r) {ygrid($y_annot_r, $n0, $sq0off, $n1, $sq1off);}
+}
+
+#void drawdiag(long n0, long n1)
+sub drawdiag {
+ my ($n0, $n1) = @_;
+ linetype(0);
+ printf("%% drawdiag %ld %ld\n",$n0, $n1);
+ print("gsave\n");
+ print("currentlinewidth 1.5 mul setlinewidth\n");
+ newline();
+ move(SX(0),SY(0));
+ draw(SX($n0+1),SY($n1+1));
+ clsline($n0,$n1,10000);
+ print("grestore\n");
+ print("%% drawdiag done\n");
+}
+
+# tick array - values */
+my @tarr = (10,20,50,100,200,500,1000,2000,5000,10000,20000,50000,100000,200000,500000,1000000);
+
+# void xaxis(long n, int offset, char *title)
+sub xaxis {
+ my ($n, $offset, $title, $n_div) = @_;
+
+ my ($i, $jm, $tick);
+ my ($js, $jo, $jl);
+ my $num_len;
+ my $numstr;
+
+ $tick = 6;
+
+ # search for the correct increment for the tick array */
+ for ($i=0; $i< @tarr; $i++) {
+ # seek to divide into 20 or fewer divisions */
+ if (($jm = $n/$tarr[$i])<$n_div) {goto found;}
+ }
+ $i=scalar(@tarr)-1;
+ $jm = $n/$tarr[$i];
+ found:
+ # js is the start of the value - modify to accomodate offset */
+ $js = $tarr[$i];
+
+ # jo is the offset */
+ $jo = ($offset-1) % $tarr[$i]; # figure out offset in tarr[i] increments */
+
+ # jl is the label */
+ $jl = ($offset-1)/$tarr[$i]; # figure out offset in tarr[i] increments */
+ $jl *= $tarr[$i];
+
+ newline();
+ for ($i=1; $i<=$jm; $i++) {
+ move(SX($i*$js - $jo),SY(0));
+ draw(SX($i*$js - $jo),SY(0)-$tick);
+ }
+ clsline($n,$n,10000);
+
+ $numstr = sprintf("%ld",$js + $jl );
+ $num_len = length($numstr);
+
+ if ($num_len > 4) {
+ move(SX($js-$jo),SY(0)-$tick-16);
+ printf("(%s) hcprint\n",$numstr);
+
+ $numstr=sprintf("%ld",$jm*$js+$jl);
+ move(SX($jm*$js-$jo),SY(0)-$tick-16);
+ printf("(%s) hcprint\n",$numstr);
+ }
+ else { # put in all the axis values */
+ for ($i=1; $i<=$jm; $i++) {
+ $numstr=sprintf("%ld",$i*$js+$jl);
+ move(SX($i*$js-$jo),SY(0)-$tick-16);
+ printf("(%s) hcprint\n",$numstr);
+ }
+ }
+
+ print("newpath\n");
+ move(SX($n/2),SY(0)-$tick-30);
+
+# for (bp = strchr(title,'('); (bp!=NULL); bp = strchr(bp+1,'(')) *bp=' ';
+# for (bp = strchr(title,')'); (bp!=NULL); bp = strchr(bp+1,')')) *bp=' ';
+ $title =~ s/\(/ /g;
+ $title =~ s/\)/ /g;
+ printf("(%s) hcprint\n",$title);
+}
+
+# void yaxis(long n, int offset, char *title)
+sub yaxis {
+ my ($n, $offset, $title, $n_div) = @_;
+
+ my ($i, $jm, $tick);
+ my ($js, $jo, $jl);
+ my ($num_len, $numstr);
+
+ $tick = 6;
+
+ for ($i=0; $i<@tarr; $i++) {
+ if (($jm = $n/$tarr[$i])<$n_div) {goto found;}
+ }
+ $jm = $n/5000;
+ $i= scalear(@tarr)-1;
+
+ found:
+ $js = $tarr[$i];
+
+ # $jo is the offset */
+ $jo = ($offset-1) % $tarr[$i]; # figure out offset in tarr[i] increments */
+ # jl is the label */
+ $jl = ($offset-1)/$tarr[$i]; # figure out offset in tarr[i] increments */
+ $jl *= $tarr[$i];
+
+ newline();
+ for ($i=1; $i<=$jm; $i++) {
+ move(SX(0),SY($i*$js-$jo));
+ draw(SX(0)-$tick,SY($i*$js-$jo));
+ }
+ clsline($n,$n,10000);
+
+ $numstr = sprintf("%ld",$js+$jl);
+
+ $num_len = length($numstr);
+
+ if ($num_len > 4) {
+ move(SX(0)-$tick-4,SY($js-$jo)-4);
+ printf("(%s) hrprint\n",$numstr);
+
+ $numstr = sprintf("%ld",$jm*$js+$jl);
+ move(SX(0)-$tick-4,SY($jm*$js-$jo)-4);
+ printf("(%s) hrprint\n",$numstr);
+ }
+ else {
+ for ($i=1; $i<=$jm; $i++) {
+ $numstr = sprintf("%ld",$i*$js+$jl);
+ move(SX(0)-$tick-4,SY($i*$js-$jo)-4);
+ printf("(%s) hrprint\n",$numstr);
+ }
+ }
+
+ move(SX(0)-$tick-42,SY($n/2));
+# for (bp = strchr(title,'('); (bp!=NULL); bp = strchr(bp+1,'(')) *bp=' ';
+# for (bp = strchr(title,')'); (bp!=NULL); bp = strchr(bp+1,')')) *bp=' ';
+ $title =~ s/\(/\\(/g;
+ $title =~ s/\)/\\)/g;
+ printf("(%s) vcprint\n",$title);
+
+}
+
+sub xgrid {
+ my ($annot_arr_r, $n0, $sq0_off, $n1, $sq1_off) = @_;
+
+ my $sq_off = $sq0_off;
+
+ my $show_block = 1;
+ my $text_offset = 8;
+ if ($show_block) {$text_offset = 24;}
+ my $color = 1;
+
+ print("%% xgrid: $n0 $n1\n");
+ print("gsave\n");
+ print("/Courier findfont 11 scalefont setfont\n");
+ print("currentlinewidth 0.5 mul setlinewidth\n");
+ for my $annot ( @$annot_arr_r) {
+ next unless $annot->{beg} >= $sq_off;
+ next if ($annot->{end} > $sq_off + $n0 - 1);
+ last if ($annot->{beg} > $sq_off + $n0 - 1);
+ newline();
+ print("0.33 0.33 0.33 setrgbcolor\n");
+ move(SX($annot->{beg}-$sq_off),SY(0));
+ print("[3 6] 0 setdash\n");
+ draw(SX($annot->{beg}-$sq_off),SY($n1));
+ clsline();
+ newline();
+ print("0.33 0.33 0.33 setrgbcolor\n");
+ move(SX($annot->{end}-$sq_off),SY(0));
+ print("[6 3] 0 setdash\n");
+ draw(SX($annot->{end}-$sq_off),SY($n1));
+ clsline();
+
+ # show rotated label
+ my $xpos = SX(($annot->{end} - $annot->{beg})/2 + $annot->{beg} - $sq_off) + 4;
+ my $ypos = SY($n1) + $text_offset;
+ # printf("<text x=\"0\" y=\"0\" text-anchor=\"left\" transform=\"translate($xpos, $ypos) rotate(-90,0,0)\">%s</text>\n",$annot->{sdescr});
+ if ($show_block) {
+ draw_block(SX($annot->{beg} - $sq_off), SY($n1) + 6,
+ SX($annot->{end} - $sq_off)-SX($annot->{beg} - $sq_off),12,
+ $annot_names{$annot->{sname}});
+ }
+ move($xpos, $ypos);
+ my $str = $annot->{sdescr};
+ $str =~ s/\(/\\(/g;
+ $str =~ s/\)/\\)/g;
+ print "($str) vprint\n";
+ }
+ print("grestore\n");
+}
+
+sub ygrid {
+ my ($annot_arr_r, $n0, $sq0_off, $n1, $sq1_off) = @_;
+
+ my $sq_off = $sq1_off;
+
+ my $show_block = 1;
+
+ my $text_offset = 8;
+ if ($show_block) {$text_offset = 24;}
+ my $color=4;
+
+ print("gsave\n");
+ print("/Courier findfont 11 scalefont setfont\n");
+ print("currentlinewidth 0.5 mul setlinewidth\n");
+ for my $annot ( @$annot_arr_r) {
+ next unless $annot->{beg} >= $sq_off;
+ next if ($annot->{end} > $sq_off + $n1 - 1);
+ last if ($annot->{beg} > $sq_off + $n1 - 1);
+ newline();
+ print("0.33 0.33 0.33 setrgbcolor\n");
+ move(SX(0), SY($annot->{beg}-$sq_off));
+ print("[3 6] 0 setdash\n");
+ draw(SX($n0), SY($annot->{beg}-$sq_off));
+ clsline();
+ newline();
+ move(SX(0), SY($annot->{end}-$sq_off));
+ print("[6 3] 0 setdash\n");
+ draw(SX($n0), SY($annot->{end}-$sq_off));
+ clsline();
+
+ my $xpos = SX($n0) + $text_offset;
+ my $ypos = SY(($annot->{end} - $annot->{beg})/2 + $annot->{beg} - $sq_off) - 3;
+
+ if ($show_block) {
+ draw_block(SX($n0)+6, SY($annot->{beg} - $sq_off), 12,
+ SY($annot->{end} - $sq_off)-SY($annot->{beg} - $sq_off),
+ $annot_names{$annot->{sname}});
+ }
+
+# printf("<text x=\"$xpos\" y=\"$ypos\" text-anchor=\"left\">%s</text>\n",$annot->{sdescr});
+ move($xpos, $ypos);
+ my $str = $annot->{sdescr};
+ $str =~ s/\(/\\(/g;
+ $str =~ s/\)/\\)/g;
+ print "($str) hprint\n";
+ }
+ print("grestore\n");
+}
+
+sub draw_block {
+ my ($x, $y, $w, $h, $color) = @_;
+
+ $color = ($color % scalar(@block_colors));
+ my $rgb = $color_names{$block_colors[$color]};
+
+# color is index into @[rgb]blk_color
+ print "gsave\n";
+
+ printf("%.3f %.3f %.3f setrgbcolor\n",
+ $rgb->[0]/255,$rgb->[1]/255,$rgb->[2]/255);
+
+ printf "%d %d %d %d RT\n",$x+1,$y+1,$w-2,$h-2;
+ print "fill\n";
+ print "stroke\n";
+
+ print "1.0 1.0 1.0 setrgbcolor\n";
+ print "$x $y $w $h RT\n";
+ print "stroke\n";
+ print "grestore\n";
+}
+
+# void legend()
+sub legend {
+ my ($have_zdb, $have_bits) = @_;
+
+ my ($i, $last);
+ my ($ixp, $iyp);
+ my $numstr;
+ my @xpos=(144,144,288,288,432);
+ my @ypos=(36,18,36,18,27);
+
+ if ($have_zdb || $have_bits) {$last = 5;}
+ else {$last = 4;}
+
+ move(60,27+18);
+ if ($have_zdb) {draw_str("E(): ");}
+ elsif ($have_bits) {draw_str("Bits: ");}
+
+ for ($i=0; $i<$last ; $i++) {
+ print("gsave currentlinewidth 1.5 mul setlinewidth\n");
+ newline();
+ linetype($i);
+ move($xpos[$i]-36,$ypos[$i]+18);
+ draw($xpos[$i]+24,$ypos[$i]+18);
+ clsline(1000,1000,10000);
+ print("grestore\n");
+ move($xpos[$i]+36,$ypos[$i]-4+18);
+ if ($have_zdb) {
+ if ($i==4) {$numstr = sprintf(">%.1lg",$elinval[3]);}
+ else {$numstr = sprintf("<%.1lg",$elinval[$i]);}
+ }
+ elsif ($have_bits) {
+ if ($i==4) {$numstr = sprintf("<%.1lf",$blinval[3]);}
+ else {$numstr = sprintf(">=%.1lf",$blinval[$i]);}
+ }
+ else {
+ if ($i==3) {$numstr = sprintf("<%d",$ilinval[3]);}
+ else {$numstr = sprintf(">%d",$ilinval[$i]);}
+ }
+ printf("(%s) hprint\n",$numstr);
+ }
+}
+
+#void linetype(type)
+sub linetype {
+ my $type = shift;
+
+ my $rgb_name = $line_colors[$type];
+ my $rgb = $color_names{$rgb_name};
+
+ printf("%5.3f %5.3f %5.3f setrgbcolor\n",
+ $rgb->[0]/256, $rgb->[1]/256, $rgb->[2]/256);
+}
+
+#void closeplt()
+sub closeplt {
+ print("%%Trailer\n");
+ print("showpage\n");
+ print("%%EOF\n");
+}
+
+# void opnline(int s, double bits)
+sub opnline {
+ my ($s, $bits) = shift;
+
+ my $e_val;
+
+ if ($have_zdb) {
+ $e_val = bit_to_E($bits);
+ if ($e_val < $elinval[0]) {linetype(0);}
+ elsif ($e_val < $elinval[1]) {linetype(1);}
+ elsif ($e_val < $elinval[2]) {linetype(2);}
+ elsif ($e_val < $elinval[3]) {linetype(3);}
+ else {linetype(4);}
+ }
+ elsif ($have_bits) {
+ if ($bits >= $blinval[0]) {linetype(0);}
+ elsif ($bits >= $blinval[1]) {linetype(1);}
+ elsif ($bits >= $blinval[2]) {linetype(2);}
+ elsif ($bits >= $blinval[3]) {linetype(3);}
+ else {linetype(4);}
+ }
+ else {
+ if ($s > $ilinval[0]) {linetype(0);}
+ elsif ($s > $ilinval[1]) {linetype(1);}
+ elsif ($s > $ilinval[2]) {linetype(2);}
+ else {linetype(3);}
+ }
+
+ print("newpath\n");
+}
+
+# void newline()
+sub newline {
+ print("0 0 0 setrgbcolor\n newpath\n");
+}
+
+# void clsline(long x,long y,int s)
+sub clsline {
+ print("stroke\n");
+}
+
+# void move(int x, int y)
+sub move {
+ my ($xx, $yy) = @_;
+
+ printf("%d %d moveto\n",$xx,$yy);
+}
+
+# void sxy_move(int x, int y)
+sub sxy_move {
+ my ($x, $y) = @_;
+ printf("%d %d moveto\n",SX($x),SY($y));
+}
+
+# void draw(int x, int y)
+sub draw {
+ my ($x,$y) = @_;
+ printf("%d %d lineto\n",$x,$y);
+}
+
+# void sxy_draw(int x, int y)
+sub sxy_draw {
+ my ($x,$y) = @_;
+ printf("%d %d lineto\n",SX($x),SY($y));
+}
+
+#void draw_str(char *str)
+sub draw_str
+{
+ my $str = shift;
+
+# for (bp = strchr(str,'('); (bp!=NULL); bp = strchr(bp+1,'(')) *bp=' ';
+# for (bp = strchr(str,')'); (bp!=NULL); bp = strchr(bp+1,')')) *bp=' ';
+
+ $str =~ s/\(/\\(/g;
+ $str =~ s/\)/\\)/g;
+
+ printf("(%s) show\n",$str);
+}
+
+#void cal_coord(int n0, int n1, long *a_start0, long *a_stop0, long *a_start1, long *a_stop1 )
+sub cal_coord {}
diff --git a/scripts/lavplt_svg.pl b/scripts/lavplt_svg.pl
new file mode 100755
index 0000000..ccc1c7a
--- /dev/null
+++ b/scripts/lavplt_svg.pl
@@ -0,0 +1,461 @@
+
+################################################################
+# copyright (c) 2012, 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+#define SX(x) (int)((double)(x)*fxscal+fxoff+6)
+sub SX {
+ my $xx = shift;
+ return int($xx*$fxscal+$fxoff+18);
+}
+
+#define SY(y) (max_y + 24 - (int)((double)(y)*fyscal+fyoff))
+sub SY {
+ my $yy = shift;
+ return $max_y + 24 - int($yy*$fyscal+$fyoff);
+}
+
+# $y_delta used widely to offset for x_domain annotation
+my $y_delta = 0;
+
+#void openplt(long n0, long n1, int sq0off, int sq1off, char *xtitle, char *ytitle)
+sub openplt
+{
+ my ($n0, $n1, $sq0off, $sq1off, $xtitle, $ytitle, $x_annot_r, $y_annot_r,$have_zdb, $have_bits) = @_;
+
+ if ($lvstr) {
+ @elinval = split(/\s+/,$lvstr);
+ }
+ elsif ($ENV{LINEVAL}) {
+ @elinval = split(/\s+/,$ENV{LINEVAL});
+ }
+
+ my ($xbound, $ybound) = ($max_x + 24, $max_y + 48);
+ $y_delta = 0;
+
+ if ($x_annot_r) {
+ my $x_comments = 0;
+ for my $annot (@$x_annot_r) {
+ if (length($annot->{sdescr}) > $x_comments) {
+ $x_comments = length($annot->{sdescr})
+ }
+ }
+ $ybound += 24 + 6 * $x_comments;
+ $y_delta += 24;
+ }
+
+ if ($y_annot_r) {
+ my $y_comments = 0;
+ for my $annot (@$y_annot_r) {
+ if (length($annot->{sdescr}) > $y_comments) {
+ $y_comments = length($annot->{sdescr})
+ }
+ }
+ $xbound += 6 * $y_comments;
+ $max_x -= 6 * $y_comments;
+ $max_y -= 6 * $y_comments
+ }
+
+ print("<?xml version=\"1.0\" standalone=\"no\"?>\n");
+ print("<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\" \n");
+ print("\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n\n");
+
+ print("<svg width=\"$xbound\" height=\"$ybound\" version=\"1.1\"\n");
+ print("xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n\n");
+
+ ($pmaxx, $pmaxy) = ($n0, $n1);
+
+ $fxscal = ($max_x-1)/$n1;
+ $fyscal = ($max_y-1)/$n0;
+
+ if ($fxscal > $fyscal) {$fxscal = $fyscal;}
+ else {$fyscal = $fxscal;}
+
+ if ($fyscal * $n0 < $max_y/5.0) {
+ $fyscal = ($max_y-1)/($n0*5.0);
+ }
+
+ $fxscal *= 0.9; $fxoff = ($max_x-1)/11.0;
+ $fyscal *= 0.9; $fyoff = ($max_y-1)/11.0;
+ if ($x_annot_r) {$fyoff -= (48 + $y_delta);}
+
+ # draw the plot frame
+ printf("<rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\"\n",
+ SX(0),SY($n1+1), SX($n0+1)-SX(0), SY(0) - SY($n1+1));
+ print("stroke=\"black\" stroke-width=\"2.0\" fill=\"none\" />\n");
+
+ my $n_div = 11;
+ xaxis($n0, $sq1off, $xtitle, $n_div);
+
+ $n_div = 21 unless ($n0 == $n1);
+ yaxis($n1, $sq0off, $ytitle, $n_div);
+ legend($have_zdb, $have_bits, ($x_annot_r));
+
+ if ($x_annot_r) {xgrid($x_annot_r, $n0, $sq0off, $n1, $sq1off);}
+ if ($y_annot_r) {ygrid($y_annot_r, $n0, $sq0off, $n1, $sq1off);}
+}
+
+# void drawdiag(long n0, long n1)
+sub drawdiag
+{
+ my ($n0, $n1) = @_;
+ # printf("currentlinewidth 1.5 mul setlinewidth\n"); */
+ newline("stroke=\"black\" stroke-width=\"1.5\"");
+ move(SX(0),SY(0));
+ draw(SX($n0+1),SY($n1+1));
+ clsline($n0,$n1,10000);
+}
+
+# tick array - values */
+my @tarr = (10,20,50,100,200,500,1000,2000,5000,10000,20000,50000,100000,200000,500000,1000000);
+my $MAX_INTERVAL=1000000;
+
+# void xaxis(long n, int offset, char *title)
+sub xaxis {
+ my ($n, $offset, $title, $n_div) = @_;
+
+ my ($i, $jm, $tick);
+ my ($js, $jo, $jl);
+ my $num_len;
+ my $numstr;
+
+ $tick = 6;
+
+ # search for the correct increment for the tick array */
+ for ($i=0; $i< @tarr; $i++) {
+ # seek to divide into 20 or fewer divisions */
+ if (($jm = $n/$tarr[$i])< $n_div) {goto found;}
+ }
+ $i= scalar(@tarr)-1;
+ $jm = $n/$tarr[$i];
+ found:
+ # js is the start of the value - modify to accomodate offset */
+ $js = $tarr[$i];
+
+ # jo is the offset */
+ $jo = ($offset-1) % $tarr[$i]; # figure out offset in tarr[i] increments */
+
+ # jl is the label */
+ $jl = ($offset-1)/$tarr[$i]; # figure out offset in tarr[i] increments */
+ $jl *= $tarr[$i];
+
+ newline("stroke=\"black\" stroke-width=\"1.5\"");
+ for ($i=1; $i<=$jm; $i++) {
+ move(SX($i*$js - $jo),SY(0));
+ draw(SX($i*$js - $jo),SY(0)+$tick);
+ }
+ clsline($n,$n,10000);
+
+ $numstr = sprintf("%ld",$js + $jl );
+ $num_len = length($numstr);
+ if ($num_len > 4) {
+
+ printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\">%s</text>\n",SX($js-$jo),SY(0)+$tick+16,$numstr);
+
+ $numstr = sprintf("%ld",$jm*$js+$jl);
+ printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\">%s</text>\n",SX($jm*$js-$jo),SY(0)+$tick+16,$numstr);
+ }
+ else {
+ for ($i=1; $i<=$jm; $i++) {
+ $numstr = sprintf("%ld",$i*$js+$jl);
+ printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\">%s</text>\n",SX($i*$js-$jo),SY(0)+$tick+16,$numstr);
+ }
+ }
+
+ printf("<text x=\"%d\" y=\"%d\" text-anchor=\"middle\">%s</text>\n",SX($n/2),SY(0)-$tick+42, $title);
+}
+
+sub xgrid {
+ my ($annot_arr_r, $n0, $sq0_off, $n1, $sq1_off) = @_;
+
+ my $sq_off = $sq0_off;
+
+ my $show_block = 1;
+ my $text_offset = 8;
+
+ if ($show_block) {$text_offset = 24;}
+
+ for my $annot ( @$annot_arr_r) {
+ next unless $annot->{beg} >= $sq_off;
+ next if ($annot->{end} > $sq_off + $n0 - 1);
+ last if ($annot->{beg} > $sq_off + $n0 - 1);
+ newline("stroke=\"black\" stroke-width=\"1.5\" stroke-opacity=\"0.33\" stroke-dasharray=\"3,6\"" );
+ move(SX($annot->{beg} - $sq_off),SY(0));
+ draw(SX($annot->{beg} - $sq_off),SY($n1));
+ clsline();
+ newline("stroke=\"black\" stroke-width=\"1.5\" stroke-opacity=\"0.33\" stroke-dasharray=\"6,3\"" );
+ move(SX($annot->{end} - $sq_off),SY(0));
+ draw(SX($annot->{end} - $sq_off),SY($n1));
+ clsline();
+
+ if ($show_block) {
+ draw_block(SX($annot->{beg} - $sq_off), SY($n1) - 18,
+ SX($annot->{end} - $sq_off) - SX($annot->{beg} - $sq_off),
+ 12, $annot_names{$annot->{sname}});
+ }
+
+ # show rotated label
+ my $xpos = SX(($annot->{end} - $annot->{beg})/2 + $annot->{beg} - $sq_off) + 4;
+ my $ypos = SY($n1) - $text_offset;
+ printf("<text x=\"0\" y=\"0\" text-anchor=\"left\" transform=\"translate($xpos, $ypos) rotate(-90,0,0)\">%s</text>\n",$annot->{sdescr});
+ }
+}
+
+sub ygrid {
+ my ($annot_arr_r, $n0, $sq0_off, $n1, $sq1_off) = @_;
+
+ my $sq_off = $sq1_off;
+
+ my $show_block = 1;
+ my $text_offset = 8;
+ if ($show_block) {$text_offset = 24;}
+
+ for my $annot ( @$annot_arr_r) {
+ next unless $annot->{beg} >= $sq_off;
+ next if ($annot->{end} > $sq_off + $n1 - 1);
+ last if ($annot->{beg} > $sq_off + $n1 - 1);
+ newline("stroke=\"black\" stroke-width=\"1.5\" stroke-opacity=\"0.33\" stroke-dasharray=\"3,6\"" );
+ move(SX(0), SY($annot->{beg} - $sq_off));
+ draw(SX($n0), SY($annot->{beg} - $sq_off));
+ clsline();
+ newline("stroke=\"black\" stroke-width=\"1.5\" stroke-opacity=\"0.33\" stroke-dasharray=\"6,3\"" );
+ move(SX(0), SY($annot->{end} - $sq_off));
+ draw(SX($n0), SY($annot->{end} - $sq_off));
+ clsline();
+
+ my $xpos = SX($n0) + $text_offset;
+ my $ypos = SY(($annot->{end} - $annot->{beg})/2 + $annot->{beg} - $sq_off) + 4;
+
+ if ($show_block) {
+ draw_block(SX($n0)+6, SY($annot->{end} - $sq_off), 12,
+ SY($annot->{beg} - $sq_off) - SY($annot->{end} - $sq_off),
+ $annot_names{$annot->{sname}});
+ }
+ printf("<text x=\"$xpos\" y=\"$ypos\" text-anchor=\"left\">%s</text>\n",$annot->{sdescr});
+ }
+}
+
+#void yaxis(long n, int offset, char *title)
+sub yaxis {
+ my ($n, $offset, $title, $n_div) = @_;
+
+ my ($i, $jm, $tick);
+ my ($js, $jo, $jl);
+ my $num_len;
+ my $numstr;
+
+ $tick = 6;
+
+ for ($i=0; $i< @tarr; $i++) {
+ if (($jm = $n/$tarr[$i])< $n_div) {goto found;}
+ }
+ $jm = $n/5000;
+ $i= scalar(@tarr)-1;
+
+ found:
+ $js = $tarr[$i];
+
+ # jo is the offset */
+ $jo = ($offset-1) % $tarr[$i]; # figure out offset in tarr[i] increments */
+ # jl is the label */
+ $jl = ($offset-1) / $tarr[$i]; # figure out offset in tarr[i] increments */
+ $jl *= $tarr[$i];
+
+ newline("stroke=\"black\" stroke-width=\"1.5\"");
+ for ($i=1; $i<=$jm; $i++) {
+ move(SX(0),SY($i*$js-$jo));
+ draw(SX(0)-$tick,SY($i*$js-$jo));
+ }
+ clsline($n,$n,10000);
+
+ $numstr = sprintf("%d",$js+$jl);
+ $num_len = length($numstr);
+ if ($num_len > 4) {
+ printf("<text x=\"%d\" y=\"%d\" text-anchor=\"end\">%s</text>\n",SX(0)-$tick-4,SY($js-$jo)+4,$numstr);
+
+ $numstr = sprintf("%ld",$jm*$js+$jl);
+ printf("<text x=\"%d\" y=\"%d\" text-anchor=\"end\">%s</text>\n",SX(0)-$tick-4,SY($jm*$js-$jo)+4,$numstr);
+ }
+ else {
+ for ($i=1; $i<=$jm; $i++) {
+ $numstr = sprintf("%ld",$i*$js+$jl);
+ printf("<text x=\"%d\" y=\"%d\" text-anchor=\"end\">%s</text>\n",SX(0)-$tick-4,SY($i*$js-$jo)+4,$numstr);
+ }
+ }
+ # make a path for the label */
+
+ #move(SX(0)-$tick-18,SY($n/2));
+ printf(qq(<g transform="rotate(-90, 142, 142)">\n));
+ printf(qq(<text x="0" y="12" text-anchor="middle">%s</text>\n),$title);
+ print("</g>\n");
+}
+
+sub draw_block {
+ my ($x, $y, $w, $h, $color) = @_;
+
+ $color = ($color % scalar(@block_colors));
+ my $svg_color = $block_colors[$color];
+
+ print (qq(<rect x="$x" y="$y" width="$w" height="$h" fill="$svg_color" stroke="white" stroke-width="1" />));
+}
+
+sub legend
+{
+ my ($have_zdb, $have_bits, $annot_flg) = @_;
+
+ my ($i, $last, $del);
+ my ($ixp, $iyp);
+ my $numstr;
+ my $optstr;
+ my @xpos=(144,144,288,288,432);
+ my @ypos=(36,18,36,18,27);
+
+ my $y_off = 66;
+ if ($annot_flg) {$y_off = 120;}
+
+ if ($have_zdb || $have_bits) {$last = 5;}
+ else {$last = 4;}
+
+ if ($have_zdb) {printf("<text x=\"%d\" y=\"%d\">E(): </text>",54,$max_y + $y_off - 24 + $y_delta);}
+ elsif ($have_bits) {printf("<text x=\"%d\" y=\"%d\">bits: </text>",54,$max_y + $y_off - 24 + $_delta);}
+
+ $del = 10;
+ for ($i=0; $i<$last ; $i++) {
+ $optstr = sprintf("stroke-width=\"1.5\" stroke=\"%s\"",$line_colors[$i]);
+ newline($optstr);
+ # linetype(i);*/
+ move($xpos[$i]-48,$max_y + $y_off - $ypos[$i] + $y_delta);
+ draw($xpos[$i]+12,$max_y + $y_off - $ypos[$i] + $y_delta);
+ clsline(1000,1000,10000);
+
+ if ($have_zdb) {
+ if ($i==4) {$numstr = sprintf(">%.1lg",$elinval[3]);}
+ else {$numstr = sprintf("<%.1lg",$elinval[$i]);}
+ }
+ elsif ($have_bits) {
+ if ($i==4) {$numstr = sprintf("<%.1lf",$blinval[3]);}
+ else {$numstr = sprintf(">=%.1lf",$blinval[$i]);}
+ }
+ else {
+ if ($i==3) {$numstr = sprintf("<%d",$ilinval[3]);}
+ else {$numstr = sprintf(">%d",$ilinval[$i]);}
+ }
+
+ printf("<text align=\"center\" x=\"%d\" y=\"%d\">%s</text>\n",$xpos[$i] + 18, $max_y + $y_off - $ypos[$i] + $y_delta + 4,$numstr);
+ }
+}
+
+# void linetype(int type)
+sub linetype
+{
+ my $type = shift;
+ printf(" stroke=\"%s\"",$line_colors[$type]);
+}
+
+#void closeplt()
+sub closeplt
+{
+ print("</svg>\n");
+}
+
+#void opnline(int s, double bits)
+sub opnline
+{
+ my ($s, $bits) = @_;
+
+ my $e_val;
+
+ if ($have_zdb) {
+ $e_val = bit_to_E($bits);
+ printf("<!-- score: %d; bits: %.1g; E(): %.1g -->\n",$s,$bits,$e_val);
+ print("<path ");
+ if ($e_val < $elinval[0]) {linetype(0);}
+ elsif ($e_val < $elinval[1]) {linetype(1);}
+ elsif ($e_val < $elinval[2]) {linetype(2);}
+ elsif ($e_val < $elinval[3]) {linetype(3);}
+ else {linetype(4);}
+ }
+ elsif ($have_bits) {
+ printf("<!-- score: %d; bits: %.1g -->\n",$s,$bits);
+ print("<path ");
+ if ($bits >= $blinval[0]) {linetype(0);}
+ elsif ($bits >= $blinval[1]) {linetype(1);}
+ elsif ($bits >= $blinval[2]) {linetype(2);}
+ elsif ($bits >= $blinval[3]) {linetype(3);}
+ else {linetype(4);}
+ }
+ else {
+ printf("<!-- score: %d -->\n",$s);
+ print("<path ");
+ if ($s > $ilinval[0]) {linetype(0);}
+ elsif ($s> $ilinval[1]) {linetype(1);}
+ elsif ($s> $ilinval[2]) {linetype(2);}
+ else {linetype(3);}
+ }
+
+ print(" d=\"");
+}
+
+# void newline(char *options)
+sub newline
+{
+ my $options = shift;
+
+ if ($options) {
+ printf("<path %s d=\"",$options);
+ }
+ else {print("<path stroke=\"black\" d=\"");}
+}
+
+# void clsline(long x, long y, int s)
+sub clsline
+{
+ my ($x, $y, $s) = @_;
+
+ print("\" fill=\"none\" />\n");
+}
+
+#void move(int x, int y)
+sub move
+{
+ my ($x, $y) = @_;
+ printf(" M %d %d",$x,$y);
+}
+
+# void sxy_move(int x, int y)
+sub sxy_move
+{
+ my ($x, $y) = @_;
+ move(SX($x), SY($y));
+}
+
+# void draw(int x, int y)
+sub draw
+{
+ my ($x, $y) = @_;
+ printf(" L %d %d",$x,$y);
+}
+
+# void sxy_draw(int x, int y)
+sub sxy_draw
+{
+ my ($x, $y) = @_;
+ draw(SX($x),SY($y));
+}
+
+#void cal_coord(int n0, int n1, long *a_start0, long *a_stop0, long *a_start1, long *a_stop1 )
+sub cal_coord {}
+
diff --git a/scripts/links2sql.pl b/scripts/links2sql.pl
new file mode 100755
index 0000000..bdcc301
--- /dev/null
+++ b/scripts/links2sql.pl
@@ -0,0 +1,61 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+use strict;
+use DBI;
+use Getopt::Long;
+
+use vars qw($host $db $user $password $table $tab_file);
+
+GetOptions(
+ 'host=s'=>\$host,
+ 'db=s'=>\$db,
+ 'user=s'=>\$user,
+ 'pass=s'=>\$password,
+ 'table=s'=>\$table,
+ 'file=s'=>\$tab_file,
+ );
+
+$tab_file ||= "link_tmp.tab";
+
+my $dbh = DBI->connect("dbi:mysql:host=$host:$db",
+ $user, $password,
+ { RaiseError => 1, AutoCommit => 1}
+ ) or die $DBI::errstr;
+
+$dbh->do(qq(drop table if exists $table;));
+$dbh->do(qq(create table $table (seed_acc varchar(20) not NULL, link_acc varchar(20) not NULL, key seed_acc (seed_acc), key link_acc (link_acc));));
+
+open(FH, ">$tab_file");
+
+while (my $seed_line = <> ) {
+ chomp($seed_line);
+ my ($seed, $hit_line) = split(/\s+/,$seed_line);
+ my @hits = split(/;/,$hit_line);
+ for my $hit (@hits) {
+ if ($hit ne $seed) {print FH "$seed\t$hit\n";}
+ }
+}
+close(FH);
+
+$dbh->do(qq(load data local infile '$tab_file' into table $table;));
+
+unlink($tab_file);
+
+$dbh->disconnect();
diff --git a/scripts/m8_btop_msa.pl b/scripts/m8_btop_msa.pl
new file mode 100755
index 0000000..9cf6da6
--- /dev/null
+++ b/scripts/m8_btop_msa.pl
@@ -0,0 +1,412 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014,2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+################################################################
+# m8_btop_msa.pl --query query.file blast_tab_btop_file
+################################################################
+# m8_btop_msa.pl takes a query sequence and a blast tabular format
+# file with a BTOP field, and constructs a query-driven multiple
+# sequence alignment of the subject sequences that can be used as
+# input to psiblast with the "--in_msa msa.file" option.
+#
+# (because BLAST BTOP encoding provides the mismatched residues, the
+# library sequences are not required to produce the MSA -- they are
+# available in the BTOP string)
+#
+# The BTOP alignment encoding file generated from "blastp/n" or
+# "blast_formatter" using the command: blast_formatter -archive
+# blast_output.asn -outfmt '7 qseqid sseqid pident length mismatch
+# gapopen qstart qend sstart send evalue bitscore score btop' >
+# blast_output.tab_annot
+#
+# the raw score shown above is used by the annot_blast_btop2.pl
+# program, if present, but is not required by m8_btop_msa.pl
+#
+################################################################
+
+use strict;
+use IPC::Open2;
+use Pod::Usage;
+use Getopt::Long;
+# use Data::Dumper;
+
+# read lines of the form:
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|121694|sp|P20432|GSTT1_DROME 100.00 209 0 0 1 209 1 209 6e-156 433 1113 209
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|1170090|sp|P04907|GSTF3_MAIZE 26.77 198 123 7 4 185 6 197 2e-08 51.2 121 FL1YG ... 1NKRA1YW1
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|81174731|sp|P0ACA5|SSPA_ECO57 39.66 58 32 2 43 100 49 103 8e-06 43.9 102 EDFLLI ... V-I-NEQS3FM
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|121695|sp|P12653|GSTF1_MAIZE 27.62 181 107 7 32 203 34 199 9e-05 40.8 94 LI1LF ... N-1AS1CLLM1
+
+# and report the domain content ala -m 8CC
+
+my ($shelp, $help, $evalue) = (0, 0, 0.001);
+my ($query_file, $bound_file) = ("","");
+my ($out_field_str) = ("");
+my $query_lib_r = 0;
+
+GetOptions(
+ "query=s" => \$query_file,
+ "query_file=s" => \$query_file,
+ "evalue=f" => \$evalue,
+ "bound_file=s" => \$bound_file,
+ "bound=s" => \$bound_file,
+ "seqbdr=s" => \$bound_file,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+unless (-f STDIN || -p STDIN || @ARGV) {
+ pod2usage(1);
+}
+
+my @hit_list = ();
+my @multi_align = ();
+my @multi_names = ();
+
+################
+# get query sequence, and insert into MSA
+#
+my ($query_acc, $query_seq_r, $query_len);
+if ($query_file) {
+ ($query_acc, $query_seq_r) = parse_query_lib($query_file);
+ $query_len = scalar(@$query_seq_r)-1; # -1 for ' ' 1: offset
+}
+
+if (! $query_file || !$query_len) {
+ die "query sequence required";
+}
+
+push @multi_names, $query_acc;
+push @multi_align, btop2alignment($query_seq_r, $query_len, {BTOP=>$query_len, q_start=>1, q_end=>$query_len});
+my $max_sseqid_len = length($query_acc);
+
+################
+# get sequence boundaries if available
+#
+my $seq_bound_hr = 0;
+
+if ($bound_file) {
+ $seq_bound_hr = parse_bound_file($bound_file)
+}
+
+my @tab_fields = qw(q_seqid s_seqid percid alen mismatch gopen q_start q_end s_start s_end evalue bits score BTOP);
+
+while (my $line = <>) {
+ if ($line =~ m/^# Fields:/ && $line !~ m/bit score, score, BTOP/) {
+ # raw score missing, edit @tab_fields
+ pop @tab_fields;
+ pop @tab_fields;
+ push @tab_fields, "BTOP";
+ next;
+ }
+ next if ($line =~ m/^#/);
+ chomp $line;
+ next unless $line;
+
+ my %hit_data = ();
+
+ @hit_data{@tab_fields} = split(/\t/,$line);
+
+ next if ($hit_data{evalue} > $evalue);
+
+# push @hit_list, \%hit_data;
+ if (length($hit_data{s_seqid}) > $max_sseqid_len) {
+ $max_sseqid_len = length($hit_data{s_seqid});
+ }
+
+ if ($bound_file) {
+ if (defined($seq_bound_hr->{$hit_data{subj_acc}})) {
+ push @multi_names, $hit_data{s_seqid};
+ push @multi_align, bound_btop2alignment($query_seq_r, $query_len, \%hit_data, @{$seq_bound_hr->{$hit_data{subj_acc}}}{qw(start end)});
+ }
+ }
+ else { # no sequence boundaries
+ push @multi_names, $hit_data{s_seqid};
+ push @multi_align, btop2alignment($query_seq_r, $query_len, \%hit_data);
+ }
+}
+
+# final MSA output
+$max_sseqid_len += 4;
+
+print "SSEARCHm8 multiple sequence alignment\n\n\n";
+
+my $i_pos = 0;
+for (my $j = 0; $j < $query_len/60; $j++) {
+ my $i_end = $i_pos + 59;
+ if ($i_end > $query_len) {$i_end = $query_len-1;}
+ for (my $n = 0; $n < scalar(@multi_names); $n++) {
+ printf("%-".$max_sseqid_len."s %s\n",$multi_names[$n],join("",@{$multi_align[$n]}[$i_pos .. $i_end]));
+ }
+ $i_pos += 60;
+ print "\n\n";
+}
+
+
+# input: a blast BTOP string of the form: "1VA160TS7KG10RK27"
+# returns a list_ref of tokens: (1, "VA", 60, "TS", 7, "KG, 10, "RK", 27)
+#
+sub decode_btop {
+ my ($btop_str) = @_;
+
+ my @tokens = split(/(\d+)/,$btop_str);
+
+ shift @tokens unless $tokens[0];
+
+ my @out_tokens = ();
+
+ for my $token (@tokens) {
+ if ($token =~ m/^\d+$/) {
+ push @out_tokens, $token
+ }
+ else {
+ my @mis_tokens = split(/(..)/,$token);
+ for my $mis (@mis_tokens) {
+ if ($mis) {push @out_tokens, $mis};
+ }
+ }
+ }
+
+ return \@out_tokens;
+}
+
+
+sub btop2alignment {
+ my ($query_seq_r, $query_len, $hit_data_hr) = @_;
+
+ # $query_seq_r is 1: based
+ my @alignment = ();
+
+ # make a local copy
+ # my @query_seq = @{$query_seq_r};
+
+ # the left unaligned region gets " ";
+ for (my $i=1; $i < $hit_data_hr->{q_start}; $i++) {
+ push @alignment, "-";
+ }
+
+ my $btop_align_r = decode_btop($hit_data_hr->{BTOP});
+
+ my ($seq0, $seq1) = ("","");
+ my $qix = $hit_data_hr->{q_start};
+ for my $btop (@{$btop_align_r}) {
+ if ($btop =~ m/^\d+$/) { # matching query sequence, add it up
+ for (my $i=0; $i < $btop; $i++) {
+ push @alignment, $query_seq_r->[$qix++];
+ }
+ }
+ else { # could be: TS/-S/T-
+ ($seq0, $seq1) = split(//,$btop);
+ if ($seq0 ne '-') {
+ push @alignment, $seq1;
+ $qix++;
+ }
+ }
+ }
+ # all done with alignment, double check that $qix = $hit_data_hr->{q_end}
+ unless ($qix == $hit_data_hr->{q_end}+1) {
+ warn "$qix != ".$hit_data_hr->{q_end}+1;
+ }
+
+ for (my $i = $hit_data_hr->{q_end}+1; $i <= $query_len; $i++) {
+ push @alignment, "-";
+ }
+
+ return \@alignment;
+}
+
+sub bound_btop2alignment {
+ my ($query_seq_r, $query_len, $hit_data_hr, $sb_start, $sb_end) = @_;
+
+ # $query_seq_r is 1: based
+ my @alignment = ();
+
+ # the left unaligned region gets " ";
+ for (my $i=1; $i < $hit_data_hr->{q_start}; $i++) {
+ push @alignment, "-";
+ }
+
+ my $btop_align_r = decode_btop($hit_data_hr->{BTOP});
+
+ my ($seq0, $seq1) = ("","");
+ my ($qix, $six) = @{$hit_data_hr}{qw(q_start s_start)};
+
+ for my $btop (@{$btop_align_r}) {
+ if ($btop =~ m/^\d+$/) { # matching query sequence, add it up
+ for (my $i=0; $i < $btop; $i++) {
+ if ($six >= $sb_start && $six <= $sb_end) {
+ push @alignment, $query_seq_r->[$qix];
+ }
+ else {
+ push @alignment, '-';
+ }
+ $qix++; $six++;
+ }
+ }
+ else { # could be: TS/-S/T-
+ ($seq0, $seq1) = split(//,$btop);
+ if ($seq1 eq '-') { # gap in subject
+ push @alignment, '-';
+ $qix++;
+ }
+ elsif ($seq0 ne '-') { # mismatch
+ if ($six >= $sb_start && $six <= $sb_end) {
+ push @alignment, $seq1;
+ }
+ else {
+ push @alignment, '-';
+ }
+ $qix++;
+ $six++;
+ }
+ else { # gap in query, consume $six
+ $six++;
+ }
+ }
+ }
+ # all done with alignment, double check that $qix = $hit_data_hr->{q_end}
+ unless ($qix == $hit_data_hr->{q_end}+1) {
+ warn $qix." != ".$hit_data_hr->{q_end}+1;
+ }
+
+ for (my $i = $hit_data_hr->{q_end}+1; $i <= $query_len; $i++) {
+ push @alignment, "-";
+ }
+
+ return \@alignment;
+}
+
+sub parse_query_lib {
+ my ($query_file) = @_;
+
+ my %query_seqs = ();
+
+ open(my $qfd, $query_file);
+
+
+ my ($header, $sequence) = ("","");
+ while (my $entry = <$qfd>) { # returns an entire fasta entry
+ chomp $entry;
+ if ($entry =~ m/^>/) {
+ $header = $entry;
+ }
+ else {
+ $sequence .= $entry
+ }
+ }
+
+ $sequence =~ s/[^A-Za-z\*]//g; # remove everything but letters
+ $sequence = uc($sequence);
+
+ $header =~ s/^>//;
+ $header =~ s/\s.*$//;
+ my @seq = split(//,$sequence);
+ unshift @seq,""; # @seq is now 1-based
+
+ return ($header, \@seq);
+}
+
+sub parse_query_file {
+ my ($query_file) = @_;
+
+ my $seq_data = "";
+
+ open(my $qfd, $query_file);
+ while (my $line = <$qfd>) {
+ next if $line =~ m/^>/;
+ next if $line =~ m/^;/;
+ chomp $line;
+ $line =~ s/[^A-Za-z\*]//g;
+ $seq_data .= $line
+ }
+
+ $seq_data = uc($seq_data);
+
+ my @seq = split(//,$seq_data);
+
+ return \@seq;
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+annot_blast_btop2.pl
+
+=head1 SYNOPSIS
+
+ annot_blast_btop2 --ann_script ann_pfam_www_e.pl [--query_file query.fasta] --out_fields "q_seqid s_seqid percid evalue" blast_tabular_file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+
+ --ann_script -- annotation script returning site/domain locations for subject sequences
+ -- same as --script
+
+ --q_ann_script -- annotation script for query sequences
+ -- same as --q_script
+
+ --query_file -- fasta query sequence
+ -- same as --query, --query_lib
+ (can contain multiple sequences for multi-sequence search)
+
+ --out_fields -- blast tabular fields shown before domain information
+
+ --raw_score -- add the raw_score used to normalized domain scores to
+ tabular output (raw_scores are only calculated for domains)
+
+=head1 DESCRIPTION
+
+C<annot_blast_btop2.pl> runs the script specified by
+C<--ann_script/--q_ann_script> to annotate functional sites domain
+content of the sequences specified by the subject/query seqid field of
+blast tabular format (-outfmt 6 or 7) or FASTA blast tabular format
+(-m 8). The C<--ann_script/--q_ann_script> file is run to produce
+domain boundary coordinates. For searches against SwissProt
+sequences, C<--ann_script ann_feats_up_www2.pl> will acquire features
+and domains from Uniprot. C<--ann_script ann_pfam_www.pl --neg> will
+get domain information from Pfam, and score non-domain (NODOM)
+regions.
+
+The tab file is read and parsed, and then the subject/query seqid is used to
+capture domain locations in the subject/query sequence. If the domains
+overlap the aligned region, the domain names are appended to the
+intput.
+
+If a C<--query_file> is specified and two additional fields, C<score>
+and C<btop> are present, C<annot_blast_btop2.pl> calculates
+sub-alignment scores, including fraction identity, bit score, and
+Q-value (-log10(E-value)), partitioning the alignment score, identity,
+and bit score across the overlapping domains.
+
+The C<--out_fields> specifies the blast tabular fields that can be
+returned. By default, C<q_seqid s_seqid percid alen mismatch gopen
+q_start q_end s_start s_end evalue bits> (but not C<score> and
+C<BTOP>) are shown.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/m9B_btop_msa.pl b/scripts/m9B_btop_msa.pl
new file mode 100755
index 0000000..5c72854
--- /dev/null
+++ b/scripts/m9B_btop_msa.pl
@@ -0,0 +1,654 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014,2015 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+################################################################
+# m9B_btop_msa.pl --query query.file blast_tab_btop_file
+################################################################
+# m9B_btop_msa.pl takes a query sequence and a fasta -m 9b
+# file with a BTOP field, and constructs a query-driven multiple
+# sequence alignment of the subject sequences that can be used as
+# input to psiblast with the "--in_msa msa.file" option.
+#
+# (because BLAST BTOP encoding provides the mismatched residues, the
+# library sequences are not required to produce the MSA -- they are
+# available in the BTOP string)
+#
+# The BTOP alignment encoding file generated from "blastp/n" or
+# "blast_formatter" using the command: blast_formatter -archive
+# blast_output.asn -outfmt '7 qseqid sseqid pident length mismatch
+# gapopen qstart qend sstart send evalue bitscore score btop' >
+# blast_output.tab_annot
+#
+################################################################
+
+use strict;
+use IPC::Open2;
+use Pod::Usage;
+use Getopt::Long;
+# use Data::Dumper;
+
+# read lines of the form:
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|121694|sp|P20432|GSTT1_DROME 100.00 209 0 0 1 209 1 209 6e-156 433 1113 209
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|1170090|sp|P04907|GSTF3_MAIZE 26.77 198 123 7 4 185 6 197 2e-08 51.2 121 FL1YG ... 1NKRA1YW1
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|81174731|sp|P0ACA5|SSPA_ECO57 39.66 58 32 2 43 100 49 103 8e-06 43.9 102 EDFLLI ... V-I-NEQS3FM
+# gi|121694|sp|P20432.1|GSTT1_DROME gi|121695|sp|P12653|GSTF1_MAIZE 27.62 181 107 7 32 203 34 199 9e-05 40.8 94 LI1LF ... N-1AS1CLLM1
+
+# and report the domain content ala -m 8CC
+
+my ($shelp, $help, $evalue, $qvalue, $domain_bound) = (0, 0, 0.001, 30.0,0);
+my ($query_file, $bound_file_in, $bound_file_only, $bound_file_out, $masked_lib_out) = ("","","","","");
+my $query_lib_r = 0;
+
+GetOptions(
+ "query=s" => \$query_file,
+ "query_file=s" => \$query_file,
+ "evalue=f" => \$evalue,
+ "expect=f" => \$evalue,
+ "qvalue=f" => \$qvalue,
+ "bound_file_in=s" => \$bound_file_in,
+ "bound_file_only=s" => \$bound_file_only,
+ "bound_file_out=s" => \$bound_file_out,
+ "masked_library_out=s" => \$masked_lib_out,
+ "masked_lib_out=s" => \$masked_lib_out,
+ "domain_bound" => \$domain_bound,
+ "domain" => \$domain_bound,
+ "bound_in=s" => \$bound_file_in,
+ "bound_only=s" => \$bound_file_only,
+ "bound_out=s" => \$bound_file_out,
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+unless (-f STDIN || -p STDIN || @ARGV) {
+ pod2usage(1);
+}
+
+my @m9_field_names = qw(percid perc_sim raw_score a_len q_start q_end qc_start qc_end s_start s_end sc_start sc_end gap_q gap_l fs);
+
+my @hit_list = ();
+my %multi_align = ();
+my @multi_names = ();
+
+################
+# get query sequence, and insert into MSA
+#
+my ($query_acc, $query_seq_r, $query_len);
+if ($query_file) {
+ ($query_acc, $query_seq_r) = parse_query_lib($query_file);
+ $query_len = scalar(@$query_seq_r)-1; # -1 for ' ' 1: offset
+}
+
+if (! $query_file || !$query_len) {
+ die "query sequence required";
+}
+
+push @multi_names, $query_acc;
+$multi_align{$query_acc} = btop2alignment($query_seq_r, $query_len, {BTOP=>$query_len, q_start=>1, q_end=>$query_len}, 0);
+my $max_sseqid_len = length($query_acc);
+
+################
+# get sequence boundaries if available
+#
+my $seq_bound_hr = 0;
+my @seq_bound_accs = ();
+
+if ($bound_file_in) {
+ $seq_bound_hr = parse_bound_file($bound_file_in);
+}
+elsif ($bound_file_only) {
+ $seq_bound_hr = parse_bound_file($bound_file_only);
+}
+elsif ($domain_bound) {
+ my %seq_bound = ();
+ $seq_bound_hr = \%seq_bound;
+}
+elsif ($bound_file_out) {
+ my %seq_bound = ();
+ $seq_bound_hr = \%seq_bound;
+}
+
+################
+# skip down to "The best scores are:"
+#
+my ($q_num, $query_descr, $q_len, $lib_cnt, $lib_len, $best_yes) = skip_to_results();
+warn "Cannot find the best scores are:" unless $query_descr;
+
+my ($tmp, $gi, $q_db, $q_acc, $q_id);
+
+if ($query_descr =~ /^gi\|\d+\|/) {
+ ($tmp, $gi, $q_db,$q_acc, $q_id) = split(/\|/,$query_descr);
+}
+else {
+ ($q_db,$q_acc, $q_id) = split(/\|/,$query_descr);
+}
+
+$q_acc =~ s/\.\d+$//;
+
+while (my $line = <>) {
+ chomp $line;
+ next unless ($line);
+
+ last if $line =~ m/>>>/;
+ next if $line =~ m/^\+\-/; # skip over HSPs
+ chomp ($line);
+
+ my %hit_data =();
+
+ my ($left, $right, $align_f, $annot_f) = split(/\t/,$line);
+
+ $align_f= 'NULL' unless $align_f;
+ $annot_f= 'NULL' unless $annot_f;
+
+ my @fields = split(/\s+/,$left);
+ my ($ldb, $l_id, $l_acc) = ("","","");
+ if ($fields[0] =~ m/:/) {
+ ($ldb, $l_id) = split(/:/,$fields[0]);
+ ($l_acc) = $fields[1];
+ }
+ else {
+ ($ldb, $l_acc,$l_id) = split(/\|/,$fields[0]);
+ }
+
+ @hit_data{@m9_field_names} = split(/\s+/,$right);
+ @hit_data{qw(bits evalue)} = @fields[-2,-1];
+
+ #
+ # currently preselbdr files have $ldb|$l_acc, not full s_seqid, so construct it
+ #
+ my ($s_seqid, $subj_acc) = (join('|',($ldb, $l_acc, $l_id)), "$ldb|$l_acc");
+ @hit_data{qw(s_seqid subj_acc)} = ($s_seqid, $subj_acc);
+ @hit_data{qw(query_id query_acc)} = ($query_descr, $q_acc);
+ $hit_data{BTOP} = $align_f;
+
+ next if ($hit_data{evalue} > $evalue);
+
+ if (length($s_seqid) > $max_sseqid_len) {
+ $max_sseqid_len = length($s_seqid);
+ }
+
+ my $have_dom = 0;
+ if ($domain_bound) {
+ my $hit_doms_ar = parse_hit_domains($annot_f);
+ # scan from left to right to make domain boundaries based on $qvalue
+ my ($left_bound, $right_bound) = @hit_data{qw(s_end s_start)};
+ foreach my $dom_r ( @$hit_doms_ar ) {
+ next unless $dom_r->{target} eq 'subj';
+ next if $dom_r->{virtual};
+ next unless $dom_r->{qval} > $qvalue;
+
+ if ($dom_r->{s_start} < $left_bound) {
+ $left_bound = $dom_r->{s_start};
+ $have_dom = 1;
+ }
+
+ if ($dom_r->{s_end} > $right_bound) {
+ $right_bound = $dom_r->{s_end};
+ $have_dom = 1;
+ }
+ }
+
+ if ($have_dom) {
+ if (exists($seq_bound_hr->{$subj_acc})) {
+ @{$seq_bound_hr->{$subj_acc}}{qw(start end)} = ($left_bound, $right_bound);
+ }
+ else {
+ $seq_bound_hr->{$subj_acc} = {start=>$left_bound, end=>$right_bound};
+ push @seq_bound_accs, $subj_acc;
+ }
+ }
+ }
+
+ # must have separate @hit_list that can be sorted, for searches with multiple alignment results
+
+ if ($bound_file_only || $have_dom) {
+ if (exists($seq_bound_hr->{$subj_acc})) {
+ my ($status, $alignment) = bound_btop2alignment($query_seq_r, $query_len, \%hit_data, @{$seq_bound_hr->{$subj_acc}}{qw(start end)});
+ if ($status) { # aligment is within boundary
+ push @multi_names, $s_seqid;
+ $multi_align{$s_seqid} = $alignment;
+ }
+ # do not delete entry, because it needs to be preserved
+ }
+ }
+ elsif ($bound_file_in) {
+ if (exists($seq_bound_hr->{$subj_acc})) {
+ my ($status, $alignment) = bound_btop2alignment($query_seq_r, $query_len, \%hit_data, @{$seq_bound_hr->{$subj_acc}}{qw(start end)});
+ if ($status) {
+ push @multi_names, $s_seqid;
+ $multi_align{$s_seqid} = $alignment;
+# push @multi_align, $alignment;
+ }
+ }
+ else {
+ push @multi_names, $s_seqid;
+# push @multi_align, btop2alignment($query_seq_r, $query_len, \%hit_data, );
+ $multi_align{$s_seqid} = btop2alignment($query_seq_r, $query_len, \%hit_data);
+ @{$seq_bound_hr->{$subj_acc}}{qw(start end)} = @hit_data{qw(s_start s_end)};
+ push @seq_bound_accs, $subj_acc;
+ }
+ }
+ else { # no sequence boundaries
+ push @multi_names, $s_seqid;
+ $multi_align{$s_seqid} = btop2alignment($query_seq_r, $query_len, \%hit_data);
+# push @multi_align, btop2alignment($query_seq_r, $query_len, \%hit_data);
+ if (!$have_dom && ($bound_file_out)) {
+ @{$seq_bound_hr->{$subj_acc}}{qw(start end)} = @hit_data{qw(s_start s_end)};
+ push @seq_bound_accs, $subj_acc;
+ }
+ }
+}
+
+# final MSA output
+$max_sseqid_len += 2;
+
+print "SSEARCHm9B multiple sequence alignment\n\n\n";
+
+my $i_pos = 0;
+for (my $j = 0; $j < $query_len/60; $j++) {
+ my $i_end = $i_pos + 59;
+ if ($i_end > $query_len) {$i_end = $query_len-1;}
+ for my $acc (@multi_names) {
+ next unless $acc;
+ printf("%-".$max_sseqid_len."s %s\n",$acc,join("",@{$multi_align{$acc}}[$i_pos .. $i_end]));
+ }
+ $i_pos += 60;
+ print "\n\n";
+}
+
+################
+# if bound_file_out provide it
+if ($bound_file_out) {
+ open(my $bound_fd, ">", $bound_file_out) || die "cannot open $bound_file_out";
+ for my $s_acc ( @seq_bound_accs ) {
+ print $bound_fd join("\t", ($s_acc, @{$seq_bound_hr->{$s_acc}}{qw(start end)})),"\n";
+ }
+ close($bound_fd);
+}
+
+if ($masked_lib_out) {
+ open(my $masked_fd, ">", $masked_lib_out) || die "cannot open $masked_lib_out";
+
+ for my $s_acc ( @multi_names ) {
+ print $masked_fd ">$s_acc\n";
+ my $seq_lines = join('',@{$multi_align{$s_acc}});
+
+ # currently, simply remove all '-' insertions --
+ # other options would be to make external '-'s 'X's
+ $seq_lines =~ s/\-//g;
+
+ $seq_lines =~ s/(.{60})/$1\n/g;
+ print $masked_fd "$seq_lines\n";
+ }
+ close($masked_fd);
+}
+
+# input: a blast BTOP string of the form: "1VA160TS7KG10RK27"
+# returns a list_ref of tokens: (1, "VA", 60, "TS", 7, "KG, 10, "RK", 27)
+#
+sub decode_btop {
+ my ($btop_str) = @_;
+
+ my @tokens = split(/(\d+)/,$btop_str);
+
+ shift @tokens unless $tokens[0];
+
+ my @out_tokens = ();
+
+ for my $token (@tokens) {
+ if ($token =~ m/^\d+$/) {
+ push @out_tokens, $token
+ }
+ else {
+ my @mis_tokens = split(/(..)/,$token);
+ for my $mis (@mis_tokens) {
+ if ($mis) {push @out_tokens, $mis};
+ }
+ }
+ }
+
+ return \@out_tokens;
+}
+
+sub parse_hit_domains {
+ my ($annot_str) = @_;
+
+## annot_str looks like: "|RX:6-65:6-65:s=311;b=125.4;I=1.000;Q=339.6;C=C.HTH~1
+# |XR:6-65:6-65:s=311;b=125.4;I=1.000;Q=339.6;C=C.HTH~1
+# |RX:66-297:66-297:s=1200;b=483.7;I=1.000;Q=1409.6;C=NODOM~0
+# |XR:66-297:66-297:s=1200;b=483.7;I=1.000;Q=1409.6;C=NODOM~0
+
+ return 0 unless ($annot_str);
+
+ my @hit_annots = ();
+
+ my @annots = split(/\|/,$annot_str);
+ shift @annots; # remove first blank
+
+ for my $annot ( @annots ) {
+ my %dom_info = ();
+
+ # parse an entry:
+ # |RX:6-65:6-65:s=311;b=125.4;I=1.000;Q=339.6;C=C.HTH~1
+ my @d_fields = split(";",$annot);
+
+ ($dom_info{dom}) = ($d_fields[4] =~ m/C=(.+?)~?\d*v?$/); # also remove virtual domain symbols
+ next if ($dom_info{dom} =~ m/NODOM/);
+
+ ################
+ # parse @d_fields
+ if ($d_fields[4] =~ m/v$/) {
+ $dom_info{virtual} = 1;
+ }
+ else {
+ $dom_info{virtual} = 0;
+ }
+
+ ($dom_info{bits}) = ($d_fields[1] =~ m/b=(\-?\d+\.?\d*)/);
+ unless (defined($dom_info{bits})) {
+ warn "missing score info - annot: $annot\n annot_str: $annot_str";
+ $dom_info{bits} = '\N';
+ }
+ ($dom_info{percid}) = ($d_fields[2] =~ m/I=(\-?[\d\.]+)/);
+ unless (defined($dom_info{percid})) {
+ warn "missing percid info - annot: $annot\n annot_str: $annot_str";
+ $dom_info{percid} = '\N';
+ }
+
+ ($dom_info{qval}) = ($d_fields[3] =~ m/Q=([\d\.]+)/);
+
+ ################
+ # parse @c_fields
+ my @c_fields = split(":",$d_fields[0]);
+
+ if ($c_fields[0] =~ m/RX/) {$dom_info{target} = 'query';}
+ else {$dom_info{target} = 'subj';}
+
+ @dom_info{qw(q_start q_end)} = ($c_fields[1] =~ m/(\d+)\-(\d+)/);
+ @dom_info{qw(s_start s_end)} = ($c_fields[2] =~ m/(\d+)\-(\d+)/);
+ ($dom_info{score}) = ($c_fields[3] =~ m/s=(\-?\d+)/);
+ unless (defined($dom_info{score})) {
+ warn "missing score info - annot: $annot\n annot_str: $annot_str";
+ $dom_info{score} = '\N';
+ }
+
+ push @hit_annots, \%dom_info;
+ }
+
+ return \@hit_annots;
+}
+
+
+sub btop2alignment {
+ my ($query_seq_r, $query_len, $hit_data_hr, $seq_bound_hr) = @_;
+
+ # $query_seq_r is 1: based
+ my @alignment = ();
+
+ # the left unaligned region gets " ";
+ for (my $i=1; $i < $hit_data_hr->{q_start}; $i++) {
+ push @alignment, "-";
+ }
+
+ my $btop_align_r = decode_btop($hit_data_hr->{BTOP});
+
+ my ($seq0, $seq1) = ("","");
+ my $qix = $hit_data_hr->{q_start};
+ for my $btop (@{$btop_align_r}) {
+ if ($btop =~ m/^\d+$/) { # matching query sequence, add it up
+ for (my $i=0; $i < $btop; $i++) {
+ push @alignment, $query_seq_r->[$qix++];
+ }
+ }
+ else { # could be: TS/-S/T-
+ ($seq0, $seq1) = split(//,$btop);
+ if ($seq0 ne '-') {
+ push @alignment, $seq1;
+ $qix++;
+ }
+ }
+ }
+ # all done with alignment, double check that $qix = $hit_data_hr->{q_end}
+ unless ($qix == $hit_data_hr->{q_end}+1) {
+ warn "$qix != ".$hit_data_hr->{q_end}+1;
+ }
+
+ for (my $i = $hit_data_hr->{q_end}+1; $i <= $query_len; $i++) {
+ push @alignment, "-";
+ }
+
+ return \@alignment;
+}
+
+################
+# generates MSA alignment entry between $sb_start and $sb_end
+# if there are no aligned residues between these locations, return $status=0
+
+sub bound_btop2alignment {
+ my ($query_seq_r, $query_len, $hit_data_hr, $sb_start, $sb_end) = @_;
+
+ # $query_seq_r is 1: based
+ my @alignment = ();
+
+ my $have_aligned_res = 0;
+
+ # the left unaligned region gets " ";
+ for (my $i=1; $i < $hit_data_hr->{q_start}; $i++) {
+ push @alignment, "-";
+ }
+
+ my $btop_align_r = decode_btop($hit_data_hr->{BTOP});
+
+ my ($seq0, $seq1) = ("","");
+ my ($qix, $six) = @{$hit_data_hr}{qw(q_start s_start)};
+
+ for my $btop (@{$btop_align_r}) {
+ if ($btop =~ m/^\d+$/) { # matching query sequence, add it up
+ for (my $i=0; $i < $btop; $i++) {
+ if ($six >= $sb_start && $six <= $sb_end) {
+ push @alignment, $query_seq_r->[$qix];
+ $have_aligned_res=1;
+ }
+ else {
+ push @alignment, '-';
+ }
+ $qix++; $six++;
+ }
+ }
+ else { # could be: TS/-S/T-
+ ($seq0, $seq1) = split(//,$btop);
+ if ($seq1 eq '-') { # gap in subject
+ push @alignment, '-';
+ $qix++;
+ }
+ elsif ($seq0 ne '-') { # mismatch
+ if ($six >= $sb_start && $six <= $sb_end) {
+ $have_aligned_res=1;
+ push @alignment, $seq1;
+ }
+ else {
+ push @alignment, '-';
+ }
+ $qix++;
+ $six++;
+ }
+ else { # gap in query, consume $six
+ $six++;
+ }
+ }
+ }
+ # all done with alignment, double check that $qix = $hit_data_hr->{q_end}
+ unless ($qix == $hit_data_hr->{q_end}+1) {
+ warn $qix." != ".$hit_data_hr->{q_end}+1;
+ }
+
+ for (my $i = $hit_data_hr->{q_end}+1; $i <= $query_len; $i++) {
+ push @alignment, "-";
+ }
+
+ return ($have_aligned_res, \@alignment);
+}
+
+sub parse_query_lib {
+ my ($query_file) = @_;
+
+ my %query_seqs = ();
+
+ open(my $qfd, $query_file);
+
+
+ my ($header, $sequence) = ("","");
+ while (my $entry = <$qfd>) { # returns an entire fasta entry
+ chomp $entry;
+ if ($entry =~ m/^>/) {
+ $header = $entry;
+ }
+ else {
+ $sequence .= $entry
+ }
+ }
+
+ $sequence =~ s/[^A-Za-z\*]//g; # remove everything but letters
+ $sequence = uc($sequence);
+
+ $header =~ s/^>//;
+ $header =~ s/\s.*$//;
+
+ my @seq = split(//,$sequence);
+ unshift @seq,""; # @seq is now 1-based
+
+ return ($header, \@seq);
+}
+
+sub parse_bound_file {
+ my ($bound_file) = @_;
+
+ my %seq_bound = ();
+
+ open(my $qfd, $bound_file) || return 0;
+
+ while (my $line = <$qfd>) {
+ next if ($line =~ m/^#/);
+ chomp $line;
+ my @data = split(/\t/,$line);
+ if (!defined($seq_bound{$data[0]})) {
+ $seq_bound{$data[0]} = {start=>$data[1], end=>$data[2]};
+ push @seq_bound_accs, $data[0];
+ }
+ else {
+ warn "multiple boundaries for $data[0]";
+ }
+ }
+
+ return \%seq_bound;
+}
+
+sub skip_to_results {
+
+ my ($q_num, $query_desc, $q_start, $q_stop, $q_len, $l_num, $l_len, $best_yes);
+
+ while (my $line = <>) {
+ if ($line =~ m/^\s*(\d+)>>>(\S+)\s.+ \- (\d+) aa$/) {
+ ($q_num,$query_desc, $q_len) = ($1,$2,$3);
+# ($q_len) = ($line =~ m/(\d+) aa$/);
+ $line = <>; # skip Library:
+ $line = <>; # 153571012 residues in 291716 sequences
+ ($l_len, $l_num) = ($line =~ m/^\s+(\d+)\s+residues in\s+(\d+)/);
+ goto have_query;
+ }
+ elsif ($line =~ m/>>>\/\/\//) {goto done;}
+ }
+ done:
+ return (0,"");
+
+ have_query:
+ while (my $line = <>) {
+ $best_yes = 0;
+
+ if ($line =~ m/^The best scores are:/) {
+ $best_yes = 1;
+ last;
+ }
+ last if ($line =~ m/^!! No sequences/);
+ }
+ return ($q_num, $query_desc,$q_start, $q_stop, $q_len, $l_num, $l_len, $best_yes);
+}
+
+__END__
+
+=pod
+
+=head1 NAME
+
+ m9B_btop_msa.pl
+
+=head1 SYNOPSIS
+
+ m9B_btop_msa.pl --query_file query.fasta [--bound_file seqbdr.tab] fasta_m9_output.file
+
+=head1 OPTIONS
+
+ -h short help
+ --help include description
+
+ --query_file -- query sequence file
+ -- same as --query
+ (only one sequence per file)
+
+ --bound_file_in -- tab delimited accession<tab>start<tab>end that
+ specifies MSA boundaries WITHIN alignment.
+ Additional hits use alignment (or domain)
+ boundaries.
+
+ --bound_file_only -- tab delimited accession<tab>start<tab>end that
+ specifies MSA boundaries WITHIN alignment.
+ Only sequences in --bound_file_only will be in the MSA.
+
+ --bound_file_out -- "--bound_file" for next iteration of psisearch2
+
+ --domain_bound parse domain annotations (-V) from m9B file
+ --domain
+
+ --masked_lib_out -- FASTA format library of MSA sequences
+
+=head1 DESCRIPTION
+
+C<m9B_btop_msa.pl> takes a fasta36/ssearch36 -m 9B ouput file, which
+includes a BTOP encoded alignment string, and produces the multiple
+sequence alignment (MSA) implied by the query sequence, alignment
+boundaries, and pairwise alignments. The alignment does not allow
+gaps in the query sequence, only in the subject sequences.
+
+The C<--query_file> must be specified, and the query sequence is
+provided as the first sequence in the MSA.
+
+If a C<--bound_file> is provided, then the ends of the alignments are
+reduced to the coordinates specified by the C<bound_file>. In
+addition, only sequences included in the C<bound_file> are included in
+the MSA.
+
+Output: A clustal-like interleaved multiple sequence alignment that
+can be used as input (using the C<-in_msa> option) to C<psiblast>.
+
+=head1 AUTHOR
+
+William R. Pearson, wrp at virginia.edu
+
+=cut
diff --git a/scripts/plot_domain2t.cgi b/scripts/plot_domain2t.cgi
new file mode 100755
index 0000000..0aa0c53
--- /dev/null
+++ b/scripts/plot_domain2t.cgi
@@ -0,0 +1,667 @@
+#!/usr/bin/perl -w
+
+# plot_domain2.pl - produce SVG plot for aligned domains
+# version2 plots both n0 and n1 sequences, with 2 axes
+#
+# args:
+# q_cstop - n0 - query length
+# l_cstop - n1 - lib length
+# q_name - query_acc
+# l_name - library_acc
+# l_astart= lib start (need q_start, q_astop, l_astart, l_astop)
+# l_astop= lib stop
+# pgm = program used
+# regions -- same as annotations on alignment
+# doms -- domains on (library) sequence
+#
+# l_annot - script to run to annotate library domain (separate from alignment)
+# q_annot - script to run to annotate library domain (separate from alignment)
+#
+
+# 9-May-2013 -- modify to accomodate reverse-complement coordinates
+
+use strict;
+use Getopt::Long;
+use Pod::Usage;
+
+use CGI qw(header param end_html);
+#use URI::Escape;
+
+use vars qw($pminx $pmaxx $pminy $pmaxy $lvstr $max_x $max_y
+ $fxscal $fyscal $fxoff $fyoff $x_rev $y_rev
+ @block_colors
+ $annot_color %annot_names %color_names);
+
+ at block_colors = qw( slategrey lightgreen lightblue pink cyan tan gold plum mediumplum );
+
+$annot_color = 1;
+%annot_names = ();
+
+my @annot_scripts = ("", "",
+ "ann_feats2ipr.pl",
+ "ann_feats2l.pl",
+ "ann_feats2ipr.pl",
+ "ann_pfam26.pl",
+ "ann_pfam26.pl --pfacc",
+ "ann_pdb_cath.pl",
+ "ann_pdb_cath.pl --class",
+ );
+
+# $max_x, $max_y define the maximum plotting area
+# the actual bounding box/view area will be larger if annotation comments are available
+($max_x,$max_y)=(540,24);
+
+my @xax_len = (200, 300, 500, 1000, 2000, 3000, 5000, 10000, 20000, 30000, 50000);
+my $max_xax = -1;
+my $comb_xax = 0; # comb_xax captures the length of the two sequences minus the offset
+my ($x0c_off, $x1c_off, $xdc_off) = (0,0,0);
+
+my ($x0f3, $x1f3) = (1,1); # set to 3 for fastx (x0f3) or tfastx (x1f3)
+
+# tick array - values */
+my @tarr = (50, 100,200,500,1000,2000,5000,10000,20000);
+my $MAX_INTERVAL=20000;
+
+my $q = new CGI;
+
+my %dom_colors = ();
+my $max_color = 0;
+
+my @arg_names = $q->param();
+
+my %args = map { $_ => $q->param($_) } @arg_names;
+
+if ($args{pgm} =~ m/^f[xy]$/) { $x0f3 = 3;}
+elsif ($args{pgm} =~ m/^tf[xy]/) { $x1f3 = 3;}
+
+#unless ($ENV{DOCUMENT_ROOT}) {
+# %args = map { $_ => uri_unescape($args{$_}) } keys %args;
+#}
+
+my ($region_info_r, $q_dom_info_r, $l_dom_info_r);
+
+if ($args{regions}) {
+ $region_info_r = parse_regions($args{regions});
+}
+else {$region_info_r = [];}
+
+if ($args{doms}) {
+ ($q_dom_info_r, $l_dom_info_r) = parse_domains($args{doms});
+} else {
+ $q_dom_info_r = [];
+ $l_dom_info_r = [];
+}
+
+my @q_annots = ();
+# unless (scalar(@{$q_dom_info_r})) {
+# my $q_annot_script = "";
+# if (defined($args{q_annot}) && $args{q_annot}) {
+# $q_annot_script = $annot_scripts[$args{q_annot}];
+# }
+# if (!$q_annot_script && defined($args{l_annot}) && $args{l_annot}) {
+# $q_annot_script = $annot_scripts[$args{l_annot}];
+# }
+
+# if ($q_annot_script) {
+# open(S_IN, '-|',"./$q_annot_script --lav \'sp|$args{q_name}'") || die "cannot open $args{q_name}\n";
+# while (my $s_line = <S_IN>) {
+# next if ($s_line =~ m/^#/);
+# next if ($s_line =~ m/^>/);
+# chomp($s_line);
+# my %q_data = ();
+# @q_data{qw(beg end descr)} = split(/\t/,$s_line);
+# if ($dom_colors{$q_data{descr}}) {
+# $q_data{color} = $dom_colors{$q_data{descr}}
+# } else {
+# $q_data{color} = ++$max_color;
+# $dom_colors{$q_data{descr}} = $max_color;
+# }
+# push @q_annots, \%q_data;
+# }
+# }
+# }
+
+
+openplt(($args{q_cstop}-$args{q_cstart})+1, ($args{l_cstop}-$args{l_cstart})+1, $q_dom_info_r, $l_dom_info_r);
+draw_align(\%args);
+if (scalar(@{$region_info_r})) {
+ draw_regions($region_info_r, $args{l_cstop});
+}
+
+my $q_annot_script = "";
+if ($args{doms}) {
+ if (scalar(@{$l_dom_info_r})) {
+ draw_doms($l_dom_info_r, $x1c_off, -12, $args{l_cstart}, $args{l_cstop});
+ }
+ if (scalar(@{$q_dom_info_r})) {
+ draw_doms($q_dom_info_r, $x0c_off, 48, $args{q_cstart}, $args{q_cstop});
+ }
+ elsif (scalar(@q_annots)) {
+ draw_doms(\@q_annots, $x0c_off, 48, $args{q_cstart}, $args{q_cstop});
+ }
+}
+
+closeplt($args{l_cstop});
+
+exit(0);
+
+# have all the data (and length of sequence), scale it and color it
+
+#define SX(x) (int)((double)(x)*fxscal+fxoff+6)
+sub SX {
+ my $xx = shift;
+ return int($xx*$fxscal+$fxoff+18);
+}
+
+sub SY {
+ my $yy = shift;
+ return $max_y - int($yy*$fyscal+$fyoff);
+}
+
+my $y_delta = 0;
+
+#void openplt(long n0, long n1, int sq0off, int sq1off, char *xtitle, char *ytitle)
+sub openplt
+{
+ my ($n0, $n1, $q_dom_info_r, $l_dom_info_r) = @_;
+
+ my ($xbound, $ybound) = ($max_x + 24, 48);
+ my ($x0_rev, $x1_rev) = (0,0);
+
+ if (scalar(@{$q_dom_info_r})) {
+ $ybound += 14;
+ }
+ elsif (scalar(@q_annots)) {
+ $ybound += 14;
+ }
+ $ybound += 14 if (scalar(@{$l_dom_info_r}));
+
+ if ($n0 < 0) {$x0_rev=1; $n0 = 2 - $n0;}
+ if ($n1 < 0) {$x1_rev=1; $n1 = 2 - $n1;}
+
+ print $q->header('image/svg+xml') if ($ENV{DOCUMENT_ROOT});
+ print("<?xml version=\"1.0\" standalone=\"no\"?>\n");
+ print("<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\" \n");
+ print("\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n\n");
+
+ print(qq(<!-- l_name=$args{l_name} -->\n));
+ print("<svg width=\"$xbound\" height=\"$ybound\" version=\"1.1\"\n");
+# print("<svg version=\"1.1\"\n");
+ print("xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n\n");
+
+# simple things first, if query is shorter and inside, or library is
+# shorter and inside, then just use the longer sequence.
+
+ ($x0c_off, $x1c_off, $xdc_off) = (0,0,0);
+
+ ($comb_xax, $xdc_off, $x0c_off, $x1c_off) = calc_offsets($n0, $x0f3, $x0_rev, $n1, $x1f3, $x1_rev, \%args);
+
+# if ($args{pgm} =~ m/lsw/) {
+# $max_xax = $comb_xax;
+# } else {
+ for (my $i=0; $i < scalar(@xax_len); $i++) {
+ if ($comb_xax <= $xax_len[$i]) {
+ $max_xax = $xax_len[$i];
+ last;
+ }
+# }
+ $max_xax = $xax_len[$#xax_len] if ($max_xax <= 0);
+ }
+
+ $fxscal = ($max_x-1)/$max_xax;
+ $fyscal = 1;
+
+ $fxscal *= 0.9; $fxoff = 24;
+ $fyoff = -14;
+ if (scalar(@{$q_dom_info_r}) || scalar(@q_annots)) {$fyoff -= 12;}
+
+
+ xaxis2($n0/$x0f3,$args{q_cstart}/$x0f3, $x0_rev, $n1/$x1f3, $args{l_cstart}/$x1f3, $x1_rev);
+
+ newline(qq(stroke="black" stroke-width="1.5"));
+ move(SX($x0c_off), SY(15));
+ draw(SX($x0c_off+($n0/$x0f3)),SY(15));
+ clsline($n0,0);
+
+ newline(qq(stroke="black" stroke-width="1.5"));
+ move(SX($x1c_off), SY(9));
+ draw(SX($x1c_off+($n1/$x1f3)),SY(9));
+
+ clsline($n1,0);
+}
+
+sub closeplt {
+ print "</svg>\n";
+}
+
+sub calc_offsets {
+ my ($n0, $x0f3, $x0_rev, $n1, $x1f3, $x1_rev, $args_r) = @_;
+
+ my ($comb_xax, $x0c_off, $x1c_off, $xdc_off) = (0,0,0,0);
+
+ my ($n0f, $n1f, $q_start_d, $q_stop_d, $l_start_d, $l_stop_d) =
+ (
+ $n0/$x0f3, # $n0f
+ $n1/$x0f3, # $n1f
+ abs($args_r->{q_astart} - $args_r->{q_cstart})/$x0f3, # $q_start_d
+ abs($args_r->{q_astop} - $args_r->{q_cstop})/$x0f3, # $q_stop_d
+ abs($args_r->{l_astart} - $args_r->{l_cstart})/$x1f3, # $l_start_d
+ abs($args_r->{l_astop} - $args_r->{l_cstop})/$x1f3 # $l_stop_d
+ );
+
+ if (($n1f >= $n0f) && ($l_start_d >= $q_start_d) && ($l_stop_d >= $q_stop_d)) {
+ # n1 is longer and n0 is contained
+ $comb_xax = $n1f; # $comb_xax : combined x-axis, in amino-acids if translated
+ $x0c_off = $l_start_d - $q_start_d;
+ }
+ elsif (($n1f < $n0f) && ($l_start_d <= $q_start_d) && ($l_stop_d <= $q_stop_d)) {
+ # n0 is longer and n1 is contained
+ $comb_xax = $n0f;
+ $xdc_off = $x1c_off = $q_start_d - $l_start_d;
+ }
+ # some kind of extension is necessary
+ elsif ($l_start_d >= $q_start_d) {
+ $x0c_off = $l_start_d - $q_start_d;
+ $comb_xax = $n0f + $x0c_off;
+ }
+ else {
+ $xdc_off = $x1c_off = $q_start_d - $l_start_d;
+ $comb_xax = $n1f + $x1c_off;
+ }
+
+ return ($comb_xax, $xdc_off, $x0c_off, $x1c_off);
+}
+
+sub draw_trapz {
+ my ($start0, $stop0, $start1, $stop1, $color, $text) = @_;
+
+ $color = ($color % scalar(@block_colors));
+ my $svg_color = $block_colors[$color];
+ my $tx = $start1 + int(($stop1-$start1+1)/2);
+ my $ty = 10 + 9;
+
+ $text = substr($text,0,10);
+
+ newline(qq(stroke="black" stroke-width="1.5"));
+
+ move(SX($start0+$x0c_off), SY(20));
+ draw(SX($stop0+$x0c_off),SY(20));
+ draw(SX($stop1+$x1c_off),SY(10));
+ draw(SX($start0+$x1c_off),SY(10));
+ move(SX($start0+$x0c_off), SY(20));
+
+ print(qq(" fill="$svg_color" />\n));
+
+
+# print (qq(<rect x="$x" y="$y" width="$w" height="$h" fill="$svg_color" stroke="white" stroke-width="1" />\n));
+# print (qq(<text x="$tx" y="$ty" font-size="9" font-family="sans-serif" fill="white" text-anchor="middle">$text</text>\n));
+}
+
+# draws a colored solid block, and labels it, to indicate domain
+sub draw_block {
+ my ($x, $y, $w, $h, $color, $text, $Q) = @_;
+
+ $color = ($color % scalar(@block_colors));
+ my $svg_color = $block_colors[$color];
+ my $tx = $x + int($w/2);
+ my $ty = $y + 9;
+
+ $text = substr($text,0,10);
+
+ my $stroke_width = 0.5;
+ if ($Q < 30.0) {$stroke_width = 2;}
+
+ print (qq(<rect x="$x" y="$y" width="$w" height="$h" fill="$svg_color" stroke="white" stroke-width="$stroke_width" />\n));
+ print (qq(<text x="$tx" y="$ty" font-size="9" font-family="sans-serif" fill="white" text-anchor="middle">$text</text>\n));
+}
+
+sub draw_regions {
+ my ($annot_arr_r, $n1) = @_;
+
+ for my $annot ( @$annot_arr_r) {
+ draw_block(SX($annot->{beg1} - $args{l_cstart}+$xdc_off), SY(18), SX($annot->{end1}+$xdc_off)-SX($annot->{beg1}+$xdc_off),
+ 12, $annot->{color}, $annot->{descr}, $annot->{Q});
+ }
+}
+
+sub draw_doms {
+ my ($annot_arr_r, $xc_off, $y_off, $xc_start, $xc_stop) = @_;
+
+ for my $annot ( @$annot_arr_r) {
+ draw_block(SX($annot->{beg}+$xc_off), SY($y_off), SX($annot->{end}+$xc_off)-SX($annot->{beg}+$xc_off),
+ 12, $annot->{color}, $annot->{descr}, 100.0);
+ }
+}
+
+sub draw_align {
+ my $arg_r = shift;
+
+ my ($x, $y, $w, $h) = (SX($args{l_astart} - $args{l_cstart} + $xdc_off), SY(21), SX($args{l_astop}+$xdc_off) - SX($args{l_astart}+$xdc_off), 18);
+
+ print (qq(<rect x="$x" y="$y" width="$w" height="$h" stroke="black" fill-opacity="0" stroke-width="1" />\n));
+}
+
+# void newline(char *options)
+sub newline
+{
+ my $options = shift;
+
+ if ($options) {
+ printf("<path %s d=\"",$options);
+ }
+ else {print("<path stroke=\"black\" d=\"");}
+}
+
+# void clsline(long x, long y, int s)
+sub clsline
+{
+ my ($x, $y, $s) = @_;
+
+ print("\" fill=\"none\" />\n");
+}
+
+#void move(int x, int y)
+sub move
+{
+ my ($x, $y) = @_;
+ printf(" M %d %d",$x,$y);
+}
+
+# void draw(int x, int y)
+sub draw
+{
+ my ($x, $y) = @_;
+ printf(" L %d %d",$x,$y);
+}
+
+# void xaxis(long n, int offset, char *title)
+# coordinates in amino acids - modify for final axes
+sub xaxis2 {
+ my ($n0, $offset0, $x0_rev, $n1, $offset1,$x1_rev) = @_;
+
+ my ($v_offset0, $v_offset1) = ($offset0, $offset1);
+
+ my ($i, $jm, $tick_length, $max_ticks, $tick_inc);
+ my ($js, $jl0, $jl1);
+ my ($sgn0, $sgn1) = (1,1);
+
+ my $num_len;
+ my $numstr;
+
+ if ($x0_rev) {
+ $sgn0 = -1;
+ $v_offset0 = $offset0 - $n0;
+ }
+
+ if ($x1_rev) {
+ $sgn1 = -1;
+ $v_offset1 = $offset1 - $n1;
+ }
+
+ # for translated-DNA/protein searches, both $n0 and $n1 are in amino-acids
+ my $n_max = $n1;
+ $n_max = $n0 if ($n0 > $n1);
+ my $offset = 0;
+
+ $tick_length = 2;
+
+ # search for the correct increment for the tick array */
+ # @tarr[] has the list of axis increments we might use
+ # we want a tick increment that gives < 6 ticks
+ for ($i=0; $i< @tarr; $i++) {
+ # seek to divide into 10 or fewer divisions */
+ if (($max_ticks = $n_max/$tarr[$i]) <= 6) {goto found;}
+ }
+
+ # these happen only if no tick increment was found
+ # point $i to the last element
+ $i = scalar(@tarr)-1;
+
+ # $max_ticks is the number of increments for longest sequence
+
+ $max_ticks = ($n_max)/$tarr[-1];
+ $i = -1;
+
+ found:
+ $max_ticks += max($offset0, $offset1)/$tarr[$i];
+ $tick_inc = $tarr[$i];
+
+ # jo is the offset for the coordinate system, e.g. if we are
+ # plotting an alignment from 101 - 300 rather than 1 - 400 we may
+ # show partial sequences in alignments, it should be kept, but is is
+ # different from the axis shift ($x0c_off, $x1c_off)
+
+ my ($xx0c_off, $xx1c_off) = ($x0c_off, $x1c_off);
+
+ unless ($x0_rev) {
+# $xx0c_off -= $offset0;
+# draw up-tick
+ newline("stroke=\"black\" stroke-width=\"1.5\"");
+ for ($i=1; $i<=$max_ticks; $i++) {
+ last if ($i*$tick_inc > $n0 + $v_offset0);
+ next if ($i*$tick_inc < $v_offset0);
+
+ move(SX($i*$tick_inc + $xx0c_off - $v_offset0),SY(26));
+ draw(SX($i*$tick_inc + $xx0c_off - $v_offset0),SY(26)+$tick_length);
+ }
+ clsline($n_max,$n_max,10000);
+
+ for ($i=1; $i<=$max_ticks; $i++) {
+ last if ($i*$tick_inc > $n0 + $v_offset0);
+ next if ($i*$tick_inc < $v_offset0);
+ $numstr = sprintf("%ld",$i*$tick_inc*$x0f3 );
+ printf(qq(<text x="%d" y="%d" font-family="sans-serif" font-size='9' text-anchor="middle">%s</text>\n),
+ SX($i*$tick_inc+$xx0c_off - $v_offset0),SY(28)+$tick_length-1,$numstr);
+ }
+ }
+ else {
+ $xx0c_off += $offset0;
+ # if $x0_rev need to know $x0_max_ticks
+ my $x0_max_ticks = int(($n0+$offset0)/$tick_inc);
+
+ newline("stroke=\"black\" stroke-width=\"1.5\"");
+ for ($i=$x0_max_ticks; $i>0; $i--) {
+ move(SX($xx0c_off - $i*$tick_inc),SY(26));
+ draw(SX($xx0c_off - $i*$tick_inc),SY(26)+$tick_length);
+ }
+ clsline($n_max,$n_max,10000);
+
+ # now put in the numbers, using the same reverse counting
+ for ($i=$x0_max_ticks; $i>0; $i--) {
+ $numstr = sprintf("%ld",$i*$tick_inc*$x0f3);
+ printf(qq(<text x="%d" y="%d" font-family="sans-serif" font-size='9' text-anchor="middle">%s</text>\n),
+ SX($xx0c_off - $i*$tick_inc - $v_offset0),SY(28)+$tick_length-1,$numstr);
+ }
+ }
+
+ unless ($x1_rev) {
+# $xx1c_off -= $offset1;
+ # draw down-tick
+ newline("stroke=\"black\" stroke-width=\"1.5\"");
+ for ($i=1; $i<=$max_ticks; $i++) {
+ last if ($i*$tick_inc > $n1 + $v_offset1);
+ next if ($i*$tick_inc < $v_offset1);
+ move(SX($i*$tick_inc + $xx1c_off - $v_offset1),SY(0));
+ draw(SX($i*$tick_inc + $xx1c_off - $v_offset1),SY(0)+$tick_length);
+ }
+ clsline($n_max,$n_max,10000);
+
+ $numstr = sprintf("%ld",$tick_inc*$x0f3);
+ $num_len = length($numstr);
+
+ for ($i=1; $i<=$max_ticks; $i++) {
+ last if ($i*$tick_inc > $n1 + $offset1);
+ next if ($i*$tick_inc < $offset1);
+ $numstr = sprintf("%ld",$i*$tick_inc*$x1f3);
+ printf(qq(<text x="%d" y="%d" font-family="sans-serif" font-size='9' text-anchor="middle">%s</text>\n),
+ SX($i*$tick_inc+$xx1c_off-$v_offset1),SY(0)+$tick_length+8,$numstr);
+ }
+ }
+ else {
+ my $x1_max_ticks = int($n1/$tick_inc);
+ $xx1c_off += $offset1;
+
+ # draw down-tick
+ newline("stroke=\"black\" stroke-width=\"1.5\"");
+ for ($i=$x1_max_ticks; $i>0; $i--) {
+ move(SX($xx1c_off - $i*$tick_inc),SY(0));
+ draw(SX($xx1c_off - $i*$tick_inc),SY(0)+$tick_length);
+ }
+ clsline($n_max,$n_max,10000);
+
+ $numstr = sprintf("%ld",$tick_inc*$x0f3);
+ $num_len = length($numstr);
+
+ for ($i=$x1_max_ticks; $i>0; $i--) {
+ $numstr = sprintf("%ld",$i*$tick_inc*$x1f3);
+ printf(qq(<text x="%d" y="%d" font-family="sans-serif" font-size='9' text-anchor="middle">%s</text>\n),
+ SX($xx1c_off - $i*$tick_inc),SY(0)+$tick_length+8,$numstr);
+ }
+ }
+}
+
+# void xaxis(long n, int offset, char *title)
+sub xaxis_d {
+ my ($n, $offset) = @_;
+
+ my ($i, $jm, $tick);
+ my ($js, $jo, $jl);
+ my $num_len;
+ my $numstr;
+
+ $tick = 2;
+
+ # search for the correct increment for the tick array */
+ for ($i=0; $i< @tarr; $i++) {
+ # seek to divide into 10 or fewer divisions */
+ if (($jm = $n/$tarr[$i])<6) {goto found;}
+ }
+ $i= scalar(@tarr)-1;
+ $jm = $n/$tarr[$i];
+ found:
+ # js is the start of the value - modify to accomodate offset */
+ $js = $tarr[$i];
+
+ # jo is the offset */
+ $jo = $offset % $tarr[$i]; # figure out offset in tarr[i] increments */
+
+ # jl is the label */
+ $jl = $offset/$tarr[$i]; # figure out offset in tarr[i] increments */
+ $jl *= $tarr[$i];
+
+ newline("stroke=\"black\" stroke-width=\"1.5\"");
+ for ($i=1; $i<=$jm; $i++) {
+ move(SX($i*$js - $jo),SY(0));
+ draw(SX($i*$js - $jo),SY(0)+$tick);
+ }
+ clsline($n,$n,10000);
+
+ $numstr = sprintf("%ld",$js + $jl );
+ $num_len = length($numstr);
+ if ($num_len > 4) {
+
+ printf(qq(<text x="%d" y="%d" font-family="sans-serif" font-size='9' text-anchor="middle">%s</text>\n),SX($js-$jo),SY(0)+$tick+8,$numstr);
+
+ $numstr = sprintf("%ld",$jm*$js+$jl);
+ printf(qq(<text x="%d" y="%d" font-family="sans-serif" font-size='9' text-anchor="middle">%s</text>\n),SX($jm*$js-$jo),SY(0)+$tick+8,$numstr);
+ }
+ else {
+ for ($i=1; $i<=$jm; $i++) {
+ $numstr = sprintf("%ld",$i*$js+$jl);
+ printf(qq(<text x="%d" y="%d" font-family="sans-serif" font-size='9' text-anchor="middle">%s</text>\n),SX($i*$js-$jo),SY(0)+$tick+8,$numstr);
+ }
+ }
+}
+
+sub parse_regions {
+ my $region_str = shift;
+
+ my @regions = split(/\n\s*/,$region_str);
+
+ my @region_info = ();
+
+ for my $region ( @regions) {
+ $region =~ s/^\s+//;
+ next unless ($region =~ m/^Region/);
+
+ my @fields = split(/\s+:\s*/,$region);
+
+ my %data = ();
+
+ @data{qw(descr color)} = @fields[-2,-1];
+
+ $dom_colors{$data{descr}}=$data{color} unless defined($dom_colors{$data{descr}});
+ $max_color = $data{color} if ($data{color} > $max_color);
+
+ my @scores = split(/;\s*/,$fields[1]);
+
+ for my $score (@scores) {
+ my ($key, $value) = split(/=/,$score);
+ $data{$key} = $value;
+ }
+
+ # this line hides low-score NODOMs
+ next if ($data{color}==0 && $data{Q} < 30.0);
+
+ @data{qw(beg0 end0 beg1 end1)} = ($fields[0] =~ m/^Region:\s*(\d+)-(\d+):(\d+)-(\d+)$/);
+
+ push @region_info, \%data;
+ }
+
+ return \@region_info;
+}
+
+sub parse_domains {
+ my $domain_str = shift;
+
+ my @domains = split(/\n\s*/,$domain_str);
+
+ my @q_domain_info = ();
+ my @l_domain_info = ();
+
+ for my $domain ( @domains) {
+ $domain =~ s/^\s+//;
+ next unless ($domain =~ m/^[ql]Domain/);
+
+ my @fields = split(/\t/,$domain);
+
+ next if ($fields[-1] =~ m/NODOM/);
+
+ my %data = ();
+
+ @data{qw(beg end)} = ($fields[1]) =~ m/(\-?\d+)\-(\-?\d+)/;
+ @data{qw(descr color)} = split(/ :/,$fields[-1]);
+
+ $dom_colors{$data{descr}}=$data{color} unless defined($dom_colors{$data{descr}});
+ $max_color = $data{color} if ($data{color} > $max_color);
+
+ if ($fields[0] =~ m/^qDomain/) {
+ $data{beg} -= $args{q_cstart} + 1;
+ $data{end} -= $args{q_cstart} + 1;
+ next if $data{end} < 1;
+ $data{beg} = 1 if $data{beg} < 1;
+ next if $data{beg} > $args{q_cstop};
+ $data{end} = $args{q_cstop} if $data{end} > $args{q_cstop};
+
+ push @q_domain_info, \%data;
+ }
+ elsif ($fields[0] =~ m/^lDomain/) {
+ $data{beg} -= $args{l_cstart} + 1;
+ $data{end} -= $args{l_cstart} + 1;
+ next if $data{end} < 1;
+ $data{beg} = 1 if $data{beg} < 1;
+ next if $data{beg} > $args{l_cstop};
+ $data{end} = $args{l_cstop} if $data{end} > $args{l_cstop};
+ push @l_domain_info, \%data;
+ }
+ }
+
+ return (\@q_domain_info, \@l_domain_info);
+}
+
+sub max {
+ my ($x0, $x1) = @_;
+
+ return $x0 if $x0 >= $x1;
+ return $x1;
+}
diff --git a/scripts/summ_domain_ident.pl b/scripts/summ_domain_ident.pl
new file mode 100755
index 0000000..85e9c55
--- /dev/null
+++ b/scripts/summ_domain_ident.pl
@@ -0,0 +1,97 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# summ_domain_ident.pl takes a -m 8CC result file with query-annotated
+# domains, and produces a tab-delimited summary of identity across the domains
+# parse:
+# sp|P09488|GSTM1_HUMAN gi|121735|sp|P09488.3|GSTM1_HUMAN 100.00 218 0 0 1 218 1 218 2.9e-113 408.2 218M |RX:1-12:1-12:s=64;b=25.0;I=1.000;Q=47.5;C=exon_1|RX:13-37:13-37:s=128;b=49.9;I=1.000;Q=121.4;C=exon_2|RX:38-59:38-59:s=125;b=48.7;I=1.000;Q=117.9;C=exon_3|RX:60-86:60-86:s=145;b=56.5;I=1.000;Q=141.0;C=exon_4|RX:87-120:87-120:s=185;b=72.1;I=1.000;Q=187.2;C=exon_5|RX:121-152:121-152:s=174;b=67.8;I=1.000;Q=174.5;C=exon_6|RX:153-189:153-189:s=197;b=76.8;I=1.000;Q=201.0;C=exon_7|RX:190-21 [...]
+
+
+
+use strict;
+use Getopt::Long;
+use Pod::Usage;
+
+my ($shelp, $help) = (0, 0);
+
+GetOptions(
+ "h|?" => \$shelp,
+ "help" => \$help,
+ );
+
+pod2usage(1) if $shelp;
+pod2usage(exitstatus => 0, verbose => 2) if $help;
+pod2usage(1) unless @ARGV;
+
+my $first_line =1;
+
+my @a_field_names = qw( score_field bits id qval comment );
+my @domain_names = ();
+
+while (my $line = <>) {
+ next if $line =~ m/^#/;
+ chomp($line);
+
+ # get last (annotation) field
+
+ my @fields = split(/\t/,$line);
+ $fields[1] =~ s/^gi\|\d+\|//;
+ $fields[1] =~ s/\.\d+\|/\|/;
+
+ my @annots = split(/\|/,$fields[-1]);
+ shift @annots; # first is blank
+
+ # $annots[...]:
+ # RX:1-12:1-12:s=64;b=25.0;I=1.000;Q=47.5;C=exon_1
+
+ my %dom_ids = ();
+
+ for my $annot (@annots) {
+ my %a_fields = ();
+ @a_fields{@a_field_names} = split(/;/,$annot);
+ $a_fields{'id'} =~ s/^I=//;
+ $a_fields{'comment'} =~ s/^C=//;
+
+ $dom_ids{$a_fields{'comment'}} = $a_fields{'id'};
+
+ if ($first_line) {
+ push @domain_names, $a_fields{'comment'};
+ }
+ }
+ if ($first_line) {
+ print join("\t",("subj_acc ","ident", at domain_names)),"\n";
+ $first_line = 0;
+ }
+
+ for my $dom ( @domain_names ) {
+ if (defined($dom_ids{$dom})) {
+ if (100.0 * $dom_ids{$dom} >= $fields[2]) {
+ $dom_ids{$dom} .= '+';
+ }
+ else {
+ $dom_ids{$dom} .= '-';
+ }
+ }
+ else {
+ $dom_ids{$dom} = '';
+ }
+ }
+
+ print join("\t",($fields[1],$fields[2], at dom_ids{@domain_names})),"\n";
+}
diff --git a/scripts/test_ann_scripts.sh b/scripts/test_ann_scripts.sh
new file mode 100755
index 0000000..eb9f299
--- /dev/null
+++ b/scripts/test_ann_scripts.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+## a script to test the annotation scripts ann_*.pl
+## acc_examples contains:
+# P09488 -- new NCBI format
+# sp|P09488 --more traditional format
+# up|P09488|GSTM1_HUMAN
+# SP:GSTM1_HUMAN P09488 ---ebi searches with accession
+# SP:GSTM1_HUMAN -- ebi searches without accession
+##
+
+if [ ! $1=='' ]; then
+ script_file=$1
+else
+ script_file=ann_script_list
+fi
+
+if [ ! $1=='' ]; then
+ ex_file=$1
+else
+ ex_file=acc_examples
+fi
+
+for script in `cat $script_file `; do
+ for acc_type in `cat $ex_file`; do
+ echo $script ${acc_type}
+ $script ${acc_type}
+ done
+ echo '***DONE***' $script `date`
+done
diff --git a/seq/bovgh.seq b/seq/bovgh.seq
new file mode 100644
index 0000000..f30b344
--- /dev/null
+++ b/seq/bovgh.seq
@@ -0,0 +1,38 @@
+>BOVGH bovine growth hormone (presomatotropin) gene and flanks.
+ AAAACCTATG GGGTGGGCTC TCAAGCTGAG ACCCTGTGTG CACAGCCCTC TGGCTGGTGG
+ CAGTGGAGAC GGGATNNNAT GACAAGCCTG GGGGACATGA CCCCAGAGAA GGAACGGGAA
+ CAGGATGAGT GAGAGGAGGT TCTAAATTAT CCATTAGCAC AGGCTGCCAG TGGTCCTTGC
+ ATAAATGTAT AGAGCACACA GGTGGGGGGA AAGGGAGAGA GAGAAGAAGC CAGGGTATAA
+ AAATGGCCCA GCAGGGACCA ATTCCAGGAT CCCAGGACCC AGTTCACCAG ACGACTCAGG
+ GTCCTGTGGA CAGCTCACCA GCTATGATGG CTGCAGGTAA GCTCGCTAAA ATCCCCTCCA
+ TTCGCGTGTC CTAAAGGGGT AATGCGGGGG GCCCTGCCGA TGGATGTGTT CAGAGCTTTG
+ GGCTTTAGGG CTTCCGAATG TGAACATAGG TATCTACACC CAGACATTTG GCCAAGTTTG
+ AAATGTTCTC AGTCCCTGGA GGGAAGGGTA GGTGGGGGCT GGCAGGAGAT CAGGCGTCTA
+ GCTCCCTGGG GCCCTCCGTC GCGGCCCTCC TGGTCTCTCC CTAGGCCCCC GGACCTCCCT
+ GCTCCTGGCT TTCGCCCTGC TCTGCCTGCC CTGGACTCAG GTGGTGGGCG CCTTCCCAGC
+ CATGTCCTTG TCCGGCCTGT TTGCCAACGC TGTGCTCCGG GCTCAGCACC TGCATCAGCT
+ GGCTGCTGAC ACCTTCAAAG AGTTTGTAAG CTCCCGAGGG ATGCGTCCTA GGGGTGGGGA
+ GGCAGGAAGG GGTGAATCCA CACCCCCTCC ACACAGTGGG AGGAAACTGA GGAGTTCAGC
+ CGTATTTTAT CCAAGTAGGG ATGTGGTTAG GGGAGCAGAA ACGGGGGTGT GTGGGGTGGG
+ GAGGGTTCCG AATAAGGCGG GGAGGGGAAC CGCGCACCAG CTTAGACCTG GGTGGGTGTG
+ TTCTTCCCCC AGGAGCGCAC CTACATCCCG GAGGGACAGA GATACTCCAT CCAGAACACC
+ CAGGTTGCCT TCTGCTTCTC TGAAACCATC CCGGCCCCCA CGGGCAAGAA TGAGGCCCAG
+ CAGAAATCAG TGAGTGGCAA CCTCGGACCG AGGAGCAGGG GACCTCCTTC ATCCTAAGTA
+ GGCTGCCCCA GCTCTCCGCA CCGGGCCTGG GGCGGCCTTC TCCCCGAGGT GGCGGAGGTT
+ GTTGGATGGC AGTGGAGGAT GATGGTGGGC GGTGGTGGCA GGAGGTCCTC GGGCAGAGGC
+ CGACCTTGCA GGGCTGCCCC AAGCCCGCGG CACCCACCGA CCACCCATCT GCCAGCAGGA
+ CTTGGAGCTG CTTCGCATCT CACTGCTCCT CATCCAGTCG TGGCTTGGGC CCCTGCAGTT
+ CCTCAGCAGA GTCTTCACCA ACAGCTTGGT GTTTGGCACC TCGGACCGTG TCTATGAGAA
+ GCTGAAGGAC CTGGAGGAAG GCATCCTGGC CCTGATGCGG GTGGGGATGG CGTTGTGGGT
+ CCCTTCCATG CTGGGGGCCA TGCCCGCCCT CTCCTGGCTT AGCCAGGAGA ATGCACGTGG
+ GCTTGGGGAG ACAGATCCCT GCTCTCTCCC TCTTTCTAGC AGTCCAGCCT TGACCCAGGG
+ GAAACCTTTT CCCCTTTTGA AACCTCCTTC CTCGCCCTTC TCCAAGCCTG TAGGGGAGGG
+ TGGAAAATGG AGCGGGCAGG AGGGAGCTGC TCCTGAGGGC CCTTCGGCCT CTCTGTCTCT
+ CCCTCCCTTG GCAGGAGCTG GAAGATGGCA CCCCCCGGGC TGGGCAGATC CTCAAGCAGA
+ CCTATGACAA ATTTGACACA AACATGCGCA GTGACGACGC GCTGCTCAAG AACTACGGTC
+ TGCTCTCCTG CTTCCGGAAG GACCTGCATA AGACGGAGAC GTACCTGAGG GTCATGAAGT
+ GCCGCCGCTT CGGGGAGGCC AGCTGTGCCT TCTAGTTGCC AGCCATCTGT TGTTTGCCCC
+ TCCCCCGTGC CTTCCTTGAC CCTGGAAGGT GCCACTCCCA CTGTCCTTTC CTAATAAAAT
+ GAGGAAATTG CATCGCATTG TCTGAGTAGG TGTCATTCTA TTCTGGGGGG TGGGGTGGGG
+ CAGGACAGCA AGGGGGAGGA TTGGGAAGAC AATAGCAGGC ATGCTGGGGA TGCGGTGGGC
+ TCTATGGGTA CCCAGGTGCT GAAGAATTGA CCCGGTTCCT CCTGGG
diff --git a/seq/bovprl.seq b/seq/bovprl.seq
new file mode 100644
index 0000000..c7f4f11
--- /dev/null
+++ b/seq/bovprl.seq
@@ -0,0 +1,17 @@
+>BOVPRL GenBank entry BOVPRL from omam file. 907 nucleotides.
+TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT
+CACCACCATGGACAGCAAAGGTTCGTCGCAGAAAGGGTCCCGCCTGCTCCTGCTGCTGGT
+GGTGTCAAATCTACTCTTGTGCCAGGGTGTGGTCTCCACCCCCGTCTGTCCCAATGGGCC
+TGGCAACTGCCAGGTATCCCTTCGAGACCTGTTTGACCGGGCAGTCATGGTGTCCCACTA
+CATCCATGACCTCTCCTCGGAAATGTTCAACGAATTTGATAAACGGTATGCCCAGGGCAA
+AGGGTTCATTACCATGGCCCTCAACAGCTGCCATACCTCCTCCCTTCCTACCCCGGAAGA
+TAAAGAACAAGCCCAACAGACCCATCATGAAGTCCTTATGAGCTTGATTCTTGGGTTGCT
+GCGCTCCTGGAATGACCCTCTGTATCACCTAGTCACCGAGGTACGGGGTATGAAAGGAGC
+CCCAGATGCTATCCTATCGAGGGCCATAGAGATTGAGGAAGAAAACAAACGACTTCTGGA
+AGGCATGGAGATGATATTTGGCCAGGTTATTCCTGGAGCCAAAGAGACTGAGCCCTACCC
+TGTGTGGTCAGGACTCCCGTCCCTGCAAACTAAGGATGAAGATGCACGTTATTCTGCTTT
+TTATAACCTGCTCCACTGCCTGCGCAGGGATTCAAGCAAGATTGACACTTACCTTAAGCT
+CCTGAATTGCAGAATCATCTACAACAACAACTGCTAAGCCCACATTCCATCCTATCCATT
+TCTGAGATGGTTCTTAATGATCCATTCCCTGGCAAACTTCTCTGAGCTTTATAGCTTTGT
+AATGCATGCTTGGCTCTAATGGGTTTCATCTTAAATAAAAACAGACTCTGTAGCGATGTC
+AAAATCT
diff --git a/seq/dna_test_s.nlib b/seq/dna_test_s.nlib
new file mode 100644
index 0000000..e8de1cd
--- /dev/null
+++ b/seq/dna_test_s.nlib
@@ -0,0 +1,47 @@
+>RABGLTR Oryctolagus cuniculus glutathione S-transferase mRNA, complete cds.
+ CGGCAGCTCC TGTGGACTCA GAGGAGCTGC ACCATGCCCA TGACGCTGGG TTACTGGGAC
+ GTCCGTGGGC TGGCTCTGCC AATCCGCATG CTCCTGGAAT ACACGGACAC CAGCTATGAG
+ GAAAAGAAAT ACACCATGGG GGATGCTCCC AACTATGACC AAAGCAAGTG GCTGAGTGAG
+ AAGTTCACCC TGGGCCTGGA CTTTCCCAAT CTGCCCTACC TAATTGATGG GACTCACAAG
+ CTCACGCAGA GCAACGCCAT CCTGCGCTAC CTGGCCCGCA AGCACGGCCT GTGTGGGGAG
+ ACGGAAGAGG AGAGGATTCG CGTGGACATT CTGGAGAATC AGCTGATGGA CAACCGCTTC
+ CAACTTGTAA ACGTCTGCTA CAGTCCCGAC TTTGAGAAGC TCAAGCCCGA GTACCTGAAG
+ GGGCTCCCTG AGAAGCTGCA GCTGTACTCG CAGTTCCTGG GAAGCCTCCC CTGGTTCGCA
+ GGGGACAAGA TCACCTTCGC CGATTTCCTT GTCTACGACG TTCTTGACCA GAACCGGATA
+ TTTGTGCCTG GGTGCCTGGA CGCGTTCCCA AACCTGAAGG ACTTTCATGT CCGCTTTGAG
+ GGCCTGCCGA AGATCTCTGC CTACATGAAG TCCAGCCGCT TTATCCGAGT CCCTGTGTTT
+ TTAAAGAAGG CCACGTGGAC GGGAATATAG GGCCCTGGAA GGAGGTGGGC CATCCCCTGG
+ GAGCTCAGGT CTCCCAGCCT CTTGCTCATC TTCCTCAACC TTCCCAAAAA CAAAAGCCTA
+ CTGCCTGCTT GTGTTCTGAG CCAGCCCCTC CCATGCAGGC TCTGGCCAGC TCAGAAACCC
+ ACCCTTCTAG CCATGGGCTC TCTAAGGCTG CTCTTCCCGG ACTAAGCAGA CCCCACGGGC
+ CACATCTCTC TTCGTGGGCT CCGTTTGATC TCCCCGACTG CCAGAATCAT GGTTGTACCT
+ GCTGCGGCCC TATTCCCAGG CGGGACTCCC CAGTGCTGTT TGGTCCCCAG GAGGGCCTGA
+ CCTCAGCCAG GGCCCTTCTT ACCCCTCCCT GTGTTGCACT GGAGTGGGCG CTGACTGTGC
+ AGACCTTGGG GGGGTTTCTT TGTTCTGCTG CCCACAGCAT GGCTGGGTGG GGCAGGATTA
+ GTGTGGGGGG AGTTGGGTGC TCAGGCAGGG CTATGAGGGA TCTTGTTCAT TTCCGGGCCC
+ TATCCATGTG CTCTGCTCCT CGCCCTGGGT TTTCTCCTCT GCCCGGGTTC CTCGTTCCTT
+ CACCCTGGAG GGAGGCCAGG GCCACGTGCA GCCGTGCCGG GTTCTGAGAG CGCTGGGCTG
+ ATGGGGACGG GGCTGAGCAG GCTTGAGCAG ACCCCTCTGT CACCATCTCC CGGAAGCTTT
+ CAGCTGATAC AGATGCTCCT CGTCTATAGT TTCAGGATGT TTCTCAATAA AACATCCCAC
+ TGT
+>EYKX4VC01BO0UO length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_
+AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACC
+GCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGT
+TCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGG
+GCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA
+>mgstm1 reverse complement
+AAGTGTGTTTCAGACTTTATTGTAGACGAGACAGACCTGGGGAGGCTACTCCACCAGGAACAGGCTGGCACTCAA
+GTATTGACTTCGGGGTAACTCTAGGGAGGGCTAGCACTAAGATAGTGTTGACCATCGGGGTAATTCTAGGAAGCG
+TGAGTTCAGGACAGACCTCAGTTCGCAGAAACGGGCTGTGAGGTTGGGTCAGGGAGCCAATGAAGAAGGGGCCAT
+gtgaaagaaactggggagaatgaaggctgtgtggacttgactgggaagagggtgaggagatggggctgaccaagc
+tgcagaagggagcgggaaggagagagaaccaggagccacagtgcagaaggccagggtgctgtccccacccagggc
+CTGCAGGATCCCCAGTGTGGACAGGTCCTCCTAGTGAGTGCCCGTGTAGCAAGGGCCTACTTGTTACTCCAGTGG
+GCCATCTTTGAAAATATAGGTGTTGCGATGTAGCGGCTACTCTTCATGTAGGCAGAGATCTTCTTGAGGCCCTCG
+AAGCGGGCCAGGAAGTCCCTCAGGTTTGGGAAGGCGTCCAGGCACTTGGGCTCAAACATACGGTACTGGTCAAGA
+ATGTCATAAGCAAGGAAATCCACATAGGTGACCTTGTCCCCTGCAAACCATGGCCTCTTGCCCAGGAACTCAGAG
+tagagcttcattttctcagggatggtcttcaagaactctggcttctgcttctcaaagtcagggttgtaacagagc
+atgatgagctgcatgcgggtgtccatgacctggttctccacaatgtctgcacggatcctctcctcctctgtctct
+ccatccaggtggtgctttcgggcaaggtagcgcaggatggcattgctctgggtgatcttgtgtgatccatcgatc
+AAGTAAGGCAGATTGGGAAAGTCCAGGCCCAGCTTGAACTTCTCATTCAGCCACTGGCTTCTGTCAAAGTCGGGA
+GCGTCACCCATGGTGTATCTCTTCTCATCATAGCTTGAGTCTGTGTATTCCAGGAGCATGCGGATCGGGTGTGTC
+AGTCCGCGGACGTTCCAGTATCCCAGTATCATAGGCATGGTGCTGGTGCTGTGGTCTTCTCAAACTGGCTTCAGC
diff --git a/seq/dyr_human.aa b/seq/dyr_human.aa
new file mode 100644
index 0000000..bb5fcdc
--- /dev/null
+++ b/seq/dyr_human.aa
@@ -0,0 +1,4 @@
+>gi|118992|sp|P00374.2|DYR_HUMAN RecName: Full=Dihydrofolate reductase
+MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGKQNLVIMGKKTWFSIPEKNRPLKGRINLVLSREL
+KEPPQGAHFLSRSLDDALKLTEQPELANKVDMVWIVGGSSVYKEAMNHPGHLKLFVTRIMQDFESDTFFPEIDLEKYKLL
+PEYPGVLSDVQEEKGIKYKFEVYEKND
diff --git a/seq/egmsmg.aa b/seq/egmsmg.aa
new file mode 100644
index 0000000..e3ec155
--- /dev/null
+++ b/seq/egmsmg.aa
@@ -0,0 +1,19 @@
+>EGMSMG Epidermal growth factor precursor - Mouse
+MPWGRRPTWLLLAFLLVFLKISILSVTAWQTGNCQPGPLERSERSGTCAGPAPFLVFSQGKSISRIDPDG
+TNHQQLVVDAGISADMDIHYKKERLYWVDVERQVLLRVFLNGTGLEKVCNVERKVSGLAIDWIDDEVLWV
+DQQNGVITVTDMTGKNSRVLLSSLKHPSNIAVDPIERLMFWSSEVTGSLHRAHLKGVDVKTLLETGGISV
+LTLDVLDKRLFWVQDSGEGSHAYIHSCDYEGGSVRLIRHQARHSLSSMAFFGDRIFYSVLKSKAIWIANK
+HTGKDTVRINLHPSFVTPGKLMVVHPRAQPRTEDAAKDPDPELLKQRGRPCRFGLCERDPKSHSSACAEG
+YTLSRDRKYCEDVNECATQNHGCTLGCENTPGSYHCTCPTGFVLLPDGKQCHELVS
+CPGNVSKCSHGCVLTSDGPRCICPAGSVLGRDGKTCTGCSSPDNGGCSQICLPLRPGSWECDCFPGYDLQ
+SDRKSCAASGPQPLLLFANSQDIRHMHFDGTDYKVLLSRQMGMVFALDYDPVESKIYFAQTALKWIERAN
+MDGSQRERLITEGVDTLEGLALDWIGRRIYWTDSGKSVVGGSDLSGKHHRIIIQERISRPRGIAVHPRAR
+RLFWTDVGMSPRIESASLQGSDRVLIASSNLLEPSGITIDYLTDTLYWCDTKRSVIEMANLDGSKRRRLI
+QNDVGHPFSLAVFEDHLWVSDWAIPSVIRVNKRTGQNRVRLQGSMLKPSSLVVVHPLAKPGADPCLYRNG
+GCEHICQESLGTARCLCREGFVKAWDGKMCLPQDYPILSGENADLSKEVTSLSNST
+QAEVPDDDGTESSTLVAEIMVSGMNYEDDCGPGGCGSHARCVSDGETAECQCLKGFARDGNLCSDIDECV
+LARSDCPSTSSRCINTEGGYVCRCSEGYEGDGISCFDIDECQRGAHNCAENAACTNTEGGYNCTCAGRPS
+SPGRSCPDSTAPSLLGEDGHHLDRNSYPGCPSSYDGYCLNGGVCMHIESLDSYTCNCVIGYSGDRCQTRD
+LRWWELRHAGYGQKHDIMVVAVCMVALVLLLLLGMWGTYYYRTRKQLSNPPKNPCDEPSGSVSSSGPDSS
+SGAAVASCPQPWFVVLEKHQDPKNGSLPADGTNGAVVDAGLSPSLQLGSVHLTSWRQKPHIDGMGTGQSC
+WIPPSSDRGPQEIEGNSHLPSYRPVGPEKLHSLQSANGSCHERAPDLPRQTEPVK
diff --git a/seq/grou_drome.pseg b/seq/grou_drome.pseg
new file mode 100644
index 0000000..618f592
--- /dev/null
+++ b/seq/grou_drome.pseg
@@ -0,0 +1,14 @@
+>gi|121620|sp|P16371|GROU_DROME GROUCHO PROTEIN (ENHANCER OF SPLIT M9/10)
+MYPSPVRHpaaggpppqgpIKFTIADTLERIKEEFNFLQAHYHSIKLECEKLSNEKTEMQ
+RHYVMYYEMSYGLNVEMHKQTEIAKRLNTLINQLLPFLQADHQQQVLQAVERAKQVTMQE
+LNLIIGQQIHAqqvpggppqpmgALNPFGALGATMGLPHGPQGLLNKPPEHHRPDIKPTG
+LEGPAAAEERLRNSVSPADREKYRTRSPLDIENDSKRRKDEKLQEDEGEKSDQDLVVDVA
+NEMESHSPRPNGEHVSMEVRDRESLNGERLEKPSSSGIKQErppsrsgssssrstpsLKT
+KDMEKPGTPGakartptpnaaapapgvnpkqmmpqgpppagypgapyqrpaDPYQRPPSD
+PAYGRPPPMPYDPHAHVRTNGIPHPSALTGGKPAYSFHMNGEGSLQPVPFPPDALVGVGI
+PRHARQINTLSHGEVVCAVTISNPTKYVYTGGKGCVKVWDISQPGNKNPVSQLDCLQRDN
+YIRSVKLLPDGRTLIVGGEASNLSIWDLASPTPRIKAELTSAAPACYALAISPDSKVCFS
+CCSDGNIAVWDLHNEILVRQFQGHTDGASCIDISPDGSRLWTGGLDNTVRSWDLREGRQL
+QQHDFSSQIFSLGYCPTGDWLAVGMENSHVEVLHASKPDKYQLHLHESCVLSLRFAACGK
+WFVSTGKDNLLNAWRTPYGASIFQSKETSSVLSCDISTDDKYIVTGSGDKKATVYEVIY
+
diff --git a/seq/gst.nlib b/seq/gst.nlib
new file mode 100644
index 0000000..2f04a8e
--- /dev/null
+++ b/seq/gst.nlib
@@ -0,0 +1,284 @@
+>pGT875 | 266
+GCTGAAGCCAGTTTGAGAAGACCACAGCACCAGCACCATGCCTATGATACTGGGATACTG
+GAACGTCCGCGGACTGACACACCCGATCCGCATGCTCCTGGAATACACAGACTCAAGCTA
+TGATGAGAAGAGATACACCATGGGTGACGCTCCCGACTTTGACAGAAGCCAGTGGCTGAA
+TGAGAAGTTCAAGCTGGGCCTGGACTTTCCCAATCTGCCTTACTTGATCGATGGATCACA
+CAAGATCACCCAGAGCAATGCCATCCTGCGCTACCTTGCCCGAAAGCACCACCTGGATGG
+AGAGACAGAGGAGGAGAGGATCCGTGCAGACATTGTGGAGAACCAGGTCATGGACACCCG
+catgcagctcatcatgctctgttacaaccctgactttgagaagcagaagccagagttctt
+gaagaccatccctgagaaaatgaagctctactctgagttcctgggcaagaggccatggtt
+tgcaggggacaaggtcacctatgtggatttccttgcttatgacattcttgaccagtaccg
+tatgtttgagcccaagtgcctggacgccttcccaaacctgagggacttcctggcccgctt
+cgagggcctcaagaagatctctgcctacatgaagagtagccgctacatcgcaacacctat
+ATTTTCAAAGATGGCCCACTGGAGTAACAAGTAGGCCCTTGCTACACGGGCACTCACTAG
+GAGGACCTGTCCACACTGGGGATCCTGCAGGCCCTGGGTGGGGACAGCACCCTGGCCTTC
+TGCACTGTGGCTCCTGGTTCTCTCTCCTTCCCGCTCCCTTCTGCAGCTTGGTCAGCCCCA
+TCTCCTCACCCTCTTCCCAGTCAAGTCCACACAGCCTTCATTCTCCCCAGTTTCTTTCAC
+ATGGCCCCTTCTTCATTGGCTCCCTGACCCAACCTCACAGCCCGTTTCTGCGAACTGAGG
+TCTGTCCTGAACTCACGCTTCCTAGAATTACCCCGATGGTCAACACTATCTTAGTGCTAG
+CCCTCCCTAGAGTTACCCCGAAGTCAATACTTGAGTGCCAGCCTGTTCCTGGTGGAGTAG
+CCTCCCCAGGTCTGTCTCGTCTACAATAAAGTCTGAAACACACTT
+>RABGLTR Oryctolagus cuniculus glutathione S-transferase mRNA, complete cds.
+ CGGCAGCTCC TGTGGACTCA GAGGAGCTGC ACCATGCCCA TGACGCTGGG TTACTGGGAC
+ GTCCGTGGGC TGGCTCTGCC AATCCGCATG CTCCTGGAAT ACACGGACAC CAGCTATGAG
+ GAAAAGAAAT ACACCATGGG GGATGCTCCC AACTATGACC AAAGCAAGTG GCTGAGTGAG
+ AAGTTCACCC TGGGCCTGGA CTTTCCCAAT CTGCCCTACC TAATTGATGG GACTCACAAG
+ CTCACGCAGA GCAACGCCAT CCTGCGCTAC CTGGCCCGCA AGCACGGCCT GTGTGGGGAG
+ ACGGAAGAGG AGAGGATTCG CGTGGACATT CTGGAGAATC AGCTGATGGA CAACCGCTTC
+ CAACTTGTAA ACGTCTGCTA CAGTCCCGAC TTTGAGAAGC TCAAGCCCGA GTACCTGAAG
+ GGGCTCCCTG AGAAGCTGCA GCTGTACTCG CAGTTCCTGG GAAGCCTCCC CTGGTTCGCA
+ GGGGACAAGA TCACCTTCGC CGATTTCCTT GTCTACGACG TTCTTGACCA GAACCGGATA
+ TTTGTGCCTG GGTGCCTGGA CGCGTTCCCA AACCTGAAGG ACTTTCATGT CCGCTTTGAG
+ GGCCTGCCGA AGATCTCTGC CTACATGAAG TCCAGCCGCT TTATCCGAGT CCCTGTGTTT
+ TTAAAGAAGG CCACGTGGAC GGGAATATAG GGCCCTGGAA GGAGGTGGGC CATCCCCTGG
+ GAGCTCAGGT CTCCCAGCCT CTTGCTCATC TTCCTCAACC TTCCCAAAAA CAAAAGCCTA
+ CTGCCTGCTT GTGTTCTGAG CCAGCCCCTC CCATGCAGGC TCTGGCCAGC TCAGAAACCC
+ ACCCTTCTAG CCATGGGCTC TCTAAGGCTG CTCTTCCCGG ACTAAGCAGA CCCCACGGGC
+ CACATCTCTC TTCGTGGGCT CCGTTTGATC TCCCCGACTG CCAGAATCAT GGTTGTACCT
+ GCTGCGGCCC TATTCCCAGG CGGGACTCCC CAGTGCTGTT TGGTCCCCAG GAGGGCCTGA
+ CCTCAGCCAG GGCCCTTCTT ACCCCTCCCT GTGTTGCACT GGAGTGGGCG CTGACTGTGC
+ AGACCTTGGG GGGGTTTCTT TGTTCTGCTG CCCACAGCAT GGCTGGGTGG GGCAGGATTA
+ GTGTGGGGGG AGTTGGGTGC TCAGGCAGGG CTATGAGGGA TCTTGTTCAT TTCCGGGCCC
+ TATCCATGTG CTCTGCTCCT CGCCCTGGGT TTTCTCCTCT GCCCGGGTTC CTCGTTCCTT
+ CACCCTGGAG GGAGGCCAGG GCCACGTGCA GCCGTGCCGG GTTCTGAGAG CGCTGGGCTG
+ ATGGGGACGG GGCTGAGCAG GCTTGAGCAG ACCCCTCTGT CACCATCTCC CGGAAGCTTT
+ CAGCTGATAC AGATGCTCCT CGTCTATAGT TTCAGGATGT TTCTCAATAA AACATCCCAC
+ TGT
+>BTGST Bovine GST mRNA for gluthathione S-transferase, class-pi.
+ CGGCTCAGGC CGCCGCCGAG CGCGCTGGAA CTTTGCTGCC GCCGCCACCT TTACCGACTT
+ CCCCGACTCC AGGATGCCTC CCTACACCAT CGTCTACTTC CCGGTTCAAG GGCGCTGCGA
+ GGCCATGCGC ATGCTGCTGG CCGACCAGGG CCAGAGCTGG AAGGAGGAGG TCGTAGCCAT
+ GCAGAGCTGG CTGCAGGGCC CACTCAAGGC CTCCTGCCTG TACGGGCAGC TCCCCAAGTT
+ CCAGGACGGA GACCTCACGC TGTACCAGTC CAATGCCATC CTGCGGCACC TGGGCCGCAC
+ CCTCGGGCTG TATGGGAAGG ACCAGCAGGA GGCGGCCCTG GTGGACATGG TGAATGACGG
+ TGTAGAGGAC CTTCGCTGCA AATACGTCTC CCTCATTTAC ACCAACTACG AGGCGGGCAA
+ GGAGGACTAT GTGAAGGCGC TGCCCCAGCA CCTGAAGCCT TTCGAGACCC TGCTGTCCCA
+ GAACAAGGGT GGCCAGGCCT TCATCGTGGG CGACCAGATC TCCTTTGCGG ACTACAACCT
+ GCTGGACCTG CTTCGGATTC ACCAGGTCCT GGCCCCCAGC TGTCTGGACT CCTTCCCCCT
+ GCTCTCAGCC TACGTGGCCC GTCTCAACTC CCGGCCCAAG CTCAAGGCCT TCCTGGCCTC
+ CCCCGAGCAC ATGAACCGGC CCATCAACGG CAATGGGAAA CAGTGAGGGC TTGCAGCACT
+ CTCTGCTCGA GGCAGGGGGC TGCCTGCTCT TCCCTTTCCC CAGGACCAAT AAAACTTCCA
+ AGAGAGAAAA AAAAAAAAAA AAAAAAAAA
+>OCDHPR Rabbit mRNA for dihydropyridine (DHP) receptor (from skeletal
+ TTCCACCTAC ATGTTGGCCT GGACAGCAGG GAGCCGAGGG GAGGCTAATT TTACTGCTGG
+ GAGCAGCTAG CATAATCCTC CCGCCCCCAC CCCGCTGGCT CAGCAGGGCA GGCTTCGCCC
+ GGCAAGCTCA GCGGCCCAGT CCCCAAGGCG GGGAACACTG GGGACGCAGG GAAGAGAGGG
+ CCGCGGGGTG GGGGAGCAGC AGGAAGCGCC GTGGCCAGGG AAGCCATGGA GCCATCCTCA
+ CCCCAGGATG AGGGCCTGAG GAAGAAACAG CCCAAGAAGC CCCTGCCCGA GGTCCTGCCC
+ AGGCCGCCGC GGGCTCTGTT CTGCCTGACC CTGCAGAACC CGCTGAGGAA GGCGTGCATC
+ AGCATCGTGG AATGGAAACC CTTCGAGACC ATCATCCTGC TCACCATCTT TGCCAACTGT
+ GTGGCCCTGG CCGTGTACCT GCCCATGCCC GAGGATGACA ACAACTCCCT GAACCTGGGC
+ CTGGAGAAGC TGGAGTACTT CTTCCTCACC GTCTTCTCCA TCGAAGCCGC CATGAAGATC
+ ATCGCCTACG GCTTCCTGTT CCACCAGGAC GCCTACCTGC GCAGCGGCTG GAACGTGCTG
+ GACTTCATCA TCGTCTTCCT GGGGGTCTTC ACGGCGATTC TGGAACAGGT CAACGTCATC
+ CAGAGCAACA CGGCCCCGAT GAGCAGCAAA GGAGCCGGCC TGGACGTCAA GGCCCTGAGG
+ GCCTTCCGTG TGCTCAGACC CCTCCGGCTG GTGTCGGGGG TGCCTAGTTT GCAGGTGGTC
+ CTCAACTCCA TCTTCAAGGC CATGCTCCCC CTGTTCCACA TCGCCCTGCT CGTCCTCTTC
+ ATGGTCATCA TCTACGCCAT CATCGGGCTG GAGCTCTTCA AGGGCAAGAT GCACAAGACC
+ TGCTACTACA TCGGGACAGA CATCGTGGCC ACAGTGGAGA ATGAGAAGCC CTCGCCCTGC
+ GCTAGGACGG GCTCGGGGCG CCCCTGCACC ATCAACGGCA GCGAGTGCCG GGGCGGCTGG
+ CCGGGGCCCA ACCACGGCAT CACGCACTTC GACAACTTCG GCTTCTCCAT GCTCACCGTG
+ TACCAGTGCA TCACCATGGA GGGCTGGACA GATGTCCTCT ACTGGGTCAA CGATGCCATC
+ GGGAACGAGT GGCCCTGGAT CTACTTTGTC ACTCTCATCC TGCTGGGGTC CTTCTTCATC
+ CTCAACCTGG TGCTGGGCGT CCTGAGTGGG GAATTCACCA AGGAGCGGGA GAAGGCCAAG
+ TCCAGGGGAA CCTTCCAGAA GCTGCGGGAG AAGCAGCAGC TGGAGGAGGA CCTTCGGGGC
+ TACATGAGCT GGATCACGCA GGGCGAGGTC ATGGACGTGG AGGACCTGAG AGAAGGAAAG
+ CTGTCCTTGG AAGAGGGAGG CTCCGACACG GAAAGCCTGT ACGAAATCGA GGGCTTGAAC
+ AAAATCATCC AGTTCATCCG ACACTGGAGG CAGTGGAACC GTGTCTTTCG CTGGAAGTGC
+ CATGACCTGG TGAAGTCGAG AGTCTTCTAC TGGCTGGTCA TCCTGATCGT GGCCCTCAAC
+ ACCCTGTCCA TCGCCTCGGA GCACCACAAC CAGCCGCTCT GGCTGACCCA CTTGCAAGAC
+ ATCGCCAATC GAGTGCTGCT GTCACTCTTC ACCATCGAGA TGCTGCTGAA GATGTACGGG
+ CTGGGCCTGC GCCAGTACTT CATGTCCATC TTCAACCGCT TCGACTGCTT CGTGGTGTGC
+ AGCGGCATCC TGGAGCTGCT GCTGGTGGAG TCGGGCGCCA TGACGCCGCT GGGCATCTCC
+ GTGTTGCGCT GCATCCGCCT CCTGAGGCTC TTCAAGATCA CCAAGTACTG GACGTCGCTC
+ AGCAACCTGG TGGCCTCCCT GCTCAACTCC ATCCGCTCCA TCGCCTCGCT GCTGCTGCTG
+ CTCTTCCTCT TCATCATCAT CTTCGCCCTG CTGGGCATGC AGCTCTTCGG GGGGCGGTAC
+ GACTTCGAGG ACACGGAAGT GCGACGCAGC AACTTCGACA ACTTCCCCCA GGCCCTCATC
+ AGCGTCTTCC AGGTGCTGAC GGGTGAGGAC TGGAACTCCG TGATGTACAA CGGGATCATG
+ GCCTACGGAG GCCCGTCCTA CCCGGGCGTT CTCGTGTGCA TCTATTTCAT CATCCTTTTT
+ GTCTGCGGCA ACTATATCCT GCTGAATGTC TTCCTGGCCA TCGCCGTGGA CAACCTGGCC
+ GAGGCGGAGA GCCTGACTTC CGCGCAAAAG GCCAAGGCCG AGGAGAGGAA ACGCAGGAAG
+ ATGTCCAGGG GTCTCCCTGA CAAGACAGAG GAGGAGAAGT CTGTGATGGC CAAGAAGCTG
+ GAGCAGAAGC CCAAGGGGGA GGGCATCCCC ACCACTGCCA AGCTCAAGGT CGATGAGTTC
+ GAATCTAACG TCAACGAGGT GAAGGACCCC TACCCTTCAG CTGACTTCCC AGGGGATGAT
+ GAGGAGGACG AGCCTGAGAT CCCAGTGAGC CCCCGACCGC GCCCGCTGGC CGAGCTGCAG
+ CTCAAAGAGA AGGCAGTGCC CATCCCGGAA GCCAGCTCCT TCTTCATCTT CAGTCCCACC
+ AATAAGGTCC GTGTCCTGTG TCACCGCATC GTCAACGCCA CCTGGTTCAC CAACTTCATC
+ CTGCTCTTCA TCCTGCTCAG CAGTGCTGCG CTGGCCGCCG AGGACCCCAT CCGGGCGGAG
+ TCCGTGAGGA ATCAGATCCT TGGATATTTT GATATTGCCT TCACCTCTGT CTTCACTGTG
+ GAGATTGTCC TCAAGATGAC GACCTACGGC GCCTTCCTGC ACAAGGGCTC CTTCTGCCGC
+ AACTACTTCA ACATCCTGGA CCTGCTGGTG GTGGCTGTGT CTCTCATCTC CATGGGTCTC
+ GAGTCCAGCA CCATCTCCGT GGTAAAGATC CTGAGAGTGC TAAGGGTGCT CCGGCCCCTG
+ CGAGCCATCA ACAGAGCCAA AGGGTTGAAG CACGTGGTCC AGTGCGTGTT CGTGGCCATC
+ CGCACCATCG GGAACATCGT CCTGGTCACC ACGCTCCTGC AGTTCATGTT CGCCTGCATT
+ GGTGTCCAGC TCTTCAAGGG CAAGTTCTTC AGCTGCAACG ACCTATCCAA GATGACAGAA
+ GAGGAGTGCA GGGGCTACTA CTATGTGTAC AAGGACGGGG ACCCCACGCA GATGGAGCTG
+ CGCCCCCGCC AGTGGATACA CAATGACTTC CACTTTGACA ACGTGCTGTC GGCCATGATG
+ TCGCTCTTCA CGGTGTCCAC CTTCGAGGGA TGGCCCCAGC TGCTGTACAG GGCCATAGAC
+ TCCAACGAGG AGGACATGGG CCCCGTTTAC AACAACCGAG TGGAGATGGC CATCTTCTTC
+ ATCATCTACA TCATCCTCAT TGCCTTCTTC ATGATGAACA TCTTTGTGGG CTTTGTCATC
+ GTCACCTTCC AGGAGCAGGG GGAGACAGAG TACAAGAACT GCGAGCTGGA CAAGAACCAG
+ CGCCAGTGTG TGCAGTATGC CCTGAAGGCC CGCCCACTTC GGTGCTACAT CCCCAAGAAC
+ CCATACCAGT ACCAGGTGTG GTACGTCGTC ACCTCCTCCT ACTTTGAATA CCTGATGTTC
+ GCCCTCATCA TGCTCAACAC CATCTGCCTG GGCATGCAGC ACTACCACCA GTCGGAGGAG
+ ATGAACCACA TCTCGGACAT CCTCAACGTG GCCTTCACCA TCATCTTCAC ACTGGAGATG
+ ATCCTCAAGC TCTTGGCGTT CAAGGCCAGG GGCTATTTCG GAGACCCCTG GAATGTGTTC
+ GACTTCCTGA TCGTCATCGG CAGCATCATT GACGTCATCC TCAGCGAGAT CGACACTTTC
+ CTGGCCTCCA GCGGGGGACT GTATTGCCTG GGTGGCGGCT GCGGGAACGT TGACCCAGAC
+ GAGAGCGCCC GCATCTCCAG TGCCTTCTTC CGCCTGTTCC GGGTCATGAG GCTGATCAAG
+ CTGCTGAGTC GGGCCGAGGG CGTGCGCACG CTGCTGTGGA CGTTCATCAA GTCCTTCCAG
+ GCCCTGCCCT ACGTGGCCCT GCTCATCGTC ATGCTGTTCT TCATCTACGC CGTCATCGGC
+ ATGCAGATGT TTGGAAAGAT CGCCCTGGTG GACGGGACCC AGATCAACCG CAACAACAAC
+ TTCCAGACCT TCCCGCAGGC CGTGCTGCTG CTCTTCAGGT GTGCGACAGG GGAGGCGTGG
+ CAAGAGATCC TGCTGGCCTG CAGCTACGGG AAGTTGTGCG ACCCAGAGTC AGACTACGCC
+ CCGGGCGAGG AGTACACGTG TGGCACCAAC TTCGCCTACT ACTACTTCAT CAGCTTCTAC
+ ATGCTCTGCG CCTTCCTGAT CATCAACCTC TTCGTGGCTG TCATCATGGA CAACTTTGAC
+ TACCTGACAC GCGACTGGTC CATCCTGGGC CCTCACCACC TGGACGAGTT CAAGGCTATC
+ TGGGCAGAGT ATGACCCAGA GGCCAAGGGG CGAATCAAGC ACCTGGACGT GGTGACCCTG
+ CTGAGAAGGA TCCAGCCCCC TCTGGGCTTC GGGAAGTTCT GTCCACACCG GGTGGCCTGT
+ AAGCGCCTGG TGGGCATGAA CATGCCCCTG AACAGTGACG GCACGGTCAC CTTCAATGCC
+ ACGCTCTTTG CCCTGGTGCG CACGGCCCTC AAGATCAAGA CAGAAGGTAA CTTTGAGCAG
+ GCCAACGAGG AGCTGAGGGC CATCATCAAG AAGATCTGGA AGAGAACCAG CATGAAGCTG
+ CTGGACCAGG TCATCCCTCC CATAGGAGAT GACGAGGTGA CCGTGGGGAA GTTCTACGCC
+ ACATTCCTCA TCCAGGAGCA CTTCCGGAAG TTCATGAAGC GCCAGGAGGA ATATTATGGG
+ TATCGGCCCA AGAAGGACAC CGTGCAGATC CAGGCTGGGC TGCGGACCAT AGAGGAGGAG
+ GCGGCCCCTG AGATCCGCCG CACCATCTCA GGAGACCTGA CCGCCGAGGA GGAGCTGGAG
+ AGAGCCATGG TGGAGGCTGC GATGGAGGAG AGGATCTTCC GGAGGACGGG AGGCCTGTTT
+ GGCCAGGTGG ACACCTTCCT GGAAAGGACC AACTCCCTGC CCCCGGTGAT GGCCAACCAA
+ AGACCGCTCC AGTTTGCTGA GATAGAAATG GAAGAGCTTG AGTCGCCTGT CTTCTTGGAG
+ GACTTCCCTC AAGATGCAAG AACCAACCCT CTCGCTCGTG CCAATACCAA CAACGCCAAT
+ GCCAATGTTG CCTATGGCAA CAGCAACCAT AGCAACAACC AGATGTTTTC CAGCGTCCAC
+ TGTGAAAGGG AGTTCCCGGG AGAGGCGGAG ACACCGGCTG CCGGACGAGG AGCCCTCAGC
+ CACTCCCACA GGGCCCTGGG ACCTCACAGC AAGCCCTGTG CTGGAAAACT GAATGGGCAG
+ CTGGTCCAGC CGGGGATGCC CATCAACCAG GCACCTCCTG CCCCCTGCCA GCAGCCTAGC
+ ACGGATCCCC CAGAGCGCGG GCAGAGGAGG ACCTCCCTGA CAGGGTCTCT GCAAGACGAA
+ GCACCCCAGA GGAGGAGCTC CGAGGGGAGC ACCCCCAGGC GCCCGGCTCC TGCTACAGCT
+ CTGCTGATCC AAGAGGCTCT GGTTCGAGGG GGCCTGGACA CCTTGGCAGC TGATGCTGGC
+ TTCGTCACGG CAACAAGCCA GGCCCTGGCA GACGCCTGTC AGATGGAACC GGAGGAAGTA
+ GAGGTCGCAG CCACAGAGCT ACTGAAAGCG CGAGAGTCTG TCCAGGGCAT GGCCAGTGTC
+ CCGGGAAGCC TGAGCCGCAG GTCCTCCCTG GGCAGCCTTG ACCAGGTCCA GGGCTCCCAG
+ GAAACCCTTA TTCCTCCCAG GCCGTGATGG CTGTGGTGTC CACATGACCA AGGCGAGAGG
+ GACAGTGCGT GCAGAAGCTC AGCCCTGCAT GGCAGCCTCC CTCTGTCTCA GCCCTCCTGC
+ TGAGCTGGGG CGGTCTGGAA CCGCACCAGG AAGCCAGGAG CCTCCCCTGG CCAGCAAGAG
+ GCATGATTCT AAAGCCATCC AGAAAGGCCT GGTCAGTGCC ACTCCCCAGC AGGACATTAA
+ AGTCTCTAGG TCTGTGGCAC TGG
+>RABALP1A Rabbit dihydropyridine-sensitive calcium channel alpha-1 subunit
+ TTCCACCTAC ATGTTGGCCT GGACAGCAGG GAGCCGAGGG GAGGCTAATT TTACTGCTGG
+ GAGCAGCTAG CATAATCCTC CCGCCCCCAC CCCGCTGGCT CAGCAGGGCA GGCTTCGCCC
+ GGCAAGCTCA GCGGCCCAGT CCCCAAGGCG GGGAACACTG GGGACGCAGG GAAGAGAGGG
+ CCGCGGGGTG GGGGAGCAGC AGGAAGCGCC GTGGCCAGGG AAGCCATGGA GCCATCCTCA
+ CCCCAGGATG AGGGCCTGAG GAAGAAACAG CCCAAGAAGC CCCTGCCCGA GGTCCTGCCC
+ AGGCCGCCGC GGGCTCTGTT CTGCCTGACC CTGCAGAACC CGCTGAGGAA GGCGTGCATC
+ AGCATCGTGG AATGGAAACC CTTCGAGACC ATCATCCTGC TCACCATCTT TGCCAACTGT
+ GTGGCCCTGG CCGTGTACCT GCCCATGCCC GAGGATGACA ACAACTCCCT GAACCTGGGC
+ CTGGAGAAGC TGGAGTACTT CTTCCTCACC GTCTTCTCCA TCGAAGCCGC CATGAAGATC
+ ATCGCCTACG GCTTCCTGTT CCACCAGGAC GCCTACCTGC GCAGCGGCTG GAACGTGCTG
+ GACTTCATCA TCGTCTTCCT GGGGGTCTTC ACGGCGATTC TGGAACAGGT CAACGTCATC
+ CAGAGCAACA CGGCCCCGAT GAGCAGCAAA GGAGCCGGCC TGGACGTCAA GGCCCTGAGG
+ GCCTTCCGTG TGCTCAGACC CCTCCGGCTG GTGTCGGGGG TGCCTAGTTT GCAGGTGGTC
+ CTCAACTCCA TCTTCAAGGC CATGCTCCCC CTGTTCCACA TCGCCCTGCT CGTCCTCTTC
+ ATGGTCATCA TCTACGCCAT CATCGGGCTG GAGCTCTTCA AGGGCAAGAT GCACAAGACC
+ TGCTACTACA TCGGGACAGA CATCGTGGCC ACAGTGGAGA ATGAGAAGCC CTCGCCCTGC
+ GCTAGGACGG GCTCGGGGCG CCCCTGCACC ATCAACGGCA GCGAGTGCCG GGGCGGCTGG
+ CCGGGGCCCA ACCACGGCAT CACGCACTTC GACAACTTCG GCTTCTCCAT GCTCACCGTG
+ TACCAGTGCA TCACCATGGA GGGCTGGACA GATGTCCTCT ACTGGGTCAA CGATGCCATC
+ GGGAACGAGT GGCCCTGGAT CTACTTTGTC ACTCTCATCC TGCTGGGGTC CTTCTTCATC
+ CTCAACCTGG TGCTGGGCGT CCTGAGTGGG GAATTCACCA AGGAGCGGGA GAAGGCCAAG
+ TCCAGGGGAA CCTTCCAGAA GCTGCGGGAG AAGCAGCAGC TGGAGGAGGA CCTTCGGGGC
+ TACATGAGCT GGATCACGCA GGGCGAGGTC ATGGACGTGG AGGACCTGAG AGAAGGAAAG
+ CTGTCCTTGG AAGAGGGAGG CTCCGACACG GAAAGCCTGT ACGAAATCGA GGGCTTGAAC
+ AAAATCATCC AGTTCATCCG ACACTGGAGG CAGTGGAACC GTGTCTTTCG CTGGAAGTGC
+ CATGACCTGG TGAAGTCGAG AGTCTTCTAC TGGCTGGTCA TCCTGATCGT GGCCCTCAAC
+ ACCCTGTCCA TCGCCTCGGA GCACCACAAC CAGCCGCTCT GGCTGACCCA CTTGCAAGAC
+ ATCGCCAATC GAGTGCTGCT GTCACTCTTC ACCATCGAGA TGCTGCTGAA GATGTACGGG
+ CTGGGCCTGC GCCAGTACTT CATGTCCATC TTCAACCGCT TCGACTGCTT CGTGGTGTGC
+ AGCGGCATCC TGGAGCTGCT GCTGGTGGAG TCGGGCGCCA TGACGCCGCT GGGCATCTCC
+ GTGTTGCGCT GCATCCGCCT CCTGAGGCTC TTCAAGATCA CCAAGTACTG GACGTCGCTC
+ AGCAACCTGG TGGCCTCCCT GCTCAACTCC ATCCGCTCCA TCGCCTCGCT GCTGCTGCTG
+ CTCTTCCTCT TCATCATCAT CTTCGCCCTG CTGGGCATGC AGCTCTTCGG GGGGCGGTAC
+ GACTTCGAGG ACACGGAAGT GCGACGCAGC AACTTCGACA ACTTCCCCCA GGCCCTCATC
+ AGCGTCTTCC AGGTGCTGAC GGGTGAGGAC TGGAACTCCG TGATGTACAA CGGGATCATG
+ GCCTACGGAG GCCCGTCCTA CCCGGGCGTT CTCGTGTGCA TCTATTTCAT CATCCTTTTT
+ GTCTGCGGCA ACTATATCCT GCTGAATGTC TTCCTGGCCA TCGCCGTGGA CAACCTGGCC
+ GAGGCCGAGA GCCTGACTTC CGCGCAAAAG GCCAAGGCCG AGGAGAGGAA ACGTAGGAAG
+ ATGTCCAGGG GTCTCCCTGA CAAGAGAGAG GAGGAGAAGT CTGTGATGGC CAAGAAGCTG
+ GAGCAGAAGC CCAAGGGGGA GGGCATCCCC ACCACTGCCA AGCTCAAGGT CGATGAGTTC
+ GAATCTAACG TCAACGAGGT GAAGGACCCC TACCCTTCAG CTGACTTCCC AGGGGATGAT
+ GAGGAGGACG AGCCTGAGAT CCCAGTGAGC CCCCGACCGC GCCCGCTGGC CGAGCTGCAG
+ CTCAAAGAGA AGGCAGTGCC CATCCCGGAA GCCAGCTCCT TCTTCATCTT CAGTCCCACC
+ AATAAGGTCC GTGTCCTGTG TCACCGCATC GTCAACGCCA CCTGGTTCAC CAACTTCATC
+ CTGCTCTTCA TCCTGCTCAG CAGTGCTGCG CTGGCCGCCG AGGACCCCAT CCGGGCGGAG
+ TCCGTGAGGA ATCAGATCCT TGGATATTTT GATATTGCCT TCACCTCTGT CTTCACTGTG
+ GAGATTGTCC TCAAGATGAC AACCTACGGC GCCTTCCTGC ACAAGGGCTC CTTCTGCCGC
+ AACTACTTCA ACATCCTGGA CCTGCTGGTG GTGGCCGTGT CTCTCATCTC CATGGGTCTC
+ GAGTCCAGCA CCATCTCCGT GGTAAAGATC CTGAGAGTGC TAAGGGTGCT CCGGCCCCTG
+ CGAGCCATCA ACAGAGCCAA AGGGTTGAAG CACGTGGTCC AGTGCGTGTT CGTGGCCATC
+ CGCACCATCG GGAACATCGT CCTGGTCACC ACGCTCCTGC AGTTCATGTT CGCCTGCATC
+ GGTGTCCAGC TCTTCAAGGG CAAGTTCTTC AGCTGCAATG ACCTATCCAA GATGACAGAA
+ GAGGAGTGCA GGGGCTACTA CTATGTGTAC AAGGACGGGG ACCCCACGCA GATGGAGCTG
+ CGCCCCCGCC AGTGGATACA CAATGACTTC CACTTTGACA ACGTGCTGTC GGCCATGATG
+ TCGCTCTTCA CGGTGTCCAC CTTCGAGGGA TGGCCCCAGC TGCTGTACAG GGCCATAGAC
+ TCCAACGAGG AGGACATGGG CCCCGTTTAC AACAACCGAG TGGAGATGGC CATCTTCTTC
+ ATCATCTACA TCATCCTCAT TGCCTTCTTC ATGATGAACA TCTTTGTGGG CTTTGTCATC
+ GTCACCTTCC AGGAGCAGGG GGAGACAGAG TACAAGAACT GCGAGCTGGA CAAGAACCAG
+ CGCCAGTGTG TGCAGTATGC CCTGAAGGCC CGCCCACTTC GGTGCTACAT CCCCAAGAAC
+ CCATACCAGT ACCAGGTGTG GTACGTCGTC ACCTCCTCCT ACTTTGAATA CCTGATGTTC
+ GCCCTCATCA TGCTCAACAC CATCTGCCTG GGCATGCAGC ACTACCACCA GTCGGAGGAG
+ ATGAACCACA TCTCAGACAT CCTCAATGTG GCCTTCACCA TCATCTTCAC GCTGGAGATG
+ ATTCTCAAGC TCTTGGCGTT CAAGGCCAGG GGCTATTTCG GAGACCCCTG GAATGTGTTC
+ GACTTCCTGA TCGTCATCGG CAGCATCATT GACGTCATCC TCAGCGAGAT CGACACTTTC
+ CTGGCCTCCA GCGGGGGACT GTATTGCCTG GGTGGCGGCT GCGGGAACGT TGACCCAGAC
+ GAGAGCGCCC GCATCTCCAG TGCCTTCTTC CGCCTGTTCC GGGTTATGAG GCTGATCAAG
+ CTGCTGAGTC GGGCCGAGGG CGTGCGCACG CTGCTGTGGA CGTTCATCAA GTCCTTCCAG
+ GCCCTGCCCT ACGTGGCCCT GCTCATCGTC ATGCTGTTCT TCATCTACGC CGTCATCGGC
+ ATGCAGATGT TTGGAAAGAT CGCCCTGGTG GACGGGACCC AGATCAACCG CAACAACAAC
+ TTCCAGACCT TCCCGCAGGC CGTGCTGCTG CTCTTCAGGT GTGCGACAGG GGAGGCGTGG
+ CAAGAGATCC TGCTGGCCTG CAGCTACGGG AAGTTGTGCG ACCCAGAGTC AGACTACGCC
+ CCGGGCGAGG AGTACACGTG TGGCACCAAC TTCGCCTACT ACTACTTCAT CAGCTTCTAC
+ ATGCTCTGCG CCTTCCTGAT CATCAACCTC TTCGTGGCTG TCATCATGGA CAACTTTGAC
+ TACCTGACAC GCGACTGGTC CATCCTGGGC CCTCACCACC TGGACGAGTT CAAGGCCATC
+ TGGGCAGAGT ATGACCCAGA GGCCAAGGGG CGAATCAAGC ACCTGGACGT GGTGACCCTG
+ CTGAGAAGGA TCCAGCCCCC TCTGGGCTTC GGGAAGTTCT GTCCACACCG GGTGGCCTGT
+ AAGCGCCTGG TGGGCATGAA CATGCCCCTG AACAGTGACG GCACGGTCAC CTTCAATGCC
+ ACGCTCTTTG CCCTGGTGCG CACGGCCCTC AAGATCAAGA CAGAAGGTAA CTTCGAGCAG
+ GCCAACGAGG AGCTGAGGGC CATCATCAAG AAGATCTGGA AGAGAACCAG CATGAAGCTA
+ CTGGACCAGG TCATCCCTCC CATAGGAGAT GACGAGGTGA CCGTGGGGAA GTTCTACGCC
+ ACATTCCTCA TCCAGGAGCA CTTCCGGAAG TTCATGAAGC GCCAGGAGGA ATATTATGGG
+ TATCGGCCCA AGAAGGACAC CGTGCAGATC CAGGCTGGGC TGCGGACCAT AGAGGAGGAG
+ GCGGCCCCTG AGATCCGCCG CACCATCTCA GGAGACCTGA CCGCCGAGGA GGAGCTGGAG
+ AGAGCCATGG TGGAGGCTGC GATGGAGGAG AGGATCTTCC GGAGGACCGG AGGCCTGTTT
+ GGCCAGGTGG ACACCTTCCT GGAAAGGACC AACTCCCTAC CCCCGGTGAT GGCCAACCAA
+ AGACCGCTCC AGTTTGCTGA GATAGAAATG GAAGAGCTTG AGTCGCCTGT CTTCTTGGAG
+ GACTTCCCTC AAGACGCAAG AACCAACCCT CTCGCTCGTG CCAATACCAA CAACGCCAAT
+ GCCAATGTTG CCTATGGCAA CAGCAACCAT AGCAACAACC AGATGTTTTC CAGCGTCCAC
+ TGTGAAAGGG AGTTCCCGGG AGAGGCGGAG ACACCGGCTG CCGGACGAGG AGCCCTCAGC
+ CACTCCCACA GGGCCCTGGG ACCTCACAGC AAGCCCTGTG CTGGAAAACT GAATGGGCAG
+ CTGGTCCAGC CGGGAATGCC CATCAACCAG GCACCTCCTG CCCCCTGCCA GCAGCCTAGC
+ ACAGATCCCC CAGAGCGCGG GCAGAGGAGG ACCTCCCTGA CAGGGTCTCT GCAAGACGAA
+ GCACCCCAGA GGAGGAGCTC CGAGGGGAGC ACCCCCAGGC GCCCGGCTCC TGCTACAGCT
+ CTGCTGATCC AAGAGGCTCT GGTTCGAGGG GGCCTGGACA CCTTGGCAGC TGATGCTGGC
+ TTCGTCATGG CAACAAGCCA GGCCCTGGTA GACGCCTGTC AGATGGAACC GGAGGAAGTA
+ GAGGTCGCAG CCACAGAGCT ACTGAAAGAG CGAGAGTCCG TCCAGGGCAT GGCCAGTGTC
+ CCGGGAAGCC TGAGCCGCAG GTCCTCCCTG GGCAGCCTTG ACCAGGTCCA GGGCTCCCAG
+ GAAACCCTTA TTCCTCCCAG GCCGTGATGG CTGTGCAGTG TCCACATGAC CAAGGCGAGA
+ GGGACAGTGC GTGCAGAAGC TCAGCCCTGC ATGGCAGCCT CCCTCTGTCT CAGCCCTCCT
+ GCTGAGCTGG GGCGGTCTGG AACCGACCAG GAAGCCAGGA GCCTCCCCTG GCCAGCAAGA
+ GGCATGATTC TAAAGCCATC CAGAAAGGCC TGGTCAGTGC CACTCCCCAG CAGGACATTA
+ AAGTCTCTAG GTCTGTGGCA
+>RABGSTB Oryctolagus cuniculus glutathione S-transferase mRNA, complete cds.
+ CAGAAACCAC CACTATGGCA GGGAAGCCCA AGCTTCACTA CTTCAATGCA CGGGGCAGAA
+ TGGAGTCTAT CCGGTGGCTC CTGACTGCAG CTGGGGTAGA GTTTGAAGAG AAATGTATGA
+ AAACTCGAGA AGACCTGGAA AAGTTAAGAA AAGATGGGGT ATTGATGTTC CAGCAAGTGC
+ CCATGGTTGA GATTGATGGG ATGAAGCTGG TGCAGACCAG AGCCATTTTC AACTACATTG
+ CAGACAAGCA CAACCTGTAT GGGAAAGACA TAAAGGAGAG AGCCCTGATT GATATGTATA
+ CAGAAGGCAT AGTAGATTTG AATGAATTGA TTCTTACTCG TCCATTCCTT CCACCGGAGG
+ AACAAGAGGC AAAACTTGCT CAGATCAAAG ATAAAGCAAA AAACCGTTAT TTTCCTGCCT
+ TTGAAAAGGT GTTGAAGAGC CACGGACAAG ACTACCTTGT TGGCAACAAG CTGAGCAAGG
+ CTGACATTCT CCTGGTTGAA CTTCTCTACA ACGTGGAAGA GCTCAACCCC GGCGCGACTG
+ CCAGCTTCCC TCTGCTGCAG GCCCTGAAAA CCAGGATCAG CAATCTCCCC ACCGTGAAGA
+ AGTTTCTGCA GCCTGGCAGC CAGAGGAATC CGCCTGATGA TGAGAAATGC AGAGAAGAAG
+ CAAAAATCAT TTTCCATTAA GAAGGCAAAG ATACCAAGCA CAGGCAAGAC CAGCCTCTGA
+ CCCCCTGCAG CGATGAAGTA CTTTAAATAA ATAGTGATCC TGATTGTCAT AAGGCATATT
+ ACGTTTTCTA AGTATTGTGT AAATTTAATT AAAAACCACC CATGTAGATT TAGTTGCAAT
+ ACATGGTACT TGGTTTTGAT CAAATACAAA ATTATGAGCA CCTCCTAGGA TGTCCCTTTG
+ AA
diff --git a/seq/gst.seq b/seq/gst.seq
new file mode 100644
index 0000000..0aa82e2
--- /dev/null
+++ b/seq/gst.seq
@@ -0,0 +1,20 @@
+>gi|193547|gb|J04632|MUSGLUTA Mouse glutathione S-transferase class mu (GST1-1) mRNA, complete cds
+CCTGCCTTCCGCTTTAGGGTCTGCTGCTCTGGTTACAGACCTAGGAAGGGGAGTGCCTAATTGGGATTGG
+TGCAGGGTTGGGAGGGACCCGCTGTTTTGTCCTGCCCACGTTTCTCTAGTAGTCTGTATAAAGTCACAAC
+TCCAAACACACAGGTCAGTCCTGCTGAAGCCAGTTTGAGAAGACCACAGCACCAGCACCATGCCTATGAT
+ACTGGGATACTGGAACGTCCGCGGACTGACACACCCGATCCGCATGCTCCTGGAATACACAGACTCAAGC
+TATGATGAGAAGAGATACACCATGGGTGACGCTCCCGACTTTGACAGAAGCCAGTGGCTGAATGAGAAGT
+TCAAGCTGGGCCTGGACTTTCCCAATCTGCCTTACTTGATCGATGGATCACACAAGATCACCCAGAGCAA
+TGCCATCCTGCGCTACCTTGCCCGAAAGCACCACCTGGATGGAGAGACAGAGGAGGAGAGGATCCGTGCA
+GACATTGTGGAGAACCAGGTCATGGACACCCGCATGCAGCTCATCATGCTCTGTTACAACCCTGACTTTG
+AGAAGCAGAAGCCAGAGTTCTTGAAGACCATCCCTGAGAAAATGAAGCTCTACTCTGAGTTCCTGGGCAA
+GAGGCCATGGTTTGCAGGGGACAAGGTCACCTATGTGGATTTCCTTGCTTATGACATTCTTGACCAGTAC
+CGTATGTTTGAGCCCAAGTGCCTGGACGCCTTCCCAAACCTGAGGGACTTCCTGGCCCGCTTCGAGGGCC
+TCAAGAAGATCTCTGCCTACATGAAGAGTAGCCGCTACATCGCAACACCTATATTTTCAAAGATGGCCCA
+CTGGAGTAACAAGTAGGCCCTTGCTACACGGGCACTCACTAGGAGGACCTGTCCACACTGGGGATCCTGC
+AGGCCCTGGGTGGGGACAGCACCCTGGCCTTCTGCACTGTGGCTCCTGGTTCTCTCTCCTTCCCGCTCCC
+TTCTGCAGCTTGGTCAGCCCCATCTCCTCACCCTCTTCCCAGTCAAGTCCACACAGCCTTCATTCTCCCC
+AGTTTCTTTCACATGGCCCCTTCTTCATTGGCTCCCTGACCCAACCTCACAGCCCGTTTCTGCGAACTGA
+GGTCTGTCCTGAACTCACGCTTCCTAGAATTACCCCGATGGTCAACACTATCTTAGTGCTAGCCCTCCCT
+AGAGTTACCCCGAAGTCAATACTTGAGTGCCAGCCTGTTCCTGGTGGAGTAGCCTCCCCAGGTCTGTCTC
+GTCTACAATAAAGTCTGAAACACACTT
diff --git a/seq/gstm1_human.vaa b/seq/gstm1_human.vaa
new file mode 100644
index 0000000..b0cad06
--- /dev/null
+++ b/seq/gstm1_human.vaa
@@ -0,0 +1,2 @@
+>sp|P09488|GSTM1_HUMAN GLUTATHIONE S-TRANSFERASE MU 1 (EC 2.5.1.18) (GSTM1-1) (HB SUBUNI
+MPMILGYWDIRGLAHAIRLLLEYTDSSYEEKKYTMGDAPDYDRSQWLNEKFKLGLDFPNLPYLIDGAHKITQSNAILCYIARKHNLCGETEEEKIRVDILENQTMDNHMQLGMICYNPEFEKLKPKYLEELPEKLKLYSEFLGKRPWFAGNKITFVDFLVYDVLDLHRIFEPNCLDAFPNLKDFISRFEGLEKISAYMKSSRFLPRPVFTKMAVWGNK
diff --git a/seq/gstm1b_human.nt b/seq/gstm1b_human.nt
new file mode 100644
index 0000000..b8484a7
--- /dev/null
+++ b/seq/gstm1b_human.nt
@@ -0,0 +1,17 @@
+>gi|183668|gb|J03817.1|HUMGSTM1B Human glutathione transferase M1B (GST1) mRNA, complete cds
+GCACCAACCAGCACCATGCCCATGATACTGGGGTACTGGGACATCCGCGGGCTGGCCCACGCCATCCGCC
+TGCTCCTGGAATACACAGACTCAAGCTATGAGGAAAAGAAGTACACGATGGGGGACGCTCCTGATTATGA
+CAGAAGCCAGTGGCTGAATGAAAAATTCAAGCTGGGCCTGGACTTTCCCAATCTGCCCTACTTGATTGAT
+GGGGCTCACAAGATCACCCAGAGCAACGCCATCTTGTGCTACATTGCCCGCAAGCACAACCTGTGTGGGG
+AGACAGAAGAGGAGAAGATTCGTGTGGACATTTTGGAGAACCAGACCATGGACAACCATATGCAGCTGGG
+CATGATCTGCTACAATCCAGAATTTGAGAAACTGAAGCCAAAGTACTTGGAGGAACTCCCTGAAAAGCTA
+AAGCTCTACTCAGAGTTTCTGGGGAAGCGGCCATGGTTTGCAGGAAACAAGATCACTTTTGTAGATTTTC
+TCGTCTATGATGTCCTTGACCTCCACCGTATATTTGAGCCCAACTGCTTGGACGCCTTCCCAAATCTGAA
+GGACTTCATCTCCCGCTTTGAGGGCTTGGAGAAGATCTCTGCCTACATGAAGTCCAGCCGCTTCCTCCCA
+AGACCTGTGTTCTCAAAGATGGCTGTCTGGGGCAACAAGTAGGGCCTTGAAGGCAGGAGGTGGGAGTGAG
+GAGCCCATACTCAGCCTGCTGCCCAGGCTGTGCAGCGCAGCTGGACTCTGCATCCCAGCACCTGCCTCCT
+CGTTCCTTTCTCCTGTTTATTCCCATCTTTACTCCCAAGACTTCATTGTCCCTCTTCACTCCCCCTAAAC
+CCCTGTCCCATGCAGGCCCTTTGAAGCCTCAGCTACCCACTATCCTTCGTGAACATCCCCTCCCATCATT
+ACCCTTCCCTGCACTAAAGCCAGCCTGACCTTCCTTCCTGTTAGTGGTTGTGTCTGCTTTAAAGCCTGCC
+TGGCCCCTCGCCTGTGGAGCTCAGCCCCGAGCTGTCCCCGTGTTGCATGAAGGAGCAGCATTGACTGGTT
+TACAGGCCCTGCTCCTGCAGCATGGTCCCTGCCTAGGCCTACCTGATGGAAGTAAAGCCTCAACCAC
diff --git a/seq/gstm1b_human_fs.nt b/seq/gstm1b_human_fs.nt
new file mode 100644
index 0000000..b6fbaa8
--- /dev/null
+++ b/seq/gstm1b_human_fs.nt
@@ -0,0 +1,17 @@
+>gi|183668|gb|J03817.1|HUMGSTM1B Human glutathione transferase M1B (GST1) mRNA, complete cds
+GCACCAACCAGCACC
+ATGCCCATGATACTGGGGTACTGGGACATCCGCGGGCTGGCCCACGCCATCCGCCTGCTCCTGGAATAACAGACTCAAGCTATGAGGAAAAGAAGTCACACGATGGGGGACGCTCCTGATTATGA
+CAGAAGCCAGTGGCTGATGAAAAATTCAAGCTGGGCCTGGACTTTCCCAATCTGCCCTACTTGATTGAT
+GGGGCTCACAAGATCACCCAGAGCAACGCCATCTTGTGCTACATTGCCCGCAAGCACAACCTGTGTGGGG
+AGACAGAAGAGGAGAAGATTCGTGTGGACATTTTGGAGAACCAGACCATGGACAACCATATGCAGCTGGG
+CATGATCTGCTACAATCCAGAATTTGAGAAACTGAAGCCAAAGTACTTGGAGGAACTCCCTGAAAAGCTA
+AAGCTCTACTCAGAGTTTCTGGGGAAGCGGCCATGGTTTGCAGGAAACAAGATCACTTTTGTAGATTTTC
+TCGTCTATGATGTCCTTGACCTCCCACCGTATATTTGAGCCCAACTGCTTGGACGCCTTCCCAAATCTGAA
+GGACTTCATCTCCCGCTTTGAGGGCTTGGAGAAGATCTCTGCCTACATGAAGTCCAGCCGCTTCCTCCCA
+AGACCTGTGTTCTCAAAGATGGCTGTCTGGGGCAACAAGTAGGGCCTTGAAGGCAGGAGGTGGGAGTGAG
+GAGCCCATACTCAGCCTGCTGCCCAGGCTGTGCAGCGCAGCTGGACTCTGCATCCCAGCACCTGCCTCCT
+CGTTCCTTTCTCCTGTTTATTCCCATCTTTACTCCCAAGACTTCATTGTCCCTCTTCACTCCCCCTAAAC
+CCCTGTCCCATGCAGGCCCTTTGAAGCCTCAGCTACCCACTATCCTTCGTGAACATCCCCTCCCATCATT
+ACCCTTCCCTGCACTAAAGCCAGCCTGACCTTCCTTCCTGTTAGTGGTTGTGTCTGCTTTAAAGCCTGCC
+TGGCCCCTCGCCTGTGGAGCTCAGCCCCGAGCTGTCCCCGTGTTGCATGAAGGAGCAGCATTGACTGGTT
+TACAGGCCCTGCTCCTGCAGCATGGTCCCTGCCTAGGCCTACCTGATGGAAGTAAAGCCTCAACCAC
diff --git a/seq/gstt1_drome.aa b/seq/gstt1_drome.aa
new file mode 100644
index 0000000..27c46dd
--- /dev/null
+++ b/seq/gstt1_drome.aa
@@ -0,0 +1,4 @@
+>gi|121694|sp|P20432.1|GSTT1_DROME GLUTATHIONE S-TRANSFERASE 1-1 (EC 2.5.1.18) (CLASS-THETA). - DROS
+MVDFYYLPGSSPCRSVIMTAKAVGVELNKKLLNLQAGEHLKPEFLKINPQHTIPTLVDNGFALWESRAIQVYLVEKYG
+KTDSLYPKCPKKRAVINQRLYFDMGTLYQSFANYYYPQVFAKAPADPEAFKKIEAAFEFLNTFLEGQDYAAGDSLTVA
+DIALVATVSTFEVAKFEISKYANVNRWYENAKKVTPGWEENWAGCLEFKKYFE
diff --git a/seq/gstt1_pssm.asn1 b/seq/gstt1_pssm.asn1
new file mode 100644
index 0000000..98c0fd2
Binary files /dev/null and b/seq/gstt1_pssm.asn1 differ
diff --git a/seq/gtm1_human.aa b/seq/gtm1_human.aa
new file mode 100644
index 0000000..03c7d46
--- /dev/null
+++ b/seq/gtm1_human.aa
@@ -0,0 +1,4 @@
+>sp|P09488|GSTM1_HUMAN GLUTATHIONE S-TRANSFERASE MU 1 (EC 2.5.1.18) (GSTM1-1) (HB SUBUNI
+MPMILGYWDIRGLAHAIRLLLEYTDSSYEEKKYTMGDAPDYDRSQWLNEKFKLGLDFPNLPYLIDGAHKITQSNAILCY
+IARKHNLCGETEEEKIRVDILENQTMDNHMQLGMICYNPEFEKLKPKYLEELPEKLKLYSEFLGKRPWFAGNKITFVD
+FLVYDVLDLHRIFEPKCLDAFPNLKDFISRFEGLEKISAYMKSSRFLPRPVFSKMAVWGNK
diff --git a/seq/gtt1_drome.aa b/seq/gtt1_drome.aa
new file mode 100644
index 0000000..388b4df
--- /dev/null
+++ b/seq/gtt1_drome.aa
@@ -0,0 +1,4 @@
+>GTT1_DROME GLUTATHIONE S-TRANSFERASE 1-1 (EC 2.5.1.18) (CLASS-THETA). - DROS
+MVDFYYLPGSSPCRSVIMTAKAVGVELNKKLLNLQAGEHLKPEFLKINPQHTIPTLVDNGFALWESRAIQVYLVEKYG
+KTDSLYPKCPKKRAVINQRLYFDMGTLYQSFANYYYPQVFAKAPADPEAFKKIEAAFEFLNTFLEGQDYAAGDSLTVA
+DIALVATVSTFEVAKFEISKYANVNRWYENAKKVTPGWEENWAGCLEFKKYFE
diff --git a/seq/h10_human.aa b/seq/h10_human.aa
new file mode 100644
index 0000000..04bcf0c
--- /dev/null
+++ b/seq/h10_human.aa
@@ -0,0 +1,4 @@
+>H10_HUMAN | 90538 | HISTONE H1' (H1.0) (H1(0)).
+TENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAGSSRQSIQKYIKSHYKVGENADSQIKLSIKRLV
+TTGVLKQTKGVGASGSFRLAKSDEPKKSVAFKKTKKEIKKVATPKKASKPKKAASKAPTKKPKATPVKKAKKKLA
+ATPKKAKKPKTVKAKPVKASKPKKAKPVKPKAKSSAKRAGKKK
diff --git a/seq/hahu.aa b/seq/hahu.aa
new file mode 100644
index 0000000..b96234d
--- /dev/null
+++ b/seq/hahu.aa
@@ -0,0 +1,4 @@
+>HAHU | 1114 | Hemoglobin alpha chain - Human, chimpanzee, and pygmy chimpanzee
+VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAV
+AHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKY
+R
diff --git a/seq/hsgstm1b.gcg b/seq/hsgstm1b.gcg
new file mode 100644
index 0000000..9a88985
--- /dev/null
+++ b/seq/hsgstm1b.gcg
@@ -0,0 +1,214 @@
+ FROMSTADEN of: hsgstm1b.g check: 1769 from: 1 to: 5183
+
+ <---No Contig Comments--->
+
+hsgstm1b.gcg Length: 5183 October 12, 1994 10:58 Type: N Check: 1769 ..
+
+ 1 GCACCAACCA GCACCATGCC CATGATACTG GGGTACTGGG ACATCCGTGG
+
+ 51 GGTAAGCGAG GGTCCTCTGG TGGGTGGGAC AGGGGGCGGA GGCGGGGATG
+
+ 101 TGTGGAGTAG CTGCAGGACT GGCTCTAGGG ACCGTTCCTC TTCAGGGCTG
+
+ 151 CCCGCCTCAG AAGGGCCTGT GCATGACGCT GTGTGTGTGT TTGGGGGTGG
+
+ 201 GGGCGGGTAG AGGAGGCGAC GGGTACGTGC AGTATAGACT AGGGCTGGCC
+
+ 251 TGGTGCAGAG AAAGTCACCA AGTCAGGGAC CCTCCATCTC TGACCCGAGC
+
+ 301 CGCGGCCATC TCTCCCAGCT GGCCCACGCC ATCCGCCTGC TCCTGGAATA
+
+ 351 CACAGACTCA AGCTACGAGG AAAAGAAGTA TACGATGGGG GACGGTAATG
+
+ 401 ACACCCTTGT GTCCGGGCTC TGCACTCACG CTGAGTTGGC ACCAAGCAAC
+
+ 451 CCATGGTGGC CACCTGTCGT ACCTCTGCAG GCCTCCCCTG CTGGAGCTGC
+
+ 501 AGGCTGTCCC TTCCCTGAGC CCCGGTGAGG AGTCCTGTGG CCTTGCAAGG
+
+ 551 CAGAATGCTG GGGCGGGATA GTGGGTCCCT GTTTAATTGG GTTGGGTGTC
+
+ 601 CTCAGAGCTT CCCAAACCCT GGAAGCCTTA GCCGTGTGGG GTCCAGAGCC
+
+ 651 TCAGCGGGAT TATTTGTCCC TGAACCCTGG GATGTGGGAC TGAGTGGTCA
+
+ 701 GATTCTAGAT CCACCTGTCT CAGGGATCTT GCCACTGGTT CCTTGGGAGG
+
+ 751 GTCCCCGGAA GGAGGGCTGG GCTCTGGGGA GGTTTGTTTT CACTTCTTCT
+
+ 801 TCCCCACGGC AGCTCCTGAC TATGACAGAA CGCAGTGGCT GAATGAAAAA
+
+ 851 TTCAAGCTGG GCCTGGACTT TCCCAATGTA GGTGCAGGGG GAAGGGGCGG
+
+ 901 TTTTGGGGGA AAGTGCGACG TGTCTCTGAC TGCATCTCCT CTCCCCAGAT
+
+ 951 TAGAGGTGTT CGGATCAGGA GTCTTCTGCC CAATTCCTGG TTGTCTACAC
+
+ 1001 AGCCCCTGCA TGATGTTCTG TGTCCCAGCT CATTTGTTCA TGTGACAGTA
+
+ 1051 TTTCTATGTC AGGCCTGCAT GAGCGGGCAC AGTGAGTCTG GTCTCCCCTT
+
+ 1101 GCATATAGGA AGGGGATGCT GGGGAGCCTG CTGGCCCCAA CTGAGCTTCC
+
+ 1151 CCGGTTTCCC ATCTATCCAG CTGCCCTACT TGATTGATGG GGCTCACAAG
+
+ 1201 ATCACCCAGA GCAACGCCAT CTTGTGCTAC ATTGCCCGCA AGCACAACCT
+
+ 1251 GTGTGAGTGT GGGTGGCTGC AATGTGTGGG GGGAAGGTGG CCTCCTCCTT
+
+ 1301 GGCTGGGCTG TGATGCTGAG ATTGAGTCTG TGTTTTGTGG GTGGCAGGTG
+
+ 1351 GGGAGACAGA AGAGGAGAAG ATTCGTGTGG ACATTTTGGA GAACCAGACC
+
+ 1401 ATGGACAACC ATATGCAGCT GGGCATGATC TGCTACAATC CAGAATTTGT
+
+ 1451 GAGTGTCCCC AGTGAGCTGC ATCTGACAGA GTTTGGATTT GGGGCCAGGA
+
+ 1501 CTCTTGCATC CTGCACACAT TGGTCTTAAG TCCCTGGTAC CATTCATCCT
+
+ 1551 CCAAGTGCTT TCCCATCATC TAGCAGTATC TCTACGACTC CAATGTCATG
+
+ 1601 TCAACAAAAG CAGAGGCAAT TCCCAACCAA CCTTAGGACA CGATTCCAGG
+
+ 1651 CATTCCCAGG GTAGAAATTT CAGTTCCTGT ATGGTAAAGT TTGTGTTCAG
+
+ 1701 AATCTCCTTC ATCAGCTCTG GCCTCTGACT TCTGTCCTGG GTCATTTCTG
+
+ 1751 TCAGCCAGTT CACATCACCT GCCTGCTCCT AGAATATGCA GACTCAAGTA
+
+ 1801 GAAGACTCAG GAATGTAATG GCACCCTCGA ATTGCATCTT CTCCTCAACA
+
+ 1851 GTTTTCTGAG TGCTGTCATT GACATGCACA GGGATCTGCG CATCTTCATA
+
+ 1901 ACAGACAGCT CAGAGGCAGT CAGAGGGCCT TTATTCCTCT CCCTCCTTCC
+
+ 1951 TTTCAACTTG AACTTCTCAT CTCCCTGGAA ACTAGTCAAC GTTCATTGTT
+
+ 2001 TTCTTCTGCC ACCCCATTAG AAGGAACTTT CTACTTTCCC TGAGCTCCCT
+
+ 2051 TAGTTCTTTG CATCCTTGAT TCTGCTGGTC TGGATCCAGA GGCTGCCAGG
+
+ 2101 TGCTTGGGCG CTCCTGGGGC TGACCCAGAG GCTATTGGGA GGTCAGTGAG
+
+ 2151 GACAGATTCA GGGACAGCAT CTCATTCCTC TCTGCCTTCT GATCAGTTTA
+
+ 2201 GATAGGGTCT GACACTCAGT CAGAGTCTAA AATGCTGAGT ATCCAATTGA
+
+ 2251 AGCCTGCACT GCCCCAGTTC CAGACTTGGG GAAGATGGCT GCTTGCCCGT
+
+ 2301 GCCAGCCTGG CCGTCCACAG CCCCGGGGAG GCCACGTCTG TGCAGGGAGC
+
+ 2351 TTTTGTCCGA GGGTGGTGAC AGCTGTTTTC TGCCTCAGGA GAAACTGAAG
+
+ 2401 CCAAAGTACT TGGAGGAACT CCCTGAAAAG CTAAAGCTCT ACTCAGAGTT
+
+ 2451 TCTGGGGAAG CGGCCATGGT TTGCAGGAAA CAAGGTAAAG GAGGAGTGAT
+
+ 2501 ATGGGGAATG AGATCTGTTT TGCTTCACGT GTTATGGAGG TTCCAGCCCA
+
+ 2551 CACATTCTTG GCCTTCTGCA GATCACTTTT GTAGATTTTC TCGTCTATGA
+
+ 2601 TGTCCTTGAC CTCCACCGTA TATTTGAGCC CAACTGCTTG GACGCCTTCC
+
+ 2651 CAAATCTGAA GGACTTCATC TCCCGCTTTG AGGTGATGCC CCCAATCCTC
+
+ 2701 CCTTCTCTTT GATGCCCCTT GTTCCGTTAC CTCCTTTCAG ATGCTTTCCC
+
+ 2751 ATGCCTGGAG CTACACACAG AATAACTCGC ATGTATTGAG TACTGGTTTC
+
+ 2801 ATGCCACGAA CCGTACCCCA GCACATTATA CCTATTGTGT GAAATTTGAA
+
+ 2851 TTTTATAACA TTCCAGTAAG GTAACAGAAT TATCTCGCCC ATTTTAGAGA
+
+ 2901 TAAGGAAACT AAGAATGAGA GGGTCGGTCC TCTGCTCAGG GTCCCAGAGC
+
+ 2951 TAGTGGAGGC AGTGCTGGGC CCCTGTGAGC CTCTGGATCT ATGGGTGGCA
+
+ 3001 GTCAGGCTCT CCCATTCGAC AGAGAAAAAG CCTTAGCGTT CACCTAGCCT
+
+ 3051 GGGTTTCACA GCCCAGGACA CTTTGGAAGA GGCAGAGAAC TTCATGACCA
+
+ 3101 TAGATGGAGC TGGCAATAGT AGGACTGACA CAACGGTGAC ATTGATGTCT
+
+ 3151 AGTACTGAAC CCACAGGCAA TCTCATAGCT ACCTCCAGAA GCTTTGCATG
+
+ 3201 ATTGGACCCC AGTGTGGGAA TCCTGAGAGC CAGGGCTGTG GCTGTAGCTG
+
+ 3251 GATTAAGGTA CATATGTGGG TGTCCCTGTT GAAGGAGTAT ATGTTGAAAT
+
+ 3301 GCCCGGTGCT GGGGCACTTA CTTACTCCAC CACTATCTTT TTTTTTTTTT
+
+ 3351 TTTTTTTTTT TTTGTGCTGG AGTCTTGCTC TGTTGCCCAG GCTGGAGTTC
+
+ 3401 AATGGAGTGA TCTTGGCTCA CTGCAACCTC CGCCTCCTGG GTTCAAGCGA
+
+ 3451 TTCTACTGCC TCAGCTGCAC GATTAGTTGG GATTACAGGT GTGCACCACC
+
+ 3501 ACGTCTGGCT AATTTTTGTA TTTTTAGTAG AGATGGGGTT TTGCCATGTT
+
+ 3551 GGTCAGGCTG GTCTTCGAAC TCCTGACCTC AGGTGATCTA CCCACATCAG
+
+ 3601 CCTCCCTCAG ATCGTGTCTT GCTGTTGCCC AGGCTGGAGC AGCAGTTGCG
+
+ 3651 TGACCTCGGA CTTACTGCAA CCTCTGCTCC CGGGTTCAAA CAATTCTCTG
+
+ 3701 CCTCAGCCTC CCGAGTAGCT GGGAATTACA AGTGTCTATC ACCACGCCCA
+
+ 3751 GCTAATTTTT CTATTTTTAG TAGAGATGGG CTTTTCACCA TGTTGGCCAG
+
+ 3801 GTGGTCTTGA ACTCCTGACC TCGGTGATCC ACCCACCTCG GCTTCCCACA
+
+ 3851 TCTGAGTGTC ATGTAGCCTG ATCTGCAGCA GGGCTGTAGA TGCCATGGGT
+
+ 3901 TAGGGCACAG TGAGATTTTG CTCAGGTATT AGATGGAGAA CTTTGGACTT
+
+ 3951 TCTGCTTTAA GGGGAATGTT TAGAGCCTAG TCTCgTTTGA TTTTCTTGTG
+
+ 4001 CACTGCCACC CCCCATTCCA CTTTCATCCA GGTTTACTGA GACATTGGGG
+
+ 4051 TGAGTGTGTT CAGAGCCCCT TTGTTCTGCT GCAGGTCCCT TCTGTGTCTC
+
+ 4101 TATACCCAGA CAAGCCAAGA GCCTCCCTGT GGAAAAGGAG ACTGTTTGTG
+
+ 4151 CAGTCAAGGA GTGACAGGGC CTGGTGTGAG GGGTGGTGGG GCAGAAGAAG
+
+ 4201 AAGAGAATTT GTCAGGAAGA GGCCAGAACT GGAGAGAGAC AGAACCAGGC
+
+ 4251 TACACYGCAA GTTCTATTCC CCTTACAAGG TATCTAAACG TAAGGAAGTT
+
+ 4301 GCTGAACTTC TGTTCCACAT GAGAATGGTG ATAATAGATT CAGCCTTGCA
+
+ 4351 GAGCAGTCGA GTGGTTTTCT AAGCTTACGT TGTAATTTGT GTTGGTACAG
+
+ 4401 AGCACCCAGC ACCGTGTAGA ATCTTCGTAA GTGTTAGCTG TTACTGTGGT
+
+ 4451 ACAACATTAC CTAAAGGAAG TTGGAAGAGT TAACTCAGCA AATCTGGGGA
+
+ 4501 CCCTAAGAAG CTGTGTGATG CCTCAGCACT TGAGCCCACA TGGAAAGGCT
+
+ 4551 GTGCCAGGGC CCTGACCTGC TGTGTCTGCA GTGGGGTTGT CCCACCGCTC
+
+ 4601 ATGGGCAGCT GACCTTGAGT TCTGGCCTTA TTTTCCCCCC TCTCAGGGCT
+
+ 4651 TGGAGAAGAT CTCTGCCTAC ATGAAGTCCA GCCGCTTCCT CCCAAGACCT
+
+ 4701 GTGTTCTCAA AGATGGCTGT CTGGGGCAAC AAGTAGGGCC TTGAAGGCAG
+
+ 4751 GAGGTGGGAG TGAGGAGCCC ATACTCAGCC TGCTGCCCAG GCTGTGCAGC
+
+ 4801 GCAGCTGGAC TCTGCATCCC AGCACCTGCC TCCTCGTTCC TTTCTCCTGT
+
+ 4851 TTATTCCCAT CTTTACTCCC AAGACTTCAT TGTCCCTCTT CACTCCCCCT
+
+ 4901 AAACCCCTGT CCCATGCAGG CCCTTTGAAG CCTCAGCTAC CCACTATCCT
+
+ 4951 TCGTGAACAT CCCCTCCCAT CATTACCCTT CCCTGCACTA AAGCCAGCCT
+
+ 5001 GACCTTCCTT CCTGTTAGTG GTTGTGTCTG CTTTAAAGCC TGCCTGGCCC
+
+ 5051 CTCGCCTGTG GAGCTCAGCC CCGAGCTGTC CCCGTGTTGC ATGAAGGAGC
+
+ 5101 AGCATTGACT GGTTTACAGG CCCTGCTCCT GCAGCATGGT CCCTGCCTAG
+
+ 5151 GCCTACCTGA TGGAAGTAAA GCCTCAACCA CAc
+
diff --git a/seq/hsgstm1b.seq b/seq/hsgstm1b.seq
new file mode 100644
index 0000000..8a7123c
--- /dev/null
+++ b/seq/hsgstm1b.seq
@@ -0,0 +1,40 @@
+>gi|31932|emb|X68676|HSGSTM1B H.sapiens GSTM1b gene for glutathione S-transferase
+ATGCCCATGATACTGGGGTACTGGGACATCCGTGGGGTAAGCGAGGGTCCTCTGGTGGGTGGGACAGGGG
+GCGGAGGCGGGGATGTGTGGAGTAGCTGCAGGACTGGCTCTAGGGACCGTTCCTCTTCAGGGCTGCCCGC
+CTCAGAAGGGCCTGTGCATGACGCTGTGTGTGTGTTTGGGGGTGGGGGCGGGTAGAGGAGGCGACGGGTA
+CGTGCAGTATAGACTAGGGCTGGCCTGGTGCAGAGAAAGTCACCAAGTCAGGGACCCTCCATCTCTGACC
+CGAGCCGCGGCCATCTCTCCCAGCTGGCCCACGCCATCCGCCTGCTCCTGGAATACACAGACTCAAGCTA
+CGAGGAAAAGAAGTATACGATGGGGGACGGTAATGACACCCTTGTGTCCGGGCTCTGCACTCACGCTGAG
+TTGGCACCAAGCAACCCATGGTGGCCACCTGTCGTACCTCTGCAGGCCTCCCCTGCTGGAGCTGCAGGCT
+GTCCCTTCCCTGAGCCCCGGTGAGGAGTCCTGTGGCCTTGCAAGGCAGAATGCTGGGGCGGGATAGTGGG
+TCCCTGTTTAATTGGGTTGGGTGTCCTCAGAGCTTCCCAAACCCTGGAAGCCTTAGCCGTGTGGGGTCCA
+GAGCCTCAGCGGGATTATTTGTCCCTGAACCCTGGGATGTGGGACTGAGTGGTCAGATTCTAGATCCACC
+TGTCTCAGGGATCTTGCCACTGGTTCCTTGGGAGGGTCCCCGGAAGGAGGGCTGGGCTCTGGGGAGGTTT
+GTTTTCACTTCTTCTTCCCCACGGCAGCTCCTGACTATGACAGAACGCAGTGGCTGAATGAAAAATTCAA
+GCTGGGCCTGGACTTTCCCAATGTAGGTGCAGGGGGAAGGGGCGGTTTTGGGGGAAAGTGCGACGTGTCT
+CTGACTGCATCTCCTCTCCCCAGATTAGAGGTGTTCGGATCAGGAGTCTTCTGCCCAATTCCTGGTTGTC
+TACACAGCCCCTGCATGATGTTCTGTGTCCCAGCTCATTTGTTCATGTGACAGTATTTCTATGTCAGGCC
+TGCATGAGCGGGCACAGTGAGTCTGGTCTCCCCTTGCATATAGGAAGGGGATGCTGGGGAGCCTGCTGGC
+CCCAACTGAGCTTCCCCGGTTTCCCATCTATCCAGCTGCCCTACTTGATTGATGGGGCTCACAAGATCAC
+CCAGAGCAACGCCATCTTGTGCTACATTGCCCGCAAGCACAACCTGTGTGAGTGTGGGTGGCTGCAATGT
+GTGGGGGGAAGGTGGCCTCCTCCTTGGCTGGGCTGTGATGCTGAGATTGAGTCTGTGTTTTGTGGGTGGC
+AGGTGGGGAGACAGAAGAGGAGAAGATTCGTGTGGACATTTTGGAGAACCAGACCATGGACAACCATATG
+CAGCTGGGCATGATCTGCTACAATCCAGAATTTGTGAGTGTCCCCAGTGAGCTGCATCTGACAGAGTTTG
+GATTTGGGGCCAGGACTCTTGCATCCTGCACACATTGGTCTTAAGTCCCTGGTACCATTCATCCTCCAAG
+TGCTTTCCCATCATCTAGCAGTATCTCTACGACTCCAATGTCATGTCAACAAAAGCAGAGGCAATTCCCA
+ACCAACCTTAGGACACGATTCCAGGCATTCCCAGGGTAGAAATTTCAGTTCCTGTATGGTAAAGTTTGTG
+TTCAGAATCTCCTTCATCAGCTCTGGCCTCTGACTTCTGTCCTGGGTCATTTCTGTCAGCCAGTTCACAT
+CACCTGCCTGCTCCTAGAATATGCAGACTCAAGTAGAAGACTCAGGAATGTAATGGCACCCTCGAATTGC
+ATCTTCTCCTCAACAGTTTTCTGAGTGCTGTCATTGACATGCACAGGGATCTGCGCATCTTCATAACAGA
+CAGCTCAGAGGCAGTCAGAGGGCCTTTATTCCTCTCCCTCCTTCCTTTCAACTTGAACTTCTCATCTCCC
+TGGAAACTAGTCAACGTTCATTGTTTTCTTCTGCCACCCCATTAGAAGGAACTTTCTACTTTCCCTGAGC
+TCCCTTAGTTCTTTGCATCCTTGATTCTGCTGGTCTGGATCCAGAGGCTGCCAGGTGCTTGGGCGCTCCT
+GGGGCTGACCCAGAGGCTATTGGGAGGTCAGTGAGGACAGATTCAGGGACAGCATCTCATTCCTCTCTGC
+CTTCTGATCAGTTTAGATAGGGTCTGACACTCAGTCAGAGTCTAAAATGCTGAGTATCCAATTGAAGCCT
+GCACTGCCCCAGTTCCAGACTTGGGGAAGATGGCTGCTTGCCCGTGCCAGCCTGGCCGTCCACAGCCCCG
+GGGAGGCCACGTCTGTGCAGGGAGCTTTTGTCCGAGGGTGGTGACAGCTGTTTTCTGCCTCAGGAGAAAC
+TGAAGCCAAAGTACTTGGAGGAACTCCCTGAAAAGCTAAAGCTCTACTCAGAGTTTCTGGGGAAGCGGCC
+ATGGTTTGCAGGAAACAAGGTAAAGGAGGAGTGATATGGGGAATGAGATCTGTTTTGCTTCACGTGTTAT
+GGAGGTTCCAGCCCACACATTCTTGGCCTTCTGCAGATCACTTTTGTAGATTTTCTCGTCTATGATGTCC
+TTGACCTCCACCGTATATTTGAGCCCAACTGCTTGGACGCCTTCCCAAATCTGAAGGACTTCATCTCCCG
+CTTTGAG
diff --git a/seq/humgstd.seq b/seq/humgstd.seq
new file mode 100644
index 0000000..5072afb
--- /dev/null
+++ b/seq/humgstd.seq
@@ -0,0 +1,20 @@
+>HUMGSTD Human glutathione transferase class mu (GST1) mRNA, complete cds.
+ GCACCAACCA GCACCATGCC CATGATACTG GGGTACTGGG ACATCCGCGG GCTGGCCCAC
+ GCCATCCGCC TGCTCCTGGA ATACACAGAC TCAAGCTATG AGGAAAAGAA GTACACGATG
+ GGGGACGCTC CTGATTATGA CAGAAGCCAG TGGCTGAATG AAAAATTCAA GCTGGGCCTG
+ GACTTTCCCA ATCTGCCCTA CTTGATTGAT GGGGCTCACA AGATCACCCA GAGCAACGCC
+ ATCTTGTGCT ACATTGCCCG CAAGCACAAC CTGTGTGGGG AGACAGAAGA GGAGAAGATT
+ CGTGTGGACA TTTTGGAGAA CCAGACCATG GACAACCATA TGCAGCTGGG CATGATCTGC
+ TACAATCCAG AATTTGAGAA ACTGAAGCCA AAGTACTTGG AGGAACTCCC TGAAAAGCTA
+ AAGCTCTACT CAGAGTTTCT GGGGAAGCGG CCATGGTTTG CAGGAAACAA GATCACTTTT
+ GTAGATTTTC TCGTCTATGA TGTCCTTGAC CTCCACCGTA TATTTGAGCC CAACTGCTTG
+ GACGCCTTCC CAAATCTGAA GGACTTCATC TCCCGCTTTG AGGGCTTGGA GAAGATCTCT
+ GCCTACATGA AGTCCAGCCG CTTCCTCCCA AGACCTGTGT TCTCAAAGAT GGCTGTCTGG
+ GGCAACAAGT AGGGCCTTGA AGGCAGGAGG TGGGAGTGAG GAGCCCATAC TCAGCCTGCT
+ GCCCAGGCTG TGCAGCGCAG CTGGACTCTG CATCCCAGCA CCTGCCTCCT CGTTCCTTTC
+ TCCTGTTTAT TCCCATCTTT ACTCCCAAGA CTTCATTGTC CCTCTTCACT CCCCCTAAAC
+ CCCTGTCCCA TGCAGGCCCT TTGAAGCCTC AGCTACCCAC TATCCTTCGT GAACATCCCC
+ TCCCATCATT ACCCTTCCCT GCACTAAAGC CAGCCTGACC TTCCTTCCTG TTAGTGGTTG
+ TGTCTGCTTT AAAGCCTGCC TGGCCCCTCG CCTGTGGAGC TCAGCCCCGA GCTGTCCCCG
+ TGTTGCATGA AGGAGCAGCA TTGACTGGTT TACAGGCCCT GCTCCTGCAG CATGGTCCCT
+ GCCTAGGCCT ACCTGATGGA AGTAAAGCCT CAACCAC
diff --git a/seq/lcbo.aa b/seq/lcbo.aa
new file mode 100644
index 0000000..bd7477b
--- /dev/null
+++ b/seq/lcbo.aa
@@ -0,0 +1,5 @@
+>LCBO - Prolactin precursor - Bovine
+MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSS
+EMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL
+VTEVRGMKGAPDAILSRAIEIEEENKRLLEGMEMIFGQVIPGAKETEPYPVWSGLPSLQTKDED
+ARYSAFYNLLHCLRRDSSKIDTYLKLLNCRIIYNNNC*
diff --git a/seq/m1r.aa b/seq/m1r.aa
new file mode 100644
index 0000000..5beac76
--- /dev/null
+++ b/seq/m1r.aa
@@ -0,0 +1,5 @@
+>test | 40001 90043 | mgstm1
+MGCEN,
+MIDYP,
+MLLAY,
+MLLGY
diff --git a/seq/m2.aa b/seq/m2.aa
new file mode 100644
index 0000000..89b65f7
--- /dev/null
+++ b/seq/m2.aa
@@ -0,0 +1,5 @@
+>tests from mgstm1
+MILGYW,
+MLLEYT,
+MGDAPD,
+MLCYNP
diff --git a/seq/mchu.aa b/seq/mchu.aa
new file mode 100644
index 0000000..d6ca8e5
--- /dev/null
+++ b/seq/mchu.aa
@@ -0,0 +1,3 @@
+>sp|P62158|CALM_HUMAN Calmodulin; CaM
+MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDT
+DSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK
diff --git a/seq/mgstm1.3nt b/seq/mgstm1.3nt
new file mode 100644
index 0000000..ad02f34
--- /dev/null
+++ b/seq/mgstm1.3nt
@@ -0,0 +1,60 @@
+>pGT875 | 266
+ATGCCTATGATACTGGGATACTGGGTCCGCGGACTGACACACCCGATCCGCATGCTC
+CTGGAATACACAGACTCAAGCTATGATGAGAAGAGATACACCATGGGTGACGCTCCCGAC
+TTTGACAGAAGCCAGTGGCTGAAATGAGAAGTTCAAGCTGGGCCTGGACTTTCCCAATCT
+GCCTTACTTGATCGATGGATCACACAAGATCACCCAGAGCAATGCCATCCTGCGCTACCT
+TGCCCGAAAGCACCACCTGGATGGAGAGACAGAGGAGGAGAGGATCCGTGCAGACATTGT
+GGAGAACCAGGTCATGGACACCCGCATGCAGCTCATCATGCTCTGTTACAACCCTGACTT
+TGAGAAGCAGAAGCCAGAGTTCTTGAAGACCATCCCTGAGAAAATGAAGCTCTACTCTGA
+GTTCCTGGGCAAGAGGCCATGGTTTTGCAGGGGACAAGGTCACCTATGTGGATTTCCTTG
+CTTATGACATTCTTGACCAGTACCGTATGTTTGAGCCCAAGTGCCTGGACGCCTTCCCAA
+ACCTGAGGGACTTCCTGGCCCGCTTCGAGGGCCTCAAGAAGATCTCTGCCTACATGAAGA
+GTAGCCGCTACATCGCAACACCTATATTTTCAAAGATGGCCCACTGGAGTAACAAGTAGG
+CCCTTGCTACACGGGCACTCACTAGGAGGACCTGTCCACACTGGGGATCCTGCAGGCCCT
+GGGTGGGGACAGCACCCTGGCCTTCTGCACTGTGGCTCCTGGTTCTCTCTCCTTCCCGCT
+CCCTTCTGCAGCTTGGTCAGCCCCATCTCCTCACCCTCTTCCCAGTCAAGTCCACACAGC
+CTTCATTCTCCCCAGTTTCTTTCACATGGCCCCTTCTTCATTGGCTCCCTGACCCAACCT
+CACAGCCCGTTTCTGCGAACTGAGGTCTGTCCTGAACTCACGCTTCCTAGAATTACCCCG
+ATGGTCAACACTATCTTAGTGCTAGCCCTCCCTAGAGTTACCCCGAAGTCAATACTTGAG
+TGCCAGCCTGTTCCTGGTGGAGTAGCCTCCCCAGGTCTGTCTCGTCTACAATAAAGTCTG
+AAACACACTT
+NNNNNNNNNN
+GCTGAAGCCAGTTTGAGAAGACCACAGCACCAGCACCATGCCTATGATACTGGGATACTG
+GAACGTCCGCGGACTGACACACCCGATCCGCATGCTCCTGGAATACACAGACTCAAGCTA
+TGATGAGAAGAGATACACCATGGGTGACGCTCCCGACTTTGACAGAAGCCAGTGGCTGAA
+TGAGAAGTTCAAGCTGGGCCTGGACTTTCCCAATCTGCCTTACTTGATCGATGGATCACA
+CAAGATCACCCAGAGCAATGCCATCCTGCGCTACCTTGCCCGAAAGCACCACCTGGATGG
+AGAGACAGAGGAGGAGAGGATCCGTGCAGACATTGTGGAGAACCAGGTCATGGACACCCG
+CATGCAGCtCATCATGCTCTGTTACAACCCTGACTTTGAGAAGCAGAAGCCAGAGTTCTT
+GAAGACCATCCCTGAGAAAATGAAGCTCTACTCTGAGTTCCTGGGCAAGAGGCCATGGTT
+TGCAGGGGACAAGGTCACCTATGTGGATTTCCTTGCTTATGACATTCTTGACCAGTACCG
+TATgTTTGAGCCCAAGTGCCTGGACGCCTTCCCAAACCTGAGGGACTTCCTGGCCCGCTT
+CGAGGGCCTCAAGAAGATCTCTGCCTACATGAAGAGTAGCCGCTACATCGCAACACCTAT
+ATTTTCAAAGATGGCCCACTGGAGTAACAAGTAGGCCCTTGCTACACGGGCACTCACTAG
+GAGGACCTGTCCACACTGGGgATCCTGCAGGCCCTGGGTGGGGACAGCACCCTGGCCTTC
+TGCACTGTGGCTCCTGGTTCTCTCTCCTTCCCGCTCCCTTCTGCAGCTTGGTCAGCCCCA
+TCTCCTCACCCTCTTCCCAGTCAAGTCCACACAGCCTTCATTCTCCCCAGTTTCTTTCAC
+ATGGCCCCTTCTTCATTGGCTCCCTGACCCAACCTCACAGCCCGTTTCTGCGAACTGAGG
+TCTGTCCTGAACTCACGCTTCCTAGAATTACCCCGATGGTCAACACTATCTTAGTGCTAG
+CCCTCCCTAGAGTTACCCCGAAGTCAATACTTGAGTGCCAGCCTGTTCCTGGTGGAGTAG
+CCTCCCCAGGTCTGTCTCGTCTACAATAAAGTCTGAAACACACTT
+NNNNNNNNNN
+GCTGAAGCCTAGTTTGAGAAGACCACCAGCACCACCACCATGCCTATGATATGGGATACTG
+GAAAGTCCGCGGACTGACACACCCGATCCGCATGCTCCTGGAATACACAGACCCAAGTTA
+TGATGAGAAGAGATACACTATGGGTGACGGCTCCCGACTTTGACAGACAGTGGCTGA
+ATGAGAAGNTTCAAGCTGGGCCTGGAATTTCCCTAATCTGCCTTACTTGATCGATGGATCA
+CACAAGATCACCCAGAGAATGCCATCCTGCGCTACCTGGCCACAAAGCCCACCTGGAGGA
+GATGACAGAGGAGGAGAGGATCCGTGCAGACATTGTGGAGAACCAGATAGCATGGAAACC
+CGCTGCAGCNNNNCATGCTCTCGTTACAACCTTGACTTTGAGAAGCAGAAGCCAGAGTTC
+TTGAAGACCATCCCTGAGAAAATGAGCTCTACTCTGAGTTCCTGGGATGCAAGAGGCCATGGT
+TTGCATGGGACAAGTGTCACCTATGTGGATTTCTTTGCTTATGACATTCTTGACCAGTAC
+CGTATGTTTGAGCCAAGTGCCTGGACGCCTTCCCAAACCTGAGGTGACTTCCTGGCCCGC
+TTCGAGGGCCTCAAGAAGATCTCTGCTCTACATGAAGAGTAGCCGGTACATCGGCACAGC
+TCATATTTACAAAGATGGCCCACTGGAGTAACAAGCAGGCCCTTGCTACACGGCACTCAC
+TAGGAGGACCTGTCCNNACTGGTGGCTCCTGCAGTCCCTGTGTGGGGACAAGCACCCTGG
+CCTTCTGCACTGTGGCTCCTGGTTCCTCTCCTCCCGCTCCCTTCTGCAGTTGGTCAGCCC
+CATCTCCTCACCCTCTTCCCAGTCAAGGCCACACGCCTTCATTCGTCCCCGTCTTCTTTC
+ACATGGCCTCCTTCTTCGATTGGCTCCCTGACCCACACCTCACAGCCCGTTTCTGCGAAC
+TGAGGTCTGTCTGAACTCACGCTTCCTAGAATTACCCCGATGGTCAACCACTATCTTAGT
+GCTAGCCCTGCCCTAGAGTTACCCGAAGTCAATACTTGAAGTGCCAGCCTGCTTCCTGGT
+GGTAGTAGCCTCCCCAGGTCGGTCTCGTCTACAATAAAGTCATGAAACACACT
diff --git a/seq/mgstm1.aa b/seq/mgstm1.aa
new file mode 100644
index 0000000..7c7f121
--- /dev/null
+++ b/seq/mgstm1.aa
@@ -0,0 +1,5 @@
+>sp|P10649|GSTM1_MOUSE Glutathione S-transferase Mu 1; GST 1-1; GST class-mu 1; Glutathione S-transferase GT8.7; pmGT10
+MPMILGYWNVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKFKLGLDFPNL
+PYLIDGSHKITQSNAILRYLARKHHLDGETEEERIRADIVENQVMDTRMQLIMLCYNPDF
+EKQKPEFLKTIPEKMKLYSEFLGKRPWFAGDKVTYVDFLAYDILDQYRMFEPKCLDAFPN
+LRDFLARFEGLKKISAYMKSSRYIATPIFSKMAHWSNK
diff --git a/seq/mgstm1.aaa b/seq/mgstm1.aaa
new file mode 100644
index 0000000..5a9fef5
--- /dev/null
+++ b/seq/mgstm1.aaa
@@ -0,0 +1,8 @@
+>GT8.7 | 266 40001 90043 | transl. of pa875.con, 19 to 675
+MPMILGY at WNVRGLT#HPIRMLLEY at T#DS*S*Y at DEKR
+Y at T#MGDAPDFDRS*QWLNEKFKLGLDFPNLPY at LI
+DGS*HKIT#QSNAILRY at LARKHHLDGET#EEERIR
+ADIVENQVMDT#RMQLIMLCY at NPDFEKQKPEFL
+KT#IPEKMKLY at SEFLGKRPWFAGDKVT#Y at VDFLA
+Y at DILDQY@RMFEPKCLDAFPNLRDFLARFEGLK
+KISAY at MKSSRY@IAT#PIFSKMAHWSNK
diff --git a/seq/mgstm1.e05 b/seq/mgstm1.e05
new file mode 100644
index 0000000..bddfc21
--- /dev/null
+++ b/seq/mgstm1.e05
@@ -0,0 +1,20 @@
+>pGT875 | 266 with an average of 5% of residues modified by mutr.
+GCTGAAGCCTAGTTTGAGAAGACCACCAGCACCACCACCATGCCTATGATATGGGATACTG
+GAAAGTCCGCGGACTGACACACCCGATCCGCATGCTCCTGGAATACACAGACCCAAGTTA
+TGATGAGAAGAGATACACTATGGGTGACGGCTCCCGACTTTGACAGACAGTGGCTGA
+ATGAGAAGNTTCAAGCTGGGCCTGGAATTTCCCTAATCTGCCTTACTTGATCGATGGATCA
+CACAAGATCACCCAGAGAATGCCATCCTGCGCTACCTGGCCACAAAGCCCACCTGGAGGA
+GATGACAGAGGAGGAGAGGATCCGTGCAGACATTGTGGAGAACCAGATAGCATGGAAACC
+CGCTGCAGCNNNNCATGCTCTCGTTACAACCTTGACTTTGAGAAGCAGAAGCCAGAGTTC
+TTGAAGACCATCCCTGAGAAAATGAGCTCTACTCTGAGTTCCTGGGATGCAAGAGGCCATGGT
+TTGCATGGGACAAGTGTCACCTATGTGGATTTCTTTGCTTATGACATTCTTGACCAGTAC
+CGTATGTTTGAGCCAAGTGCCTGGACGCCTTCCCAAACCTGAGGTGACTTCCTGGCCCGC
+TTCGAGGGCCTCAAGAAGATCTCTGCTCTACATGAAGAGTAGCCGGTACATCGGCACAGC
+TCATATTTACAAAGATGGCCCACTGGAGTAACAAGCAGGCCCTTGCTACACGGCACTCAC
+TAGGAGGACCTGTCCNNACTGGTGGCTCCTGCAGTCCCTGTGTGGGGACAAGCACCCTGG
+CCTTCTGCACTGTGGCTCCTGGTTCCTCTCCTCCCGCTCCCTTCTGCAGTTGGTCAGCCC
+CATCTCCTCACCCTCTTCCCAGTCAAGGCCACACGCCTTCATTCGTCCCCGTCTTCTTTC
+ACATGGCCTCCTTCTTCGATTGGCTCCCTGACCCACACCTCACAGCCCGTTTCTGCGAAC
+TGAGGTCTGTCTGAACTCACGCTTCCTAGAATTACCCCGATGGTCAACCACTATCTTAGT
+GCTAGCCCTGCCCTAGAGTTACCCGAAGTCAATACTTGAAGTGCCAGCCTGCTTCCTGGT
+GGTAGTAGCCTCCCCAGGTCGGTCTCGTCTACAATAAAGTCATGAAACACACT
diff --git a/seq/mgstm1.eeq b/seq/mgstm1.eeq
new file mode 100644
index 0000000..3413d53
--- /dev/null
+++ b/seq/mgstm1.eeq
@@ -0,0 +1,20 @@
+>mgstm1 | 266
+ATGCCTATGATACTGGATACTGGAACGTCCGCGGACTGACACACCCGATCCGCATGCTC
+CTGGAATACACAGACTCAAGCTATAGATGAGAAGAGATACACCATGGGTGACGCTCCCGAC
+TTTGACAGAAGCCAGTGGCTGAAATGAGAAGTTCAAGCCTGGGCCTGGACTTTCCCAATCT
+GCCTTACTTATCGATGGATCACACAAGATCACCCAGAGCAATGCCATCCTGCGCTACCT
+TGCCCGAAAGCACCACCTGGATGGAGAGACAGAGGAGGAGAGGACCGTGCAGACATTGT
+GGAGAAGGCAGGTCATGGACACCCGCATGCAGCTCATCATGCTCTGTTACAACCCTGACTT
+TGAGAAGCAGAAGCCAGAGTTCTTGAAGACCATCCCTGAGAAAATGAAGCTCTACTCTGA
+GTTCCTGGCAAGAGGCCATGGTTTTGCAGGGGACAAGGTCACCTATGTGGATTTCCTTG
+CTTATGACATTCTTGACCAGTACCGTTGTTTGAGCCCAAGTGCCTGGACGCCTTCCCAA
+ACCTGAGGGACTTCCTTTGGCCCGCTTCGAGGGCCTCAAGAAGATCTCTGCCTACATGAAGA
+GTAGCCGCTACATCGCAACACCTATATTTTCAAAGATCCCACTGGAGTAACAAGTAGG
+CCCTTGCTACACGGGCACACTCACTAGGAGGACCTGTCCACACTGGGGATCCTGCAGGCCCT
+GGGTGGGGACAGCACCCTGGCCTTCACTGTGGCTCCTGGTTCTCTCTCCTTCCCGCT
+CCCTTCTGCAGCTTGTTTGTCAGCCCCATCTCCTCACCCTCTTCCCAGTCAAGTCCACACAGC
+CTTCATTCTCCCCAGTTTCTTTCACATGGCCCCTTCTTCTTGGCTCCTGACCCAACCT
+CACAGCCCGTTTCTGCGAATGAGGTCTGTCCTGAACTCACGCTTCCTAGAATTACCCCG
+ATGGTCAACACTATCTTAGTGCTAGCACCTCCCTAGAGTTACCCCGAAGTCAATACTTGAG
+TGCCAGCCTGTTCCTGGTGGAGTAGCCTCCCCAGGTCTGTCTCGTCTACAATAAAGTCTGC
+AAACACACTT
diff --git a/seq/mgstm1.esq b/seq/mgstm1.esq
new file mode 100644
index 0000000..056e881
--- /dev/null
+++ b/seq/mgstm1.esq
@@ -0,0 +1,20 @@
+>mgstm1e
+ATGCCTATGATACTGGGATACTGGGTCCGCGGACTGACACACCCGATCCGCATGCTC
+CTGGAATACACAGACTCAAGCTATGATGAGAAGAGATACACCATGGGTGACGCTCCCGAC
+TTTGACAGAAGCCAGTGGCTGAAATGAGAAGTTCAAGCTGGGCCTGGACTTTCCCAATCT
+GCCTTACTTGATCGATGGATCACACAAGATCACCCAGAGCAATGCCATCCTGCGCTACCT
+TGCCCGAAAGCACCACCTGGATGGAGAGACAGAGGAGGAGAGGATCCGTGCAGACATTGT
+GGAGAACCAGGTCATGGACACCCGCATGCAGCTCATCATGCTCTGTTACAACCCTGACTT
+TGAGAAGCAGAAGCCAGAGTTCTTGAAGACCATCCCTGAGAAAATGAAGCTCTACTCTGA
+GTTCCTGGGCAAGAGGCCATGGTTTTGCAGGGGACAAGGTCACCTATGTGGATTTCCTTG
+CTTATGACATTCTTGACCAGTACCGTATGTTTGAGCCCAAGTGCCTGGACGCCTTCCCAA
+ACCTGAGGGACTTCCTGGCCCGCTTCGAGGGCCTCAAGAAGATCTCTGCCTACATGAAGA
+GTAGCCGCTACATCGCAACACCTATATTTTCAAAGATGGCCCACTGGAGTAACAAGTAGG
+CCCTTGCTACACGGGCACTCACTAGGAGGACCTGTCCACACTGGGGATCCTGCAGGCCCT
+GGGTGGGGACAGCACCCTGGCCTTCTGCACTGTGGCTCCTGGTTCTCTCTCCTTCCCGCT
+CCCTTCTGCAGCTTGGTCAGCCCCATCTCCTCACCCTCTTCCCAGTCAAGTCCACACAGC
+CTTCATTCTCCCCAGTTTCTTTCACATGGCCCCTTCTTCATTGGCTCCCTGACCCAACCT
+CACAGCCCGTTTCTGCGAACTGAGGTCTGTCCTGAACTCACGCTTCCTAGAATTACCCCG
+ATGGTCAACACTATCTTAGTGCTAGCCCTCCCTAGAGTTACCCCGAAGTCAATACTTGAG
+TGCCAGCCTGTTCCTGGTGGAGTAGCCTCCCCAGGTCTGTCTCGTCTACAATAAAGTCTG
+AAACACACTT
diff --git a/seq/mgstm1.gcg b/seq/mgstm1.gcg
new file mode 100644
index 0000000..8d75321
--- /dev/null
+++ b/seq/mgstm1.gcg
@@ -0,0 +1,13 @@
+GT8.7 transl. of pa875.con, 19 to 675
+ gt87 Length: 217 July 31, 1996 19:51 Type: P Check: 9358 ..
+
+ 1 PMILGYWNVR GLTHPIRMLL EYTDSSYDEK RYTMGDAPDF DRSQWLNEKF
+
+ 51 KLGLDFPNLP YLIDGSHKIT QSNAILRYLA RKHHLDGETE EERIRADIVE
+
+ 101 NQVMDTRMQL IMLCYNPDFE KQKPEFLKTI PEKMKLYSEF LGKRPWFAGD
+
+ 151 KVTYVDFLAY DILDQYRMFE PKCLDAFPNL RDFLARFEGL KKISAYMKSS
+
+ 201 RYIATPIFSK MAHWSNK
+
diff --git a/seq/mgstm1.lc b/seq/mgstm1.lc
new file mode 100644
index 0000000..b6ac6b2
--- /dev/null
+++ b/seq/mgstm1.lc
@@ -0,0 +1,8 @@
+>GT8.7 | 40001 ! 90043 | transl. of pa875.con, 19 to 675
+MPMILGYWNVRGLTHPIRMLLEYTDSSYDEKR
+ytmgdapdfdrsqwlnekfklgldfpnlpyli
+DGSHKITQSNAILRYLARKHHLDGETEEERIR
+adivenqvmdtrmqlimlcynpdfekqkpefl
+KTIPEKMKLYSEFLGKRPWFAGDKVTYVDFLA
+ydildqyrmfepkcldafpnlrdflarfeglk
+KISAYMKSSRYIATPIFSKMAHWSNK
diff --git a/seq/mgstm1.nt b/seq/mgstm1.nt
new file mode 100644
index 0000000..3f266a3
--- /dev/null
+++ b/seq/mgstm1.nt
@@ -0,0 +1,12 @@
+>pGT875
+ATGCCTATGATACTGGGATACTGGAACGTCCGCGGACTGACACACCCGATCCGCATGCTC
+CTGGAATACACAGACTCAAGCTATGATGAGAAGAGATACACCATGGGTGACGCTCCCGAC
+TTTGACAGAAGCCAGTGGCTGAATGAGAAGTTCAAGCTGGGCCTGGACTTTCCCAATCTG
+CCTTACTTGATCGATGGATCACACAAGATCACCCAGAGCAATGCCATCCTGCGCTACCTT
+GCCCGAAAGCACCACCTGGATGGAGAGACAGAGGAGGAGAGGATCCGTGCAGACATTGTG
+GAGAACCAGGTCATGGACACCCGCATGCAGCTCATCATGCTCTGTTACAACCCTGACTTT
+GAGAAGCAGAAGCCAGAGTTCTTGAAGACCATCCCTGAGAAAATGAAGCTCTACTCTGAG
+TTCCTGGGCAAGAGGCCATGGTTTGCAGGGGACAAGGTCACCTATGTGGATTTCCTTGCT
+TATGACATTCTTGACCAGTACCGTATGTTTGAGCCCAAGTGCCTGGACGCCTTCCCAAAC
+CTGAGGGACTTCCTGGCCCGCTTCGAGGGCCTCAAGAAGATCTCTGCCTACATGAAGAGT
+AGCCGCTACATCGCAACACCTATATTTTCAAAGATGGCCCACTGGAGTAACAAGTAG
diff --git a/seq/mgstm1.nt1 b/seq/mgstm1.nt1
new file mode 100644
index 0000000..db5e404
--- /dev/null
+++ b/seq/mgstm1.nt1
@@ -0,0 +1,12 @@
+>pGT875 | 266
+ATGCCTATGATACTGGGATACTGGAACGTCCGCGGACTGACACACCCGATCCGCATGCTC
+CTGGAATACACAGACTCAAGCTATGATGAGAAGAGATACACCATGGGTGACGCTCCCGAC
+TTTGACAGAAGCCAGTGGCTGAATGAGAAGTTCAAGCTGGGCCTGGACTTTCCCAATCTG
+CCTTACTTGATCGATGGATCACACAAGATCACCCAGAGCAATGCCATCCTGCGCTACCTT
+GCCCGAAAGCACCACCTGGATGGAGAGACAGAGGAGGAGAGGATCCGTGCAGACATTGTG
+GAGAACCAGGTCATGGACACCCGCATGCAGCtCATCATGCTCTGTTACAACCCTGACTTT
+GAGAAGCAGAAGCCAGAGTTCTTGAAGACCATCCCTGAGAAAATGAAGCTCTACTCTGAG
+TTCCTGGGCAAGAGGCCATGGTTTGCAGGGGACAAGGTCACCTATGTGGATTTCCTTGCT
+TATGACATTCTTGACCAGTACCGTATgTTTGAGCCCAAGTGCCTGGACGCCTTCCCAAAC
+CTGAGGGACTTCCTGGCCCGCTTCGAGGGCCTCAAGAAGATCTCTGCCTACATGAAGAGT
+AGCCGCTACATCGCAACACCTATATTTTCAAAGATGGCCCACTGGAGTAACAAGTAG
diff --git a/seq/mgstm1.nt12r b/seq/mgstm1.nt12r
new file mode 100644
index 0000000..f4cf7c5
--- /dev/null
+++ b/seq/mgstm1.nt12r
@@ -0,0 +1,26 @@
+>rev-comp
+CTACTTGTTACTCCAGTGGGCCATCTTTGAAAATATAGGTGTTGCGATGTAGCGG
+CTACTCTTCATGTAGGCAGAGATCTTCTTGAGGCCCTCGAAGCGGGCCAGGAAGTCCCTC
+AGGTTTGGGAAGGCGTCCAGGCACTTGGGCTCAAAcATACGGTACTGGTCAAGAATGTCA
+TAAGCAAGGAAATCCACATAGGTGACCTTGTCCCCTGCAAACCATGGCCTCTTGCCCAGG
+AACTCAGAGTAGAGCTTCATTTTCTCAGGGATGGTCTTCAAGAACTCTGGCTTCTGCTTC
+TCAAAGTCAGGGTTGTAACAGAGCATGATGaGCTGCATGCGGGTGTCCATGACCTGGTTC
+TCCACAATGTCTGCACGGATCCTCTCCTCCTCTGTCTCTCCATCCAGGTGGTGCTTTCGG
+GCAAGGTAGCGCAGGATGGCATTGCTCTGGGTGATCTTGTGTGATCCATCGATCAAGTAA
+GGCAGATTGGGAAAGTCCAGGCCCAGCTTGAACTTCTCATTCAGCCACTGGCTTCTGTCA
+AAGTCGGGAGCGTCACCCATGGTGTATCTCTTCTCATCATAGCTTGAGTCTGTGTATTCC
+AGGAGCATGCGGATCGGGTGTGTCAGTCCGCGGACGTTCCAGTATCCCAGTATCATAGGC
+AT
+NNN
+CTACTTGTTACTCCAGTGGGCCATCTTTGAAAATATAGGTGTTGCGATGTAGCGG
+CTACTCTTCATGTAGGCAGAGATCTTCTTGAGGCCCTCGAAGCGGGCCAGGAAGTCCCTC
+AGGTTTGGGAAGGCGTCCAGGCACTTGGGCTCAAAcATACGGTACTGGTCAAGAATGTCA
+TAAGCAAGGAAATCCACATAGGTGACCTTGTCCCCTGCAAACCATGGCCTCTTGCCCAGG
+AACTCAGAGTAGAGCTTCATTTTCTCAGGGATGGTCTTCAAGAACTCTGGCTTCTGCTTC
+TCAAAGTCAGGGTTGTAACAGAGCATGATGaGCTGCATGCGGGTGTCCATGACCTGGTTC
+TCCACAATGTCTGCACGGATCCTCTCCTCCTCTGTCTCTCCATCCAGGTGGTGCTTTCGG
+GCAAGGTAGCGCAGGATGGCATTGCTCTGGGTGATCTTGTGTGATCCATCGATCAAGTAA
+GGCAGATTGGGAAAGTCCAGGCCCAGCTTGAACTTCTCATTCAGCCACTGGCTTCTGTCA
+AAGTCGGGAGCGTCACCCATGGTGTATCTCTTCTCATCATAGCTTGAGTCTGTGTATTCC
+AGGAGCATGCGGATCGGGTGTGTCAGTCCGCGGACGTTCCAGTATCCCAGTATCATAGGC
+AT
diff --git a/seq/mgstm1.nt13 b/seq/mgstm1.nt13
new file mode 100644
index 0000000..4c6527e
--- /dev/null
+++ b/seq/mgstm1.nt13
@@ -0,0 +1,36 @@
+>pGT875 | 266
+ATGCCTATGATACTGGGATACTGGAACGTCCGCGGACTGACACACCCGATCGCATGCTC
+CTGGAATACACAGACTCAAGCTATGATGAGAAGAGATACACCATGGGTGACGCTCCCGAC
+TTTGACAGAAGCCAGTGGCTGAATGAGAAGTTCAAGCTGGGCGACTTTCCCAATCTG
+CCTTACTTGATCGATGGATCACACAAGATCACCCAGAGCAATGCCATCCTGCGCTACCTT
+GCCCGAAAGCACCACCTGGATGGAGAGACAGAGGAGGAGAGGATCCGTGCAGACATTGTG
+GAGAACCAGGTCATGGACACCCGCATGCAGCtCATCATCCGCTCTGTTACAACCCTGACTTT
+GAGAAGCAGAAGCCAGAGTTCTTGAAGACCATCCCTGAGAAAATGAAGCTCTACTCTGAG
+TTCCTGGGCAAGAGGCCATGGTTTGCAGGGGACAAGGTCACCTATGTGGATTTCCTTGCT
+TATGACATTCTTGACCAGTACCGTATgTTTGAGCCCAAGTCTGGACGCCTTCCCAAAC
+CTGAGGGACTTCCTGGCCCGCTTCGAGGGCCTCAAGAAGATCTCTGCCTACATGAAGAGT
+AGCCGCTACATCGCAACACCTATATTTTCAAAGATGGCCCACTGGAGTAACAAGTAG
+NNNNNNNNN
+ATGCCTATGATACTGGGATACTGGAACGTCCGCGGACTGACACACCCGATCCGCATGCTC
+CTGGAATACACAGACTCAAGCTATGATGAGAAGAGATACACCATGGGTGACGCTCCCGAC
+TTTGACAGAAGCCAGTGGCTGAATGAGAAGTTCAAGCTGGGCCTGGACTTTCCCAATCTG
+CCTTACTTGATCGATGGATCACACAAGATCACCCAGAGCAATGCCATCCTGCGCTACCTT
+GCCCGAAAGCACCACCTGGATGGAGAGACAGAGGAGGAGAGGATCCGTGCAGACATTGTG
+GAGAACCAGGTCATGGACACCCGCATGCAGCtCATCATGCTCTGTTACAACCCTGACTTT
+GAGAAGCAGAAGCCAGAGTTCTTGAAGACCATCCCTGAGAAAATGAAGCTCTACTCTGAG
+TTCCTGGGCAAGAGGCCATGGTTTGCAGGGGACAAGGTCACCTATGTGGATTTCCTTGCT
+TATGACATTCTTGACCAGTACCGTATgTTTGAGCCCAAGTGCCTGGACGCCTTCCCAAAC
+CTGAGGGACTTCCTGGCCCGCTTCGAGGGCCTCAAGAAGATCTCTGCCTACATGAAGAGT
+AGCCGCTACATCGCAACACCTATATTTTCAAAGATGGCCCACTGGAGTAACAAGTAG
+NNNNNNNNN
+ATGCCTATGATACTGGGATACTGGGTCCGCGGACTGACACACCCGATCCGCATGCTC
+CTGGAATACACAGACTCAAGCTATGATGAGAAGAGATACACCATGGGTGACGCTCCCGAC
+TTTGACAGAAGCCAGTGGCTGAAATGAGAAGTTCAAGCTGGGCCTGGACTTTCCCAATCT
+GCCTTACTTGATCGATGGATCACACAAGATCACCCAGAGCAATGCCATCCTGCGCTACCT
+TGCCCGAAAGCACCACCTGGATGGAGAGACAGAGGAGGAGAGGATCCGTGCAGACATTGT
+GGAGAACCAGGTCATGGACACCCGCATGCAGCTCATCATGCTCTGTTACAACCCTGACTT
+TGAGAAGCAGAAGCCAGAGTTCTTGAAGACCATCCCTGAGAAAATGAAGCTCTACTCTGA
+GTTCCTGGGCAAGAGGCCATGGTTTTGCAGGGGACAAGGTCACCTATGTGGATTTCCTTG
+CTTATGACATTCTTGACCAGTACCGTATGTTTGAGCCCAAGTGCCTGGACGCCTTCCCAA
+ACCTGAGGGACTTCCTGGCCCGCTTCGAGGGCCTCAAGAAGATCTCTGCCTACATGAAGA
+GTAGCCGCTACATCGCAACACCTATATTTTCAAAGATGGCCCACTGGAGTAACAAGTAG
diff --git a/seq/mgstm1.nt13r b/seq/mgstm1.nt13r
new file mode 100644
index 0000000..0768c03
--- /dev/null
+++ b/seq/mgstm1.nt13r
@@ -0,0 +1,35 @@
+>rev-comp
+CTACTTGTTACTCCAGTGGGCCATCTTTGAAAATATAGGTGTTGCGATGTAGCGGCTACT
+CTTCATGTAGGCAGAGATCTTCTTGAGGCCCTCGAAGCGGGCCAGGAAGTCCCTCAGGTT
+TGGGAAGGCGTCCAGGCACTTGGGCTCAAACATACGGTACTGGTCAAGAATGTCATAAGC
+AAGGAAATCCACATAGGTGACCTTGTCCCCTGCAAAACCATGGCCTCTTGCCCAGGAACT
+CAGAGTAGAGCTTCATTTTCTCAGGGATGGTCTTCAAGAACTCTGGCTTCTGCTTCTCAA
+AGTCAGGGTTGTAACAGAGCATGATGAGCTGCATGCGGGTGTCCATGACCTGGTTCTCCA
+CAATGTCTGCACGGATCCTCTCCTCCTCTGTCTCTCCATCCAGGTGGTGCTTTCGGGCAA
+GGTAGCGCAGGATGGCATTGCTCTGGGTGATCTTGTGTGATCCATCGATCAAGTAAGGCA
+GATTGGGAAAGTCCAGGCCCAGCTTGAACTTCTCATTTCAGCCACTGGCTTCTGTCAAAG
+TCGGGAGCGTCACCCATGGTGTATCTCTTCTCATCATAGCTTGAGTCTGTGTATTCCAGG
+AGCATGCGGATCGGGTGTGTCAGTCCGCGGACCCAGTATCCCAGTATCATAGGCATNNNN
+NNNNNCTACTTGTTACTCCAGTGGGCCATCTTTGAAAATATAGGTGTTGCGATGTAGCGG
+CTACTCTTCATGTAGGCAGAGATCTTCTTGAGGCCCTCGAAGCGGGCCAGGAAGTCCCTC
+AGGTTTGGGAAGGCGTCCAGGCACTTGGGCTCAAAcATACGGTACTGGTCAAGAATGTCA
+TAAGCAAGGAAATCCACATAGGTGACCTTGTCCCCTGCAAACCATGGCCTCTTGCCCAGG
+AACTCAGAGTAGAGCTTCATTTTCTCAGGGATGGTCTTCAAGAACTCTGGCTTCTGCTTC
+TCAAAGTCAGGGTTGTAACAGAGCATGATGaGCTGCATGCGGGTGTCCATGACCTGGTTC
+TCCACAATGTCTGCACGGATCCTCTCCTCCTCTGTCTCTCCATCCAGGTGGTGCTTTCGG
+GCAAGGTAGCGCAGGATGGCATTGCTCTGGGTGATCTTGTGTGATCCATCGATCAAGTAA
+GGCAGATTGGGAAAGTCCAGGCCCAGCTTGAACTTCTCATTCAGCCACTGGCTTCTGTCA
+AAGTCGGGAGCGTCACCCATGGTGTATCTCTTCTCATCATAGCTTGAGTCTGTGTATTCC
+AGGAGCATGCGGATCGGGTGTGTCAGTCCGCGGACGTTCCAGTATCCCAGTATCATAGGC
+ATNNNNNNNNNCTACTTGTTACTCCAGTGGGCCATCTTTGAAAATATAGGTGTTGCGATG
+TAGCGGCTACTCTTCATGTAGGCAGAGATCTTCTTGAGGCCCTCGAAGCGGGCCAGGAAG
+TCCCTCAGGTTTGGGAAGGCGTCCAGACTTGGGCTCAAAcATACGGTACTGGTCAAGAAT
+GTCATAAGCAAGGAAATCCACATAGGTGACCTTGTCCCCTGCAAACCATGGCCTCTTGCC
+CAGGAACTCAGAGTAGAGCTTCATTTTCTCAGGGATGGTCTTCAAGAACTCTGGCTTCTG
+CTTCTCAAAGTCAGGGTTGTAACAGAGCGGATGATGaGCTGCATGCGGGTGTCCATGACC
+TGGTTCTCCACAATGTCTGCACGGATCCTCTCCTCCTCTGTCTCTCCATCCAGGTGGTGC
+TTTCGGGCAAGGTAGCGCAGGATGGCATTGCTCTGGGTGATCTTGTGTGATCCATCGATC
+AAGTAAGGCAGATTGGGAAAGTCGCCCAGCTTGAACTTCTCATTCAGCCACTGGCTTCTG
+TCAAAGTCGGGAGCGTCACCCATGGTGTATCTCTTCTCATCATAGCTTGAGTCTGTGTAT
+TCCAGGAGCATGCGATCGGGTGTGTCAGTCCGCGGACGTTCCAGTATCCCAGTATCATAG
+GCAT
diff --git a/seq/mgstm1.nt1r b/seq/mgstm1.nt1r
new file mode 100644
index 0000000..b7f376c
--- /dev/null
+++ b/seq/mgstm1.nt1r
@@ -0,0 +1,13 @@
+>mgstm1-rev
+CTACTTGTTACTCCAGTGGGCCATCTTTGAAAATATAGGTGTTGCGATGTAGCGG
+CTACTCTTCATGTAGGCAGAGATCTTCTTGAGGCCCTCGAAGCGGGCCAGGAAGTCCCTC
+AGGTTTGGGAAGGCGTCCAGGCACTTGGGCTCAAAcATACGGTACTGGTCAAGAATGTCA
+TAAGCAAGGAAATCCACATAGGTGACCTTGTCCCCTGCAAACCATGGCCTCTTGCCCAGG
+AACTCAGAGTAGAGCTTCATTTTCTCAGGGATGGTCTTCAAGAACTCTGGCTTCTGCTTC
+TCAAAGTCAGGGTTGTAACAGAGCATGATGaGCTGCATGCGGGTGTCCATGACCTGGTTC
+TCCACAATGTCTGCACGGATCCTCTCCTCCTCTGTCTCTCCATCCAGGTGGTGCTTTCGG
+GCAAGGTAGCGCAGGATGGCATTGCTCTGGGTGATCTTGTGTGATCCATCGATCAAGTAA
+GGCAGATTGGGAAAGTCCAGGCCCAGCTTGAACTTCTCATTCAGCCACTGGCTTCTGTCA
+AAGTCGGGAGCGTCACCCATGGTGTATCTCTTCTCATCATAGCTTGAGTCTGTGTATTCC
+AGGAGCATGCGGATCGGGTGTGTCAGTCCGCGGACGTTCCAGTATCCCAGTATCATAGGC
+AT
diff --git a/seq/mgstm1.nts b/seq/mgstm1.nts
new file mode 100644
index 0000000..4d828c3
--- /dev/null
+++ b/seq/mgstm1.nts
@@ -0,0 +1,9 @@
+>mgstm1
+GCACCATGCCTATGAT,
+GATACACCA,
+CCATCCTGCGCTACCTTGCC,
+aaggtcacctatgtggatttccttgcttat,
+CCTGTCCACACTGGG,
+TCAAGTCCACACAGCC,
+TCACGCTTCCTA,
+CAATACTTGAGTGCCAGCC
diff --git a/seq/mgstm1.raa b/seq/mgstm1.raa
new file mode 100644
index 0000000..9bf8248
--- /dev/null
+++ b/seq/mgstm1.raa
@@ -0,0 +1,5 @@
+>mgstm1.aa shuffled
+LEGLPLKPCK RPQDRFSEDR VILFESFTYG FILAAWNMGY NEAEDMDRSH YLLTKELPKS
+YGGRRYYAPD FTYLFLILRN PPVKRAAPDR GNTMLQIFMA FLDDQYVMQD AFLPIGDGLK
+DKPMRSNMKY ITHNVYIDED IVRCKWIFAD EMSTPLLLWL MHKQKPGHRF LEKSWSHTRR
+EEEYNSIIDL KKSYKYLKNM AELKITSQTI FFDKDAE
diff --git a/seq/mgstm1.rev b/seq/mgstm1.rev
new file mode 100644
index 0000000..7903f42
--- /dev/null
+++ b/seq/mgstm1.rev
@@ -0,0 +1,16 @@
+>mgstm1 reverse complement
+AAGTGTGTTTCAGACTTTATTGTAGACGAGACAGACCTGGGGAGGCTACTCCACCAGGAACAGGCTGGCACTCAA
+GTATTGACTTCGGGGTAACTCTAGGGAGGGCTAGCACTAAGATAGTGTTGACCATCGGGGTAATTCTAGGAAGCG
+TGAGTTCAGGACAGACCTCAGTTCGCAGAAACGGGCTGTGAGGTTGGGTCAGGGAGCCAATGAAGAAGGGGCCAT
+gtgaaagaaactggggagaatgaaggctgtgtggacttgactgggaagagggtgaggagatggggctgaccaagc
+tgcagaagggagcgggaaggagagagaaccaggagccacagtgcagaaggccagggtgctgtccccacccagggc
+CTGCAGGATCCCCAGTGTGGACAGGTCCTCCTAGTGAGTGCCCGTGTAGCAAGGGCCTACTTGTTACTCCAGTGG
+GCCATCTTTGAAAATATAGGTGTTGCGATGTAGCGGCTACTCTTCATGTAGGCAGAGATCTTCTTGAGGCCCTCG
+AAGCGGGCCAGGAAGTCCCTCAGGTTTGGGAAGGCGTCCAGGCACTTGGGCTCAAACATACGGTACTGGTCAAGA
+ATGTCATAAGCAAGGAAATCCACATAGGTGACCTTGTCCCCTGCAAACCATGGCCTCTTGCCCAGGAACTCAGAG
+tagagcttcattttctcagggatggtcttcaagaactctggcttctgcttctcaaagtcagggttgtaacagagc
+atgatgagctgcatgcgggtgtccatgacctggttctccacaatgtctgcacggatcctctcctcctctgtctct
+ccatccaggtggtgctttcgggcaaggtagcgcaggatggcattgctctgggtgatcttgtgtgatccatcgatc
+AAGTAAGGCAGATTGGGAAAGTCCAGGCCCAGCTTGAACTTCTCATTCAGCCACTGGCTTCTGTCAAAGTCGGGA
+GCGTCACCCATGGTGTATCTCTTCTCATCATAGCTTGAGTCTGTGTATTCCAGGAGCATGCGGATCGGGTGTGTC
+AGTCCGCGGACGTTCCAGTATCCCAGTATCATAGGCATGGTGCTGGTGCTGTGGTCTTCTCAAACTGGCTTCAGC
diff --git a/seq/mgstm1.seq b/seq/mgstm1.seq
new file mode 100644
index 0000000..bbd1363
--- /dev/null
+++ b/seq/mgstm1.seq
@@ -0,0 +1,20 @@
+>pGT875 | 266
+gctgaagccagtttgagaagaccacagcaccagcaccATGCCTATGATACTGGGATACTG
+GAACGTCCGCGGACTGACACACCCGATCCGCATGCTCCTGGAATACACAGACTCAAGCTA
+TGATGAGAAGAGATACACCATGGGTGACGCTCCCGACTTTGACAGAAGCCAGTGGCTGAA
+TGAGAAGTTCAAGCTGGGCCTGGACTTTCCCAATCTGCCTTACTTGATCGATGGATCACA
+CAAGATCACCCAGAGCAATGCCATCCTGCGCTACCTTGCCCGAAAGCACCACCTGGATGG
+AGAGACAGAGGAGGAGAGGATCCGTGCAGACATTGTGGAGAACCAGGTCATGGACACCCG
+CATGCAGCtCATCATGCTCTGTTACAACCCTGACTTTGAGAAGCAGAAGCCAGAGTTCTT
+GAAGACCATCCCTGAGAAAATGAAGCTCTACTCTGAGTTCCTGGGCAAGAGGCCATGGTT
+TGCAGGGGACAAGGTCACCTATGTGGATTTCCTTGCTTATGACATTCTTGACCAGTACCG
+TATgTTTGAGCCCAAGTGCCTGGACGCCTTCCCAAACCTGAGGGACTTCCTGGCCCGCTT
+CGAGGGCCTCAAGAAGATCTCTGCCTACATGAAGAGTAGCCGCTACATCGCAACACCTAT
+ATTTTCAAAGATGGCCCACTGGAGTAACAAGTAGGCCCTTGCTACACGGGCACTCACTAG
+GAGGACCTGTCCACACTGGGgATCCTGCAGGCCCTGGGTGGGGACAGCACCCTGGCCTTC
+TGCACTGTGGCTCCTGGTTCTCTCTCCTTCCCGCTCCCTTCTGCAGCTTGGTCAGCCCCA
+TCTCCTCACCCTCTTCCCAGTCAAGTCCACACAGCCTTCATTCTCCCCAGTTTCTTTCAC
+ATGGCCCCTTCTTCATTGGCTCCCTGACCCAACCTCACAGCCCGTTTCTGCGAACTGAGG
+TCTGTCCTGAACTCACGCTTCCTAGAATTACCCCGATGGTCAACACTATCTTAGTGCTAG
+CCCTCCCTAGAGTTACCCCGAAGTCAATACTTGAGTGCCAGCCTGTTCCTGGTGGAGTAG
+CCTCCCCAGGTCTGTCTCGTCTACAATAAAGTCTGAAACACACTT
diff --git a/seq/mgstm1_genclone.seq b/seq/mgstm1_genclone.seq
new file mode 100644
index 0000000..7a54610
--- /dev/null
+++ b/seq/mgstm1_genclone.seq
@@ -0,0 +1,2088 @@
+>gi|22316163|emb|AL671877.15| Mouse DNA sequence from clone RP23-214E3 on chromosome 3 Contains the gene for a novel protein (0610005A07Rik), the Gstm6 gene for glutathione S-transferase mu 6, a glutathione S-transferase mu 6 (Gstm6) pseudogene, the Gstm3 gene for glutathione S-transferase mu 3, the Gstm2 gene for glutathione S-transferase mu 2, the Gstm1 gene for glutathione S-transferase mu 1, a novel gene, a ubiquitin-conjugating enzyme E2 variant 1 (Ube2v1) pseudogene, a glutathione [...]
+GAATTCTTTAGCAGAATGCCACTAGCTGTGGCAAATGCTACTTATGTGTTTAGTCATGGCCAGTCCTCTC
+AACTCAAGTCACTAAACTGCACTTTAGCTACAGAAGGAGCCTAATGCCCCAAATTAAATTCTCACGGGGC
+TTTTCCCAGGCCAGAGTCTGCCACAAGGCACAAGTGCATGTACACACATATGCATGGGTCTCTGAATCAT
+TGTTAAGGATACGTGCTGGAGGGCTGGTGAGATGGCTCAGCAGGTAAGAGCACTTACTGCTCTTCCGAAG
+GTTCTGAGTTCAAATCCCAGCAACCACATGGTGGCTCACAACCATCTGTAATGAGATCTGACACCCTCCT
+GTTGACCAGAGTGAGCAGAGGTCCTAAAATTCAATTCCCAAACACCACATGAAGTTCACAACCATCTGTA
+CAGCTACAGTGTACTCACATACATAAAATAATTTTTAAAAAATCTTTAAAAAAAAAAGATACGTGCTGGA
+TAACTTTATGTCACCCCACAAAAGCTAGAGTCATCTGAGAGGAGGGAATTTGCAGCTGTGGGGGCCCCCA
+TAAGATTGGCTCATAGGCCATTAATGATTAATGATTGTTTAATGATTAATGATTAATTAATGATGTGGGA
+GGCCTCTGGGCTGGTGGTCTTGGATGCTATAAGAAAGCACAGTGAGCCAGCCATGGAGAGCAAACTAGTA
+AGGAGCACTCCTCCATGGCCTCTGCATCAGTTCCTGCCCTGACTTCTCTCAGTGATGGACTGTGATGTAG
+AACTGTAAGCTGAAATAAACCCTTTCTTCCTGGAGTTGCTTTTAGTTATAGTGATCTGTCAAAGCAATAA
+GCAAGACAGGGCATATCATTGTCTTCATAACTATGAACAGGAAGATATATAATTTCTGACTATAATTGTG
+AAAATAATTTTTAAGAAAAACTCATGCAAATTAAAATAACAATCTTAAAAGTATAGGATGGGGTCTGGAT
+AGATGGCTCAGTAGTTAAGAGTATTTGCTGCTCTTGCAGAGGATGGGGAGTTTTACTCTCAGTGTCCATC
+TGGAGGCTCACAACCGTCGATAATTACAGTTCCAGAGGATCTGCAACAGTCTTCTGACCTCCACAGGCAA
+AACACTCATTCTTAAAACAAATTTAAAATATATTGAACAAGTACTAGAACTTTAGTGTATACCCACCTGG
+CCCTTCCTGACTATATTTTTTAAAAGTAGTTGACATCTTCCTACAGACTGGACCCTCTGCATAGCACGGT
+CCTTCTGTAGACTAGTGATTCAGGAAATAGTGGCCACACGGTCCTGTTTCTATATTCTCCAAAGAGTGGA
+GAGGAAGGCAGGGGCAGCAGGAGAGGAGGAGCCATGATGGCCACTGCTCTGAGAGCCTGCTGGCCCAATT
+CTCCCCTTCCTTGAGCAGAGCTGTTCTCCTGCAGTGTCCCCAGGAACCAGAGAAGAGGTCTGAAACAATG
+GTGAGTAGCGACCTCACAAGGACTCACTCCAGAGTTGCCAGACCCTCTGGATCTCTGAGTGCTTGGTCCC
+AACTGCACAGTGTGGACAGTTCAAATTGTTCAAAGGGACTCTGCAGGAGCAGCAAAATGATCTGGGTAAA
+ATGTGGCCAGGGCCTAGAGAGTGCCCTTGGGACTGTGTGTCCTAGCTCCTTCTAGATCTGTAGCTAACAT
+TCTAATATTCTCAAGGCTAATTCATAGAAATTTCAAATAAGATCAGGTCGGAATGTAAGATTTTTAACAT
+ACAAATGCTTAGATATAAATGCTGACAACATGGGGTGCCTCCGGTGGAAACGCCTTCATCCAAACTTAGA
+AAATCCGGAATCTAAAGAGAAGAAAATAAAATCTCATTCGTACAGTGCTGGCAAACTAGAATTCAGTAAT
+CAAAATTTCCCACATGTTTCAGATATTAAAAATAAGGCAAAAATAAAGGTTTTCAGAAAGCAGAAAAGCA
+ACAAGATCTGAAATTACAGAAAACAAAAAAACATTTCTAAGAAGAAAAATGGGGAAGCTTAAAATTCTTT
+TTGTTGTTATTTATTTTTATTTTATGTGTATTGACGTTTTGCCTGCATGTGTGTCTGTGCACCACGTGTG
+TGCAGTGCCCTCAGAGGCCAGAAGAGGTAAGCAAATCCTCTGGAACTGAAGTTACAGATGGGTGTGAGCC
+ACTAGGTGGATGCTGGGACCTGAAACCAGATCTTCTAGAAGAACAACCAGTGCTCTTAATGGCTGGATCA
+ACTCTCCGGCTACAAAACAGGAAACTTTTGCCAATATGCTTTTTAATGGGATAGCTGGTTTTCTAAAAAC
+ATTCTAAATGGAGAAAATATTAGTCCACCAAGCTAAACATAACTTTAGAAAAGGTTGTCTGGCCTGCCCA
+TGGCCATATGCAGCAAGCCCTGGGGACTCTTTTGGAGATGTCGTTCACATACCAAGGCCACAGGATAACT
+GCAGCTAGTTAGGAGTCTAACTTGAATAGATACACTGTGAAGTGACTTTTAAGTTACACCCTTTGTCTCT
+CTGTCTCTCTCTCTCTCTCTCTTTCCCCAAGATAGGGTCTCTCTGTATAGCCCTGGCTGTCCTGGAACTC
+ATTCTGTAGACCAGGCTAGCCTTGAACTCAGAAATCCACCTGCCTCTGCTTCCCAAATGCTGGGATTAAA
+GGCATGCGCCACCACTGCCCGGCTGTATATCACTCTAAAAACATAGAATTTAGGACTGGAGAGATGGCTC
+AGTGGATAAGAGCTCTGACTATTCTTCCAGAGGTCCTGAGTTCAATTCTCAGAAACCACATGGTGGCTCA
+CAACCATCTGTAATGGAATCTGATGCCCTCTTCTGGTGTGTCTGAAGACAGCTACAGTGTACTCACTTAA
+AAAAACAAACAAACATAGAATTTAGTACCTTAGTCTAGAGATTCATTTGTCTGGCCACTCCTTCACAGAG
+TTGCTGTTCACTGTAGAAAGGTACAAAAGCATCAGCCTTTACTCGTTGCTTTAGACTTCAGTCTTGTGAA
+GATTCCCACATCTGCATAGATTATGAAATGTTTGTGGTTTTCTCTTACATTACTTGGGCTCCTAGATCCA
+ACCAGCAATCCCAAATGAGAACCAACCTCTAACCACTTGACACCCAATGTGTGGTTCGTCCTTGCTGACT
+GGAGCAGGGCTCTGCCCACCCCTAACCTCATCATAGGGGGCTGCTCCAGCTGAGATCCTGGGCCTCAAGC
+AAAAGATGAGGCTGCCTTCAGGTCTTTCTGTGTTGCCTCTCAGAATCAGTGCAGCTACAGGAAAGAAACT
+GTAGCGATCTTTCTAAGTCTGCTGTGTGCACTGGCCTGCAGACTGGCCAGTTTATGGCTTCTGTTATAAC
+AGCATTTCTCTTCTCTTCTCTTCTCTTCTCTTCTCTTCTCTTCTCTTCTCTTCTCTTCTCTTCTCTTCTC
+TTCTCTTCTCTTCTCTTCTCTTCTCTTCTCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCC
+TTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCTCTTTTCTTTTCTTTTCTTTTCCTTTT
+AAGACAAAAGTTTGCTGTGGATCCCAAGAAGAAAACAAAATAAAAACAAACAAACAAAAAACCCAGCAAA
+GCAAGGACAGCAAAAAGGTGACGTTCCCCTCCCGAGTTCCATCATAAGAGTTTGAGTTTGGTTTTTCTGC
+TTTCTGAGAGCACAAGAGGGCTTGGCAGCTGCTGAGACCTGGGAGACCTGAGGAAGCAACATTCTGTGTC
+CTGCCAGCCTTGAGGAGTCTGTGCAAAAGGACTTTCCTCACTCACTACTGCCTGGCCAGTTCTGGCAAAC
+TCCCTTCCCACTATCACTCGCCCAGTGTCTCAGATGAGCAGGTCTGTCTGTGGCTGGAAGCTACTGGATT
+TCTGTAGGGAACTGGAGGTCTCATTCATACAGGCAAACACCTTTAATCTCCTATATCGACAGGGGTCTTT
+TATTGTTTCAGTCAGTTCCCAAGAGTGAATTTGGATCCTAGCTGTGACACTCCATATCTAGTTTAAGAAC
+AGCTCTGGCTTCCTAATCCGTTTCTGCTCACACAATCCTGTCTGCAGTCCCGTTTCTCCCAGCTTTATCC
+TGGATTCTGTTCCACGTTGGCAGTTTTCCTTTGTGTTTAATGATCTCTCCTGACCCACTGCAAATGTTTC
+AAGTGTAATGAAATTTTATCAGCAAAACATTGCACAGGGTTGATACAAAGAGCCGTCTCTGCTGGCCCCT
+GCCCTTTGCACTGAGAACTGGCCAGGTAGGATCTCTGGCTAGTTTTCTCAGGCAAATCTTACCACGATTT
+TAAAATGTTTTTTTAAAAAACACGTCGTAGCCGGGCGTGGTGGCGCACGCCTTTAAACCCAGCACTCGGG
+AGGCCGAGGCAGGCGGATTTCTGAGTTCGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACATCCAGG
+GCTATACAGAGAAACCCTGTCTCAAAATAAATAAATAAATAAATAAATAAATAAATAAATAAATAAATAA
+ATAAATACAAACACAATGTAAGGTGCCTCTTTAGTATGGGATAGCGTTTCTGCCTCATAAAAATGCATCT
+TAAACTCCTTTCTTAATGAGTCCCCTTGGGCTCCAGAGGGGGGCGCTGTTTTGAGTATTTTGATTGTGCG
+GCTCTTGAATGTGGGGCATGATGTTTTCTTCTCCCATGGATAGCTTTTGAATTCCTGCTTGTCTTGCTGC
+TGTTGATGTTCCTTCTCTGAGGGGGACACATAGTGTATTGAGGCTTTGTGTACAGAGGGCCAATCAGAAT
+GGGGCCTGAAAAGACAGTTAGGCAATAAGTGTGGGTAACAACTCTTGTGAGTGATAGGAAATCCTTTCTT
+CTTTTGCCTCGTATTGGAAGTCCTTTGAATTTTGACAGAGAACATTATTTCCTTGTTGGAGAGCTTGATT
+TTTTTTTTTTTTTTGATCCTTTAAAATAAATCTTAAAAAAAAAATTTTTTTTTTAAAGCAGGAGCTGTGT
+GCATGCTAGACACGTGCTCTACCAGACAGATTTCTGACACCCTCTCATTATTATTTGTTAGGCTCTCACA
+CATAAATAACGGTTATAGTTTACATTGCTTGTTTAAGCATGCCTCTGGTTTAAAATTCTGCTTCATTTGT
+AGAGCATGACAACAACTTTGCTTTATTCTCCCCCTACCAGCTGAAACGAATTACAAACTCACATGAGAAA
+CAATTTAGGGTCTGGGGATATAGCTAAATGATACGGAACACTACATGCATGAGTTCTGGGTTCAACCTGC
+ATGCTTGTGCGCGTGCATGCACACACACACACACACACACATATACACACACATACACACACAAGAGTTG
+TCAACACATTTTAAAAGACAAACCCTTTGTAGATTTGTAGTGGTCTGTCTTCCACTTTAGCAAAACACCT
+GAGGTAATCAACTTGAAAAGAGGAAAGGTGGGGGCTGGAGAGATGGCTCAGTGGTTAAGAGCACTGTTTG
+CTGCTCTTTCAAAGGTACTGAGTCCAATCCCCAGTAACCATGTGGTGGCTCACAAACATCTATACTAGGA
+TCTGGCGCCCTCTTCTGGCATATGAAGATAAAGCATGCACGTCCATTAAATAAATAAACAAAGAAACCAA
+AAAACCCAAAAAGATGAAAAGAGTATTTTGGTTACATTTGGGATGTCTTAGTCCATGGTTCACAGCCACT
+GCTTCGGAGCCTGCAGCAAGACAATTCATCACATTGAGAGCATGTACCCGAGGAAGCTCCTTCACTTTAT
+GACTGAAAATGAGAGAGACAGAGACAGAGTCAGAGAGAGAGAGACAGACAGAGAGAGACAGAGAGAGAGT
+TACACACTGAGAGAGGAAGAGAGATCAGAGTCCCACAATTCCTTTCAAGGCGGTATTCCTATTCCATAAA
+AGCTCCCACTATATTCTACCTCTTAACTTTTCTACCAGTTCCCAGTCACACCAAGCTAAGGACCATCATC
+CTTAACATGTGGGCCTTTGGAAAACATTCCAGATCAAAACTAAAGAACCACTCTAAATTCAAACCAGAGA
+AAGTTTTTGTTTTGTTGAAATGGGATCTCACAACCCCGAAAGCTGGCCTTGAACTAGTGATGTTGCAGAG
+GGTGGGCCTGAGTTCCTGTGTCTCCTGATTCCACTTCCCAGGTGCTGGATGTATGGGTACATCACCACGC
+CTGGCCAGAGAACTCTTGTTTCCATTTGTTGCAGTTTGGATGTCGAATATCCCAAGGTCTGCATAATAAA
+GGCTTGGCGCTCAGAAAGTCCTATTGGGAGGTGATAAACCTGTGAGAGGTAGAGCCCGAGGGAGATTATT
+ATGGTCTGGATAAGAAATGCCCTTCCAAAGAGGTTATGATGCTAATCTTGTCAGCTTAACCCACCTGCAG
+GGAGAGAACCTTAGATGAAGAACTGCCACCATCAGGTTAGCCTGCAGACATAGTTGTGGGGCATAACCTT
+GATTGTGGTTACCACAGAAGGGCCTAGCCCATTGTTGGTGGTACAATCTCTAGGTGGTTGACCTGGCATT
+ATAGCATGGTAATAGAGTCTTGGAGAGATGACACCTTACAGGTGGTCCTTGGGACTTTGGAGGCACACCT
+TTAAAAGGCACCCCACATTCTTCCTCCTTTGCTCTCTGGTGTAAACGGAGTGGTTTTCTCTGCCATATAG
+TCTTGGTGTGATGTATTCCTCATAACAGGTCCAAATGATGGGGCCAATCAGCTAAGGACTGGAACCTCCA
+TGACTATGAACTAAAAATTTATCTTTTTGAGTTAATTATCTTAGGCATTGCTTTACAATAACACAAAGTT
+GAGCAAGGCACCACCTTAGCTAAAAATCTGTGCACTGATGTAAATGAACGAATTACCAACTATACAATTA
+AGGAGTCATATCTACCTTTTGAGATTCAGGTTTTGTCAGTTCTTGGGGAAATTGGAAATTAAGTTCCAAT
+CTTGTTCTTGGCACCTTCTAGGAGGTCATCTGACCCAAAATGATGCTGAGCATCTCCTTCCAGACACTGG
+CTGTCAGAAACGTACTGAAGTGGATGATGAACACAAACTTCGCAGTTCCCACAAGACGTGCATGGCTGCA
+GACATCACTGCTGACGCTCTGGGAGAAGAGTGAAAAGGTTATGTGTAGTGGGTCCTGCCTATTATGTTAA
+TTTGGGTCTCCAAAATTGTTTGCATGAGAATCTGCACATCCACATGTAAGGCAATTGAGGGACCCTGCTC
+CCAGTTGGTTCTGATTGGTAAATAAAGTTGCTGGCGACCAATGGCTGGGTAGGACAGGACAGACAAAGGC
+GGGACTTTTAGTATTCCTGGATGAGGGCAAGGTACCAAGAAGGAAGGGAGGAGATCTGCTATGCTGGGGG
+TAGGGGTGGGAAGGAGAGAAGAGATGCCACTCCTGAGAGGGACAGAGAGCTTAGCCGCCCTGTAGGAGCT
+TGGGGAACTCAGCCCAAGAGGGCTGCACATCTGGGTCCAGGGCAGCCAAGATGGAATATAGGTTTTAGTA
+TGTAGTAACTCGGGAATATCGGGGCGGGGGGGGGGAGGGGAGGGAGGTGGATTAGCCACATGGCAGTTAG
+GAAGCGTCCCAGTCATTGAGCTGATTAAGTCATATCAAAATATAAAGGCTGTGTGAGTGTGTCTTTCTTT
+TGGAAACCCAGAACACTGGGGTGGGGTGGGTAGGGAGGAACCTGCTGCCAGGATCAATTAATTGGCAATT
+AATGCCAAGAGTCTTCTCTGTTTGACTCTATAAATTAAACCATGGGTTTTTAATGAGCACAAATCCATAT
+GAGCTAAATCCTTTTTGGATGACCTCTGCTCTTTTGAATTGCATGAAAGTCCACAGGTCCCCTTGATAAA
+ACAAGAACACATGAGTCACCACCAGTAACAGTGAGGTGAAATTCATCCCGGGCCTGGCGCATGAGCAAGG
+GTGAGCAGGAATATAGAAGCCTCTTCATTCTACGCATGAGCTGCGGAGACCATTTTCATTAGAGAATGCC
+TTCGGAAAGCAGTTGCTAATTACCAGTTGGACCATAGTCCCAGCCACTGGATAGGTCTTTGAGAGATCAC
+AGTGTATCTAGAGGTGGTGTAGAAAAGGTAACATTTACTGAATTTTACCAACAGAGAAGCTAACAGAGAA
+ACTTACAGTAGGGTCAAAGGGCATCCACAAGAGGAGCATGTTTCAAAGCATTGGAAGGTTTCATGACACC
+GTCAGACCAAGAGCACATGTTCTCTCTACAGCTCTCTTATAAAAACCCAGGGCACAGTCTGTGCCCCACA
+AAAGCCAAAGTCATCCACCCATCCTTGAGGGGTCGACCGACATACCAGATGAATAGTTAAAAGCAGAAAA
+GGATTAATTCAATGCAGCCACATTAAAAACAAGGACAAAGACCCAGAAACTTCTTCTGGTCCATCTTTGG
+GATCCTGAAATGAGATTAAAGCTTAAATGGCTGGTCAGGAAGATGATCATCTAAGCAGTGTCTGTCAAGG
+TGTCTGCTATTGATGCATTGTCTGTGGCCGGTGCTCTAGCAGGTCCAAGCATGACAAACATGGTTCCGAC
+AGGTTTTTCCCCTTCCCCTGCCCTCTCCCTTGCTAAACCATTAGATTCTATTCCTACAGATTCCATTGTT
+AGTTTGCTCTGTCCTTATGGGACCAGCAATCCAAGGAACAGCTGATTTCATATACACACAAGGAGCAGGT
+TCTAGTCTATATAACACCAGTTCGCCCCAGTCCCCAGAAGCCAGAGAGGGAGTCAGTGGTGTTCCATATG
+GTCCTTTAGCTGAACAGCCTGGTCGTGGCGCACTCTCAATAGAGGATATAGTGTCCAATGTTCTTCTTTC
+TTCACTCATCATACACCATCAGCCAGGAAAGGGGCTTTGTACTGGATGAAAATTTCTCCTGTAGCTGTGG
+TAGTGAATTGTTGACCGTTGTCCTGTGCCTTTAACCCCTAGGGTGGCCTGGGACTTCTGATACTGGTTGA
+AGTCCAGACCATTAAAATAGCCAGTGGCGTGGAGTTTTTGATCCAGTTCCAAATAAGATGACTCTGTACA
+GAGTGTGACACCTTGAAGAGATCGTTGAAGAGACTATAGAAGCATCAAAGGCTTTGTGCTCCGGTTGTAG
+AACACGGTTGTTGTTAAGTGCTGGATCCTGGCAGCTCTTAATTCATTCCTTCAGTCAACACATTAACAGT
+GCCTTGGAATATGATTTTTTACCAAGTAAAAGAATTGTACTAGAAATTAATATGATATGAGAAGTTTATT
+TCACCTGAGAAACTAGAGAACATTCAGAGGGTGATGGGGAATTCCCTAGAACCCTGTGTCTGGCTGCTGG
+TACTGGAGTGAAAACTTCAGCTTGTGTCCACCACAGGAAAGCTCATGGAGGCTGAGGTGTGCTAAAAGAA
+ACAGAATCTTAGCCAGCCCTTCACCAGGCCAGGTGAGCATCTTAGCCAGCCCTCCTCCAGGCCAGGTGAG
+CATCTTAGCCAGCCCTCCACCAGGTTAGGTGAGCATCTTAGCCAGCCCTTCACCAGGCCAGGTGAGCATC
+TTAGCCAGCCCTTCACCAGGCCAGGTGAGCATCTTAGCCAGCCCTTCACCAGGCCAGGTGAGCATCTTAG
+CCAGCCCTTCACCAGGCCAGGTGAGCATCTTAGCCAGCCCTCCACCAGGTTAGGTGAGCATCTTAGCCAG
+CCCTCCTCCAGGCCAGGTGAGCATCTTAGCCAGCCCTCCTCCAGGCCAGGTGAGCATCTTAGCCAGCCCT
+CCACCAGGTTAGGTGAGCATCTTAGCCAGCCCTTCACCAGGCCAGGTGAGCATCTTAGCCAGCCCTCCTC
+CAGGCCAGGTGAGCATCTTAGCCAGCCCTCCACCAGGTTAGGTGAGCATCTTAGCCAGCCCTCCACCAGG
+TTAGGTGTGTATGCAAAGCCAAACAATGAGCCTTTCTTGAGAATGGCTTGAAAGTGTGATGTTTACAAAC
+AAAAACATAAGTTTATCTTTTATGAAAAAATTCTATAAAGAGAAAAATCATGAAATAAAAAGGTTGAAAC
+CCTTGGAAACCTTTACTTTAGATAACAACATTTTGTCAGGGTCTGATGTGATCTGGTAGTTTCAGTTTCC
+CATTCTCTCCTAAAAAGAGTGTACACATTTTATTTCGTCTTACACATCACATATCCATTTTTATCTGCTT
+GTTTACAAATAACATATAATGTATTGGGATTGAAATAGGCAATCTAAAGGAATGTGCATCACCCATGCCC
+CTCTTCCTTAGACTTTAGTTAAAGGTGGCAGTACTAACTAATAATAGCCTCTTTGGGAGGATAAAATATA
+TTTTTCTCCTGTTAGAAAACTGTAGCGTGTAGTTTTAAAAATGATCCATGCTAAATTGGTAATGCACTGA
+CATTGGTGGTCTCGCCCACCCCCATGGCCAGCTCACTGCCAAGCCGCTTGCACTTCCCAGGCTATCTGAC
+CCCTGTGGCTAGCCTGCAGAATGCCAGCTATGGCCACTCAGTTTCCCTTAGCCCAGTAAAATAATAAAAC
+AAGTGTTCAACACGTGATGGATTTGAAGTCGGATATTTAAATCAATATGATCAGTGCCTGTTCAACGATT
+GACATAAAATGAAGTCTCTGTCTCTCTCTCTCTGTCTCTCTGTCTCTCTCTCTCTCTGTCTCTCTGTCTC
+TCTCTCTCTCTCTGTCTCTCTCTCTGTCTCTCTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
+TGTGTTTAATGTTTGGCGTTTAAATTGACTAAGCCCTGGGGAAATGTTAGATAAACAGTTTTACCCAGAG
+AAAGTCCTAGGCCTTGATCCACCAAGTGGAAACAAGAACAAGCAGTAACAATTAGTCAAGAACGAAGAGG
+AGAAACATCAAAGAATCATTATAATTATAAACTTCAAAGGTACTGTGAAGTGGAACAATGGACTCCAAGT
+GCCTTGGTGACACATGAAAGGGGCCACATGTCTTCCTGGTGACACCATAAATGTATTCATAGAACAGGCA
+CTTAATATCTTAAAACACCACACAGCTGGTCATTCCCTGGGCCCCAGACTGTATCTTTTGGGGATCGTTG
+AAAAAACAGTAACAAAAAAAAAAAAAAAAAAAAGGAAGACAATATATTTAAAGGCAACCATAGTGGTGTA
+ATGAACGCTAAAAGCTTAGAGGTGGCCAGTAGTGTGGAAAGGATTAGGAAAGTTGTACTTTATTTACAGG
+AAGGAATAATAGATGATGGCTAAGTCTTATAAATGTACATGTGAGCAGAATGATAGTATGTCCCACTTTA
+AAAAATATGTATACACCACAGGGAGATAGAGTGGTTTACTGGATAAAGTGTTTCACATGAGGTCCTGCAT
+TCAGATCCTAATATAAGCTAGGCTTGGCTTGGGCACCTGTAGTCCCATGTCTGGGAGACAGAGGATCCTA
+GAAGTTCACTCCCCAGCCACAGTAGCCAACAGATTCAGTTAATGATTCTATTTCAAAACTTAAGGTGGAG
+AACAATAGAGAAAGGCACCTAGTGTCTACCTCTAACCTCCACGTGTTCACACACAGACAATTGTACCTAA
+ACACACTTGTGTGCACACACACATCATGCACACACAGAAACACACACACAGAAACACACACACACATCAC
+ATATAACACATACACTTATGGTCTTCTGTTTTTTTTTTTCTTTCTTATACAAGAGATCCAACCCAGAGTT
+CTACATATGCTAGGCAAATACTCTACCACGAGCCACACAGCCAGCTCCAATATGTTTTAAGTAACATGCT
+AAAGAAATAATGAGCAGAACCTCTAAGGAGCAGAGGAAATGCTGAGTGAGCAGTCTTGAGTGGATGTGTA
+GTTTGACATGGGCACAGCCACATAGAGGACTACAGGACCTCCTGGGACTATTGGCAGAACCATTACTCAT
+ATGGCCCAGTCTCAAGAGGAACAGCAAAGCTTTTCTTGGGAGTCTCAATGTGAGTCTTCATGAACAATTG
+TTCAGTGACTGCAGCTCCCTCCTCCAGGATGTTCACAGCCTGAGATTGCTTGACTACAGAGACCTAGATT
+AGCTCACAGCCCCCCTCCACCCCCCCCCCAGCCCCTATACTCTTCCTAATAATCACACTGAAGATCCAGG
+TTACAATGTTGTTAGAACCCTACCCAACCCCAGCTTCACTGTATGTGTGTCTGTCATTTCTTCATTCCCT
+CGCTGACATTGCCATCCCAAGCCACACTGATCCCAGGAAGAGCATATGATTAAAGCAGGCATTATTTAAA
+GTCAAGAGAATCTGAAGCAATGTCGCTAAATAGGCATTTATTGAACCCCTACTATGTTCAGGGCATTTCC
+CAGGTGGTAAACACTTAAAACACTAGTGAACAAGAAGAAACTAAAATACATGCCATTTTGTCATTAGCAT
+CTATGGAAAGAAGCATACAATAAATGTGAATAATAAGGTTATATATTTCCACTGATGGGTGGAACTCACA
+TGCAATAAATGTGTAGCTATTAAGTGTACGTCTGTGTGAGTTTCGAATCTGTATTCTTTGTGTGATTACC
+ATCTAGATCTAGATAAGGAATGTTCTCAGCACTGCATAAGCTCCTGTCAGCAAATCACCCTGCCAAAGTC
+AACATGCATTTGGCCCATCACACTTCCCATCTATAGACAATGATAAAGAATGCACATCCCACCTGTCTGG
+ATCAACATGATGCCATTGTCAACATTAGTCAACATTATGTCTACATGCTGTGGTGTAGAACATCACTACA
+TTTATCTTAACTTATAATATACTACTTCGTTATATGATAATGGCTTTATACGAATAAACTATACCTGATT
+TACTCATTTTCTGCTGATGAAATTGCAGAGTAAGCAGTTTCCCACATTCTTCCCTCAGTCTTTCATGGCT
+TTTGGGAGACAAAAGCTTTCATTCTGGTTGGGAGACAGACCTAGGTAAGAGGTCACTACTTGCTCAACAC
+TAGTTGATATTGTCAATGTCCCATAGTGGTTATACTCCCAGCGTTACACAAGTTCCATTATTTCACAAAT
+TACCAGCCATTGTCAGCCTGCTGCATCTTAGCCATTCTACTTGGCCTACGTTAAGCTACTACTGTGTTTA
+CACTAACTCTGCTAGTTAATGGTGCTTGTTGTCATTTTACAGGCTAAAAGGCTACTTGGATGTTCACTTT
+TAGGAAATACCTGTTTAATACCTGTTGATTCATTTTGTAAGGTATCTTTCTGGTTATATTACTTTTGAAT
+AAGAGGTCTTTGTTGAGTAGCCATGTTCTCCCATCCAAGCATGGGCCTGAATGCCTACCCATAAACCAGC
+TCTTTCTGGAACCATGAATGGGAATCACTGAGAAACTGAACCTCCTAGAGCTTGCTCTCGGGTGTTATTT
+TAAATTGCTTCAGGAATACATGAAGCAAATGCAGAACTGGTGAAGGAGGTATCTAAATAGACAAGGTTAT
+ATACACCAACAAAGTACATAAAAGGATGTGAGGTACCGCAAAGCAGGTCTCCCTACTGGCTTTTGACCTA
+GAGACTTCAGAAGGGCCAAAGCAACAGCCTTCAATACTGAAGATCATCTACAAGGGGTTTTGAATCCATT
+GGAGTCGGGCTCCTTACCAGCTGTCTGTTGGTGTAACAACCGACCAAAGAAAAACAATGTAAGCGGAGAA
+TGATTTATTCTGGTTCATGGTGCAGGAAGAACAGCTCACAGGAGCAGGGAGAGGGGTAGGAGAGGGTCAC
+AGTTGCAATGACAAGTGCTGGAGTCACGGCACACCTAGCTCATAGCTTGTCTCAGGAAACAGAGCAAGCA
+GAAGAGTGTTTTGGCTACAGTGGTCAACACACCTAGCATTCTACAGTCGCCATATGGACCCCACCTGCCG
+GAGAGCACACAGCTGCCCAAAATAGCCATGAGCTGGGAACCAAGTATCTGAACACATGAGTCTCTAAGTG
+GTGGTGGTGGGGCAGGGGGACTTTAGCTTTAGAAAGGGCTACTTTGCTAATGAAAAGGAACTGTTAGGGA
+TTTTAAGAGACTGTGTTGGGCAGAGATTGGATGACCAAAAACGAGAGCCGGTGAGGCTTGGGATCTGGAA
+CAGGATGGAAGGAAAGAAGAAGGAAAAGTTGGCTGTGGGGCTGCAGGGATCCTGCCTGCACCCCAGGGCT
+CCTGGCAAACTCAGGAGATCTGGTATCTTTATCCTACAGCCCACCCTGTCAAGGTCCTAATTGCTGCCCC
+AAGTTGCCATCTTTGTGAACATGGGTCTTGGGAGGAAGCGACTGGTCTTCATGTAGTCGGAGATCTTCTT
+CAGGCCCTGCGTGTGAGGACAGAGAGGGAGGTCAGGCCAGGGGCCCCACTGGGATAAGCAGCCCCACTCC
+AGAAACAGCTTTCCCTGTAGACTCTAATATTAGAGTTCTAGAAAGGGGAATCCTCGGTGGCCTATGCGAA
+GATGCCTGTGCATCGGGATCGTGCCCATTATAGATTATGACCTACGCTCAGAAAACACACTCCATCCTCC
+AAGATCAGTCTCACTGTGCCAACGTGTCAGCAAAGAGGTTATGTGGCTTCCTCCTGCTGGCACAGCTAGT
+CCATGAGACAGGACCTGAGTCAAGCTGGCTCTATAGCTCCCAATCTCTTCCCTCTCTACCCAGGCCTTTC
+CTGTCTTCCCACAGGAGACCCTGCCACTCCTTGCCTTCACAGACACGATCCTTTCTCACAGGGAACATCC
+TTGCTGGTTTACACAGAAACCTTCTGTAGCAGGACAAAGATGTCCATGACAGGCCAGTGCAGCAGGACAG
+AGAACCCAGACCTGAGCCAGGGACCCAGCAAACCCAGGATCAAATGCACAAGGGGAGGAGGGCCAGCTAA
+GGATGCTGAGCCAGCTCTAATTGTTCTCCTGCAAGTAGAGGTCCAACCCGCTCTCTGTGGAGCCTCAGGC
+CAATCTCACTGAGTCCCAAAGCTGCCTCATATCCAGCCCTGCTCCATGGGGCCATGCAAAGAGACACTGG
+GAGTGTTTGTTCCATCTTACCAGGAGAGCCTGCTCAGATAAATTCTGCATGGTTCAGAGACCTAGAGCTA
+ACTGCAATCAAGAAACCAGGTGCAGCAGGCCTCAGGCTGCAGCCCTCAGCTCTTCTATCCTTCCCTATCA
+TGACCACAGCAGCCCTGGTACACTTTCAAGGCCACTGGGTCAGGACCAGACCCAGAATTGGTGAAACCAG
+CTGAGAAGATATATCACTTGATATAGGGACAAGATTCCAGACTGCAAGCCTGTTGTTCTAATGCCTGCAG
+TTCCCAAATCTCTGAAAGCATATTCTGTAGGTGCTAAGTAGACTCTGGGGGACCCATGACTAGCTATGCA
+GTGAGGGCCAAGCTGTGACTCACAGCTCCCGGGTGACTCTGTGCCGGTTACGTTCACTGCACTCTGCTAT
+ACTTGGTCTCTGAATGGAGTTCTTTCCATGCCTCACTGGGCCTGATACTGAGCCTCATACAGAGGAGGGG
+GCCAGTGAATAGGAGTCTAAATAAGTATATAAATACATTGTAGAAATGGCCTCATGGAGACATGGGGAAA
+AGAAAAGGGCAAAGGCCCTATAACCATACAACTCTTTAAGATAAAATCCTGAATAGGGCTCACATACACT
+CTCAAATCTAGCACCAAAAACATCTCTGGGGTTTGGCATGTGCTCAGATGCGGATGGACCTGAGTGAAAC
+TTTCTGAAGACCTGGTATTGTCTACACGCGGCTATTTCACACAAATCCCACCATGCCTCCCTCTCTGAAT
+GTCTGCGGCTTACACACTCCTGCCACGTTCTGCCTTTCCCCAGCATTCTGGACTGAGTAACCTAGGCAGC
+ACAAGGACCCGAGGCTCTTCCTTTTGACACAAAATAGGAAATGTCTACTCCCTGCTGGCCTTAAAGGCCA
+CACACCCCCAGGGAGCTGGGCACAACCTCTGCTAGTAAGGGACTTAGCCTCTATAGTCTGTTTCCTTATC
+TGTCCCATAGGACTCATAATTCAATCTGCCCTGCAAGTTGGTTGTAATGATAATTTTTACCCAATAGATA
+ACGTATGCTGAGCCAGAAAGCAAGCACTCAGTACATGCTACTGGCTTTTGCTGATTTTAAGCTCAGTTCC
+AGGGCCAAGGGCCTTGCAAGCAGAGTGGGGAAGGGAGGACCTCAAAGAAGAACGAGGATTTAGAGCGTCC
+GTCACCTCAAAGCGCGCTATGAAGTCCTTCAGGTTTGGGAACGCATCCAGGCACTTGGCCTCAAACACTT
+GGTTCCTCTCAAGCACATCGTAAGCAATGAAATCCACAAAGGTGATCTGTAGGAAGCCGAGGGTGAGTGT
+GCTTGGATCAGGTAACCTGGGAAGGATGGCAACCCCTCCTCCCCACCCCTCCCTTTACCTTGTCCCCTGC
+GAACCATGGCCGCTTGCCCAGGAACTCAGAGTAAAGCCTCATCATTCCAGGGAGTTGCTCCAGGTACCCT
+GGCTTCAGCTTCTCCTAGAGCGGAACACACTGTGATCACCCTCAGTCACAAGATTCCGAGCAGAGGCCGC
+CCAAACAGGGCGACAAAGACCCCATCTGGGCAGAGGTGCAGCCACCTTCCCGAGACTAGGCCTGGGTTAA
+TTCTGCTTTAATTTATGATGTCACTATTATTAATTCATTATGTCTATTGATTAGAGTCCTACAGGCACCA
+GATCCAATATCAGAGGTGATGAGGATCTACACAGGGGCACGCTGTCCCTGAGACTCACCACAGGTGCTCA
+GAACAACTCTTGTGAGTCACACCCAGGACTGTGTGAGCTGGCTCTCGCATTCAGATGTAATGTGAGTCAG
+AGAAGCAAATAACTAAAGCATTCTGGAAGAGAAGTTGGATGTTCTGCAGGAAGAAGAACGGTAGGGCTCT
+GGGGCAGTTTCAGAAAGATGAAGTGTGATGGGGTGAAAGGAGATGAGGCAAGAAGAGGCTTAGCCAGCTG
+TAGACTGCCTTTTGGGAACACTTCCTTATTACTGTGAGTATCCCAGGCATGGCTCTGGGTATCTGGTTGG
+GAAGAGTCCCCAGGTGATGGCTCCCATCCATTTCCAGCTTACTCCACTTTTTCTCATTGTTGGAATGTGT
+ACTATAGAAGCAATGATGATGTAAGACATAACCGAGTAGGAGACATAACCCAGGCCAGAAATGGGTGATC
+AGAGCTGAGAGATTCTAAACACAGTCTTTACTACCCAGCTACCGACACTGGACTGTGGAGTGTTTGTTCT
+AGTTGCTGAAGTTCATTGTAAGTGCCATTTGACAAGACTCTATGTTGGCAGGGTGTGTGTGTGTGTGTGT
+GTGTGTGTGTGTGTGTGTGTGTGTGTACAAGAGTGTGTGTGTACGTGCATGGAAGGTTTGGTTTTGTTTT
+AGTGCTTTTGATTTTGAGACAGTGTAACCAGGAAGACCGTAACCTTAGTCCTTCTGTCTCAGTCTCCATA
+CTGTGTCACCATGACTCCGCCAGCAGTTCAGAGTATCATCTGTAGGTGAGCAGCGCTAAGACGTTAGATG
+CTAAGCAAACAGAGCAGATGAGAAGGCAAAGGTACAAGCTCTGGTTCCCCCACCTCATCCAGCCCAGAGC
+AGGACTCACAAAGTCAGCGTTGTAGCAAAGCCTCGCCAGCACCATGCGGTTGTCCATAAGCTGATTCTCC
+AGAATATCCACACGGATCCTTTCCTCCTCTGTCTCCCCACCTGTGAAAAACACAGCCCAGACTCACACTC
+AACATGCCACCCTGGCCAAGCCCAGCTGGTGGCCATCGGTCCCCACCCCTGCAGCCGGCCCCACTCACAC
+AGGTTGTGCTTGCGGCCAAGGTAGCGCAGGATGGCATTGCTCTGCGTGATCTTGTGTGACCCATCGATCA
+AGTAGGGCAGCTGAATGGACAAGCATGGAGAGTCAGATGGGATATGAAATCCCAGAATCCCTTCCTTGGG
+TGAGGACAGACACGAGGCCTGGCACTCAGTGTGCCTGCCATGGAAAACCTTCTATGAAAATGCTGCTCAC
+TGAACAGAGAGCTGGGATACAGCCCATCACAGAAAGATGGTGTGGACAACCAGGAATGCGAAGATGAAAG
+CCAAAGGACTGATCCCATCTGTATAAACAGGGACACCACCACCATCTCCACACCTCCCCTCTCCTGTACT
+TACGTTAGGAAAGTCCAGGCCCAGCTTGAATTTCTCATTCAGCCACTGGCTTTGGTCATAGTCAGGAGCT
+GTGGTGGACAAGATAATATGAAATCAAAGGACCCATGTGTTCAAACCCCTTTCCCAAGATTTTCCCAAGG
+GGTCAGAGACAAGACTCTCCAAAGCAGTGGACCTGCAAGCTGACTCACAAAGTCATACTGCAAGCTTGCA
+AGGATGAAGATGTTTGTAAGGGCTGTGAGCTTCTTCATCGCTAAAGCTTGTGAAAGATTTAGCCAATTGC
+TAGAAGAATGGAGCCCAGTTATAAATACAGAGTACACTCAAATCTTTCCTTAGCATCTTCCATTGGCAAA
+TCAAACCACACCATCAAGGGTTTTGGAAGGGGCATGTTACAGCTCCAACAAAAAAGTGTCAGAGCTGAGT
+GATGAGGTTGGTTGCTGGGCAACAATGGACAGGAGTTGGGCAGGGATCTGACTAGAAAGGATGCAGTCAC
+CATCACCCATGGTGTATCTCTTCTCCTCATAGCTTGAGTCTGTGTATTCCAGGAACAGGCGAATGGCATG
+GGCTAGCTGTAAGAGACAAGAGATGGTAAAATTCTTTACCCCTTCTACACGAGTCCTCATTCTCTGACCT
+AGTTTCACTATGTCCATGGATGGACCAGAACACACACACACACACACACACACACACACACACACACACA
+CACACATACACAGCTGCAAGAGCATGCCTCCATTGACTGCACGGATCCTTGTCCCTCCTCCAAGTCCTAC
+TCTGCATTCGGACCCCTCTCTCACCCCACGGATGTCCCAGTAACCCAGTGTCATGGGCATGGTGCCGAGG
+TCTGAACGGACAGGCTCAGTGTCAATCTAGCCTTGGCTGTTCAGAGAGCTGCCAGAAACGGATTGGCTTC
+GCTTTGTCAGGTCTTCAGACTGCCCTTCCCCACCTCCCAGTTCCAGCCAATACCGGCATCCAGGAACAAG
+GCCAGGGCCGCCCCCAGAGCCAAGGCTGCCTGAGCTCCATTGTTATGGTGGTGGCTTGGCTGAGTGCCAA
+GAGCGAAAGGCGGGGCTGGGTTTGCTAGCTTCTTATTGGTGGTGCTGGGTTTCCGCTGTAGGACTGCTAT
+TTTCTACCCTCTAGTCTAGATGGCAGCAGCATTCCCCAGTGGTGACAGTTCGGAAGGCCTAGGACCTTGA
+CAACTGTGTCCTGAGGGCAAAATTATCTTTTATACTGTGAATACTTAAAGAACGAAGAACTGGAAAACAT
+GATAATACAAATATATTCATGAATGCAAAACAAATTTTTCTGATGAAATTTATTTCATTTAAGTGTGTGT
+GTGTGTGTGTCCTGGATCACATAGTAAATGATGTATCTGGAGTTTTTTTTCTGTTTTTTATTCTGATATA
+ACATTAATCTTTGAAAAAATTCACAAAAGAAATGAGTGCCCACTTAATCACCTGTATACTGTAGCTGCTT
+CCTTGTTCGGGTGCCCTCCGCTAAACTCCGCCTTCCTTGGCTTGCCTTTCCCTCTCTCCTGATCTTCCCA
+CCCCCTACCTTTGACTCTTTCCAGCATGAGTACAAAATCTTATAACCAAGAGTCATTTCATTTTCTTTCC
+CACTAACACCATGCCGGGGATGCGTGAGACCCTGGAGTAGCTGTGATTGATAATCACCTCTGCCGTGCTG
+CTTCCTGGCCAGGCTTGCACTTGGACCTGTGACTTCTGACAATTGACGGAACATGTTTCTCTTTACAGAG
+CCTTGAAAGCCTTGTGCCCAATTCCCGAGGAACAAGTGTTCTTTGTCTTATGTTTCTTTTTTATCCTTTT
+CTCTTTTGCGATGGCTTTAGAGGTTGGCCTTGGGACCCTGTGTAACCCAGGCTGGTCTTGGATTAGGCTC
+AGTTCTCCTGTCTCAGCTCCCGAGTGCTAAGATTAAGGACATGGAGTGCCACACCTGTCTCTTTTAGGAA
+GCACCGCCAACTCACTCAGATCTCAACTCTTTTAAACACGTTTTATGTTTATGTTGGTTTCATTTGGTTG
+TGTTAAAATGTAGCTGAGGGTGAGGAAACCTGCCTTAGTTTGAGATAGCATTGCTCTGATTTTTTTCGAC
+CAACTTTTATTAATGTCTGATCTCAGTGACATTGTGGGGTGGGCAAGTTTCCAGCCACAAGGTACCCTTC
+ATGTTCTAGAGACCCGGTGAAGGCTGCCTGCACTGGGAGGTGTCCTTCTTAGTCCGCTTGGACCCCTGTG
+AACCAGGCTCTATTAGGGTCTCCAGACAGTCCTTAGAGGCAATAGTGAAAGCTGGAATGGCAGTCAAAGC
+AAGTGTGACTCAAGCCTTATGGGGACCAGACAGGCTCCCATATGCAGAGACCTCACTGGGGTATACTGCA
+GATAAGACCTGCCTCTGTTACACACGTGTTGCAAGTACAGAGCCTAAAGATGATGGGCTCCCCCAGCTCC
+TCAGCACAGCACAGCACAGTCACTGCTGGACAGGACAGTGAAGCCAACTTCAGGGACGGCCACTTCAGGA
+GGCCACTGAGTTGAAGGAAAAGGAGCTGGGGCAAGAATGGGGCAGGGATTAGGGAACAGCAAGATGGTGC
+CAGAGTAGCAGGGGGATGACTGCGACTGAGACTTTAGGGCTCCCATTCTGTCCCCTCTCTGCCTGTCTTC
+CCACAGGACACCCTGCCACTCCTTGCCTTCACAGACACGACCATTTCTCACAGGGAACATCCTTGCTGGT
+TTACACAGAAACGTCCTGTAACCAGACAAAGACGTCCATGACAGGCCAGTGCAGCAGGACAGAGAGCCCA
+GACCTGAGCCAGGGACCCAGCAAACCCAGGATCAAATGCACAAGGGGAGGAGGGCCAGCTAAGGATGCTG
+AGCCAGCTCTAATTGTTCTCTTGCAAGTAGAGGTCCAACCCGCTCTCTGTGGAGCCTCAGGTCAATCTCA
+CTGAGTCCCAAAGCTGCCTCATATCCAGCCCTGCTCCATGGGGCCATGCAAAGAGACACTGGGAGTGTTT
+GTTCCATCTTACCAGGAGAGCCTGCTCAGATAATTTCTGCAGGGTTCAGAGACCTACTGCTGAGGGTTAT
+CAAGAAACCAGGTACAGTGGGCCTTGGGCTGCAGCCCTCAGCTCTCCAGTTCATCCCTCTCATAGCCATA
+GGAAGCTTTGCACAGCCACCAATGCTATGATTCCTGCAACAGAACCAGAGCCAAAGAGAATCTGAGAAAG
+AGGAGCATTTGATGTAGAAAAGACTCCAGACTGCAAACCTGTTGTGCTGGTGCCTCCAGTCCCCTACTGT
+CTAACAGAGTAGCTTCTGTGTGTATCAAGCGGACTCTGGGGGACCCAGGGCTGTGGAAGTAAAAGTTACT
+GTGATTGCCGGTAGCCAGGAGGGGCCATGACTTGCATGCCTCTTGCTCTCTGCTGGACTGAGTCACTGGA
+TGGAAAGTCACCCACACAGCTCAATGTGCTTCACACATAGATAATCACCGAGTACCAATGAGAATACATA
+ATCCCATCTCCCAGGTAAAAAACCAGACACCCAGAGAACGGAGAAGGAGCGCTTGAGAGGGAAGGAAACT
+GCCCCATTTAATGTCAGGATTGCAGTCTGCCAACTAGGGAGCTGGAGAGACGGTTCAGTGGTTAAGAGCG
+GTTGCTAGACAACCATGAGGACCTGAGTTCTGATCCCGGTGTCCACGTAGCAAGCTAGGCATAGGTTCAG
+CCCACATGTAGCCATAGCATTGAGGAGTGGAGACAGGAGCATGGCTGGAGCTTGCTGGCTAAAAGTCCAT
+CTGAAATATGAGCTGGGCTACCAGATTCAGGTAGAGACCCTAAGGTGGAAAGTGGTAGAGAATCCCTGAC
+AGCTTGACGAACCTCCCTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT
+GTGTGTGTGTGCGTGCGCGCATGTCTTCTACATGTCTGCTCAAATCTAGCACCAACTATATCCCCTGGGT
+CTCTGTTTCTCCCATTGGGGATTTAAGTCACTAAGGTAGTCTGCAGAGGGAATATGAATAGTGTCTTGTG
+ATCCTTCATCTCAGCCCTTTAATTCAACTTCCTAAGGCCCTCGGCTGACATGTTCACCCCTCCCAAGGAT
+CCTGGAGTGGGAAACACAGCCTATGCAATGACCTGAGTCCTCTCTTTTGGTTCAAAACAGGGGTATTCAC
+AACCTGCTAGCCTCAGACTTCAGAGACCAACAGAGAAGTCCTGCACAGCTATCACTAGTACCAGGTTCTC
+AAGCAAGTGACTGAATCTCTACAGTTGGCTTCCTTATCTGACCAGTGGGACAGGTAACTCTATCTGCCTT
+GTGGAAATGTTCTTATGATTCATTTGACCCCAGGTTGTAGGATGTGAGCACCCCGTCCTGGAATAAAGCG
+ATCAATGGTCATAGTTATTTTTCCTTTTGGCTCTAGATTTAGGAACAAATTGTGGAAAGAGATTCAGAGA
+GAAAGTATCCACTTGAGAAGAGAAGTGGTGCTCTTGGGACTGGAATGCATACAGAGGTCTGGTTTCCCTG
+TGCTTCTGCCGCCGACTTCTTCACTAATGTCAGAACCAGCATTTCGAGGCTTCCATGAATCATGGACTGA
+GTCAGAATCTCAGATGTGAGACAGGTTTGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTACTATTTTGG
+CTTAAGCTCACACATGGGCAGCCCACACAATAATAGCAGACATTTAGTTCTTCACCTCAGATAAACCCAG
+GGTTTGGCCACAATCTCCCACAACCTGCCATCAGCCCAGTCACACATACTATTCTTGGTACTCTGTAGCA
+CAAGATGGTGCCATTTTAGATGCTCCTGTGAGTGCCTGAGCAAACTATACTTCTATGCCTCACCTGCATA
+GAGGACCTTCCTGCCTTGGTTTTAGCTTGCTTTCTCTCTCTCTCTCTCTTTCTCCTTTCCATACTTGAAT
+TATAACTTCATCTAAGGACAGGGTCGTCAGGTAGAACACATGGGTAGGGAGTTTGTCTCTCCCTCCCCCG
+AGGGCCCTTCCAGACAGCATAGTCCTAAACAGAGGACTTCAAGTAACCACTTTCCTATCACAAATTAACT
+AAGCCCTCTGTTCTTCCAAAAGCCCTTATCCAGAAATGTCCTTGGGACTTTTCAAAAAGAGATACAAACC
+CACACAATCTCCTTTAGTTTAGTTGGGTGGTATCAGCTGACCTTGAACCCACTTTCTCCTATCACATCAT
+TCTCCCTCCCAGTTGTTCTCACGGGGCACCTTAGAGAACAAGCCTGCAACTTCGGACCTCACCATGCTGT
+GCTTCCTCCCTTCCCTCGGGTGTTGATAAGGCTTTTGCCTGAAGGGTAATTTGACTCAATACTTTTCTCT
+TGAGCCCAACAAAAAGGAAGCACCCTGGTGAATGGAGGAACCACAGATGGAATAAGGCTGAGACCCTGAA
+AGACTCCCTGGAGCTGCCCAGCCCCCTGGACTCTCCCCGTGAGAGAAGATGAACTGGGGACAGGCCCTTG
+CAGATGTATAGTTTGGGGCTCCTTCATGTCACATTCTCTGCCTCTTAGCCATCAAGACGTGAAATCAGTC
+TCTGCCATACACCTTGACAATCATAATATTCTGCCTAAGCACATGAGACCACACACACATATATACATAT
+ACACATAGTGTGTAAGTGTGTGTGTGTGTGCGCGCGCGTGCACGCTCGCGTGCGCATGTGTGCCTGGGTT
+TGCTTGCATGTGGAGGGCAGCAGTTGATATCAGGTGTCTTCCTCAAGCTCTCTCTTCCTTACTTTTTGAG
+ACAAGATCTCTCACTGAGCCAGGCTCTCATTGGTTTAGCTAAGCTGTCTATAGAGTAAGTTCAGAGAGTT
+TCCATTGCCCCAGCTCAGGGATAATAAATTCATATCACAACACTTGGCTTGATCTATGCAAAGGACTAAA
+CTCGTGTGTTTGCGCTTGCATGGCACACAAGTACTTTATAGACTGAGTCACATCCGCATCTTTTCCTGGT
+TCTTTTTAGTACTTCTGTTGGCTGTCAAACGCCATGTGAATGAATCCAAACATGCACACTCAGCTAGGAA
+CTATTGAAACGTATTGGGAAGTATTTTTCTAACTTCTGATATAGAATGATACTAGTATTAGTCTGGTTTA
+ATATTTCTGCCTACTTAACCTTTATATACTTCTATTAACTTACCTTTTTCCTGATTTTTTGTTGCATTTA
+TGTCTAAGTTGGTGTTTAATAAAGGTCAGTGTCTTAGTCAGGGTTTCTATTCCTGCACAAACATCATGAC
+CAAGAAGCAAGTTGGGAAGGAATGGGTTTATTCAGCTTACACTTCCACACTGCTGTTCATCACCAAGGAA
+GTCAGGACAGGAACTCACACAGGGTAGGAACTTGGAGGCAGGAGCTGATGCAGAAACCATGGAGAAGTGC
+TGCTTACTGCCTTGCTTCCCCTGGCTTGCTCAGCTTGCTTTCTTATAGAACCCAGGACTACCAGGCCAGG
+GATGGCACCACCCACAATGGGTCCTTCCCCCTTGATCACTAATTGAGAAAATGTCTTATGGCTGGATCTC
+ATGGAGGCATTTCCTCAAGGGAGGCTCCTTTCTCTGTGATAACTCCAGCTCCTGACAAGTTGACACACAA
+AGCCAGCCAGTAGAGTCAGCATAAAGGATTTTGATCTTACCAGATTAATAAACATTTCTCATCTATGAAT
+GTTTATCTTATAAGCTTTTCATTTTAATTAATCATTTATTTCACAGTGACCCGCACATCTGCCTATGAGT
+GCTTACTTCTTAACTGCTGTAACAGGCAGCAGCTGAGACTTGGCGTAAGGACAATGCCCAGCAGGGCCAT
+CCTCCAACATGTTCCAAGTTACAGGTAGGTCTCACAAGAAGTGGTTGACAAAGGCTAGAATCCATGATCT
+AGCCTCCATCAAAGGATGCTTCTTCAATGAATCTAGCTCCTTTGTGGGTGACCTGACAGCACCAATCACA
+CCCTGCTGCTCTGGGGCCTAGGGAAGTAGATCTGCCCATCTCCAACCATTCACTGGCAGCTGCACCTCTT
+AACTGCTTAGGTGAGCGAGGGCTTCTGGGTCAGACTACAACACCTCCACATGAAAAACCTCCTGTGGCCC
+ATTCTATGTTTTCTCAGGAAGCATGGTTATTTCCAGAGTCCTTACGCCAAACACTGGCACAAGCATCTCC
+AAAAGCAGATGTTCACAATTTCCCAAACGTTAACTTGCCAGACACTGCCAGATGTATCAACACAGCCAGA
+ATCTCCCCTCGATGAGCTGATGATTGAGTTTTTTGAAAATTCCTACAACGCAATCACCGACAGTAGGGAA
+TGCTTATAGTAAACCATCTTGAAAAGAGAGAAAGATGGCTTCCAGGTGAGGGTAGATGTAAAACCTGGTG
+ACTGTTCACCCTCTGCCCATCAGCCTAGCACCTCTCGGAAGCAAGACTCACAGGTGAGCATGGCCAACGG
+AGGGACTGCATGCACACTGCAGGATGACCACACTGCAGGATGATCACACTGCAGGATGACCACACAGCTT
+GATGATGACTGAGAATTCTGGTATAGAAGACACACCCACCTATAAGAACCCCAGGTGAACACTGCCAGGG
+CACATCATGATTTCTCTCTTTTTAAAAAGATTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT
+GTGTGTATGTGTGTGTGTGTGTGTGTAAGCCCAAGCAGGCAGTAGAGAGCATCAGATTCCCTGGAGTTGT
+GATTATAGGCCCATTGTGAGCCACCATGTGGGTGGCGGGAACCAAATCCAGGCCCAAAACAAGAAAGGAA
+GTGCTCTGAAAAGTTCAGCCATATTTGCAGCCCCTGTGCGTGGTCATACATCCTGACTACCTGAGAGTCC
+ACCCCATGTTCCCATCACCTATTCATCCTTGTGAACCACAGAACAAAGACCACACAGCCACATTCATGTA
+GTCACTCCTGAGAATTCTAGGTAATACGACAGGACACAGAGCTAGTATGGATAACACTTTACTAGAGCTC
+ACACAAGACTGGGCCTTCCCACTCTGGAGATGGAGCCTGCATACTGGGGAGTATGGATCTTTGGGCTAAG
+CTGCTCAAAGTGCAGTGAAAGATTGTCTGGTTAGAAACATGGCAGAAAGAAACAAGGGAAGCAAAGGACT
+TTGCTGAGGACAGGGAGCTGACATTGGAGGCTTGAGTGAAAGGGGAGACAAGGAGTTCTTGTAGAGGGAA
+AGTTAATGAAAAGACAGAGGGAAAAAAATGGGAAGATTGATGAAGAGAAGTTACTGGAAGCTAGCTGTGG
+CCCGGTCAGTGTTCACGGTGACAGGCACACTGATAGATCTTCCCCACCAACACCGGCACTCCATGCATGG
+TCTTACTCATTGCCCCACGTGGCCTGTTTTAAGTACACAGGACTTGGAAGGAAGCGGCTGGTCTTCATAT
+AGGCAGAGATCTTCCTCAGGCCCTGCAGGTGAGAAGGAGAAACCAGGACCTCAGCTCTGCAAACCCACTC
+CAGGACCAGCCAAGAGACTTTAAAATCCAGAAGGAGAAAACCCACCCCACCACCCTGCAAAAGCAGCCCC
+ACCATCACACCAATCAGCTGAGGTAGCAGAGGTTAAGAAACTTCCTCAGTCTCCCATGGCTAAGAGGGTC
+AGCCCAGCACTGTGGCTGCCACTCTGCTCCCTGTGCACTCCCTCCTGCCTCCCATAGGGGGCACTGTCAC
+TCCTGGCTACAATGACATGCCTGTACTTAAAAGAAGCAAGGATCCAGAAAACCGAAAGGGTTGAGAACTG
+GCCTGTGTGGTTTGGAAGAGAGTCCAGGAGCCACCCAGTGACTCAGCAAACCCTGAGTGTATGCAGAGAA
+GGCCGAAGAGACTGCAGAGCATGAGTGGCCAGCTCCAGATGTTCACCAGAAGCTAACACGTCTCCACTTG
+AGCCCTAAGCACAATCCCACTGAGCCCCTACCAATCTCATCTCTAGCTCTGATGCAGTGGAGTGTGATAC
+AGGACACCTTGTCAGAGAGTGCACAGGTCATTTCCAAAGATCAAGAGAGAGCCAGAGCTACCAGTCCCCA
+ACAAGCCACAGCCATCTGTGTTCAGAACAAGGGCCTCTGCACTCCCCTCCTCAGCTATACCAGCTCGTGC
+ACACTCACAAGGCCACTCCATGTGGATCACAACCGCAGTGATGAGAGCTAACTAAAAAAAGACATGCATC
+TCTTAATGGCAAGTGGGAAAGGAGAAGTGCTTCTAGGCCGAGATCTGTTGCTATGGCAATGCCTTGAACA
+TACAAAAATGAAAAAAAGTACATCTTGTTGGTGTCAAACCAGAGAGTTCAGGGTGGATCATGGTTAACTG
+CGAGGTGAGAGCCAGGCAGCCATTAATATCACAGGCATCTCATGGTCTGTTGTGCCCCCCACCTACCGGC
+TTCACTCAGTCCTGGGATGGAGGGACAGTTTTACACGTCCAGCTGGGCCCGATACAGAGGGAAATTCCAA
+AGCATCTCAGTAAGGACAAAGTACGACCTCACTTCATAGGGGACACAATGGGAGCTGGAGGAATGGCGGG
+AGGAAAAAGCACACCCACCAAAAGAACTCAAGATGAAGTCTTCCACAGGAGCAGCCCTGAGTCACAGTAC
+TTCACAGGTCAGCCACAGGACTGACAACACCAATCAAATATTACCATGGAGTATGTCTTGGTCCTGCACA
+TCAGTCTGCCTTTCTCTCAGCCTAACGGTTGCAGGAACAAACACATTCTTCACATAATCTGCATACTGCA
+GGTCCTGGGCCGGAAACCCAAGCTGTTTAAAGGCCTGACTTTCTTTTTGGCACAAATCAGGAAGAGACAC
+TGAATGTTGCTCTTAGAAGCCAAAAGCAGTTGGGAGCCCAGAACAGCCTCTACTCGTGCTAGGATGCTGA
+GCAAGTAGCTTACCCTACACAGTCTCGTGTTCCTTATCTGTTCATCGTGACAGCTAATTCTACCCAGGTT
+GCCTGCGTGTGGCAATGACAAATTGACTCCGCAGATCTAGAAGGCTGAACATGGGTCCTGGCAGAAAGTC
+AGTGCTCAATAGATCAGAACTAGGGAAGCAACCTGGAAAGGGGGAAGGGAAGTAGGGCATCCAAGAGGAA
+GCAGGACCAGGGCATCACCTCAAAGCGGGCCATGAAGTCCTTCAGGTTTGGGAAGGCGTCCAGGCACGTG
+GGTTCAAACATTCGATGCTGATCAAGGACATCATAGACGAGGAAGTCTGCAAAGGTGATCTGCAGGAAGT
+GAGGCTCAGTGAGGTGGGATCCGGATGCCTGGGAAGAATGGCAGCTCCTCCTGCCCACCTGCTACCCTCT
+CTACCTTGTCCCCTGCAAACCATGGCTGCTTCCCCAGGAACTCCGAGTAGAGTTTCAGCTGATCTGGGAG
+GCCCTTCAAGAACTCTGGCTTCCGTTTCTCCTGAAGCAGAAAATAATAGCTGTCATAACCCTCATGCAGG
+CCCTGTGGGAGAAAGTGTGGCTTTGCCCAGCTGTGCAAAGGAACAGGCATCTTCCTAATTCTGGGCATGT
+GCCATTTAACGCTTCACCACTTTAATAATCAGACTCTCGATTGGTAGCAGGCTTCATCTCTCAGGGGTGC
+CTGGAATGGAAATGCCAATCTGTCCCTGAATCTGTCATGACTGATGTCTAGCTCAGCACTGCCTTCCAGG
+GTAGTGCCTTCCAGACAGGTTCTGAACAGAACAGTCCAGTGGAAGAAGGAAAGCCAAGTATTTAACAATC
+TCAAGAGAAGAGACTACGTGATTTCAAAGAGGGATCTCAAAGAATAGAATCAGTTGAACAGCATAGGGTG
+TTGAAAGAGACAGTGGTTCTCTAGCTGACCGTGAGCTCCAGCAAGAGGCCCCGAGTTTTGCTACCACTAA
+GAACTCAGCTCTGAGAAATCTGTTGGGTAGATTTCTAAATACCCATGACCTTTTTCCCCTCTGGTTTTCA
+TCTGTGAACCCCAGGGACAAGGTGGCTCATGGCAGTGGTCAAGGTCAAAGTTGAAAAGAATTGAACATAT
+GAATTTGCTGTGTAGGAAATGAATTTGCAGCCCTTGGTATCATAACAATACCGCAAAACTTCTTGGAAAC
+GGTATGTTGTACTTGTTAACAAAATGGGGGTGGGGCAAGAACTGTTGGCTGGATAGGAGAGCTTGATAAA
+TCAGTATGGGTGGGTTCACATTCCAAAGAGGACATGCATGGAACTGAACACAGGCCCTTCCTTGACTCCT
+GCTGCAAACCCTGTCTATACCACACAGTGCAGACTCACAAAGTCAGCGCTGTAGCAAAGCATGCCCATCT
+GAATTCGAGTGTCCATGACCTGTTTCTCCAAAATGTCCACACGGATCCTCTCCTCCTCTGTCTCTCCACC
+TGCAGCACACACAGCCCAGACTCACTCTCAGCAGCCCACCCTTGCCAAGACCAGGGTAGGTGGCCATGGT
+CCCCCACCCTGCAGCCAGCCCCACTCACACAGGTTGTGCTTCCGGCCAAGGTAGCGCAGGATGGCATTGC
+TCTGGGTGACCTTGTGTGACCCATCAATTAAGTAGGGCAGCTGGATGGACAAGCAGGGAAGGTCAGATGG
+GATATGAGGTCCCTCCATCCCTTCCCTGGGTGAGAGAAGACACAGGGCCTGGCACTCAGTGTGCCTGCCG
+TGGGAAGCCCTCCATGGGCACAGTGTGCACTGAATGAGAGAGCTGGGACACAGCCCATCACAGAACACTG
+TGCAGCCAGCAGGGATGGAGCAGGAGAGCAGAAACACTGATCCTAAACCCTCAGCCAGGCCAGGAGATGA
+GGTGAGATCACCATGTCTTCCCCAAACTTCCCCTTCACAGCACCTACATTGGGAAAGTCCAGGTCCAGCT
+TGAACTTGTCATTCAGCCACTGGCTTCTGTCATAGTCAGGAGCTGTGGTGGACAAGAGGATGTGAAAAGG
+ACGTCACGACGTCTAAGCCCCCTTCACAAGAAGTCAAAGGCAGGACCCCTTGTGCAAGGAATATGCAGTC
+TGACCCAGTAAGTCTTTCCAACCAAAGCTGCACACTTGAATAGCACAGCTATGTAGTGAGTAAAAGCCAA
+TCCAAGCCCAGGAGCTATTGAATGGCTATAGGCCTTTCAATGACTAACTAAGGCTTACAAAAGCACTAGG
+CCTGCAGTGCCTGTAGTGGGCAAACCTCACCCAATTACAGTCAGAATAGGGCCAGCATGTTACAACCATG
+ATTCTATATTGACAACCAAAGCTGCACCCTCAAGGAAGTTTCTGGAAGGGATAAGCTCTGGGTTTGCCTG
+AAGGAGATCCACAAAGCTAAGAAACAGAGAGATAGATAGAAGCTGCTGGGCACCAGTGGACAGAAGTTGG
+GGAAGAGTGTGACCAGAAGAATGTCATTACCGTCCCCCATGGCGTATCTCTTCTCTTCATAGCCTGTTTC
+TGTGTATTCCAGGAGCAGGCGGATGGCGTGACCCAGCTGTGAGAGATAGTGGGTAACATAAGTCCGTTAG
+TGAAATCCTTCACCACCTCTAGGGACGCCTTTACACCTTTCACTAAGAAGAGAGGAAGGGAGGCGGGGGA
+GAGGGAGAAAGGCAGGGACAGAGTTTGGACTGGGGGGTGTGGCTCTGACAGAGAGGGGGAGGGGGGGTCA
+AGGAAAACCCGATGGGAATCTTCTTCGTGGAACCAAGCCTGTAGTCCCGCTTTGTGTTCTGACCCTGGGC
+CCTTCCCGCCTAGCAGCCCAACCTCCCACTCACTCCACGGATATCCCAATAACCCAGAGTCACGGGCATG
+ATTGCTATTCCAGCCTCGAAGACTGTGCAGTCAGTCACTGGTCGGGAAAGACTCTGTAATGCTCAGCTGA
+TGAGTCTGAGAACAGGTCCGGAGCTGCTCTGCCCCTCCCCAAAGGAACCAACCGTGGAACCATAAAGCCC
+AACCCACTGTAACAAGTAAAAGCCAATCTAATTCCAGGAGCTATCATCCGAAGGCGGGTCTGAGCTTTCA
+GATACCGAAGGCAGTGATTGATTGGCAGGCCTGCTTCCATCTGCCCCCTGCTGGATTCTCAGGAAGTTGG
+CAGTATCCCTGGAGACAGGCCAGCTAGGACCTGTTCTGCCCATCTCCTTTCCTGGTAGATCAGTTTTTCA
+CACCTAGACTCTGGGAATCTGGGCTCAGATCTCAACATATGCATAAAAAGGCATTGCTTCAACCTGAGAG
+TCTGTTCTTGTACTCATGCTGGAAACAAGGGTTCTGCCAGTCTGACAAACTCCAGACTTCATTGTCCATA
+CAACCTGGAGGTACAGGAGCAGTGAAATGGGAATGACCCAGGACAGGGCCAAACAGAAAAGCAAGCAGTC
+TCTGTATTTCTCTTGGGGGATCTGAGGCTGTCTCAATTCCTTTCTTATTGCTGTGACAAACACCATGATC
+AAGGCAGCTTGCATGGCCATCATGGTGAGGAACACAGTAGCAGGGGAGCAGGCAGGCATGGGGCTAGAGC
+AGTAGCTGAGAGCTCGCATCTATCTGACTCACAAGCAGGAGGCAGGGGAGAGAGAGAAGGAGGGAGGGAT
+GGAGGGAGGGAAGGAGGGAGGGAGGTAGGAGAGGTAACTGGGAATTGCGTATGTTTTTGAAATCTCAAAG
+CCTACCCCAGTGACACACCTCCTCCAGCAAGGCCACACCTCCTAATCCTTCCCAAGCAGTTTCACCAACT
+GAGGACCAAGCACCCAACTGTATGAGCCTATAGGAGGTATTCTACAGGGTCATCAAGGTAGTGACTATAT
+AAATTAATAGGCACTTTGGCCATACTTGAGAGCCAAATCCAAGAATCAAACACAGAGAGGCCTTGTGAAA
+CTGAAAGAGTCCCACCACCATGGACAGAACCCCAAACTCAAAGGCACAGTAGCCTGGAGGGACCAGATAC
+CTCTCCTGATTCTAAGGGACAACAGAATGCTGTCCGGAAAGGACCTTGAAGGACTCTGCAGAGGACCAAG
+TCCCTTAGAGTCGAATTCTCAGCTATCTAACTCATTTCAGCCCTGGCCGCAGCTCCTGGCTTTTGTGTAT
+CTGGGCATAAGAGGGTTGCTCAATGTCCTTCATCATGGCACCTGTGGTTAAAACTCACAGACAAAATCCT
+ACCAATGTCAGAGAGCCCCGGTTCCCTTTAATCAGATTCTTAGCATCAGTGATAAATGAGTTTGAAAACA
+GACTTGAGGGGAAGGGGCATGGGAATGGCTCACTTGATAAGTATTTGCCAGACCAGTATTAAAATATCAG
+TACAACCCATGTGACAAAGCTTGCCGTGGTGGCAGGTGCTTCCCAGCGCTGGGGAGGTGAAGAAATACGG
+CTCCAGCAGGGTGAACTGCCTGGGGGCCAGTTCGGCTGACTCAACAGACTCCATGTTCAGTGAGAGAGCC
+GTCTCAAAAACTACGTACAAAGTGATTGATAAAGACACCAAATGTCAACCTCTAACCTCCACATACACAT
+GTGTACATGTGTGTTCACAGGAACACAGAGATATAACCCAGGGCTCAGCGACAGGGAGTCAAAATTCAGC
+TCAACGAGGGGGCAGAAGAATGAATGAGTAAGCACTTGATCATTGCTGTGAAGTTGGGAGTGATTGAGGC
+AAAGAGGGAGGACAGCGTGGAGCAGAGATCCTCTGGCTGCCCACAGGCTTCTGTTAGAAACTTGCTGATG
+GATCTGAACAGGCAGGACCAGGCACAGGCGAGCAGAGGTTAAGTCAAGGGTTCCATGCCCCGCTTTCATC
+TTGAACTACTCTTCCTCATTGGCTGAGCCTGTTTATTCCAGAGGCACAGAGAGGATCTGAACCAGCTGGG
+ATGAAGCCCACGACACAGGTCACAAGCAGTCGAGAGTTCTAGACACAGATTCTGCTCTCTGGGAGGTGAA
+GGGCCACTCTGGGGAGAAGTCAGTCTATCCTGACATTGGTGGGTCCTGAATAACAGTGTTCACTCAGTGC
+AGTTGTGTGTTATGAGACACTCTGGAGGCAGGATGTGTAGTAAACTGGATTGTAAGCAGCTTCGACGAGT
+TGGAAGGGGAGACACAGAGCCTGTGTAGACAAGCCACAGCCTGGCCCTGTCCAACAGGGGGCGTGAGATG
+GTGCAGAAAGGCTGGGCTAAGCAGGTTCTCAGCACCTGACTGGAGAGACTTCAGGATGTCTTTCTGGCTA
+CACAATGACTGATACTCAGTATTTTCTCTCTGGTAAGTTCAAGTGAAGGCCTCTGATCAGACCTTCCCCC
+TCGTCTAACTTGCACCCAGACCCTCACACATAGGCTGAGCTAAGTAAGGGGCAATGTTGCACAGGATGAC
+ATCATTCCAGGTGACCTTGGGCCTGATTATGTCACCAATAGGATGCTATTGGCCTTGACAGATGCCACTC
+TGTCCTGTGTGCATGATGGGAACTTCCTGCCTTGGCCTATTCTCTCTCTGTGTCTGTCTGTCTGTCTGTC
+TCCTTCCCATAATTGAGACACAGCTTCTTCTAAGGACAGGAGAGTAGGCAGGGAGCTTGTCTTCCCATCC
+CTGATAAAGGGCCCCTCTGAGTATCAAAGCCCTAAATGGAGGACTGCATCTAAAACAAGTGACCACTGTC
+CTATCTGTTGTGTACTAACAGGCCCTTTGTTCTTTCAGAAGCCCTTATTCAAAAATGTCCTCAAGCCTTT
+GCAAACACTTGTTTAAACCAGAAACACAAAATAACATAGTGCCTTTTATTCTAGTTATCTTAGTCTGGGT
+GATAGCAGCCAACCTTGGACCTGGTCCCTAGAGATGCAGCCCTTCCCCTCTACTTCCCCCACAGGGGATC
+TTAAAGGAACAGGCACTGCTCTCCCTAGAGGAGTCTGAAGCATCCTCCGTCAGACCCCACTGTGTTGTGC
+TGCCTGCCTGCCAATGGTTAACAAAACTCTTGCCTACAGTTAGCTAGACTCAGCACCCCACTTCTCCTCC
+TGTACCTAAAACCCCAACAAGGAGGAAGGAGGCTGGGACCCTGAAAGACTCCATGGACCAGAGATGGTCA
+GTCTCCTGGACTCTCCCCATGAGATGAAATGTGTCTAGATCAGTGATTCTTTGAGCTGAATCTCAGTCCA
+CTGGCCCTAGAAATAAGGGGGAAACTAGGCAAAGAAGTGTAATAAACAGTAAATAATGACAGACTGACAG
+AGCCAACTCCCATGCAAACAGGTATCTTTTTAAAAAGAAAGCCTGTCCCTTTTCCCAGGCTAGAGGCAGG
+AAACACAGTGAATGGAGGGAGCCCACTGGAGTCTTGCTACAGCAAATCGTGGTTTGTCTGCAGCACACAC
+GCTGCCTCTGGGCAAGGCCCTTGGGTGGATGTTTTCATCTCTTCTGAGCACAATGTTTAGGGATCTTTTT
+CATCTACAGTCACCATCTTATTTTGTCCTCTCCAGGACAGAAAGAAGGATGAAGTTGCTCTCAATCCCCT
+TCTTCTATAATCACGCTTAAATTTTCAAATCTAACATAGCTTTACCATAATTTGCCCTTTAATGGGAGCA
+TCTGAACTGGGGATAGGAATGGTAAACCACCTATTGCTATTTCCTTTCTTGCTTATTTGGTATATTTACT
+GTGGTAAATTTTGTGGCATCACATTTGACCTTGTGTCAATGCAGTATTACCTTGAGCATTAATGGTGGCA
+GTGCATTAATGGTGGCAGCTGACCCCACTCATCTTTCCCTTTTGTGACACTAGAGGCTTGTCAGTTTGTC
+TTTCCCTCCAGCCTCCAAAAGCCACTGCCTCTGTGATATCTGCTGTCCCCACATGTTGCAGCTTGAATAC
+GGAATGCCTCCCGGAGGACCGTACTCTGAACACTTGAGTAGAGGTGGTGGCTCCATCACTGGGTATGAGA
+ATGTGTGAGAGACTGAGACGTAGCTGCAGGAGGGGGATACCTGGGGCAGTTCCTTGCAGAGGTGTGGTCT
+ATGGTTCCTGTGTGTCACACTGTCTGCCTCTTCTCCGCCAAGAAGTGGAAAACAGCCTCTGTCATGTCCT
+GGCCATCATGATATTCTGCCTAAGCACATGGGGCCGAAGGACTGTGAGCCGAACTTCACAAAATGCCAGC
+CGAAATACATCTTTTCACCCTTAAGTTGTGTCCTGCTGGTATTTTATCAGAATGACGGGGAAAGTAGTTA
+ATATGAATAAAATTGTAAAGTGTGTGTGGATTGCTTGTTTGTTGTTTTGAGCAGTTTTAATGGCTCAAGT
+TTCATCTAAGTTTTATCATGGGTTAAAATAGCTCTCCTCTTTAACACTAAAAAATATTCTTCTGTGTTTC
+TACCCAGCGTTTTCTTTTTACATTCATTTGTAAATGGGTTCATAGGTTGCATCTCTGTATTATTCATCCG
+AATAATGCTTCTTTGAACATGGACACACAAATACATATTTGAGGAGCTAGGGAGATGCCTCAGTGGTTCA
+GAGCCCTTGCTGTTCCCAGGGGACCTCCATTTAATTCCCAGAATCCACATCTTCTGGCTCCAGGGCTCTG
+ATGCCCTCTACTGGTCTCTATGAGTAGAGACCTGCATGCGTGTGGCATATAAACACACACCCCAACACAT
+ATACATAAAAGTAAATCTTTTTAAAAAATACATATTTGAACTAGCCATGGTTGCACATACCTGTAACTCT
+AAGGATTCAGGAGGCTTAGGCAAGACTACAAGTCTGAGTCAGCCTGGGCTACCTAGATAATTCCAGGCCA
+GCTTGAAATAGAGAAACTATGTCTCAAAAAGCCAATATAGCTATTTAAGCCCACTATATATGGCACACAT
+AAATTATATATTATATGTGTGTATATGTTATATGTGTACCACAAGTGATAACTTTGCATTATTTTGAGTA
+TACACATAGAAAAGGAATTGATACATTAGTAATTCTATTGCATTATTTTGAGTATACACATAGAAAAGGA
+ATTGATACATTAGTAATTCTATTACTCATTTTATAAGGAACAGAGTGATTTTCACAGTAGCTGTAGCATT
+TTCCATTCCCAGAGACAACGCGAAACATTCCAATTCCTCCATACAATCACTATCTGTCATTTTCCCCTGA
+CGGTGACTAGTCATAAGTGTGTAAAGCAGATATTCAATGTGGTTTTTACATTTCTCTAATGATGAATGAC
+TTTTCAAACAACTTTTTTGTGTGACACAAAAAAGTAATGGTAATAAAAGCTATGGTAATATTTTCTCAAG
+AAAAAGAAAACTCCCTTTAGTCCTTTGCCCACGTATGTGTTGTTTGGTCTTCACTTTGCTGCAGTCACTG
+TGTGTGGAGCACATTCACTGGGACATGGACATTCATTGTATGTTAGCTATGACTTCTACAGATCCCCAGA
+AAAGACCAGGAAGGGCTGGGGAGGCGGCTCAGTCTGTAAAGCATTTACCTACCGTGCAAGCACAATGACC
+TGGGTGCAATTCCCAGCACCTCTGTAAAAGCCAGGCTAGGATGCATGTACCTGTCAGTTTCAGTGCTGAG
+GAGACAGGAAGAATCCAGGAGCTTGCTAGCCTGATAGTTTAGTTAAATAAGTAATGGCAGAGCTCCATGT
+GAGACAAAATCTCAAGGGAAAAAAAATGAAGGTAAAGGGTAGTTGAAGACATTTGATGTTGTCAACCTCT
+GATTTCCATATGCACATGCTCACACAGTCTCTCATTCTCTCTCTCCTCCTCTCCTTACTCTTCCTCCCTC
+CCTCTTTTATTCTCTATTAATATTTGTGTCTAGCTACTTAACTTATCACAGCTCCTTTTCCCTGTTATTT
+CATCTGTGCTTGTTAATGTTTAATACAGGTCAGCATAAAGGATTTTGATCTTACCAGATTAATAAACATT
+TCTCTTTGATGAACTCTTATCTTACAAACTTTGCCTTTTAATTAATCATTTATTTCACAGTGACCCACAC
+ATCTGCCTATGAGTGCTTACTTCTTAACTGCTGTAACAGGCAGCAGCTGAGACTTGGCGTAAGGACAATG
+CCCAGCAGGGCCATCCTCCAACATGTTCCAAGTTACAGGTAGGTTTCTCTCTGGTAAGCTCAAGTGGAGG
+CCTCTGATCAGACCTTCCCCCACGTCTAACTTGCACCCAGACCCTCACACATAGGCTGAGCTAAGTAAGG
+GGCAATGATGCACAGGATGACATCATTCCAGGTGACCTTGGGCCTGATTATGTCACCAATAGGATGCTAT
+TGGCCTTGACAGATGCCACTCTGTCCTGTGTGCATGATGGGAACTGACCCTAGGCATTCCCCGAGACCCT
+GATGTATGGAGAAGGAACACAGCTCCTTCTCTAAGACCAGGGATGTTCTTGGCTGAGCTGATAATGGCCT
+CGTGTTAGAGCCTTGAAGACTCTGTGGCTTCTGATCCTGAAAATCCAGGTTTTGCCACTGTACAGGGCTA
+TGGTTATCTGTCCCAAGGTCACTGTATGCTCAGATTATATTGTTCTTGAGACATTCTGTTTCCCTTTTGC
+AACATTGCTTCCTTAGCTTTTAGCTGACTACATCCCATTTCTCCACTGGAAATAGTATATAAGACGGCTA
+TATGTCTTCATGACTTTCCTCGGCATAAGACAAAGCTTGTGCAAGACCTGCTAGCCCCAGCCTTACCTTC
+ACTATTTCTATCTTTTCTATCTTTCTCATTTCCTGGCCACTTCCCACTTAGGACCCTGTCACTGAAGAAC
+AACTAGCTGGTTTAGGTCCATTACCTTTAGAAGCTAGAGGCAGTCAGGAGGAGCCCAGAACAGCTTCAGC
+GCATGCTAGGATGCTGAGCAAGTAGCTTACCCTACACAGTCTCAGGTCCCTATCTGTTCATCATGACAGC
+TAATTCTATCCAGGTTGCCTTCGTGTGGCAATGATAAATCGACTCCGCAGATCTAGGAGGCTGAACATGG
+GTCCTGGCAGAAAGTCAGTGTTCAATAGATCAAAACTAGGGAAGCAACCTGGAAAGTAGGGCATCCAAGA
+GGAAGCAGGACCAGGGCATCACCTCAAAGCGGGCCACGAAGTCCATCGGGTTTGGGAAGGCGTCCAGGCA
+CGTGGCCTCAAACATTCGATGCTGATCAAGGACATCATAGACGAGGAAGTCTGCAAAGGTGATCTGCAGG
+AAGTGAGGCTCAGTGAGGTGGGATCCGGATGCCTGGGAAGAATGGCAGCTCCTCCTGCCCACCTGCTACC
+CTCTCTACCTTGTCCCCTGCAAACCATGGCTGCTACCTAAGGAACTCAGAGCAGACCTTCAGCTGATCTG
+GGAGGCCCTATAAGAACTCTGGTTTTCACTTCTCCTAAGATAGAAAACAATTGTCACAACCCTTAGAACA
+CACAGGTGTTCTAGAGAAAGGCTGGCCTCATAGCGTTGTGCAGAAGAGAAGCCTCCTTTCTATACGGTGC
+TTGCACCAGTACATAGGTTTCTATGTAATGTGTTCCTGTTGTAATAATCAGAGTCCTGCTTGGTAGCAGA
+GTTGACTCAGAGGTGGTGACTGGAAGTTAGATGAGAATGCCTGTCCCCGAACCTGTCACCGCTGACATCC
+AAGCCAGGAATACATTTGGGTTGCTCATAAGGTCTGCACTTGAAAATCCAGTGGAACAGGGGAAGCAAGG
+TAGTTAACAAGAGACACAATTATTCTTGAAGAATGACACAGAGCAGGGTCAATATTGAGCAGCATGAGCA
+AAGTTGAAACACGTTTGATGTCGGAAGGGACAAATATCCTCAGGCTGACCATGAACTGCCAATCAGGAAG
+AGACCCTGATAATTTCTTCTACCAGGGACTACGCTCTGAGAAAACTGTTAGGTGGATTTCTGAATATATT
+AGGCTTATTTTATTTTCTTCTGGTTTGAATCTGTGCACTCAGGGAGCAGGCAGATGGTATGTGACTAACA
+AGGCAAGATGGTCCACTGTGGTGGTCAAGTTCAAAGCTGCAAAGAATTCTGAGTACATGAATGTATTATA
+TAGGAACCAAATGCCCAGTCACGGCTGCCACAACAATACCAAAGATTTCTAGGCATTGATATAAAGTATT
+TATTAATAAGAATTGGGTGTGTGTTGGCTGAATTGGGAAGCTTGAAGAATCAGAGCAATAAGTTCAGATA
+CTAAACAGGACAAGTGTAGATAATAACACAAGCCTTTAACTCCTCCAGCAAACTTCTGTCTGCCCCACAC
+AGTGAGGACTCACTAAATCAGGACTGTAGATCATGCTCATCTGCATGCCAGTGTTCATTGCCTGGTTCTC
+CAAAATCTACACACACATCCTCTCCTTCTCTATCTGCTCATCTGCAACACACACAGCACAGACTCACTCT
+CAGCATCCTGCCCCAGTCAAGCCAAGAGGGATGGCTTCTGTTCCTAACCCTGCAGCCAGCCTTAACCACA
+CAGGTCATCCTTATAGGCAGTGGGATACAGGATGTCATTGCTCTGGGTGATCTTGTGTGACCCATCAATT
+AAGTAGGACAGCTGGATAGACAAGCAGGGATGGTCAGATGGGATATGGGGTCTCAGCATCCCTTCCCTGA
+GTGAGAGCAGACACGGGGCCTGGCACTCAGTGTGCCTGCCATGGGAAGCTCTCCATGGGCACAGTGTGCA
+CTGAATGAGAGAGCTGGGACACAGCCCATCACAGAACACTGTGCAGCCAGCAGGGATGGAGCAGGAGAGC
+AGAAACACTGATCCTAAACCCTCAGCCAGACCAGGAGATGAGGTGAGATCACCGTGTCTTCCCCAAACCT
+CCCCTCCCCTGCACCTACATTGGGAAAATCCAGGCCCAGCTTGAAGTTTTCACTCAGCTGCTGGCTTCAT
+TCATAACAGCAGCTGTGATAGATAAGATAATGTGAAAAGCTCTCCTCCGTACATCCTTCCCTTCCCTGGG
+ATCCTCCCAAAATGTCAGGGCAAGTCCTCCTGAGGTATGGGAATCTGATCTGACTCATTCCATCTTACAT
+CCCACAATTTTACACGAGAATACTCCTTCAAAGGCTGTAGGCCCTACCTGACTAGGGTACACACAAAACA
+CAGGCAAGTGCTAAGCAAACCCTACATGACCACAGGTGGGGAGGGAGCCAGCGCTTCATTCCCAGGGTTA
+TTTATTGAAAACCAAAGCTGCACCTGCACCGCCTTGGAACTCTCAGGAAGGGATAAGCTCTGGGTTCGCC
+TGAAGGAGATCCACAAAGCTAAGAAACAGAGAGATAGGTAGACGTTTCCTGGCACCAGTGGACAGAAGTT
+GGGGAGGAGTGTGACCAGAAGAATGTCATTACCGTCCCCCATGGCGTATCTCTTCTCTTCATAGCCTGTT
+TCTGTGTATTCCAGGAGCAGGCGGATGGCGTGACCCAGCTGTGAAAGATAACAGGTAACAAAAGTCACAG
+TAAGATCATAAACGCATGACTTCATCTCCTCTAGGGAAGCCTCCATCTCTATACCTAACAAGAAAATGGG
+GACAGTCAGAGGCCTGCTGACAGGAAGAGACGGGAGACTCTAGAACATGCTGAAGAGGAGCTATCTCCTG
+GAACCAGCCTAACTCTTCAGTGTGTGCCAATCTAGGGCTCTTCCCACTCAGCAGCCCTACCTTCTACTCA
+CCTCACGGATATCCCAATAACCCAGAGACTTGGGCACGTTTTCTGCCCTAGCCTTGGAGACTGAGCAGAC
+AGGCTTCATTCCGTCTGGATGATTTCACTTTCAGCCGCCAGTGCATAGGGGGCGCTGTCTGCCTGCATGC
+AGTTCCTGAGCCCTCCCTATCTCACTCCCAAATGAACCAACCCTCATGGCCTGAAGCCCAACTTATCCAG
+CCGGTGATGAGATCCAGTCCAAACTTTACCATCTTCTGGGGCAGGTCTGGGTATTCAGATACCAAAGGCA
+GTGATTGGCAGGCCTGCTCTAGTGGCTCTCTGCCTACAGCCCCTCCCCCTGCTGGGTTCTCAGGAAGTTG
+GCAGTGCCCCCTGGAGATGGGCCAGATAGGCAGAGAATGAAGTGGTTCTCCTTCCTTTCATTGCTGGGCT
+CACTTTTCACACCTAGACTGTGAGAATTTGTCATCCAAGGAGGCTCTCCCCTCAATACAAGGACCAAAAA
+GTTATGGCCCCTAACCTGAGAGCTTAATCCAGCACGCATCACTGCTTCTGGGACTCTTCTAGTCTGAAAG
+CTATCAAAACTTCATTGTCTGCACAGCCCAGGGTTATGGGAATATGAAGTGACATGGGAATGACCCCAGG
+ACAGTGACAAACAGCAGAGCAAGCTGTCTCTGTGTTTCCTTTAGGGGATTTGGTTGGAGACTCATCCCTT
+TCATCATCAAGCCCATGGGCACATAAATCAGTCAGCGTTTTGGCCATGCTTGTGAGCTCAGTCCAAGACC
+CAGGCACAGAGGGCACAAGGACTTCAGGGTTTCCCAGCACCACTGACAGCGATTAGTCTGCGGTAGAAAC
+CTCACACTCACCAGCACAATGGAGGATGCGCAGAGCTCACAGTCAGAGTTAGCTTGGGTGGTAAAGCATG
+TGCTCAAGGGCACAGACAGGGCTGGCGAGGTGGCTCAGCGGGTAAGAGCACTGACTGCTCTTCTGAAGGT
+CCTGAGTTCAAATCCCAGTAACCACATGGTGGCTCACAACCACCCACAATGAGACGCCCTCTTCTGGTGT
+GCCTGGAGTCAGCAACAGTGTACTTGTGTATGATAATAAATACATCTTTAAAAAAAAAAAGAAAAAGAAA
+GAAAAAGGAAAAAAAAAGAACACAGATCTCTCCCCTGCTTCCAAGGGCTACAGAATCCTGTTTAGAGGAG
+ACATGGAGTATCTGCCTAGGTTCTAGGAAGCACAAGGCCTGTCCAGTTCCCTGGCATTAGCTATGTATGG
+CCCAGACTCCTGATGCATGCATGTCTAGGAATGAAAAAGGGTTTCAAATGCTCTACGTGACATTGTGGTA
+CCTGGGACTGGAGGTTAAAACTCAAATACCATATCTTACCAATGTTACTCTAGAGCCAAGTTTCTGTTGT
+TTAGGTGTTTAGTGTTTGTCTTGCTGACTGCTCTGTTGCTGAGAAGAGATCCTGTGACCATGGCAACTCT
+TAAAAGAAAAAGCATTTGATTGGAGCTTCTCACAGTTTCAGCGGTTTCGTCTATTTATTCAGGGAGCAAG
+GCAGCACACAGGCAGACAGGGTAGCTGAGAATCCTACATCTGGATTGGAAGGCAGCAGGAAGAAAGTACC
+CCTGGGGCTGGCTTAGGCTCTTGAGACTTCAAGCTGACCCCCACTAAAACACATCCTTCAACAAGGCCAT
+ACTGAATCCTTTCAAAGAGTATCCGTCCCTGGTGACTAAGCAGTCAATTCTGTGAGCCTATGGAGGCTAT
+TCTTATTTGAACCTCCACAGTATCACTGATAAATGAATTTTAAAACAAACTCAGCTGTAGGGGGTGGCGG
+TGCTGATGCTAAGGTCCTATTCCCCAATTGGTTCTTTTTTTTGTTTTTTTTTTTTTGTTGTTGTTGTTGT
+TGTTGTTTTGTTTTTCGAGACAGGGTTTCTCTGTGTAGCCCTGGCTGTCCTGGAACTCACTCTGTAGACT
+AGGCTGGCCTCGAACTCAGAAATCTGCCTGCCTCTGCCTCCCGAGTGCTGGGATTAAAGGCGTGCACCAC
+CACACCCGGCCCCAATTGGTTCTTGATCTATCAGTAAAGAAAGCTGGGGCCAACTGCTGAGCAGAAGGAA
+CAGGCAGAACTTCCGGGTCCCTGGAGGAGAGAGCAAGGAAGGAGAGGAAGAGTTTTTTTACCATGCTTTG
+GAGGGAGAAGAACCCGGCAGCCATGTGAGGTCTCTGGAGGAGCTGGAGCCTGTGGCCACTATTACAGGTG
+GGTGGTTAGGGATGTTTGGCAGGGGATAGGCGGAACTAGCCACTGACGTTTAGGGCAGGTGGGAGGAGCG
+GAGAAAAAAATATTAATAAGGGCACGCTTTTCCGGGTGGGAAATAGTAACACCCAGCAATTGTACAAGAA
+GGCAAGTGGAAAAGGAACAAAGTATGTGTGTGCCTCTTGTCTGAGGATTCAAGGGAAGCCCGGGGGGGGG
+GGGGGTGGTGGTAGCATGGTCACTTCCTGGAGCTAAAGGCAAGAAGAACGTAAAATTGCATACTTACACT
+CATGAAATGCTCTGTTGGTAACAACTTGCCAGGTGTTTTAGTTAGGGTTGCTCTTGCTGTGATAAAATAC
+AATGACCAAAAGCAACTTGAGGAAGGTTTGCATCAGCTCTCAGGTCACACTTTATCATGGAGGGAAGTCA
+GCGCAGGAACTGCAAGCAGGAACCCGGAGGCAGGAATTGAAGCAGAAGCCTGACACAGAAACACTGCTTA
+TTGTCTTGCTTGTCCTGGCTTTCTCAGTCTTCTTTCTTATACCACTCAGTGCTATCAGTGGATAGCACCC
+TCCTCCCCAGTGAGCTGGGTCCTCCCCCAGCAATCATGAAATAAATACCTTACAAATTTGCCTGATGGAG
+CCATTTTCTCACTTGGGTTCCTCTTCCCAAATAACTCTTACTTGTGTTAAGTTGGCAAAAACCTAACGAG
+CACATCAAACATGAGTTTGATTCCCTAGAACTGACTGAAAAGTAAGATATGGTAGGGTGCACCTATAATC
+CTAGTACTGGAGAGGTGGAGACAGGAGGCTTCCTGGATTGTCAGTCTAGCTGAACCAGTGAACTACAAAT
+TCAGTGACAGACCATGTCTTAAAATTAAGGTGGAGAGTAATTGAAGAAGACACCAGATTTCACACACATG
+TAAATACGTGTGTGTGTGTGTGTGTGTGTGCGCGCGCGCGCGCGGCGCGCTCTGATACCTTCATCTCATT
+TGGCATTTGAATTTTAAGGAGAAGCAATCGCAATGTCTCCCTTTCTTAAAGAAAAAAAGGGAAAGCAGAA
+GACTGATTGTGACAATTCAGTTTGTCTTCAAATGGCTTCTCAGTCTGCTAAGGCCAGCCTAGCTGCCTGA
+CACCAAGGACACTGTCCCTTGTACCAGTGTTCTGACTTAAAGTCAAAGGACCTTCTCTGAAGGTTCAAAA
+CCAAGAGGTGGAGGCAGGCAACTGGCCTGTGGTTGACTTTGCATGTCTAAGGAACGTGACTGGCATCAGC
+AGTAGAGAACCCGGGGAATAGGGGCCTCTTCAAAGGACAGAATAACTCTTCCAGGGGAGGAAAGGACTGA
+GCCAGAAGAAAGAGGTTGGAACTTAAAACTGAAGTGAGTTTGCAAGTCAAAGGACAAGGCTGCCACGCTG
+CAGCCCTTGGGCAGGACTTCAGATTGTGACCCAGTGCTCAGTAAATGTTTGACACCACAAAGATGTAGTA
+TAGGGAACAGGGGACAGGCAGAGACCATCTGAACACTCAAGTCAGAGAAGGTAGGTCTCTAAGAATGGAA
+GCACAGACTATGTCACAGGGGCTGGCTGAGTACACCACCTGTGGTAGAATCCACTGTGGCATTTGGATAT
+GACTATCCTGCCAGTCTCAGGTTTCTGGGCTGTGTCCAAACTCAGAAAGTAACTTGAAACTTTAAAGAGA
+ACTTCAGTGAAAGGTTGCCAAAAGCATGATAACAATAGAGTGTACACAGACACAAATTCACACACGCACA
+AACTCACACACATGAACACACATGCACACACACATTCATGCATGCTTGAATGAGCACACAAGCACACACA
+AACGTACCTACACACAAACGTACACACTCACTAACTCACACTTATGAACACAATACACACAGCACACAAA
+CACATGCATGCATGCTCAAATGTACACACAAACACATAAACACACACACACAAATGCATACACACAAACA
+CACAAGCACACACACACATACATACATTTCTCTGAACTGTTCTAGTCTCATATACCTTAAGCATTTGTCA
+TTCTAAAGTTTACACACCTTAGGTCAGGTGAGGGTAACACATGCCTTTGGTCCCAGCACTTGGGAGGCAG
+AGGCAGGCCTTATCTCTGAGTTTCAGGACAGCCAGGGATACACAGAGAAACTCTGTCTCAAAATCAAAAC
+AAAAATTTACATACCTTGAACTTAGAAAAAAGAAATTACTGAATTTAACTGAAAAAATATTTATTTAATG
+TTATGAATAATTAAAATAACTGATTTTAAAATAATTTTTCTTTATTTTTAAATCAAATTTTAAATTTTTA
+ATTACATTAAATTTTTAATTAATAAAAACTTTTTTAGTCAGGAACTCACTGTCAAAGAAGAATTTTTTGA
+AAGACTTAATTTTTATTTAAATAAAAACTCTTAAGTGGTTGCCACAGGTGGAGCAATGAAAACAAATTTT
+TCAAATGAAGGAGGACAAATGACTTCAAATTTGAAACCATTTTTTTGTATATACATTAGTGTTGAAATGA
+AGTGGTTCAAATATCGGAATGATGAATCAAGGGTTAAAAGAACTTAGGATTTGCCAGACAGGGGTGTGGG
+TGGTGGGTGTGTGGAACACACCTTTAATCCCAGCACTCAAGAGGCAGATCCAGGCAGATCTCTTGAGTTT
+GAGACCAGCCTGGTCTACAGAACAAGTTTCAGCACGGCCAAGGCTACACAGAAACCCTGTCTCAAAAAAA
+CAAGATAACAACACCTAAAAGAGATCAGGGATTTGAGTTGGCTCACCCCATCTATGAACTGCTGTAGCTT
+ATGAAGGCGCTAGTCCTGCAGGAACAGAGCTGCAGGACCTCGGTGACACCCACCGAGCAGCAACAGGATG
+GATATCTGAGTACCGATGTGTAACAGAAGCCAGAGTACTCAAGCAGGCCAATGACTAACATTAGCTTCCA
+GTCGGTGCCACAGGGCTCCTCTCAGGGACCCAGCAAGGCTTACTCATCACCTGCCGCCAGCATCCAGTAT
+CCTGGTGTGGGAGCGCACCTCTCCCCTTCCACATGTTCCAGGCTGGCCCCTCTCCCTCTTTCTGACCTTT
+CTGTCCTTTTCTGTTCTCTGTTCTGCCTTCTGTGCCCTCATAGCTCTGCTCTGATTTGTCTGCCTCGACC
+CCTTCTCTTCCTCCCCTTTCCCCAGATAAACCTCCTTTATACCAGGTCTGTCAGTCACATGGTAAGATTT
+CTCAGAGGGCACTTGGCATGGGCCCTCCAGTACCCTTGCCTGCTGCCACATTATATTTTATAAAAGCAAT
+GAACATTTGCAAGTAAAACTGTTTGGGCAAAGAGGTATATACCGATTGACACATCATGACACCCTATAGC
+TTCCACTGTGAGATGTTTTTGCTTTGTTTTGTTTTGTTGCTTTTGTTTTGTTTTGTTTTGTTTGGGGAAG
+AGGGTAACACAAAAGGACCAGGAGATGAGCGTGTTTGGGGTACATGATGTGAAATTCACAAAGGATCAAT
+AAAAAGTAAAATAAAATAAAATAGAAATAGAAATAAGGGTCCAACTATAGTTATGAAGCAGGAAGAAGAA
+AAGTCAATGAGATGAAAGGTTGTGTGTTCTCAAGAACTGGGTTCATGAAATTTCAAAGCCTCCCCAAACC
+TAGAGTTTTCCACCCACTGTCACCAATACGCACAGGTGTGGCATACACAGCCACCTGGGAGCAGTGGTGA
+GAATGGCTGTAATCACTGTCCTTCCTGCCCTGATCCCCTACACATGTGCTATCACAAAAACATATGGTGC
+TTGGTCCTTGTCACATCTGTGCTGGGACACAGTGTGCCAGGGAGAGAGTCAGAAGAAAGAAATGAGAGGG
+GGCAAGAAAGGGCTTGTGGAAACGACAGGGCTTGTGGAAAAGAGCGCTGGCTGCTTGGGAGCTCTGTGTA
+CTAGTTTTTGAAACATTTCTCAAGGTCTGGCACTGTAACAACAGCCCAAAAAAAAAAAAAAAAAAAGGTA
+AGTTGTGAAAAATGGAATATATTTATCACATGGTCCAGGACATACACACACACACACACACAAAACTATT
+GAAATAAACTTCCAACAAGAAAGACTCATCTTGTCTGGCATGGTATCTTATGCACTTGGAAGCAGAGGCA
+GGAGGATTGATGTAAGTTCCAGCCAAGCACATATAGTCAGTTATATCAAGCAAAACTTGTCTCCTAAAAA
+TAAAGACAAAAAAGAAGGGGGAGGAGGAGGAGGGGGAGGAGGAGGAGGAGGAAGAGGAGAAAACAGATGA
+CAGATTCATCTTTTATGAATAAGTTAAATTTAAATTTTAGGTTTTAGAAGGTCCAAACGCAGTCACAACC
+AAGTGAGGGGACCTAGGGTTACCTGGTCTGTGAACCAGGAATAACTCAGAAGCCACAGCTGGAGAAGCTC
+AGCCTTAGAAACGGGTGAGAGGCAGAGCAGAGCAGTTGCAGGCCTGGAAACAGCTGGGAGAAGGGGTAGT
+GGGGAGGGGCTAACAGCAGCAGTCTTGCCCTGGGGAGACCCTGGGGGCATCGGGCTGCACATTCCCACAG
+GGAGTGAGAGACTTGGGCAGAGACCTGACTCTACTAGGTGGAGAGACTAGTCAGCCTGGTCCTCCAGGAT
+ACAGGGTAGGACAGATGTGACTCTCAGAAAACCAACCCCTCTGACTGCCCCACGCCAGACCCCTGCAGCC
+CTCAGAAGGTGAGCATGTAGATTATGTGGTAAATTGCCATATGTTGTTTGAGCTCACTGAGAAGCTGACT
+CTGAAAGATCCAAGCATGTTTGTCTCATATCACAGAGCTTTGGGGGGCAGGGGATGGGTGTGCTGCTTGA
+AGTTTAATGATAAAAATAAATAAAATATTTTTAAAGTAAAGAACAGCTATGCAGAGACATCTGTATAGTA
+TAAGGCTGTTGGCCTGTGTGCCGGGGCTGTTCCTCAGCTAGCCCATCTAAGCTTAACTTTGCCATTGCTT
+CCCGTTTGCCTTTTCTTCTTGGGCAGGTAATGGAGCGACAAGCATTGACTCATCCTGCTAGCCTGGCACT
+GCTCACCCAGCAACTAAACACTGAGAACACAGGGACCCACTTCCATAGCCAGTTGATCCAATACGTGTGG
+GCTATCCCAACAGTCTCAGTTTCCCCCAAGGACTCCTCCGGGGATGGCACCCCATCTCTGCTACAGGAGG
+GAAGGAAAGAGCAAAACAGCCCTAGAGTAAAGACACACTTGGGAAATGCTAGAAGGTGAACCATGTGCAA
+CAGAGGATGCCATTGTGTAATCCCAACATGTAAAAAGGAGAGGCTGGAAGCTCAGCCAGAGGAGGCCATC
+CAAAATGAGAAAGTTCAATCCGGAACTTCAGTTGGCTGTCTCAGTTTTCCTAGCCACTTTTTCCCTGGAG
+TTGTGAGCCCTTGATTTTCTGACATCATCTCTGTTTGCTGCTATTGTTCCTGTGCCTAGAAGCCTCATCC
+AAGGGGTCAATAGCTTTTCCTCAGGGGGCTTCTGATAGATATTTTGAGTGACCAATTTGATGATATTCTG
+CTGTCTATCTATCTACACCTTGCTACTCTTTCAACTTGGCTTACTACGTAAGTGATAAATTCACTCATTC
+AAAACAAAGTACAAGCCGTAGATACACACGCACCACTCTTACAGAGGACCTGAATTTGGTCCCCAAAATT
+CACATGAGCAAGTTCACAACTGCTTTTAACTGCAGATCCCAGAGGATCTGACACCTTTGGCCCCATGCAA
+CCACGTGCAAGTATCCACAAACACACAACCGCGGACATCCATATAAATAAAAATAGGGCTGTGTGTGGAG
+GCACACTCTAAATGCCAAGGCTTGGGAGGCAGAGGCAGGTGGATCTCTCCGAGTTCCCATGTTTCCACTG
+CTAGAACAAGGGGGCTAAACATCTTCTGTACTTAAGTCTGCAATTACAGGATTAACACCTAAAGATATTT
+GGATGTCTCAGGTATTTTCTCTTCTGCTTTTTCTTGTTTACAGGGAGTGTTCAGTGTTCAAGGTCATTGG
+CATCTTGGCAGGTCCTTCATGGAATGTAAAAACTCTCTCTTACTTTCAGAGTATGTGAGACCACAGGGAG
+GAAAGACAGGAAAAATCCAAGTGATATGAAGTTTTCCAAGACTTGCAGGAGGAACAAAGTTTGGAAATTT
+GACTATACATGTACAGATAATCTTAATTTACTACACTGTGCCACATCCATACTGTGAATTAAATTCTTGT
+TAAAATTAGAAATTGAAAAGGATATAAATATGAAAATAAATATATTTTTGTTAAAGAGGGTAAATAAAAA
+TATCCGGTATTAGGTTGGATTTATATAATAAAAAAAGATTTTTGTGCTAAAAATAAAAGTCGAAGATGAT
+TTTTTTAACTTTAAATCAAGTTATAAATAGCCTGAAGAAGGCACACAGAGCCTGTGAGTGCTAAGTCTTA
+ATGATGATATATGTTTGATGACTAAAAGTTATTAACTACTGAAGTCAAAGTTCAGTATACTAGTATCAAA
+ATTATCTTTAAATCAATAGAGTTCTTCTAGGATATCTGTCTACTGTTCTATTGCTCTACTAAAATTTTTC
+TCATCAATATAAGGAGGAAAACTGAGTCTTAACAAAATCAAGACTTATTTAAAACTGGTGGGGTTTTTTT
+TCTTGTTGTTTTTTTTCATGAGCCCTATCAAAACATTACATGGAAGTTTAAATTACACAATTGGCTATGT
+TGTCCTTCTGGGTGTTCAACAAAAAAAGTTGGGAGGTCTACACCAATAATGGTAAGGGAGTCATTAAGGA
+CTGCTAAGGTTTTGACCCTTACCCTCCAAGATCAGCCAGAGCCCATGAGACAGGATCAGTAGAATGTGTA
+GACATATTATTTTGGATTCTGTGCAGATGTGCATATTATTTCTAATGGAAAGGGACTCCTGTGATTTGCA
+TCCCATCATCTGCTGACTGCAAGAGACCCCACACAGCCCGTGGTGCAGTCAGCCCTGACTGGACAGGGAG
+CACAGGGAAGCCTCCAGGTAAAGCACGAGAAGAGCATGCATGTGACTAGGCCAGCAGATGATGGGTACCT
+GGGCATGACAGAATGAACTCACACATCATCTGGCTTTGGTTCCACCTCTGTGCTTGTGAATACTGCCTAA
+CCCCACTGGTGAGGCTGGTCTTCTATACTCAGTCTACTGAGACAAAGTACAAGTCTTCTGGAATTATCGC
+CCTAGCCACACCAACATACAACATTTTATAAGCTACCTGGGTAGCTCTCAGTGCAATCCATTTGACACTT
+CTCTGGAGAGCAGATGAAGTCTGGGTGAATGGATATTGTCAATAGGAACATGCCTAGATCAAGAACCTCA
+ATGCCTCAGACCCTTGGGATGACAAGGCCCAAGGACAGATATGGACACTGTGTGCAGAAAGTACTGACCA
+CAGCTTCTCCCTCCAGGATGACTGAACCCAAGACTGGTCCCCTATAGATCCCTACTGAGATTAGCTGGCA
+CACCTCCTTGTCCACAGGTATATAATAAATCTGCTTTCCTGTTCAGCTGTACCCACTGACTTTCTGGGCA
+ACTCCTGCATCTCCTTCAAAGTACTTGCATGGCTCACAGATCCCTGTTTCTGTACTTGTTTCAGTGTTTC
+TTCATGTCCAGTTACCCTAGTCAGGTTAGTCCCAAAGCCGTGCAGGTCTTGGCAATCAACCATTCCGCTG
+TATTCAAACCCTGAATACAATAGCAGCTAATTACATCATGTCTGCAGAAAGCTAGGGTTTGATAGTCACA
+GACTCATCTTTGAACAGCTCTCCGAGAACAGGGCAGGGTGAAGAGACCATGAGGGGCTCCTCCTGGGTGC
+TCTCACCATCAGCTTCACGACTCAGTGAGACCCACAACTCTTCACAGTTCCAAGTCCCAGAATGCTGCAG
+TGAGTGAGATTCCCTCTCTATCCTCTAGAGAGCTTTATCTCTGCTGTGAGGATCTCAGACTAGAGCCAAG
+ATATGTCAAAGCTCTTCTGGTTCCATTAGATGTTTCAGAGCCCGGCACTACAGGCTCAGAGCTCAGACTC
+CTGGGAAGCAGCTGGAATGCTTAGCAAGGCTAAAGAGCAAAAGAAAAAGACACAACTCACAGCAAGTGTT
+CAGTGTTTATTGTGTTTCAAGTGTGTTTCAGGCTTTATTGTACATGAGACAGGCCTGGGGCAGCTCCTTA
+AACAGGAACAGGCTGGCACTCGAGTATTGACCTTCAGGGTAACTCTAGGGAGGGCCGTCACTAAGATAGT
+ATTGACCATCAGGGTATCTCTAGGGAGGGCTGGCACTAAGATAGTGTTGACCATTGGGGTAACTCTAGGG
+AGCCTGAGTTCAGGACAGACCTCAGATTGCAGAAAAGGGCTGTGAGGTTGGGTCTGGGCACCAATGAAGA
+AGGGGCCATGTGAAAAAAAAATGGGGAGAATAAAGGCTGCATGGGCTTGACTGGGGTGAGGATGAGGAGA
+TGGGGCTGACCAAGCTGCAGAAGGGAGCGGGAAGGAGAGAGAACCGGGAGCCACAGTGCAGAAGGCCAGG
+GTGCTGTCCCCACCCAGAGCCTGCAGGATCCGCAGTATGGACAGGTCCCCCACTCTGAGTGACAGCATGG
+CAGGGGCCTAATCAGTGCCCCACTGGGCTATCTTAGTAAACACAGGTCTTGGGAGGAAGCGGCTACTCTT
+CATGTAGGCAGAGATCTTCTTGAGGCCCTGCACAGTGGGAAGGACACAGCAGAAGCTCAGGTCTGCAGCC
+CACACAGAGGAGCAGCCCCAGGTCAGGAACTGCATTTCTCAAAGGCTCAAGCTCTGGACCACAGCACAGC
+CTGCTAAGCTCCCAACCCTGGCAAGAGAACTCTTCAGTTTCCTATAGTGAGAGACTAGACCCTGTGAAAG
+TACTTTATACCCACTAAAGGTCACAATACAGGCTCAGAAAATACTATAACCAGTTATTAGAAAGCAACCC
+GAGGTCCAGAGACCTCAACTGCCCCTTCTTACAGGGGTAGTGATTGAGACAGAGGTGAAGTCAGCTTGTC
+TCTGAAGCTGCCATTCTCCCCCTCCCCCTTCCACCCTCCTCTTTCTCCCTCTTCCTTGTCCTCTCTCTAT
+TCCCCTCTCCCTCTCCTTCTCCCTTGTTCTCTCTCTATCCCCCTCTCCTTCTCCCTCTCCCTCCCTCCTC
+CCCTCCCCTCCCCCTGTCCCCTATGAGACAGTCTACTGAGAAACTGTATTTACCAACAGAGAAGGGAGAA
+AAAGAAACCAGGAAGACAGGCAGAGCCCAGTGACTCAGCAAATCCTCAATGAAAGCCATCAAGAGAACAG
+GCCACAAAGAGAAGCTGAACAGGATTCTAATCAATTTCCTGAGAGCAGAGTTAACAATTCTTCCCGCGGT
+TCCCAAGTATAATCTCACCACACCCTGGCAGTACCTCTCTCCACCCCTAACACAGGGACCACTCTGTGTG
+ACTACATGTCATGTTATAAGCCATACATCAAGGTGGAGAATACGCAGACGATTCCAGAGGGTGCAAAGCG
+ACCATACACATGCTGGGCTAAGGTGCCACTCTCTTTCTCCAGCCACCTACCCTGGAGCCATGCCAGCTCC
+TGCACAGTCCCCAAGGGTACCCAGCCTGGACCAGAGCCAGAGCCAATGGGAACTAATAGTGTATCTTTGA
+TGATTAGTAAGCAAGGTCAACAGATGGAAGGTAGAAAAGCCTTTGGACCTACACTAATCTGATTCCCAGA
+TTTGTGTAGATGACCCAAGATTGTTGTGTACTGCGGTACAGGCTGGTTTCAGCAACCAAGAGGAATACCA
+CATGCATCTCACAGACTGGGTTTCTACGGCCTCTGCTGACTTAGTCCCAGGACTGCGGCATTATGGATGC
+CCCTCACATACACACACACATTTTGAGATTTTGAAGTGATAAATATTGCTGGATACATTTACAGAAATCT
+CAAAAAAACAAAAAACCAACAACAACAACAACAAACAAACAAACAAACAAACAAAAAACAGAACAAAACA
+AAAAAAAATCCAAAAAACAAAAACAAAAAAACTAAGTTGTTTTTCCTTTGGTCACAGAACTAGAAAGGCA
+CTGTGAAAGGTGTTGGGATTGAGACCTTGGAAAAACAGCTAAAGTGCATATTTTACATATCAAAGCAGAC
+CTGGCCTCCAGGTTCTCCCAGCATCCTTCAGTCCCTACCTGGCATACCCTGACCTCTACCCTGAACTTCC
+CAGCCTAGGGGTTGGGCTGACCTTCTCTCAGAGTCTCTTCCCATGCATTTTCTCTCTTTCTCTTCCCCTT
+CTGCTCTATCTCCCTTTCCATGGTGACATCACTGGCCTCATTCCTTGGGATCAGGGAACTCACCAGCCTG
+AGAGTGTTTTCCCGATAAACCTGCTTTTATATGCTCTAATCTGGTTTGAATTGGCTCATTTCACTGGCAG
+AAAGATAACCTATCAGAAGAAAGTGAGAAAGAAGACAACCCAAGGGGAAGCAGGATCAGAGCATCACCTC
+GAAGCGGGCCAGGAAGTCCCTCAGGTTTGGGAAGGCGTCCAGGCACTTGGGCTCAAACATACGATACTGG
+TCAAGAATGTCATAAGCAAGGAAATCCACATAGGTGACCTGCAGGAAGCCAAGGGTGAGCGCTCTGGAAT
+CTGACACCAGGGAAGGATGGCAACCCCTCCTCCCCACCCCTCCCTTTACCTTGTCCCCTGCAAACCATGG
+CCTCTTGCCCAGGAACTCAGAGTAGAGCTTCATTTTCTCAGGGATGGCCTTCAAGAACTCTGGCTTCTGC
+TTCTCCTGAGGCAGAGAACAGCTGGCTGTCACCACCTTTAGTCATAGGCTCCCTGGCAGAGAGGCTGCTG
+GCCTCGAAGGACTAGGCTACTGAGTCTCATCACCATTTTCTTAGTAGCAGGCCTCATCTATCAGAGGTGA
+TTGGAAGACAGACAGGAGTGCTACACTGTCCTTAGATCGGTCACCACTGGTACTTAGCTGCTATGAGTCA
+CACCCAGGTCTTCCTAAGTATCATTGTGCATGTTATGGTCCCAGAGATCCAATGGAACCACAGAAACAAA
+GGACCTAAAGACCCCAGGAGAAGACAAAGTTATACTGGCTTTGGGGCGGGGGCAGGGAAAATGGATGGAT
+AAGAACAATTTAGATATTTGAACGGCATGCAACTAATATTTGGAGAGCAGTCACTAATATTGAAAAGAAG
+AAAGGTCAAAGATCTTCTGGCTGACCCTGAGCTGCCTACTAGGAAGGTGTCCTGTCTTCTACTGGAGCCA
+AAGAGGGCATGCTTGGGAAGTGCTGGGTACATGTGTGAGTCCGTTGTGCACGTATGTATGTAGGCGTGCT
+TACCTGTGTGTGCATTTGTGGAAGCTAAAGCCTGCTACCCAGTGTCTTTCTCAATCACTCTCTACCTTAC
+ATTTTTGTGACAGTGTCTGTCACTGAACCTGGAGTAGAGAGACTGGCTAGATTCATTGGCTAGCAAGCAC
+AGAAATCCTCCTGTTTCTTTTCCCATGGTGCTCCAAGGTACTCCCTGGTTGGTTTGTTTTTTTAATGTGG
+GATACCGAGGATCCACAATCAAGTCCCCATGCTTGCACATCAAAGACTTCTACCCTCTGAACCATCTCCC
+CAGCCTCTACTTCATCTTTTGAGTCTGCGTTTTAAGAAGAAAATTGTTAGTGTGATCACTAGCTTAGATG
+ACACATGGAAGTGGTTACAGGACATACACATCAGGGATTTTAACTCTGCATTAAATGTCTAGGTCAGACT
+GTTCACTCTAGAATCTCAACTGTATCTAGGATGCTTAGACAAGTAATAATAGTTTCAGTGTTGATGTGAG
+AGATCACTATGGTAACGAAGGTCGTTCTGTTGGTAAGATCCGAAAGCTTGGAGAATCAGTAACATGGCTC
+CTTACTGAAGGAGATGTTACAGATCTGAATGCAAGACTTTAACTCCTGCTTCAAACCCCGTCTTCTCCAC
+CTACTGGCCACTCACAAAATCGGGACTGCAGCAGACTATCATGAGCTGTATGCGGGTGTCCATAACTTGG
+TTCTCCAAAGTATCCACACGGATCCTCTCCTCCTCTGTCTCTCCACCTGCAGCACACACAGAACACTCAG
+CATCCCAACCCAGCCAAGCCAAGGAGATTGCCTGATATGCCCTACCCTGCAGGCAACCCCACTCACACAG
+GTTGTGCTTCCGGCCAAGGTAGCGCAGGATGGCATTGCTCTGGGTGACCTTGTGTGACCCATCAATTAAG
+TAGGGCAGCTGGATGGACAAGCAGGGAAGGTCAGATGGGATATGAGGTCCCTCCATCCCTTCCCTGGGTG
+AGAGCAGACACAGGGCCTGGCACTCAGTGTGCCTGCCATGGGAAGCCCTCCATGGGCACAGTGTGCATTG
+AATGAGAGAGCTGGGACACAGCCCATCACAGAAACACTGTGCAGCCAACAGGGATGGAGCAGGAGAGCAG
+AAACACTGATCCTAAACCCTCAGCCAGGCCAGGAGATGAGGTGAGATCACCACGTCTTCCCCAAACCTCC
+CTTTCCCTGCACTTACATTGGGAAAATCCAGGCCAAGATTGAATTTCTCACTCAGCCACTGGCTTCGGTC
+AAAGTTGGGGGCTGTGGTGGACAAGAGGATGTGAAAGGATATCAGTGTGGCTGCGTCTGACACTCCAATC
+TCACTGACCCACCTAAGAAGTCACGAGCAAGATACCCTGGGGAAGGAGAACTGTAACATCTTGCACTAAG
+TCATTCTCAAGATAAAGAAGAGGGGTGATAAAGAAGGGGACCCTAAGCCCAATGGGACCCCAAAAATGAA
+CCCAGCTCAATTATAAATACAGGTGGGCCCGGAACTCCAGTCTCATTATTGTTGTTTGAGGCAAGATCTC
+TCAATTATGTAACCAAGGCTTTCCAGAAACTATATGTATAGACCTGACTGCCCTCAAACTCAGAGATCCA
+CCTGTCTCTCTGACTTTCTAGAGCTGGATTTAAAAGCAAGCTCCCACACATTGCCATTCCCAGTGCATAG
+AAAACCAAACTGCACCCTCAAGAGGTTCCTGGAAGGAGCAGGCTCCTTCCTCCTCCAGTTGTAGGAGGTC
+TGCAGAGCTGAGGAATTAGGGGCTGGGTACGGGCATCCCTGGGTAAGAGGAAAGCAGGTGTCTGACCAGG
+AAGGTGTCATTACCATCCCCCATGACATATCTCTTCTCCTCATAGCTTGAATCTGTGTATTCCAGGAGCA
+AGCGGATGGAGTGAGTCAGCTGGAAGAGAAGGTCACACAAGTTAGAAATGGTAGGACCTTTACTGGATGA
+CTTCATCTTCGCAGATGAATCCTCTCACCCTCCACAAACACAGCCACCCTGCGGTGTGTACACATGCACT
+CATAGGTGCCCGCCCCTTCACACCTGCAGCCTCCCAAGTTTGTGAGCTCCAGGGTGCTAAGAAAAGTATA
+CATCTGAGAGCCCGGATCACTTTGAACTTCCCGCCTGAGTCCTAGTCCCACCCTCCAGGCAGATCCCCCT
+CTCACTCCGCGGGTGTTCCAATAGCCCAGTGTCATAGGCATGGCGCTGATGCTGTGGTCTTCTCAGACTG
+GCTTCAGGTAGGTTGAGCTGGAGAATTTTAAGGAGGTGCAAAGGGAAAGGTGGGTGGGGCCTTGTAAACG
+GTCCCGCCCTCAATGCCCTAACCCCACTCAGGCACTGCCTTTCCCTTCAATTGTCCTCAGAGACTTTGAA
+CCCGAAGCCCTTTGGCTTCACCCACAGCCCGGGAGAGGGAGAAGTAGGATCCTATGCACCTTCAGGCCTT
+GGATTCTGAGCAGGTTCACATGAAGGACCTCCTCCCCCCTCAGCCTCTTTAGTGCTTCCTGTCACCGCCC
+CCTAATGGGTGCCCCCAAACTTGAAAATGCAGCTTTCCATTCCTAGCAGAACTTAGGTTTTCACCCCTAT
+ACTGTGGAAATCTCTAGGAGAGAGTCCCAGATCTCAACAGAAGATAAAGTAGGGTGCCCCTCGGGCAAAA
+GTTTAATTTAGCATTGACAGCTGCCAAGCTCTATTACCCACCACAGCCTTATTATTCACCTAGAATCATA
+GGACAGTCCATGAGATAGGAAGGACTCCAGAAGAGGAACAATCAGAAAACAAGTACTCCCAGTGTCTTCC
+TTTAGGAAATTTGGAAACTTCATCATTTGCTCATCAAGCTAGTGACCATATAAGCCAGTCACTATTTTTG
+GCCATGTTTGTAAGCTAAATCCAAGAACCAGATACCTAGGGCACAAGAGATCTAAGACACCATCACAGTG
+CAGGGTACCTGGCTGGCAGAACTTCCCATGGGTTGCACTGTGCAAGGGAGCAAAGAATGCACAGCTGGTG
+TGTGCCATAGAGGGGGAGAGCATGTGCCCTAAGAGGCTTGCTTTCAAGGGACAACAGGATGCTGTACACA
+CGGGATCCTGTGTACACACTTTGCACGGGACACAACGAGCCCAAAGCTAGACCCTCAGCTGTCCACATCC
+CTAGATGCTAGACCTGGCCCAAACCCCGGACATATGTTTGTATATGACAGGGTTGCTCAAATGATCTTTC
+AGTTACCCTGAAATATATGCCCTGTAGTCCAACCTTCACAAGGTAAGTGTCCTACCAAGAACACTGTCCT
+ACCTGTCTGCTTTCTTCCTTGGAGGCAACAGACTCTTTCAAAACCAAGAGGATAACATCACCTTGTTTAT
+AAAAACTTATCAGATATAATGAGAAACAAATGACAGAAGAGGAGACTAAAATGAGTTTACAAGTCAAAGG
+CCAAGGCTGTTGCTCTGCAGGACGTACATTTTGGGATTAAATCATCCTTTACTGGGACTACGGGGTATGA
+AGGCAACATATCATGTCACCCACATAGGAGCTGTGCAGTGAGGGTGCAGGGGACAAGCAGAGGCCATCTC
+ATCCTAGGACATTAAAGAAACCACATGTTCTGCAAGTCTCTGTGAGGATGCACACACACAGAGCATGTCA
+CTGGGGCTCACAGTACCTGTGGCAAGTGCTGCCTTAGCATTTTACATGTCCACCTGCCCACTTCAGGTTC
+TCTGAATGGCCACGCTCAGAAAGTACATAAGACCTTAAGGACAATGCCCAGAGTTAAGCTTGCCCACAGA
+AAGACCATACTATTTACACATGCACAAAATTTCCTGGTTCCATGAGCTCCTAGGATTTAGATCTTTAATC
+GTTTGCTATCCTAAGGATCTCAGAAAAATGAAATCTTTGGATTTACCTGAAAACTGACTTTTAGGTGTCA
+AGATGAATTAAAACATTAAGTTTGAAAGACAGGAAAAGAGCAATGAAACAATCTTTACATGGAACAGAAA
+TTGTACCATAGTGTTCATCCAATTTGGTTCTTCCTTTATTAAACAGTCAGTTCCTAGCAAGGAGATCCCA
+TGGATAAGAAGGCCCTTCAGGTTGACTCTGAACTGACTGGTAAAGACCACAAGTTTGGTGGGAATTGTAG
+AGAGAAATAAATTTGCAAGTATGAATGAGGCACATCAGCAGATTGTCATGCGCTGTGGAAAGTAGAGAAG
+GAGGAGTGGAAGAAAGCAGAAGGAATCCATACTGTCTAGGACAAGGACTCAGAGGGCCTGGGAAGGGAGC
+CAGAGCCCTGGAGCCCTTCAGAGGACAGAATCGCTCTGCAGACAGGGATAAGGGCTAGGACAGCAAGTTC
+TCAATGCAGGGTCCCCAGAGTCCGAGATCATCATCTCTTTGAATGAGAAAGGTCCCTGAAAGGCAGCAGG
+GAACCCTGGGGACTGGGGACTTGGAGAGATCTGAAGCTAAGAGTCACAGCCAGTGCTGTCCTGGAGAATT
+AATACCAGTGTTGATCACTGATCCTCTGGTTGTCTCTGTCTGTGCACTGTCACAAGTTCACAGTGCAGCC
+AGCCCTAGTGTCTTAAAAGGACACTGTACAGGAAAAAAATCTAAATGTGTTGGAGGAAATTTGGCTTATA
+ACCTATAGAGTGACATCTGAGATGGATTCCATAATTCCTTGCCAATGGGTGACATTCAACATGGTCCATG
+CAAAAACACAGAATTCTATGAATAATGGCTTCTGGGGTTTTTTTTCTTTTTAAAAACATCACAGAAGTAA
+GTGCCAACCAAGTGTTCTCTGCCTCATGGAGACTCTCTGAAAGTCTCTCACATGCTGACTGTCTAGAGAA
+ACTCAGGAAATAGAGATGATACATAGTTGAAGAGGAATTTTAATCCAGCAACATGGTTACTCCTGCTGGA
+ATATATACACACCCTTAGCACACCCCTTTAATCCCAGATAATGCTGTCTAATTGATGGACAGACAAAGTA
+CAACCAGAATAAGGGAGTCTGTGCTGCCTGTCACCAGTGAGCAGAAATGGGAACTGAGTCTTCAGGTTGT
+GCTTTCATTGGAAAACAGGCAATGTGCAATGATGATGGGTTGATTCCCTAGGGAACAAAAACTGTTTTCA
+TATCTTGTCTGCACCCAGTGGGTTTTTATAGAGTTAAATAGTAGCTTTGGGGAAGCAACCATGTGGCATG
+GAAGAGACCCCAAGTAAAAGCCTCATGCCTTGTATTTTCATTCGTTCTTCGGACATTGGATTTCAAGCTC
+TTTTTTTTTTTTTAATGCCAAGTCTAATAAGCATCTGCTTGCATCAGCCCTGCACCCTCTAGATATCACT
+TGGGCTTACAAATTAACTCAGAGCAAACAAGCCATGAACCAGGCATATCTGCAGATTCCAGTTTACAGAA
+GTACAGCGTGCCCCTATTCCTCAACTATCCAGGTATCCCCACTACTAGTTATCAGACTTATATCTATAAA
+AATCTGTCTAATATAAAAGACATCCTGCCACTGTGCATGCAGCTGGCCATCAACACAGCTTCTTTGAGGG
+AAGAAAATTTTCCAGAACAGTTATGCACATCAGAGTACATCATACGCAAAAGTAACTTATGAGTATACTC
+ATAAATTAATGCTGAAACGAATAAAGAGCAGGCAGGAAAGCTAGGCAGGCTTCTCTGAGGCCTTTGAGGT
+GAGGGAGCCAGGAGGTGAGAAAAGACAGATCAAAACTGAAGAAATGTCCTTGCTTCATCATATCAGAGGC
+AATTTCAACAACTGCCGTAACCTTCTATGAGGTCTGAACACCTTCCAGCACACTGTGAGAACAGCACATT
+CTCTGGCACATGAGACATGAGTGATAGTGCCTCACCTGCCACGAGGATGCAGATGTAGTTAAGCATCTGT
+GGAGAGTGGATGACACCCTATGTGCAAGTTGTACATACATCAAGACAGAAAATAAGCTAGAGAGACACAC
+AGTAAGAGACAGGAACCAAGGGAGGAAAGAAAAGAGAAGGGAGGAAGGAGGGAGTAACTGAGAAAAGAGA
+ATGAGGGAGGGAGAAAGAGGGAGGGAAATAGAAAAAACTTTATTTAAACACATTAGCATTTCAAGATGGG
+AGAAAAACAGCCAAGTTTAATGTAAAACGCTTTGTCTAAAAAAAAAAAACACACACACATACAAACATAC
+TAAGAAGAAAAATATTCAAACTTTCTCTAAGACTCACGAAAACCTGTAACTATCACAAAGCATCTACAAC
+ATTGGTAGAGCCTTACTCCAACCAGACAAATACAAAACCAAACACAGAGGTAAGAGAGATGGGACTCAGT
+TGGGAGAGTGCTGATCTGGCATGCACAAACCTCTGGGTTCCATCCCAGAACCCCATAAACCAAGCATAAA
+GGAGCTCACCTAATGTCAGCTCTCAGAAGGCAGAGGCTGGAAGATCAATAGTCCAAGGCCATCCTCAGTT
+ATACGGTGAGTTTAAGGCTAGCTACCTGAAACGTGAGGCTCTGCTTCACCTGCAAGGAAGTGGATATTAT
+AAACTCATAGATATTAGAAAAAGATAAGAGACTTAAAAATCTGCACAAATGATTTCAAAACTAGGAATGC
+ACTTCTTCATGTACACAATGCTACAATGAACTGTATATAATGTAAATACAACAAAGTGTAGAATGAACTA
+TTAAAACCCAAATGAAGCAATAATGCGTAATAAGTAGGGGAACAAAAGATATTTGGTTTTCATGAACTAG
+CTGGATGGTATTTCAAAGCCTCCCAAAATTAGTAAGCTTGACTACCAATTGTCATGAGAGTACCAATTGT
+ACTCTCAGGTCTGGTGTAAATGTCCACCTGGGAGGCAGCATGATGATGGAGGCTGTAAGTCTCTGTTCCT
+TCCAGGTCTGACTTCCCATCTAGGCATTGCAGAGAGAACATGGTGTCTTGGTTGTAACTGATTCTGTGCT
+CAGACATGGCAATGCAAACATAGAGAAAGATGAAGTAGAGAGAGAAGAGGAGAGCATGTAGACACGACAG
+CTGAGGAGCACAGGAGAGACTGGAGCCAGTGCAAGCTGTTTTGTATATTCTGGGTGTCACAAAACAGACT
+TGGACACTCAGCGATCCTGCACAATTTGTCCAATCATGACTCTTGTCAGGAGGAAATCTGAGATCCAGAA
+AGCTTAAGTAATTTCCTTCTAACATAGCTAGTCAGTAGGACAGGGCCCAAGTCAGACTGGCTTTGTGAGG
+GCCATTCTGTGAGGTTCCTAAGCCCATACCTCCTGTCTTGCAGGAGACACTATTCTTCTCTGTTTCATCA
+ACATGTTTCTGGCTCAAAGCTGAGCTCCTTCTAGCCTAGGTTTGCACACAACCTAAACTTCTGGCAAGAC
+AAAGGAGTCCAACAGGTCAGCACAACATAGAAGAAATCTTACCCACAGACCTACCAAAGACAGGATGAAG
+TAGGACACTGGGGGAGGGTTCAGGAAGAAAGCCTCATCAGGCTTGTAATCAAGAGAGTCCAAGCTCCCCA
+ATGTGGTGTCTGAGGTCATTCTCACTGAGCCAAAGCTGACCTCATCTCCAGGCCTGATAAGGGACCATAC
+TGGATGATACCCACATATGCCATTGACCTTCTACTAGAATGGTAAGTGTCACAGAGTTTAAAGAGTTCAA
+AGATACCCATGACTTTCTGTCCTCCAAAATCTAGGTGCACGCAGCCTCAAGTGCAATCCTGAATGGTCCC
+TCCTGACCCTCTCCTCCCTCTTTCCTCACAGCCATGCACAATCCCTAAGGCCATTAGCCTAGACCAGAAC
+CAGCACTGATGAGAACTAGCTGAGAAAGTGTGTACCTTGATGGGTGTAAGGCAGAAAAAATAGATTCCAG
+ACTGAATATTTCCATAACATCTCCAACTCCACGATATCTAATAAAGTAGGTTGTGTGTTAAAAATTAGTT
+AGTGACCCATGGTTATCTATAACAGGAGGGTCTGACTTCCCAGAACCCAGAAGGTACCACGTACACATCG
+GAGTCTTTCTATGTCTGTCTTGTCTCTGTCTGTCTCCATCTCCCTGTCTCTCTGTCTCTCTCTTTCCTAC
+CCTCAGGATGAGGACATGTTTCTCTATCTCAATGTACCTGACACCATGCCTCATCAATAGTGGTATTGTA
+CTGGCTGGTTTTGTGTGTTAACTTGACACAAGCTGGAGTTATAATAGAAAAAGATCCTCCCTTGAGGAAA
+TGCCTCCATGAGATCTAGCAGTAAGGCATTTTCTCAATTATCAAACAATGGGGGAGAGCCCAGCCCATTA
+CGGGTGGTGCCATCCCTGGGCTGGTAGTCTTGGGTTCTATAAGAAAGCAAGCTGAGCAAGCCAGGGGAAG
+CAAGCCACAGCCTCTGCATCAGCTCCTGCTTCCTGACCTGCTTGAGTTCCTGTCCTGGCTTCCTTTGGTG
+ATGAACAGCAATGTGGAAGTGTAAGCTGAATAAACCCTTTCCTCCCCAACTTGCTTCTTGGTCATGATGT
+TTGTGCAGGAATAGAAACCTTGACTAAGACAGGTGTGAAAATGAGAATCTGTCCCATTTTACAGAGAATC
+CATGTGCTACAGGGGAAAGTACATGATAGCCAGGTGTCTGTTACTTAGACAAAGCAGGAGAGGAAAGCGG
+TCCAAATGAGGACATATTTATTTTTCCCCATGGCCTCAGAGCTTTCGGTCCATGATCTCCTGGCTCCACG
+ATGGGAAGAACATTTACTGCAAGGATGAGTGCTGGAGAAAAGCTGCTCACCACAGGGTTTGAGGAAGCCA
+AGAGATGAAAGGGACAGGTTGTGATGTACCCAGCCATGCCCACCCCAACATCTACTTCTTCCTAGTAGCC
+CACCCCTGGGTTTCCACCACCTCTCAATAGCCTTTTATTAATCCACACCTACATCACAGCATTGATGGTC
+TTAGATTCATGTTCCCATCACCCCAACTCTGAACACTATTACTCTGGGACCATGCCTGTATTGAAAGCCA
+TTGAAGGAGAGCTCAGATCCATCTCTAACAGGAAATGATGGGTAAAATGAAACTGCCACACCACAAGATG
+TTCAAGGTGTAGTCCTTGAGAAAGGACACCCAGCACAACCACAACTTTGGAAGTCAGTGCTTCTTAGTGG
+GGGCTCTGATTAGATAGTGTTCTCTGATAATCATCTGCACCTGTGAACATTTATTAGTGACATGATTGTG
+AGAAAAATACCTGATATAGACAACATGTGAAAAGGGAGGGTTTTAAAAGATATGTTGCATGACTGTGCCT
+GTTTTGTTGTTGTTGTTGTTTGTTTGTTTGTTTGTTTGTTTGTAGGGGGACTAGGAGAATACTTGTGACA
+ATCAGAGAGTAACTTGTAGGAGTCCCAGGTACTGACCTTAAGTCATCAGGCTTGGAAGCAAGTGCCTTTA
+TCCGCTGAACCACATTACTGAGCACATGAGAGCAATTTTGGTTCATGGATCCAGAGGGGTTTCAGCTCAT
+CTTTTCAGGGAAGATAGGGTAGCAAGAGCAGTTCTCTTCCTTATACCCCGTGCCTGGAGAGCAATGGCCA
+CCCCTCAGTATCTGTAACATCGTCAGTGTTCAGAGCGGTCCTCCACACACTTCTCTTTCTCCTGAAGCCA
+GAACCAGCTGCCATCCCCCTCGTGCACAAGCTTTCTGGCAAAGAGGGTGTCCTGCTGAGGCTCTGAAGGC
+CCCTGGCAGAGAGGGTGTCCTGGCGAGGCTCTGCAGGCCCCTGGCAGAGAGGGTGTCCTGGCGAGGCTCT
+GCAGGTCCCAGCTGGGCAGGATGAACAGCCACCTTCCTGAGCCCCGGCTTGGGTTATGATATAAGCTTCA
+AGTTAATGAGTTGGCATTGGTAGAAGCTTTAACACATCAGCATTGGCTGGACTTCAGGTGAAGCTGATGT
+CTACAGATATGATGGGAAACCAGATAGGAACACGGTACTGTCGGCCTGCCGTGGTGGTGCACATCTGTAA
+TGCCAGAACTCATTTGCTATTTTTCAGAACGTGTAGTTTATCTGTCTAGAAACACCCCCCCTCCCCCGCA
+ACACACACACACACACACTCTGAGTTTTAAAGTCTCAACTGAAGAATCTGTGGCTACTGTGATGTGACTT
+GCTGCTTTTCTCTGATCTCCTTCAACAAGATCATGGGAGTTCATAGTGATCCTGAGGCCCGGTCACAAAA
+ATACGGCAGGAAAGCTGGCCCAGCAGTAAAGACACTTGCTGCTCTTGCAGAGGACCAGAGTTTGGTTCTC
+AACACCCACCCACATGAGATGGCTCTTAACCACCTGTAACCCTTGTTCCAGGGGATCTGGCCACTGAGAA
+CACCGGAAAGGGCGTGTGCATGCCCTCACCCAGAAGAACACAGACACATACATTTAGAAGTCTTTAAAAA
+TTAATTTCATGTCATTTTAAACTCTGGAGAAAGAACAAGAACAGCATCAACAACCCACTGGCCCTGTGTG
+TACACTGACTGTATCCCACAACGGACGGACTGAAGAGGCAAGACACGAAATTGCTCCTAGATTAAGAGCT
+TTGCCCAATAGAACTACAGAAGAATGAAACAAGCATCTGAGGAGATTTCTATAAAGCCAAGGGCTAACTG
+AGACACAGAGGAAGGAAGGAGGCGGGGCAGTGGTGGCTCACGGCTTTAATCCCTGCACCTGGGAGGCAGA
+GGCAAATGGATCTCTGAGTTTGAGGCCAGCCTGCAAAGACTGCAAGGTTAATGATGTTTCAGAAAACTGG
+TTGACTTGATTGAACCAAAAAACACACAGAGTTCAGAAACGCTATGTAGAATTCAGAATCCAGGGTTCCA
+TGACAGTCCATCTTGGAGATCTGGCTCTCGTTGATTGAGTCTGTTTAACCCAGAAATGCACAGGCAGCAT
+GAATCAGCTAGACCGGAGCCCTTGGCTGGGGCTGAGGCCAGAGGTAGAGAGAGTTCAGGACAGGACTTTA
+CTCTCTGGGAACTGGAGGTCCACTCTGGGAAGGACAGGCACTGTGTTGAGATAGGAGGGCCCTGATAACA
+GTGCTCACTTGATACAGGGGTTTGCAAGGTCAGTGATACCAACACGAGGAGTGGCCTGGATTGTAAGTGG
+CTGCAAGGAGTTCAAACAGGAAACAGGACCTGCTGGTTCATCCCAGTTCAAAGCACTGGATGATTCAGAA
+AGGCTGGAGAAGAGATTCTGGTCACCTGGGTGCAGGTGGCTCTAGTTGGCTTCAGAATGTCCAAGATGTG
+TCTTCTCTTCTGCTCATGTACACCACACAGAGTGACACATAGTACTTCGTCAAAGCAAGGTCATGAGGGT
+GGACACTATCCCCCAACTTTTATCCTAGGCTGCTTTCACACAGAAACACAGAGACATGATGTTTGTGGGT
+GATATAACACAGGATGGCATTCATCTAGGTGAGCGTGTGCCTAACTATGCCACACACTGAATTCTGCACA
+CGATGAATGGTCCCTGCTTGAGCAGCATAATGGGAACTTGCTTCCTTAGCTTGGGCTCTCCTGCATGTCT
+GTCTGTCTGTCTGTCTGTCTGTCTGTCTGCCTCTTTGTGTGTCTGTGTGTCTGTCTCTCTGTCTATATGT
+GTGTGTGTATCTGTGTCTGTCTCTGTCTCTCTGTCTTCAGTTCTCAGTGCTTATGACACAGCATCCTATG
+AGGCGGCAGGGCCACAGTCAGAAAGAACTTCTATACCCAAATGATGCCACGGAACAAGACCACTCAGCCT
+CCTGGATGCTCCCCATGGAAGTTCTGTAAGGAAAGTGAACTGTTTTGCAGATTGGGGTCTGTTCCTTCTC
+CTCTAACACACTAGGAGTGAGGAGCACTCAAGAAGAAGGAAGACCACAGCGTGGATACTTCGGTCATTCT
+TAGAAGGGGGAACAAAATACCCATGGGAGGAAATACAAAGACAAAGTGTGGAGCAGAAACTGAAGGAATG
+ACCATCCAGAGACTGCCCACCTGGGGATCCATCCAACCCCAGACACTACTGTGGATGCCAACAAGTGCTT
+GCTAACAGAAGCCTGAAATAGCTTTCTCCTGAAAGCCTCTACCAGTGCCTGACAAATACAGAAGTGGATG
+CTCTCAGCCAACCATTGGATGGAGTACAGGGTCCCTAATGAAGGCCCCAAGGAGCTGAAGGGGTTTGCAG
+CCCCATAGGAGGAACAACGATATGAACTAACCAGTAACTCCAGAGCTCCCAAGGACTAAACCACCAACCA
+AAGAGTACACATGGAGGGACCCATGGCTCCAGCTGCATATGTAGCAGAGGATGGCCTTGTCAGTCATTAA
+AGGGAGGAGAGGCCCTTGGTCTTGTGAGGGCGAGATACCCCAGTGTAGGGGAATGCCGAGACCAGGAAGA
+GGGAGTGGATGGATTGGTGAGCAGGGGAGGGAGGAGGAAAAAGGGGGTTTTCAGAGGGGAAACCAGGAAA
+GGGGATAATATTTGAAATGTGAATAGAGAAAAGGTTTGGGGAGTGTTAGCCATGGCTGTGTAAAACCAGC
+AGGTGGCTGCAGCCAAACTCAGGGGGATCCAGTGCCAATGCTACAAACATTCTAAAGTGTCAGGACTTCT
+AGTGGATTCCAGCCACATCAAGTCATTGATCTTTTGGGCTAAGTTCGAAGGCCTTGCCTTACAACTCTGT
+TACCTGTAGCTTTATTTTCCTAGTTTTGCCACAATGCTTTAAAACTTGTTTTCAGGCATTCGGAGTGAGA
+CATGGAGCTGAGCGATCGCAGCGGAGAACGAGATGCCTGTGGCCGTGGGTCCCTACAGGCAGTCCCAGCC
+CAGCTGCTTCGACTGCGTGAAGATGGGCTTCGTCATGGGTTGCGCCGTGGGTATGGCGGCCGGGGTGCTG
+TTCCGCACCTTCTCCTGTCTCAGGATCGGAATGCGGGGTTGGGAGCTAATGGGCAGCATTGGGAAAACCA
+TGGTGCAGAGTAGCAGCACGTTTGGCACTTTCATGGCCATTGGAATGGGCATACGATGCTAATTAGGGCT
+AGGATGCCCTGCAATATCTAAACTTCCCCATCCATTTCGACCCTTGTACAATAATAAAGTTGTTTTCTTC
+TTGTTTAAAAAAAAAAAAACAAAAACAAAAAAACCAAACTTGTTTTCATTCCAAATTCTCTTCGTTTGCC
+ACACCTTCCCCTTTACATAGAGAATGATTCAGATTTGGTTTGCATAGAGATCATAACCCAGAGTTCCATT
+TGCCAAAAGTGGAGTGAATATGGCCCACAAAGCCTAAATACTGGGCCCTTATGTGGGCCGTTTGCCAACC
+TTGGTTCTAGGTTCATGTCTACTTTGGTGTGGTTTGGTAATTTTATTCTCTCTACTCAACGTTATCATAG
+TGCCACTCCTGACTTCACTGCATCCTTTCCAGCTTTTTGTCTTTGTCTTCTAGTTAAAATTTAACACACA
+TTGACTGAAATAATGTGAACTTAACCAGTTTCACGAACATTTCACTTCTATGAATTTTTTATCTTATGAC
+CTTTAACTTTTAGTTCCTTATATATTTTAGCAAGCCTGCCTGTTCCTTAGCTGTAGTAGGCAACAGCAGG
+CAGGGGGCATGAAGACAACACCTGTCAGGGCTCTCATCCAGATGTGCTACCTGGTACAGGTAAGTTGCAG
+GTGCTGCCTGACAAAGGTCAGGCACTATGATCCAGCCCTGGTGTAGGAGCATGCTCAGTAGGTACTTCTG
+CAATTTGTCCAAGTCCTTCATGGACAACTTGATGCAAAATGTCCACAGCGTCTGTGACCATACACTGTGG
+CACTGGGGCCCAGCAAGCCAGGCCTGCCCATCCCTGATCATGCTCTTCCAAGGCAGCTGCAGCTTCTTAA
+TGAATCTGCTTAGCTCAGAGACTGCTGGTTGTATGGAGGGACCAAAATATCTCTGGAGATAATGCACTCA
+AATTCAAAATTTTCCAATCATAGAGCATTTGCCAAGCCCTGCCATGCATATTCAAGATGAATAATCCAAA
+GCTGGCACTGTGGTACACACCTGTAATGCCAGTAGTTGGGAGGTAGAAGAAGGAGGATTGTGGGATCAAG
+GGCCATCTCAGGCCCATAGTGAGTGAGGACCCAGCTAGTATTGGAGACATGAGATCCTGGATCAAAAACA
+CCAGCAACCAGAGGCTGGACATGGGTCAGAGGTGAAGAGTACACTGCTCTTAGATCCGTTCTGTTCCCAG
+CACCCACAGGGCCATCTGTAATTCTAGCTCCAGGAATTTGATGCTCTCTTCTGGCATCTGCAGGCACCAG
+TAACCCCTGTGGTGGTCATACACAAATACACATAAAGTGATAATAAATATTTTAAAAAGAACAATGAAAA
+AAGTGGAGAGAGAGAGAGGGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAA
+GAGATTCCCTCAATTAAGTTCTGCTTGAGTTGCAGACAGTAGAGTCTGAGAACACTGATGTGGGATGCTC
+CACTCAGGAAGGAGAATGACGAGGAGTGCAGAACATGCTCACCCAGGGCCTGCATGCTCCGCCCCTTCCC
+ATCAGCACAAGATCACTCTGAAGCATAACGCAAGCTGGCTACACATGGACAACTGTGGTACTGCACTCCT
+CAGAACAAATTAGAAATGAAGGAGATCTCTCTGATGGGAATCCCAGATTAAGGAACAAGGGGACATGGAG
+CAGGGACATCAAGGACTTTAAATAGCCCAGAGTCCCTACATCTCTACCTCTAGCTCCTCCTGCAAGCAAC
+AAAATAGACACCACCCAACCAAGTCACCACAGCCAGCCCTGAGTGTCACAGAAAAATAATACACTGAACA
+AATGTGATTCACACTTTATTGAGAACCAAATGGGGCTTAGCTGCTTACTCTGAGGACCAAGGCAGCACAC
+AGACAGATCTAATCTCTCAGGCCAGGCCGGGGATTCTCGGGTCTAGTACATTCCTCCAACCATCACTGAG
+GGAAGGGCTGGATGCAGGGCAGAGACAAGGGATAGGAGGAGACTAATGAAGAAATGGAGAGCCCAAGGAC
+CCAGAGGCTTGGCTGAGGGAAGGGCTGGGAAGAGGAAATGGAGGAATAGGGAATGGAAAGGAGAAGAAAG
+CTGCACGTGGTAAGAGCTAGGGCCAGCAGAGCACTCATGAGTATTCCCCCAGGTCTGGGCTTTGTAATCC
+TACTTTGGGTTCCAAAAGGCCATCTTTGCAAAGATTGGCTTGGAGAGGAAGCGGCTGCTCTTCATGTAGT
+CAGATATCTTCTTCAGGCCCTGTGGGGAGGAGGAGAAAGCAAGAATTGAGGAACTGGGGCTCCAGTGGGT
+GCCCAGGTTTCACTCTAAGAACCCCAACACAGCCCAGGCTGCTCAGGTGCCCAGCTCTGGTAAGGGAATG
+CTTTAATTTCCTGTGACGATGGAAAACTGGCACTGCGTTCAGCTGTGTGTCCGTCTGCAGGTGTGAGGGA
+GAGAGAGCACTCCAGAGAATCCTGCAGAACAGCGCTGTGCTCTCTGCATTCATCAGGAGGAAGCCTGACA
+CACAGGCTCAGCCAGCGGGAACTGCAAGAGTCTGCATCCCTGCCCACTCCCTTGGTTCACTCACCAAGCA
+AGAAATTCCTTGTTGCTATCTGTTTATACAAACTCCCGATTATTTATTTATTTATTTATTTATTTTTACA
+AGCAAGCTCCTTCTTGCCTGTGTTTCCACATTGGGGAGGGACCAAGCAGATCAAAGGGACCAGTAACATG
+CCAGTGTACTTTGGAAGACAGCAAAGATCTCTCCCAGTACCTCAGTAAATCCTGACTGAATGAAGCTAGA
+GATAGGTGAGCCCGCACGGAGACAATGACATGGCCTTGGCCATTCTCCTGATGCAGAAAAACCTAAACTC
+GGAACACTGGCATCTGAATGCATACTTAGTGTGTCCATCCCATTTCTCTCGGCCCTGCTAAAGAACATTC
+TACAGAATGGCATTGTGCTGTGTGCTATCTTTCCAGTACAGTGCCCAGATAAGCTGTTATGAATTCAAAG
+TTCCAGGGCGGCCTGTACCCAGGAGACGAGGCTTGGCTGGCCTTGGACTGCAGCCTCCCACACTCTCCCA
+CGGCACTGTCCAACTGTTCTCACACAATCTCCAGAGCCGAGAAGCCTGGAGCAGAATGAAAACTAAGAAA
+AACACATCACCTGTGCAGCAAGGACAGAAAAGAAGATCCCATGCTAAGATTCCCATTGGACCATGGCCTT
+GATCCCCCAAATCTCTAATAAAGTAGGGTTTGTGCCTATCACACGTGCACTACAGTCAGGCATGCCTGCC
+ACTCTGCATGCCTGGACAGGGATGCATGCTTCCATGCACTACTGAGCTCAGAACTGTGCCTCATAGAAGC
+AACACTAGAGAAGATGGATGAAGATGGCAGTGAAATCCCACTATAATGATTTTACAGACAAACATCAAGA
+TGTTGCTGAGGAGGAAAATAAAAGGAAAGGAACAGGATGCCTAAATATTGAACAATTTAAGATAAAATCC
+TGCAGCAGCAGTGAACTGTCTACGCAGCACTAGCTGTAGTCCTGAGTCCCAGTGCTTCCCCAGTCAGGGA
+CCCGACTAAGCAAAGAATTCTACAGGCCATATGGATATAGCATCTTGGTACTGTACATCAATCCTGAATA
+GTACACCCTACACACTCTGGCCATGTCCTGGGTTTCCATAGGCTTCAGGGCTGGAAAATCCAGGCTCCAT
+GACAACTTATGACTTCTCTCCTTTGGCATCAGGTGGATCTACTGCCTTTGTTTTTTGGTCACGAGAAGCA
+CACAGGAGAAAACCACAACCCCTTCCAGTGGTACTGGTTCCCTGAAAAGCTGACAGTCTTCACAATCAGC
+TCCCAACACGATCCAATGGAGCTGATAAGTTCACACAAAACGCAGGGTGTGATGGGAAATTCCATGCAAG
+AGATGAAAGAACTGAACACAGTTCCATGCTTAACAAAAGCTAATCATTTTCGTTTCCATTTGAAGGAGTA
+GAAAGGCAGTCTGGAAACAGGGGAGGGAAGTAGGGCATTCAAGAGGAAGTGAGATCAGGGCATCACCTCA
+AAGCGACCCATGAAGTCCTTCAGGTTTGGGAAGGCATCCAGGCACTTGGGCTCAAATATTCGGTGTTGAT
+CAAGGACATCATAAACAAGAAAATCCACATAGGTGACCTGCAGGAAGCCAAGGGTGAGTGCTCTGGAATT
+TGATGCTTGGAAAGGATGGCAACCCCTCCTTCCCACCCCCTCCCTTTACCTTGTTCCCTGCAAACCATGG
+CTGCTTGCCCAGAAACTCAGAGTAGAGCTTCATCTTCTCAGGGAGACCCTCTAAGTACTCTGGCTTCTTT
+TTCTCCTGAAGCAGAAAACAACTGTCACCACTTTCAGACACAGACCCCCTGGCAGAGGCTGGCCTCACAG
+GGCTGTGCAGGCCCCAGCAAGCAGAGGAGAAGTCACCTACATGAGATTCAGTCAGGGATGATATCCAAGC
+ATTTAGTTAGAATGATTAGACAGCCCAACAGGATACAGCCCTGTCTCCTGGTCTCTACCACCATTGTCAG
+GACCACTGTGCATGAGACGGCTTTGAGCACATGGCTCTGAACTCTGACCTTACATGAGGCAGAAGCAAAC
+AACTGAAATCGTCCAGGAAACAGGCCGTGCTCTACCTACTAGTGCGGTGAGGACTTCAGTACAGGTTCCA
+AAGAGATGGGGTGAATTTGAGAGGACAAAAGGAAACAGGCGCAGGATCAAAGGCTTTTGGCTGACTGAGT
+TGCCTATTGGGGACATCTGCTTATCTCTGAGTGTGTCAAGGACACAGCTCTGGGAAGCCATTTAGGAGGA
+GAGTGTCCCCTATCCTCAGTGTCTTCTTCCCATGAGTGTGCAATCTGCAGACATGGCAAGACTCCAGGAC
+AACAGAAAGACATGACTCAGGACAGAACCAGCACTTAGACCTGGAGGGATCGAACTCTGTCTGTCCTATC
+AGGGACTAAAGTGTCACTGTCGAGAGTCTGGACTGGACCTTAGGCCAATGACTGGAAGTTTTCATTTGGC
+AAACTAAGCTTTGAGGTTACTCTGTTGGCAATTTCAAACACTGTGCCATGAGCAGCACTGAGAGGCTCTG
+GTGGCAAACAGGGCCAGAGCAGATGCAAGGGCAAGAGTGTGGGGCCAGCTCCAACTCACAAAGTCAGGGC
+TGTAGCAAACCATGGCCAACTGTATGCGGGTGTCCATAGCCTGGTTCTCCAAAATGTCCACACGAATCCT
+CTCCTCCTCTGTCTCTCCACCTGCAGCACACACAGGCCAGACTCACCCTCAGCATCCCACTCCAGATAAG
+CCTAGAAAATGACTACTATCCTCCACCCTGCAGCCAGCCCCACTCACACAGGTTGTGCTTTCGGGCAAGG
+TAGCGCAGGATGGCATTGCTCTGGGTGATCTTGTGTGATCCATCAATCAAGTAGGGCAGCTGGATGGACA
+AGCAGGGAGGGTCAGGTGGGATATGGGGCACCTGAATCCTTTCTCTGGGTGAGAGCAGACACAGGGCCTG
+ACACTCAGCCTGCCTGCCATGGGAAGCCCTCCATGGGCACAGTGTGCACTGAATGGGAGAGCTGGGACAC
+AGCCCATCACAGAAACACTGTGCAGCCAACAGGGATGGAGCAGGAGAGCAGAAACACTGACCCCCAACCC
+TCAGCCAGGCCAGGAGATGAGGTGAGAACAGTGTGTCTTCCCCAAACCTCCTTTCCTCCTCCACCTACAT
+TGGGAAAGTCCAGGCCCAGCTTGAACTTCTCACTCAGCCACTGGCTTCGGTCATAGTCAGGAGCTGTGGT
+GGACAAGAGGATGTGAAAAGAAATCATGCCAGTATCCCAGCTTCCTTCTCTAGGGAAGGAGTCAGGTATG
+AGAACACCCCCAAGAAGATCTGCAGCCTCACCCAAGAGTAGACAATGCTTCTGAGGACTGTGGACTGTAT
+ATCACTAAGATTTTCAAAAGGTTTGAATAAGCCAATTAGTTGAAAAGAGAACCCAGCCCCCCAAATTCAA
+AACAGTATCTGGAAGAGACAGTTTCTAAATCCAACTCCAGGAGCTCTGTAGAGCTGAGCAACAGGGTGTC
+AGAGCAGTGTCTCTAGGCACCAGTTAGCAGGGGAGCAGGGGCCTGAGAAACAGGGTGTAATCACCGTCCC
+CCATGGTGTATTTCTTGTCCTCATAGCTTGTGTCTGTGTATTCCAGGAGCAGGCGGATGGCGTGAGCCAG
+CTGGAAAGATGACATATGACAAGTCACAGATTGCAAAACCCTGACAGTCTGACTCTGCTGATTCTTGGTG
+AGCCTACATCTTACACGTAACTGTCTCCCCACAAGCTCACACACACATACCACATAAGCACAAGCTCTCG
+TGTTTGAACTTCCAACAACTTCTGAACTCAGTCTATCTCCTGGAAATCAGTTCCCTAGCTACTCCCCACC
+CCACCCCTGTCCCACCCAGCAGAAGCGTTCTCACTCACCCCACGGATGTCCCAGTAACCTAGTGTCATAG
+GCATAGTGCTGGTGTCCAGAGCAGACCAACCCGGGCCTTGCTAGTCTGAGCTTCCTGATAATGTCTTAGG
+CAAGGAAGAGGCGGGACCTGGATATGGCTTTCTATTGGACCATTGGCCTACAGTGTCTCAAATTTCAGAC
+ATTCCAGTTGAAGGAAGGAGCCAGGGATCCAGGGGCAGGGCAAGATATAGCTAATCTCAGGAACCAGTCT
+AAGCAGTCAGAAATAAAAGATGGGAATGGGCTAGCACCTGTGTCAGCAACTCCCTCTTCTTGCTCTAGCG
+TGCTGTTTCCATGGAGAACAGTTGTGCAGCTAGGGACATTCTTGATGGTCAAAGTGGAACCCTGTTATTA
+TCTTCCTGACATTGTCCCTTCTTCATTATCAACTCCCTTCCTGGGTTCTGGGTGGCTCCTGGTGGCTGAG
+CCAGGACAAAACAGGACACCTCAAGCCTGAGAGATTAATGTCTCTGTGGTCTATAGCTTATCATAGACAG
+CAGAGGGCATGCAGGACTCTGCTAAGCTGGGCAACAGAGCCACATACAGGTACGACCAGTGGATGCAGTA
+CAGATATTCAGGATCACTGGGCACTGGAAAGATGTAAAGCAAACCACAGTGTGATACACCTTCCCCAGAA
+AGAACTGCCATTCGCACATGGGGGCTGGTGAGACTGTGGGCAGGAGAGAACATTCGACACTGCAGGTAAG
+AGTATCAACTGTCACAACCGTTAGGGAAAACAACCTGGTGTTCCTCGAAAAGACAAAACTAGAGCTACCA
+CGTGATCCAGCAATACTCGATATACATCCACAGGAAATGAAATTAGAATGACAGGGGAACACAATTTACC
+TGAGCTAAAAAATAGATGGCATCTAAGAACCCATTCTCTGACAAATGTACGATGAAAGTGCAGTCCACGT
+GCACCATGGATTACTAGTCACTCTATAAAAAAGGGGAAAAAATAAAACTAGATATCCACACACAGAGAAA
+CTAGAAGTGAGGGACTGGAGAGATGGCTCAACAGTTAAGAGCACTAATTGCTCTTCCAGAGGTCCTGAGT
+TCAAGTCCCAGCAACCATGTGGTGGCTCACAACCATCTGTAATGGGATCCAATGCCCTCTTCTGGTATGT
+CTGAAGACAGCTACAATGTACTCAAAAAAACAAAAATAATAATAGAAGTGAGATTCCTATTTCTCATCCC
+GGGGTGGGGGGCAGACAGGAAGGGATACAAATAAATCAATGGCCTTAATGTAAGACTTGAAGTTTAGGCC
+AGCAGAGTCTTGCCCAACACCCGCAAGGGCCCACACGGGACTCCCCACGGGATCCTAAGACCTCTGGTGA
+GTGGAACACAACTTCTGCCAGGAGTCTGGTTCGAACACCAGATATCTGGGTACCTGCCCTGCAAGAAGAG
+AGCTTGCCTGCAGAGAATACTCTGCCCACTGAAACTAAGGAGAGTGCTACCCTCCAGGTCTGCTTATAGA
+GGCTAACAGAGTCACCTGAAGAACAAGCTCTTAACAGTGACAACTAAAACAGCTAGCTTCAGAGATTACC
+AGATGGCGAAAGGCAAACATAAGAATCCTACTAACAGAAATCAAGACCACTCACCATCATCAGAACGCAG
+CACTCCCACCCCACCTAGTCCTGGGCACCCCAACACAACCGAAAATCTAGACCCAGATTTTAAAACATTT
+CTCATGATGATGATAGAGGACATCAAGAAGGACTTTCATAAGTCACTTAAAGAATTACAGGAGAGCACTG
+CTAAAGAGTTACAGGCCCTTAAAGAAAAGCAGGAAAACACAGCCAAACAGGTAGAAATCATTAAAGAAAA
+ACAGGAAAACACATCCAAACAGGTGATGGAAATGAACAAAACCATACTAGAACTAAAAGGGGAAGTAGAC
+ACAATAAAGAAAACCCAAAGCGAGGCAACACTGGAGATAGAAACCCTAGGAAAGAGATCTGGAACCATAG
+ATGTGAGCATCAGCAACAGAATACAAGAAATGGAAGAGAGAATCTCAGGTGCAGAAGATTCCGTAGAGAA
+CATCGACACAACAGTCAAAGAAAATACAAAATGCAAAAGGATCCTAACTCAAAACATCCAGGTAATGCAG
+GACACAATGAGAAGACCAAACCTACGGATAATAGGAATTGATGAGAATGAAGATTTTCAACTTAAAGGGC
+CAGCTAATATCTTCAACAAAATAATAGAAGAAAACTTCCCAAACATAAAAAAAGAGATGCCCATGATCAT
+ACAAGAAGCCTACAGAACTCCAAATAGACTGGACCAGAAAAGAAATTCCTCCCAACACATAATAATCAGA
+ACAACAAATGCACTAAATAAAGATAGAATATTAAAAGCAGTAAGGGAGAAAGGTCAAGTAACATATAAAG
+GAAGGCCTATCAGAATTACACCAGACTTTTCACCAGAGATTATGAAAGCCAGAAGAGCCTGGACAGATGT
+TATACAGACACTAAGAGAACACAAATGCCAGCCCAGGCTACTATACCCGTCCAAACTCTCAATTACCATA
+GTTGGAGAAACCAAAGTATTCCACGACAAAAACAAATTCACACAATATCTTTCCACAAATCCAGCCCTTC
+AAAGGATAATAACAGAAAAGAAGCAATACAAGGACGGAAATCACGCCCTAGAACAACCAAGAAAGTAATC
+ATTCAACAAACCAAAAAGAAGACAGCCACAAGAACAGAATGCCAACTCTAACAACAAAAATAAAAGGAAG
+CAACAATTACTTTTCCTTAATATCTCTTAATATCAATGGACTCAATTCCCCAATAAAAAGACATAGACTA
+ACAGACTGGCTACACAAACAGGACCCAACATTCTGCTGCTTACAGGAAACCCATCTCAGGGAAAACGACA
+GACACTACCTCAGAGTGAAAGGCTGGAAAACAATTTTCCAAGCAAATGGACTGAAGAAACAAGCTGGAGT
+AGCCATTTTAATATCGGATAAAATCGACTTCCAACCCAAAGTTATCAAAAAAGACAAGGAGGGACACTTC
+ATACTCATCAAAGGTAAAATCCTCCAAGAGGAACTCTCAATTCTGAATATCTACGCACCAAATGCAAGGG
+CAGCCACATTCATTAGAGACACTTTAGTAAAGCTCAAAGCATACATTGCACCACACACAATAATAGTGGG
+AGACTTCAACACACCACTTTCTTCAAAGGACAGATCGTGGAAACAGAAACTAAACAGGGACACAGTGAAA
+CTAACAGAAGTTATGAAACAAATGGACCTGACAGATATCTACAGAACATTTTATCCTAAAACAAAAGGAT
+ATACCTTCTTCTCAGCACCTCACGGGACCTTCTCCAAAATTGACCATATAATTGGTCACAAAACAGGCCT
+CAATAGATACAAAAATATTGAAATTGTCCCATGTATCCTATCAGACCACCAAGGCCTAAGACTGATCTTC
+AATAACAACATAAATAATGGAAAGCCAACATTCACGTGGAAACTGAGTAACACTCTTCTCAATGATACCT
+TGGTCAAGGAAGGAATAAAGAAAGAAATTAAAGACTTTTTAGAGTTTAATGAAAATGAAGCCACAACGTA
+CCCAAACCTATGGGACACAATGAAAGCATTTCTAAGAGGGAAACTCATAGCTCTGAGTGCCTCCAAGAAG
+AAACGGGAGACAGCACATACTAGCAGCTTGACAACACATCTAAAAGCCCTAGAAAAAAAGGAAGCAAATT
+CACCCAAGAGGAGTAGAAGGCAGGAAATAATCAAACTCAGGGGTGAAATCAACCAAGTGGAAACAAGAAG
+AACTATTCAAAGAATTAACCAAACGAGGAGTTGGTTCTTTGAGAAAATCAACAAGATAGATAAACCCTTA
+GCTAGACTCACTAAAGGGCACAGGGACAAAATCCTAATTAACAAAATCAGAAATGAAAAGGGAGACATAA
+CAACAGATCCTGAAGAAATCCAAAACACCATCAGATCCTTCTACAAAAGGCTATACTCAACAAAACTGGA
+AAACCTGGACGAAATGGACAAATTTCTGGACAGATACCAGGTACCAAAGTTGAATCAGGATCAAGTTGAT
+CATCTAAACAGTCCCATATCACCTAAAGAAATAGAAGCAGTTATTAATAGTCTCCCAACCAAAAAAAGCC
+CAGGACCAGATGGGTTTAGTGCAGAGTTCTATCAGACCTTCAAAGAAGATCTAATTCCAGTTCTGCACAA
+ACTATTTCACAAAATAGAAGTAGAAGGTACTCTACCCAACTCATTTTATGAAGCCACTATTACTCTGATA
+CCTAAACCACAGAAAGACCCAACAAAGATAGAGAACTTCAGACCAATTTCTCTTATGAATATCGATGCAA
+AAATCCTCAATAAAATTCTCGCTAACCGAATCCAAGAACACATTAAAGCAATCATCCATCCTGACCAAGT
+AGGTTTTATTCCAGGGATGCAGGGATGGTTTAATATACGAAAATCCATCAATGTAATCCATTATATAAAC
+AAACTCAAAGACAAAAACCACATGATCATCTCGTTAGATGCAGAAAAAGCATTTGACAAGATCCAACACC
+CATTCATGATAAAAGTTTTGGAAAGATCAGGAATTCAAGGTCCATACCTAAACATGATAAAAGCAATCTA
+CAGCAAACCAGTAGCCAACATCAAAGTAAATGGAGAGAAGCTGGAAGCAATCCCACTAAAATCAGGGACT
+AGACAAGGCTGCCCACTTTCTCCCTACCTTTTCAACATAGTACTTGAAGTATTAGCCAGAGCAATTCGAC
+AACAAAAGGAGATCAAGGGGATACAAATTGGAAAAGAGGAAGTCAAAATATCACTTTTTGCAGATGATAT
+GATAGTATATATAAGTGACCCTAAAAATTCTACCAGAGAACTCCTAAACCTGATAAACAGCTTCGGTGAA
+GTAGCTGGATATAAAATAAACTCAAACAAGTCAATGGCCTTTCTGTATACAAAAAATAAACAGGCTGAGA
+AAGAAATTAGGGAAACAACACCCTTCTCAATAGTCACAAATAATATAAAATATCTTGGCGTGACTCTAAC
+TAAGGAAGTGAAAGATCTGTATGATAAAAACTTCAAATCTCTGAAGAAAGAAATTAAGGAAGATCTCAGA
+AGATGGAAAGATCTCCCATGCTCATGGATTGGCAGGATCAACATTGTAAAAATGGCTATCTTGCCAAAAG
+CAATCTACAGATTCAATGCAATCCCCATCAAAATTCCAACTCAATTCTTCAACGAATTGGAAGGAGCAAT
+TTGCAAATTTGTCTGAAATAACAAAAAACCTAGGATAGCAAAAAGTCTTCTCAAGGATAAAAGAACTTCT
+GGCGGAATCACCATGCCAGACCTAAAGCTTTACTACAGAGCAATTGTGATAAAAACTGCATGGTACTGGT
+ATAGCGACAGACAAGTAGACCAATGGAATAGAATTGAAGACCCAGAAATGAACCCACACACCTATGGTCA
+CTTGATCTTCGACAAGGGAGCTAAAACCATCCAGTGGAAGAAAGACAGCATTTTCAACAATTGGTGCTGG
+CACAACTGGTTGTTATCGTGTAGAAGAATGCGAATCAATCCATACTTATCTCCTTGTACTAAGGTCAAAG
+CTAAGTGGATCAAGGAACTTCACATAAAACCAGAGACACTGAGACTTATAGAGGAGAAAGTGGGGAAAAG
+CCTTGAAGATATGGGCACAGGGGAAAAATTCCTGAACAGAACAGCAATGGCTTGTGCTGTAAGATCGAGA
+ATTGACAAATGGGACCTAATGAAACTCCAAAGTTTCTGCAAGGCAAAAGACACCGTCAATAAGACAAAAA
+GACCACCAACAGATTGGGAAAGGATCTTTACCTATCCTAAATCAGATAGGGGACTAATATCCAACATATA
+TAAAGAACTCAAGAAGGTGGACTTCAGAAAATCAAATAACCCCATTAAAAAATGGGGCTCAGAACTGAAC
+AAAGAATTCTCACCTGAAGAATACCGAATGGCAGAGAAACACCTGAAAAAATGTTCAACATCCTTAATCA
+TCAGGGAAATGCAAATCAAAACAACCCTGAGATTCCACCTCACACCAGTCAGAATGGCTAAGATCAAAAA
+TTCAGGTGACAGCAGATGCTGGCGTGGATGTGGAGAAAGAGGAACACTCCTCCATTGTTGGTGGGAGTGC
+AGGCTTGTACAACCACTCTGGAAATCAGTCTGGCGGTTCCTCAGAAAACTGGACATAGTACTACCGGAGG
+ATCCAGCAATACCTCTCCTGGGCATATATCCAGAAGATGCCCCAACTGGTAAGAAGGACACATGCTCCAC
+TATGTTCATAGCAGCCTTATTTATAATATCCAGAAGCTGGAAAGAACCTAGATGCCCCTCAACAGAGGAA
+TGGATACAGAAAATGTGGTACATCTACACAATGGAGTACTACTCAGCTATTAAAAAGAATGAATTTATGA
+AATTCCTAGCCAAATGGATGGACCTGGAGGGCATCATCCTGAGTGAGGTAACACATTCACAAAGAAACTC
+ACACAATATGTACTCACTGATAAGTGGATATTAGCCCCAAACCTAGGATACCCAAGATATAAGATATAAT
+TTGCTAAACACATGAAACTCAAGAAGAATGAAGACTGAAGTGTGGACACTATGCCCCTCCTTAGATTTGG
+GAACAAAACACCCATGGAAGGAGTTACAGAGACAAAGTTTGGAGCTGAGATGAAAGGATGGACCATGTAG
+AGACTGCCATATCCAGGGATCCACCCCATAATCAGCATCCAAACGCTGACACCATTGCATACACTAGCAA
+GATTTTATTGAAAGGACCCAGATGTAGCTGTCTCTTGTGAGACTATGCCGGGGCCTAGCAAACACAGAAG
+TGGATGCTCACAGTCAGCTAATGGATGGATCATAGGGCTCCCAATGGAGGAGCTAGAGAAAGTAGCCAAG
+GAGCTAAAGGGATCTGCAACCCTATAGGTGGAACAACATTATGAACTAACCAGTACCCCGGAGCTCTTGA
+CTCTAGCTGCATATATATCAAAAGATGGCCTAGTCAGACATCACTGGAAAGAGAGGCCCATTGGACTTGC
+AAACTTTATATGCCCCAGTACAGGGTAACACCAGGGCCAAAAAGGGGGAGTGGGTGGGCAGGGGAGTGGG
+GGTGGGTGGATATGGGGGACTTTTGGTATAGCATTGGAAATGTAAATGAGCTAAATACCTAATAAAAAAT
+GGAAAAAAAAATTAAAAAAAAAAAAAGACTTGAAGTTTAGAAGTTGCTTGAAGGACACACGTGGAAAAAG
+TTCTAAGATGTTGGCTAGAATTTTTTTTCTTTCAGAATAACTGTTTAACAGTTCAGAAAATAACAAGATT
+TGACTAGCAGGTGTGCATCAAATCAAAGGGCTTCTACACAATACAGAGGCTCCCCTGCAGAACTTGAGAA
+AATCTTTACCCGTGATTCCTTTGACAGAGGATTACTATACAAATATGTAAAGAACACAACATCTTAGCCA
+AATGTAATGACACTGTAATCTGAGCACTGGGTGCTGGAATCAGGAGAATCATTAGGACTTTGAGGCCAGC
+CTGGGCTGCTCTGTTGTGGTAAATCTCCAACCCAAAAAAGCCCAGGAAAAGAAATCACAACTCAATTAAT
+ATGAATACAAGCTGTGTGCCTAGATTGGGCAGATCTACCGCTACACTACCATCTTCCTCTGTTGGGAGCT
+ATTAAAACAACACAATTGTCCTCATCTCTGAATTGGGCCTCTCCCCCGAGAAGAAAAGGGGGTCAAAAGC
+GGGCCACCAACGCACTGCTCCGAGAACAACCACAGATTGTTCCAGCCCTAAGTCAGCACCAGATGTCCTA
+ACCACAAGATGTACCCTGATACCACCAAGTTCCTGCTTCCCCATTTAGTAGCCAAAAGGAAAATTTCTGC
+TGCCCCATTCCTCCTGCTGAGAAGTACTTCCTTCCCCCCCGCCCCCCTGCAGCCTCATCCCTTCTGCTGA
+GAAGTACTTCCTCCTTTGCTTGTGCATTTCCGTTGCCCTATTCCTCCTGCTGAGAAGTACTTCCTCTTTT
+GCTTGTGCATTTAAGCCTTGAGCCTTGCTGAATACAGTGAGACCTTGATGCTAATCAGACTGCTTCCATG
+TGTCATTCATTGCACTTGGTTCTCATCTCTCCCTCCCCCCATTTGGTTCTTAGGAGAAGGTCCCCTCGAG
+ACCCAATAACTGGACCTGCTGGACGGGTCATTCCCCTCTATTAGATCCCTCATAACTTGCAGTTTCTCCA
+GGCCATGTATTTCTGCTCCATTGTCCTTCCAGCTCCTCCTCCATCATCATCTCTCCCCTCCTCTCTCCCT
+TCTCCCCCCAAACTTTTCAGCTTCACCTTCCCTTCCACTGCCCAATCATGGGCTCTAGCCTTTTTACAAG
+TTAAGGTGGGGAGAAGCTTCACAAGAAGTCACCTGTAGGGCTGGTGAGGTGGCTCAGTGGGTTAGAGCAC
+CCGACTGCTCTTCTGAAGGTCCAGAGTTCAAATCCCAGCAACCACATGGTGGCTCACAACCATCCGTAAC
+GAGATCTGACTCCCTCTTCTGGAGTGTCTGAAGACAGCTACAATGTACTTACATATAATAAATAAATAAA
+TCTTTAAAAAAAAAAAAAAAAAAAAGATGTGTTTCAAAAAAAAAAAAAAGAAACTCTTTAAAAAAAAAAA
+AAAAAAAAAAAGAAGTCACCTGTATAAGTGATTCACTCCTCATCTGCCACCCCTCCCAGGAAAGCAGAAT
+TAGCACCAAAATACAAGACCAGGGCTATTCACAACATTTCTCAAAGATATAGGAGGGGAAGGGGAAGGCG
+GGGAGGAGGTGTTGGTAGCTCAGTGGAAATGCTGAGTACGCAAAGCTAAGGAGTGCAAGAGAGATGAAAT
+AGAGCCCAACAAGTCCTAAGCCGCACTTAGGAGGAGGTTCTGATGTGCCTTTGCACATACCCAGTATGAC
+ACCATAGGATTTCTAGCCAAACAACTGGAAGAAAGGACTGCCTTCCTGTGATTCCACCAGAGGAAAGTGA
+CAAGTGTTTAAGGGATGGATGTTTAGCTTGATTTCAACAGAATTTCCATGTGTATCCAAAAGTAATTCAC
+ATGGTACAGCCCAATAATGTGAGTTTTTACATTTTTCTGTATGTTAGAAACAAATTTATAAATCATTTTT
+AAAGGGTGGCTGGAAAGATGACTCAATATTAAGAAGACTTGCTACTCTTACAGAGGACCTGAGTTCATAT
+TGTTATGATCTTGAATGTCGAGCAGCTTCCAAAGGACCACACCAGGCCAAACAGTTCCACGTGAGGTTTA
+TTGGGAGAGAGAGGCAAGGTGGCAGCAGAAAGAGAAGGGGAAGTAGTGCCACGGACAGGGCTCAGTGGGC
+CCTTCTCAATCTGTGAGAATCAGAGTCTTGGACAGATGGGCATAGAGTCGGCGGAAATGGGGGTGACAAA
+CAGACACGACCCGAGAGAGTGTGTTGAATCTGAGTGTAACGTCCAAACAAAAAACAGACTTTTTTTATAC
+AGAAGAAAAAACAAGAAAAGCCAGGCAGGACACATCCTCCCTAGTTACAGTGACACAAAACAAAAGGAAT
+GTATACATCAAAAGATGGCGGGAGACCAGGCCACAGTTTATAGCTTAGAATGGAGCCAGGTGTAATGCTA
+GTCTATTGTTAAGCCCACCACCAGGGATTCTTAGTAAATACCTGATTATGTTGTTCCTTTGGGCCTAGAG
+AAGAAACCTGTCCCAGGGGGGACTCCCTAACTCTTTCATGGTAAAACCACCTATTTGCTAGGCCATTGTA
+TATTTCCTTGTTTGGGTGAGACTCGGCTATTGTCCTAAATAATCACTTTGCAGACTAGCCCTGAGCTATT
+TTCTAGCTCCATTCTTTGTAATGCCTAATTAGTTTCACTGTCTCTACTAGAAGTAAATTTGAATGTTACT
+GAATAGGTAACCTTCTCACTGAATTCCTACTGAATTCCAAGCTCATCTGCTTCAAGGATTTTCTAGGACA
+TTGGAACACTGATGGAGGCTCACCTATGTTAAAATTCAATCTTTAAAGGCACTTATAATAAAACAATACT
+GAAAGAGAGCATACACCATACTGACTCGGGGACAGGGTTTTAATATATGGGCTATGAGAATGCCAAGGTT
+CCAGGAGGCTGAGTTTCCTTGAAACTCTTTGCCTTGTGACTGCCTCCAGGCCTTTCGACCCCAACAGGCA
+GACTTCACTGGAGTGGGCATAGCAGGTGGGGAGAGGGAGGGAGAGGGAGGGAGAGAGAAAGAGAGAGAGA
+GAGAGAGAGATGGGGGGGGCTTTCTTTATATATGGGTGATGATGTAATCACAGGTAAAGGTGGGAGACGG
+GCCAAGTGGATTCTGGGAATATGGTGGCTGTTGCTTTGGCAACAGGTCTGCAGGTCCCGCCTATGTGATG
+TCACGGGTTTCAGAAGTCCTGATACCAACACATATACCAGTACCCAGCCTGTAACTTGAGCTCCAAGAGA
+CCAGATACCCACTCTGGTCTTTGTGTACACCTTCATTTATGTGGACATACTCACAACAGACACATAGACA
+CACAGACACAGACCTCTCTCTCTCTCTCTCTCTCACACACACACACACACACACAATTAAAAAATAAAAA
+TGAATCTTTAAGAAAATTTAATAGTTTAAAAGTTTACAAGATTCTTAGACAGTATAAATGGAGGGAAATT
+CCTACAAAACTTGAAGGGATACAAGTTCTGAGATGTTAAAAGCAGTTCTAAGAGGGAAATTTATAGCTCT
+GGTGTGGGAACTAAAGGGGAGCAGGGGAAGGAGGGGGGAAGGGAGGATAGCCCACATCTGGCCAGAGTTC
+CTCCTATGCTCTGGGCAGGAGGATGCAGGAGAGCTGCCAGACACTTTCCACTAGGTCCCAGGTGGGCATC
+TAAGCCACTGACCCCACTCGACGGGGGGGGGGGAGCCCCCAAAGCCAGGGGGCTCTGGGGTGACACCCTG
+TAGCCCCAGGTTTATGGGAGAGAGGGCTGAGGGAGAGAGGTTCCCACACAGGCGAGAGTCCTTAGTCTGG
+GCCTTGACTGGAACACAGGAAGGCCTTCCATTGGGAGATTAGAAACAGCTCATTAGGAGAAAGCCTATCC
+CATCTTCCGAGTGCAGCAGGCATTGATGTACAGAGACAGTCTATGGTTTTAGAGCTTTATCATAGAAAGG
+CAGGGAGAAAGGAGAGAAGGTAAGGGGGGGGGCATGGCCAAGAGGAGACAAAGAGGAAAGGGAGAGAGAG
+AGAGAGAGAGAAGAAAGTCTAGAGAGAAAGAGAGAGTAAGGGGGGGGGGGAGAGAGAGGAAGAGAGTGAC
+AGATAGAGGAGAGTAAGGGAAGTAAGAGCAAGAGATTGAGAGATCGAGGAGGGGCCAAACAGCCCCTCTT
+AAAGTATACTGCTATCTTTTCTGTTGCTAGGTAACTTGGGGAGGAGTTTAGCCTGAAGGTCAGAAGTTTG
+GAAGATTGTCTACATGCTTCTCTTGGGGGGGCTGAGGGGGGGTAACTTCAACAGGATCCAGGGTTCCAGG
+GGACACGAGAGAACTTCTTCTGTCCTATGTAGGTGAATTATCACCACCAGGTCCCGGGGTTCCACATCTC
+AGCTCTACTGGAGACCAGACTGTCTGTGTATAGCCCAAATACCCAACACTCTGCAGGCCTACATTTAAAA
+ATCAGAAAGAAGACAAATAAATTACTTAATGGCACAACCCAAGGCTTGGAAAAGCAAAACAAATTCAACC
+CTAAACCCTGTAGATAGGGGATAAATAATAAAAACCAGAGCAGAAACCAATGAAACCGAAACAAAGAAAA
+CAATACAAAGACTCCACCAACCTAAGAGCTGGGTCTTTGAAAACATAGGCAAGATTGATAGACTCTTAGC
+CCTGTAAACAAAAAGAAAGAGGAACCCCCAAATTAACAAAGATAAACAGAAGATGTTGCAACAAACACCA
+AAGAATTTTAGAACATTATAAGATATTTTACAACCTATACTCCACTGAGTTAGAAAACCTAAAAGAAATG
+CACAAATGTCTAGTTTGATCCAAAGCACCCAAGTTAAACCAAGAAGAGAATACCTTAAACAGATCTGTAA
+CAGATGAGTCTCAAAGAAACCCAAGACAAGTCTCACTCAGTACCCAGTACTTTACATCCCAGCCCCTAAC
+CTAAACCTCTCCAGCCCAAGGACTGGGGTCCCCTTCCCCCTTCTTCTGATTCCCGTATAACCCAGCCATT
+TTGGGCGTGTATTCTCTTGGGCACCCACTGCTCTCTTGGCTCCTGGTTTCTCTCTTGGCCTTGGGAGTCA
+CTAAAAAAAATCATAGCCTTGGGAATAAAACTTTTCAGTGCCCTGGGCAGCATAATGGGCTCCTCAGAGG
+GAGCCACTGACAATGGGTCTTCAGTCAGTGATGACCCTGAACTAGGGAGGATGGCTCCAGAGCTGAGACA
+GTCTCCTGGACACCCGGCCAGAAGAGATAACAAGGCCCAGAACAGGGCTGCTCAGTGAAACTCTTCAAGG
+TCTCACCAAATATAGTGAAATTAACATAATAGCCAATCAAAGCTGTACTAATGTCATTGCTTTGCCCATA
+CCAATCCTATTAGAAGTTACCTAATCCTTCTGGAAATTCCCCTGACCCTGCATAAACAGGGGCCTGCAAG
+GCCCTAGAGTTGTCAGCATTTTGATAAATGATTGACCCTGCATGCTGGTAATTTCTGCAGAGTTAATGTT
+CTTTGCATACTATTTGAGTCTTGGGTCTTCCTTCAGTGATCATTGGACCCTCTTGGCCTGTGGCTCCTCT
+CTTGGGCGCCTTGCCTTCTCTCCTCTTTTCCCTCCCTCTCTCATTCTCCCTCCCCACTTCTCTCATGGCC
+CAGCTTAGTCTGGAACCTTCCAGAGGCCTCTGGTTATTCTCTTCCTTTTCATCTACAATAAAATCCTTCT
+CCTCAATCATACCTAGGAGAGGTTATGTTCTCATTCTCTCTCTTTCTTTCTTTAACTAGAAACTGGCATT
+TACTATATGTGGAGGAAGGAGCCCAGGGACCCATACGTAATACAGGCACATGGCGTCCTAAGCTCGGCCA
+CTATGTGACTATCCTTGTAGCCACACTTCTGATACGGGGTGGTCTTTGGTCCTCATCATCATAGGTTCCT
+GGGACATGTCTGCAATGATCTGGTTCAGCCTGTCTACTTGTGAGTGATTCTGTTGATGTAGATGCAGCTG
+TTTTCTGCTTCCTGCTGGTAGTTTACAATTCTGGCACGCTTATCGCAGAATGTGGTTCCTTGTCCTCCTT
+GGGGTACAACATGTTATTACTGTGGAGAATTGGAGGTTGGAGGTCAGCCTGGAAACCAATCCTCACTGCC
+TTTCCATGCCAGCTGCCTGGACTGCACCATTCATTCCTGGAAGAATAGAATACCCACAAAGCAGGGCTCA
+TAGGTCTGGTCTGATTCCCCAGCAATGGGGCTGCCCATGTTCTCATTTTCATTCAACAAAGAGAGAAACA
+GTGATAAACAGCCTTCTGATGTAAAATTAATAATAATAATAATACTTTTCATAGGCTCAAGGCCACCTGA
+TGGACTCACAGAAGAATTCTTCCAGACCTCCAAAGAAGAATGATAACCAAGACTCCTTAGATAATCCAAA
+TAAAATAGAAATAAAAGGAGCAGTTTTAAATTCTTACAAAGTCAGTATTAGTACCAAGCCAGATAAAAAC
+ACAACAACAAAGAAAGCTACAGGCCAATATCCCTGATAAACATAGATGCAAAATTTCTTAGTAAGATGCT
+TATGAAAGATCCTGACAGAATGTAAAAGGTCACAGAAGCCCTGAAATTGGCAAGATAGATGTCACTGTTA
+GCAGAACTAGCTTCACTGATTTAGAAAAATAGAGGTGCACAATGCTCTGGTCACTCCTCGAACCTGTGTG
+TCTGCCAATGTTCTGACCAGGTGTGTGCCCATTGCTGCACCTTCATTAGACTCTTTCCTTGTACCCCTCC
+CATACCCATTTCTTGAGAATAGACATTGTTTAGATCTGGAAATCCCCTACTCTCCCCCTTCTCCTTTCCC
+CCCTGAGGGCCTATAAAAACTGGGACCTCTTTCCCCTCGAGATTGACTCCTCTACCCCTGCGTGGGATAT
+GAGTCATCCCCAGAGCTCTGGCTTTCCCCGAATAAAGCCTCATGTGGTTTGCAACAAGCTCGGTCTGTCG
+TGAGTTCTTGGGTGTCCACTATTGTCCTGAGGCCTGAGCGAGGGGCTCCTCTCGGAGTCTTTCACTTAAA
+AACTGAATATAGTTATATATCAAAAAGATCATCCATTACAATCAAGTTGGCTTTCTCCTGGAGCTCAGGA
+TATATAAAAATCAATATATATAAATCAATAAATATAGCAAATAATATAAATGAACTAAAGACAAAAATCA
+CATGATCATCTCAATACATGCAACATACCATCATGATAAAATTTCTAAGTGCACAGGACTGGAGGGAACA
+TACCTCAACATAAGGGCTATATATGGCAACACCACAGCTAGCCTCATATTAAATACAGAAAATATCAGAG
+CAATTCCATTAAAATCAGGAACACACGGGGTGATCCACAAGTCACATAATGTGTGACTCAAACCAGAGAA
+GGAAATTAAAGGGATCCAAGCAAAAAAAAAAGAAGTAAAATTATTCACATTTGCTGATAATATTTATATT
+ATATATAAATGATCACAAAAATTTCAGCCAGAAAACTTCTAGAAATGATCAATACTTCAAGCAAAGTGGC
+AAGACGCAAAATCGACTTACAAAGTTTAGTAGCCTTTCTATATATTAATAGCTAGCACGCTGAGAATGAT
+CATGGAAAGAACCCCACAGACTCAAAAAAATAATAAAATATATAGGAATAAAATTAACCAAGGAGATGAA
+AGAGCTCTATAATGAGCCGGGCGTGGTGACGCTTAATACCTCACCCCTGCAGAAATCCAGGTAGTATGCT
+GACTGCGATGAGCAAGCTTAGCACAGGCTAGCTTTCCTGCCACTTAACGATATCTAAGCACACTCAACCC
+TAGCTCCCTGCCTCTGTTTCTTTTTCCCCTGAAAAAAAAAAAAATTCTAAAAATGATTCTAACAATCTCA
+AATTCCGCCGAGCGGTGGTGGCACATGCCTTTAATCCCAGCACTCGGGATTATATATATATATATTATAT
+ATATTTATTCCTATATATTTTATTATTTTTTTTGAGTCTGTGGGGTTCTTTCCATGATCATTCTCAGCGT
+GCTAGCTATTAATATATAGAAAGGCAGGGGAGTCTCTGAGTTCGAGGCCAGCCTGGTCTACAAAGTGAGT
+TCCAGGACAGCCAGGGCTACACAGAGAAACCCTGTCTTGGAAAAAAAAAAAAAAAAAAGCTCTATAATAA
+AAACTTTAAATCACTGAAGAAAAAGATTGAGGACGACACTAGAAGAGAGGTCTCCCACGCTCACAGATTG
+CTAGAATTAATACTGTGAAAATGACCATTTTACCAAAGGCAAATTACAGATGTCATTCTTCAGAGGAAAA
+AAGAAAACCCCTCAAATTCCTCTGGAGGCACAAAAGTCACCAGGTAGTCAAAGGATTCTTCATCAGAAAG
+AACAAAGCTAGAAGTACAGTACAGATGTCAAGATGTAGTACAGAGAGCTATGGTGGGAAAAACAGGATAG
+TACTGGTACAAAAGCAATAGGCCAATGGAATCAGACAGAAGACTCAAACACGAGTGTGTTTAACATTTGG
+CAAACATGCCAAAAATGCATACCAGAACAGAGACAGCAACAGTGCTCGGAAACCGGACGTCCTCAGGAAG
+AGGAGTGACATTAGATCCATATCGATCACTGTGCTCAAAAATAAACTCAGAGTGTGTCAAAGCTCTCACT
+GAGTGGGACCTAAATCACTGAGGCTGCTGGAGGAAAATGTAGACCGTTCCTTACAAGGTGCACGTGGAAG
+GACGGGCCCTTGGAACAGGATTTGCTTCTCTCAAAAATTAAGGCCAACCAGTGAAAAAATGGGACTCAGT
+AGACCAAAGAGAAGGCCTGCAGACTGGGAGGAAACTCTGTAACCTAAACATCTGATAGAGGGTTAATATA
+CATAATAGACAAAGATCTCAAAATTCAAAGAAACGAGGAGTGGGAAAATGGAGAACTCTTCCTCAGGAAG
+GTAGAGGGAGATAAAGACAGAAGGAGGGAGGGATAAAATAAGAGTAAGGATGTAAGATTACTAATAAATA
+CTATTAATAGTATAATAGGCATACTATTAATTTTCTACCTAAAAATACCTTTAGCATATGTAAATTGGTC
+CATAATATAAATTTATACTTCAAATAAAATTTTGCAACTAGTCTTGTGATGCTTCCCCCAAGAGCTATAG
+GCAAACAATACCCCAGCACCAGTCATGAGCAACCCTCTTTTGATCTGTTAATCAGGGTTGTCCAAGGATT
+TCTCAAAACAAAATGTTATAGCTATTGATATAGCCCTTGGTTCCCACCTAGTGGTGGAAGGTAAGTCTCT
+ATTTCAGAGGCCCCCAAGTTGGAACAGACCTGAAAGCTGCCTCCCTGAGAACTACATATCATGGTACCAG
+AAGGGACTAAGCAAGCTCCCAAAGGAAGGAGGCAACCAGCAGTCTTAAACAACTCTGACAACTATGAACC
+ACGACAATGACTGGCATGGCATGATATCCCTGAGGGCACGCATACCTTGGCGGTAACCAATGGCTTTCTT
+ATCGGATTTAAGGCCTGCTAACCAAGACAGAAATTAGGCCTGGTACTGCAAACCTAGCCAACTGTCCAGG
+GCTAGTGGATTTATGGACCTTGGAGGAGAACTTAAAACTATCCCTTCATTAAACCGGCATAATAACCCCT
+ACTATATGCTTGTCCTTATACTCACAGGTAAGTGAAGTCCTCACCTCTCATCAAGGAAACTTCTTAAGAA
+AAGTAACAGAGACTCTTTCAGAAAACCACAACCAATCAAGTCAAGTACTAACTGATATGTCTACAACATA
+ATTCCTACACCCAAGGCTCGGAGATCACTGTAGAAGAGGGTGTGCAAAGACTTTAAGAGCTAGAAAAACA
+GGGAGTTTGCTGTGAGACCGTGTCTTTAGGAATGTCAGAAGCTATACCCAGGAAGCCTCACCAGCATGGT
+GGCCTAAACATGTGCGGAACAAGGATGTCACCAAAAGACAGGCCAAAGTGGGCAGTGGAAAAACTCATGA
+GGTCTCAAACCTAGACAAATAACTACAGGCAACGAAGGGATGCTGAGAGTGGGAAAACAGCCTAATCCCA
+GGAGGAAATAACTATCTAATACCAAATGATCAGCCCGAAAACATGTACATATGTAACATTATATGAACTG
+AGTGGGTTATATTTATATATTAGTAAATAAAGACACACACCAACAATTAAAGAAAAGGAGGCCATGGTGG
+TACATGCCTTAAATCCCAGAACTCAGGAGGCAGACTTAGGTTATATAGGTTATTCTCTGTGAGCTCAAGG
+CCAGCCTGGTCTACAGAGTGAGTTCCAGGACAGCCAGGGCTACACAGAGAACCCCTGCCTTCAAAAACCA
+ACAGAATAAAAAAGAAAAAGAGGCCATGAATTTAAAAGATAGCAAGATGGGGCTTCGTGAGAGAGTTTGG
+AGGGAGGAACACACACCTTTAGTGCTAAGTGAAATAAAATTTCTTCAAACAAGAGCCATAACAGCTGCAT
+GCGTTGCATAATTTTATTCTTAATTTTTCCTTTACACTTTTTTTTTTAAAGTGAGTCTCCGTCCAGTAAT
+CTCCGACCTGTAAGAGTAGGTAATTTGCAGCTTACAGCATCCTTGACAATGGTCAGAGACTTTCTGAACG
+GTCACCACTGGGAAGCACTGCAGCCTGTAGTGGGTGAGGGATGGGAAGCTGATGAATGTCCCACTATGCT
+GCCTGTAGATCTCCCCTTCTCCTTCTGTTCCTAGGAGCTCTGATCCAACTCCTCAGTCTCGTTCCCAGCA
+CTATAGAACACCATCTGGGGCAACCTCTCCTTATTGTCTCCCCCATCCCAAAAAGACAAGATAAGTAAAT
+CCTGGCTGAACACTCTGCTCCTTCCTGAAGCTTGTTCAGTCCTTAGCCCATCCCCATCCCTAAGTGTCAG
+GTCCCAATCCCCAGAAATACATTTTGTCAGGAATACCTAGTAGACACAACTATCAGGTACAAAGAGACTG
+TTTTGCCAAGCAACATTTGTCCATCATACTCTCCAGAGCTCCAGTAACAAAACATAAACCAAGGAACAAA
+ACATCCACACAATAAAGACAAATCCAGAAATCACCATCTAGACCTATAATCATCCCAACCTCAGAGGCCT
+GGATGCCAGCATAAAAATACAAATAACATCCAAGGCAGTATGTCTCCACTACATTCCAGCTAGCTTACCA
+CAGCAGGCCCCTCGCTAACTGCAGCACTTAGGAGAGTGAGCCTTGCGCCTCACCTACACAACTCAGGCTG
+GCCCTGAAAGCAGAGTACCAGTTAGCCCTCGAACTTGTGAGCCAGAGAGAGCTGGCCCTGCTTCTTGCTT
+GCTGTGGCATTGGGTGAGCTAGCTGGGGCAGTACTGAAGAGCTCCTCCTGGTAGAGTGGGTATGGGAGAG
+CTAGTGGGCTGGCCAACTTAACTACCACCCAGGCCCAGACAGAGTGATTTGAGTTGGTTCACCCTAACAT
+CTACCCCATCTATGAACTGCTGGAGTGCATGAAAGGGCGGCCCTGCAGATCCAAAGCTGCAGGGTCTCCA
+AGACACAGGGGAACAACAGGGTATCTGAGAGGCATCCTGGTGAGGATCCAGTATTGTAGCAAAAGCCAGA
+CGCCTCAAACCAAAACAATGATTCATTGCAATGTTCATTTGCAAGTAAAGATGTGGTGACAAAGATGGGA
+CACATTGTCATAAGCTACAGCTACCACAGTGAGATGCTTTGTTCTGTTTTCTTTTTCTTTTTTCTTTTCT
+TGTATGAAGGGAGGTTGCGAGAGCAGAGGGAGGATATGAGGATGAGTGGGGTTGCAGTGTATGATATGAA
+GTTCACAAAGAAATAATTAAAAGGTTTTAAAATTATTATATATATAAGAAATGGGACCTCATGAAATTGA
+AAAGCTCCTGTAAGGGAAAGGGCACCGTCATTCAGACGGAGCATCAGTCTATAGAATGGAAAAAGATTTT
+CACCAACTCCACATCCAATAGAGAGCTAATATCTGAAAATATACAAAGAACTCCAAAAACTAGATATCAA
+AAACCTAAATAATCACATTTAAAAATAGGGCATAGAGCTACTCAAACAAAGATTTCTCAATAGAGGAAAC
+TCAAATGGCTGGGAAACTCTTAAAGAACTACTCAACATCCTTAGCTATCAGGGAACTCCAAGTCACAACT
+GCTTTGAGATTCTATCTTACACCTTTCAGAATGGCTAAGATCAATAGCACAAGTGACAGCTCATGCTGGC
+AAAGATGTACAATAAGAGAAACACTTAGCCATTGCTGATGGGAGTGCAAACTTGTACAGCCACTATGGAA
+ATCTATATGGTAGATCCTTAGAAAGCTGGAAATCAATCTACCTCAAGATCCAGATATACTACTCCTGGCA
+TATACCCAGAGAAAACTTTGTCCTACCACAAGGATACTTGCTCAGCTATGTTCATTGCAGCTTTATTTAC
+AATAGCCAGAAACTAGAAACAACCTAAATGTTCCTCAATACAAGAATGGACAAAGAAAAGGTGGTACGTT
+TACACAATAGACTATTACTCAGCAGTTTAAAAAGTGACATCATGAAATTTACAAGTAAGTGGGTGGAACT
+AGAATGATAAAAACATTCTGGGTGAGTTAACCCAATCCTAGAAAGGTAAGTATATGTAGTCACTTATGAG
+TGGATAGTAACCATTAAATAAATGAGAACTAACTTACAATCTGTAGACCCAGAGAAGTTAGGTGTAAGGG
+AAGAAACTAAGGGGGATACATGGGGTCTCCCTTGGAGAGGGAAATAGAATAGATTGTATGGGTGGACTGG
+AGAAAGGTGGGGACAGAAACAAAAGGATCAGGTGGGGAGGGGGAGGGAGATAAGATTGAGGAAATGAATA
+AGAGGAAAGACAGCAAGAATTAAGGAGGATTTGAGAGTTGATAGGGAAACCCAATGTAATAGAAACTTCC
+TTAAACATATGAAGGCAATTCAAATGAGGTCTCCTAATAGTGGAGGTAGAGAGTCCCAGTTAGCCATCTC
+TTGTCACTAAAAGAGCTCCAATACCAGTACTGGGTTGCATTCAATTGAGTTGTTGGCCAAGGGAGAACCA
+TGAAAATCCCCAAGCAAGCCAGGCTGTTGCTAAGACAATGAGTTGCTCTCTGCAAACCAACAACAGGGCC
+CCATTGCTGAGAACAATACCCACACAACTCATTGAACATGAAGAAGCAGAGGTGGTGTCTATATAGAGAC
+TTTACCCCTACATTCCAGTGTCTTTGGTGTGGGAAGATACTCTGCAGGCTACCAAAGGAGAAATGTAGAC
+ACTAATCCTTCCACAAAACCTTTGACCTACAATCTGTCCTTTTCTACAAGATATGCTAGAACAACAGTGG
+CACAGAACTGGTGGGAGTAGCCAGTCCATGCCTGATCTGACTTAAGGCCCACTCCATAAGACTGAACCCA
+TGCTCAACACTGCTTGGGTGACCAAGAACCAGAGACCAGATAGCCCAGAGACCTAGGGTAAAACCAAACG
+CTATTGGTCATATGTATATATGTGTATGTACACACATACACACACTATATATATGTATATATATATATAT
+ATAGTGATGGTTTGTATATCCTTGGACCAGGGAGTGGCACCATCTGAAGGTGTGGCCTTGTTGGAATAGA
+TGTGACCTGGTTGGAATGGGTGTGTCACTGTGGGTATGGGTATAAGATCCTCACCCTAGTTGCCTGGAAG
+TCAGTCTTCCACTAGCAGCCTTTGGATGAAGACATAGAACTCTCAGCTCCTCCTGCGCCATGCCTGCCTG
+GATACTGCCATGCTCCCACCTTGATGATAATGGACTGAACCTCTGAACCTGTAAGCCAGCCCCAATTAAA
+TATTGTTTTTTATAAGACTTGCCTTGGTCATGGTGTCTGTTCACAGCAGTAAAACCCTAACTAATACATA
+TATATACACACACACTATATGATATTTTGCTATACCCTAGATTGGTTCCTTATTCAGTCATTATCAGAGA
+TGCTTCCTCTTGCAGCAGAGGGGAAAAGATGCAGAGACCTACAGGTAGACATTACACAGAGAGAGACTCC
+TTGGAACACACAGCTCTATATGGGATATCTCCATCAAATCTCTCTCCTCAGAGCTCAGGGAAACATACAA
+AAAAAAAAAAAAAAAAAAAAAAGAGGCAGAAAAAGTATAAGGGCCAGAAAAAGTGAAGGTCACCAAGAGA
+ACAAAGCCATCTGAATGGACTAAGCAAGGCTCATATGAACTCACAGAGACTGACACTGCAAGCAGGGGAC
+CGACATGGTGTGCACTACATCCTCTGCATACCTGTTATAGCTTTCAGTTTAGTATTTCATGGGACTCCGA
+GTATGTGAGTGAGTCTCTGATTCTTGGGCCTGCTCTTGCACTTGTTTCTTTCTGTTGGGTTTGCCTTGTC
+CAGTCTCAATGTGATGGGGTTTGCTTCATTTTATATTTTATTTTATCATTTTGGGTTGTGTGCTTCTTGG
+AAGCCTATTCTCTTCTAATGAGAGACAGAAAGAGGGTGGATCCAGAGGGGAGGGGAAGGGGGAAAGAACT
+GGGAGGGGTAGAGGGAGAGGAAAGAGAGGAAACTGTAATCAGGATATATTTTATTTTATTTTATTTTATT
+TTATTTTATTTATTTATTTATTTATTTATTTATTTATTTATTTTGGTTTTTCGAGACAGGGTTTCTCTGT
+ATAGCCCTGGCTGTCCTGGAACTCACTTTGTAGACCAGGCTGGCCTCGAACTCAGAAATCCGCCTGCCTC
+TGCCTCCCGAGTGCTGGGATTAAAGGCGTGCGCCACCACGCCCGGCAGGATATATTTTATGAGAAAAGAA
+TCTGTTTACAATAAATTTTTAAAATGACAAAAACAGATTATTAACTGGGTTTGCCTGTCCAAGATTAAAT
+CTTATTTTCCAAAAGCACTGGAAGGAAAAGCAGGAGAGCCCTCTTGAAACGGTAAACCAACTGAGGACTT
+ACACTTGCTCTTCAGAAGGGACCCCAAAGGTCACCTGTGAACTGGAGATCCAGAGCTCTTACTATTATAA
+ACAGCATTCTCGTGAAAGATGTTTGGCAGCATTGTGACCATTTTCACCTGCGCCTATCTGGTAAAGCCTT
+TTTGACAACAGCCTGACTCCTGAATGACTAACCAGATGTGTCCCTTAAAAAATCCAACAAATCACTCCTG
+GGATATCATACTCTACACAGCCAACGCGGCTGTGAGACTTTCCAGTCCTTCTTATGGAGAGGCTCTGGGG
+CCCTTTTCTAACTGTTCTAAATTTGTGTGTGTGTTAAGTCCTTGATCATTTACAACTAATGTCTTCCAGA
+GTCCAGGCCCTAAGACTTCAAACAAAGCTTCACCACCATCACCACATCCTTCCACCAAGGGCCGTCGGAT
+CCCCTGAAGCAAGGCCATCTTTTGCTAGCACCGTTTGCTAGTAAGACACTCAGGAGGGGGTCAAGCCGAA
+CCCTTTTACCAGTGTCCCTGTCTGGATCTAAAGCAGTGAGGAACAGCCATCACCTGATACTCCCCAGAAC
+CGAGTCCCAAAATCCTAATGAGAAAAACATTGAAAAAAAATCCCAAACGTTTACATTAAATATGTTAAAA
+TGTTAGCGTTAAAAGGAGATAAATCATCTTACCGCTGGGAATAAGAGAGACAAAGTTAACCAGATTTTTA
+AAAAATACTCAAAAGACAAAAACAAACAAAGAACCCATACACAACAGAATCTTTACAGTTAACCTTTTGC
+TTTATTACATTTGACCCAAACTATAGTAATTCAATACATAAAAAGATCCTACCACACATGGCAATTTTCC
+AGGCTGGATTTATTGTACGTTGTGTTGATAAAGTGTTGCGTGCTGTTTGTGCTTTCTGAGCAGCTGGATC
+TAGATAAGGGCTGCATCCTGCCCCCCACACCCATTTCAGAACCAAGCAACTTTGTCTCAAAGTTTGCTCC
+TGCCAGGCCCTTTGTCTGGGTTTGTTATGGCACCCTCCCCAGCTCTGTGACAGAGAGTCTCAGAGACTGG
+GAGTCAAAATCATCCACCTGGAGAACAGTAAAACCAGACACAGACGGTTGTCAAACCCATGTCATAAAAG
+TGCCTACAGTGGCTGGGTGGCAGTGGCACATGCCTGTAATCCAAGCACTCACTTGTGGTGCAAGCCTTTA
+ATCTGGCGCATACAGAAGCAGGCGGATCTCTGTGAGTTCGGGGCCAGCCTCGTCTACAAAGCAAGTTCCA
+GGACAGCCAGGGCTACACAGAGAAACTGTCTCAAAAAAGGAGAAGGGGGGGGGGGGGAGAAGAAAGGAAG
+GAAGGGAGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAACCAG
+AAATTCATTGGTGGAAAGCTAAGCCATGGAGTGAAACCTCAATCCAACCCTACTTAGTTCCCCAGCCATT
+ATTGAACACCTAATTTGATGATGCTTTCCTTACTATTTTTGTCTGTCTATCTGTCTGTCTGTCTGTCTGT
+CTGTCTGTCTATCTACCCATCTATCTATCAGTCATCTCTTACTATTAATTCTTTTATTTTGCCATATACT
+TAAGAAAGCTTTCAAATTTAAAACCTAAAGTAGGGCTTCATAAATCATTCTCTGAGTGATACAGAACACA
+AGAGCAGGCAACTAAACCCACTCACACATCATAATTCCTCCAGAGCTGCAGGGTAGGGCCACGCAGGATC
+CACTGAGCTTTCCAAAGAAGTACACCAGGATGAGAAGAATGTGTGGTCATCTTCCACAAGCCATTGCCAG
+TGGCTTCACTAGGACCCTGCCTTGGGAACGCCAAGAGTGACTGCATAAGTGCCTGGTGATTGTCATGGGG
+CCAAGGCATCAGTGACAGAGGTAACCATAGCAGCAGTTCCTGGCAGCTGCTCATACACTCTTTCAAGCTT
+CCTCATGACCATGAGCTGGATAAAATGTGCAAGCAGACAGAGACCCATAACACACTGAGATTGTTAGGAC
+AAAGAAAGATATCTTCCCTACAAAAGATTTTTTTTCCTGTATATCTCCTTGCAGCTGTATTTTAAGGCAA
+TTCTTCTAGCTAGTCCAAGTAAGAAAAAAAGGCCTTAAATTAATGCACTTACTGTAGTATTATGCATTGA
+GTGTCTGAACGAACTTACTCATTGAACAGTTTTTGCTGTGTGTAGTGGTGAGGGTTCTCCAAGAAACAGA
+ATCAATAGAAAATAGAGTCTATATTATAGAGACAATTAATTTGGGGGTTATGTGGGGATGCATGTGCTGT
+TTGTCATGTGATTTGTTGCCCCATGGCCTGTCCATGGAAAGAGGAAAAGCCAGTACGCCATTGATATATT
+CAGTCCAGACCTGACTATATAGGAGACACTGGGACAGTCCCAGATCATGATAGCAGATGGCCAGGCCAGG
+AGATGATAGACATCTGGGAATTATAGAAGGAATTCACTCATTATCTGCCACATTTTTATTTTATTTTGTG
+TGTATGAGTGATTTTTATTTTTTTATCTTCTGCATATAGATATTTTCCCTGCATATATATCTATGCACCA
+CATGCATGCAGTACACGCAGAGGCCAGAGGAGAGAGGAAGGGCCCCTAACTTGAGACAGAAAGCACTGTT
+AGCAACCATGTGGGTACTGGAACTAAACGCGGGTCTTTAGAAGAAAAGCCAGTGCTCCGAACCTCTGAGC
+CACTCTTCAGCCCCCAAACCAGAGATAATTAAAGCAGACTCATCTGTAATCAGTTCTCCAAGAACTGAAT
+GAATTTCACAATATTAAACAGCTTCCCTGAACGGGGGGTGGGTGGGGGGTGGTGGTGGTGGTGCATGGAC
+ATTCGAGGAGGCACAAACAATAAGTAACTGAGGGACGGTATATTCAATTCAGCTTCCCCAGAGAGAAAAG
+AATTAACCAAGAAAGAAATTCTCCATGCCAGGAGAACTCTAAGCCAAAAGAGACTCAGGAAAGACAACAG
+CCTAGAGGACTAAAATGACACAGCCGTGGCATGTGCAAGGTGGAGAGACCACCCAGGGCCCCTTCTGGGC
+ACTCTCACCAGCCTCATCATTCCTCAAGGAGACACACAGCTCTTCACAGTTCCAAGTCCCAGCATGCTTC
+AGTGATTGAGATTCCCAGCATGCTTCAGTGAATGAGATTCCCAACACGGTGCAGTGAATGAGATTCCCAG
+CATGCTTCAGTGAATGAGATTCCCAGCATGGTGCAGTGAATGAGATTACTGGCAGGCACACTTTATCCTC
+TCACATGCTGACTGTCGATCAGTGGTTTTCAACCTTCCTCATGCTATGACCCTTAATACAGTTCCTCATG
+ATGTGGTGACCCCCAACCATAAAATTATTCCATTGCTACTTCATAACTGTATTTTTGCTACTGTTATAAA
+TCATGATATTCATGATAATTGATATGTGACCCTCCCACAAAAGGGATCATGACCCACAGAGAAACACTGC
+CTTTATTAAAGAGCTTTATCCCTGCTGTGAGGATCTCAGACTAGAACCAAGATATGTCAAAGCTCTTCTG
+GTTCCATTAGATGTTTCAGAGCCCGGCACTACAGGCTCAGACTTCTGGGAAGCAGCTGTGATGCTTGGTG
+AGGCTAAAGAGCAAAAGAAAAAGACACAACTCATGGCAAGTGTGTTTCAGACTTTATTGTAGACGAGACA
+GACCTGGGGAGGCTACTCCACCAGGAACAGGCTGGCACTCAAGTATTGACCTTCGGGGTAACTCTAGGGA
+GGGCTAGCACTAAGATAGTGTTGACCATCGGGGTAATTCTAGGAAGCGTGAGTTCAGGACAGACCTCAGT
+TCGCAGAAACGGGCTGTGAGGTTGGGTCAGGGAGCCAATGAAGAAGGGGCCATGTGAAAGAAACTGGGGA
+GAATGAAGGCTGTGTGGACTTGACTGGGAAGAGGGTGAGGAGATGGGGCTGACCAAGCTGCAGAAGGGAG
+CGGGAAGGAGAGAGAACCAGGAGCCACAGTGCAGAAGGCCAGGGTGCTGTCCCCACCCAGGGCCTGCAGG
+ATCCCCAGTGTGGACAGGTCCTCCTAGTGAGTGCCCGTGTAGCAAGGGCCTACTTGTTACTCCAGTGGGC
+CATCTTTGAAAATATAGGTGTTGCGATGTAGCGGCTACTCTTCATGTAGGCAGAGATCTTCTTGAGGCCC
+TGCACATTGGGAAGGACACAGCAGGAGCTCAGGTCTGGAGCACAGAGCCGCATGCATGCACGCATGCACG
+CACGGGTCTCCTGGCTCACTTGTCCTGAACTCATGCTGGAGCAGCGCTGCCTGCTTAGCTCCAGCCCAGG
+CAAAAGAACTCTCCCATTCCCCATATTCAAGGGGAGACTGGATCCTGCATAAGTTGTTTTATACTCGCTA
+AAAACTACAACAGAGGTTCGGAAAAAAAACCTAGACATCAATCTTCACCCCCCACACACACACATACACA
+CCCCCACCACGCATACCCACCCACCTCTACACACATACCCCCCCACACACATATACCCTACACCCACACA
+TACCCACCCTCCCCACATACACATACCCCCACACACAAATATATCCCCCCACATACACATACCCACCCCC
+ACACCCCCACACACACACATGCATCACGTGCTTCCTACCCCTCTTCAGCTCTCTGAAGGAGCCTCGATAA
+CTGTTTCTCTGCATGGACATGATCCTGGACCTGGGGAAGCTCTTTCTAGCTTGTATTTACTCACAGAGGA
+GGAATCTAACAGGACAAATGAGCCAAAACTTGACTATGTGTGCTTAGGAAGACAGCCAAGCACCCAGTGA
+CTCAGCAAATCATGAAGAAAGGGAACAACAAGCGAATCGGGCCAGGAAGAAAAGCTGAACAGGATTCTAA
+TCAGTCTCCTGACAGAGGAGTTAATTCTCTACCTGCTGCTCAGTGCATGCTCACTGCACCCTGGCTGAAT
+ACAGGGACCACTCCATGTGACTGGAGGTTGTACCATAAGCCATACAGCAAGCTGGAGAATACAGACAATT
+CCAGAGGGTGCGCAGAGGTCCACCACAGACAACACACACAAGCTAAGCTAAGGCACTGCTCTTTCTCCAT
+CCACCCACCCCACAGTGGTGCCAACTCTTGCCGTTTGCCAGGACACCCAACCTGAACCAGGACCAGACTG
+ATTAGACCAAGCTGAGAAAGGCTAGGGGTTTCAGTGGTGAGTGCTTGCCCAGTAAGCACAGGACCCTAGG
+GTTAGCCCTCAGCTCTGGGGAGAGAGGGGGAGAAGGAGGGAGGGGGAAAGAGAGAGAAAGAGAGGGAAGG
+AGGACTCAGCAGCCCTGGGGAATTTCACAAGTATCTCACAGACTGGCTTTCCACAGCTCTCTGAAGACTC
+AGTCCCAAGATGGCAGTGTTGTGCATGCCCCCTGAGGCCCAACATGAACCTCATCTAGAGGAGAATGCCA
+GAGAATAATAGTGAAGAAAAAACTTTGACCCCATTTCACAGACAGGCCCTGGGACATGAAAGAGGAGAAG
+AGTGTAGAGGAGGAAACAACTCACCAGCACACACTCACACCAGCACACACTCACACCAGCACACACTCGC
+ACCAGCACACACTCAGGCTAAATTTCCCCTAACATGGAAGGCACTCACAAGTCTACTCAGTGAGGTGCCA
+CCTTCAGCCCTGGGTTCTCAGTGCTCTACAGATCAGTTCTAGAGCCCTGGGCTCTCAGCACTCAAGTCTG
+GACTTTCTTCCATCTGGCTATGTTACATTAGCTCTGCCTCTTTCACTCCCCTAATGCCAAGACCCACACA
+CTATCAACATGCACAGCCTTGCCAGGGATCCCAAACAGGAGAATGAGGTTGTGCAAGGGCCTGAAGCTTT
+TTCTTTTGACCCAAGGCAGGGGCAGTCACTGAATGCTGCCCCAGATCCCAGTGCAGCTGCAGGTGCAGAG
+ACCTTGGGCACATGAGTACCCTCTGTAGTACCAGGCTCCTTATCTGTCCAATGGGACTCAGAATCGCATC
+AATCTTTTGGAATTTTGAAGGGATAAATATTGCCCAATATGCTTCTTTACAGAAGCTCAGAAAATCCTGG
+TTATTTTCCTAGTCTTGGGACTAGAAAGCATCATGGACCCGAGAAAGAAGACAACCCAAGGGGAAGCAGG
+ATCAGGGCATCACCTCGAAGCGGGCCAGGAAGTCCCTCAGGTTTGGGAAGGCGTCCAGGCACTTGGGCTC
+AAACATACGGTACTGGTCAAGAATGTCATAAGCAAGGAAATCCACATAGGTGACCTGCAGGAAGCCAAGG
+GTGAGCGCTCTGGAATCTGACACCAGGGAAGGATGGCAACCCCTCCTCCCCACCCCTCCCTTTACCTTGT
+CCCCTGCAAACCATGGCCTCTTGCCCAGGAACTCAGAGTAGAGCTTCATTTTCTCAGGGATGGTCTTCAA
+GAACTCTGGCTTCTGCTTCTCCTGAGGCAGAGAACAGCTGGCTGTCACCACCTTTAGACTCAGGCTCCCC
+AGCAAAGGGTTTGGCAATGGAACAGCCACCTTTCCAAGAAAGGGCCAGGTCCTAGTCTAGGAGTCCATTT
+AACAGGTCACGATTTTACTAATGACAGAATTCATGCCAGAGGTGATGGACAGCCTGGCAGAAATTTCACA
+CTGTCTCTGAAATCTGTCACCACTGAAGTCCAAACCAGGTGCTGTGAGTCACACCCAGAACCTCCTGAGT
+ATCTAGCATGCTTTGGCCTCAGAGACCCAGCGGAACTACAGAAACAAAGGGCACAGAGACACCAGGAGAA
+GACAGTTATACTGGATCCACCGGGGAGCAGGAGAACAGATCAGATATTTGAAGGATGTGCTTCAGTTTTA
+ATCAGTGATTAATACCGAAAGGAAAAGGACTCAAAGTTCCTCTGGCTGACCCTGAGCTGGCTACTAGGAA
+GATGTCCTGCTTTTCTATTGGTACCTAAGACAGACCTGTGGGGAAAGAACTGGGTAGATGTGTGAGACCA
+GTGTAGACATAACTTATTTACTTACTTGAGTCTGGAATACACAGACTCAAGCTATGAGGAGAAGAGATAC
+ACCATGGGCGATGGTAATGCCACCTTCCTGGTCAGACATCTGCTTCCTTATCACTTAGGGAGTATACTGG
+TGCCTGTTGGTGCATGGGGGTCCAATGTTGATGCCAAGTGTCTTTCTCAATCACTGTGCACCTTATTTCT
+GAGACAGGGTTTCTTAGTAAATCTAGAGCTCAAAGAACAACTAGCCTCAGTGGTCAGCAAAACCCAAGAA
+GCCCTCCTCTCCTCCCAGAGCACTATATTTATAAGACCAGGCTGCCACACTGGGTGTGTGTGTGTGTGGG
+GGGGTGTTATAGGGTGCTAAGAGTTCATATTGAAGCCTTCATGTTATGTAGCAAGCACTTTTACTCCACA
+TCCCCTAATTTCATCCTGGGGGCTTTTGGTATAAACACTGGGTAAGACGGCACATTGGAAGAGACCAATG
+GACAGATTTAGCAGAGATTTACAGCTGTACTGAATAACTTAGATCAACTGTCCACCAGCAGAACCTTAAC
+CATATCCTAAGACCAGCAGGGATAGTTTCACTGTGTTGATGTGATAGATCTCAGGGGTGACAAGGGTCAT
+CCTGCTGGTAAGATCAGAAAGCTTAGAGAATCATTACCAAAGGAGGATGCTACAGATCTGAACGTGAGAC
+TTTAACTCCTGCTGTGAACCCTGTCTGCCGCACCCACTGGCCACTCACAAAGTCAGGGTTGTAACAGAGC
+ATGATGAGCTGCATGCGGGTGTCCATGACCTGGTTCTCCACAATGTCTGCACGGATCCTCTCCTCCTCTG
+TCTCTCCATCTGCAGCACACACAGTACACTAGGCATCCCAACCTTGCCAAGCCGAGGAGATTATCCCCCA
+TCTCCCACCCTGTGGCCAGCCCCACTCACCCAGGTGGTGCTTTCGGGCAAGGTAGCGCAGGATGGCATTG
+CTCTGGGTGATCTTGTGTGATCCATCGATCAAGTAAGGCAGCTGGATGGACAAGCAGGGAGGGTCAGATG
+GGATATGGGGTACCTGAATCCTTTCTCTGGGTGAGAGAAGACACAGGGCCTGACAGTGTGCCTGCCATGG
+GAAGCCCTCCATGGGCACAGTGTGCACTGAATGAGAGAGCTGGGGCACAGCCCATCACAGAACACTGTGC
+AGCCAGCAGGGATGGAGCAGGAGAGCAGAAACACTGATCCTAAACCCTCAGCCAGGCCAGGAGATGAGGT
+GAGATCACCGTGTCTTCCCCAAACCTCTCCTCCCCTGCACCTACATTGGGAAAGTCCAGGCCCAGCTTGA
+ACTTCTCATTCAGCCACTGGCTTCTGTCAAAGTCGGGAGCTGTAAACAAGAAGATGTGAAAAGGCATTAC
+CCCACTGTCCCACCCTTCTACAGATTATCGACCAGTAGCAAGAACCCCTAAGGCAGGGCCTCTGCAACCT
+GTTACAGAGAGTTACTTTCCAATGTGGAGAAGGGTCTGAATAACAACTGATAATAAAGGCAGCCTGGACC
+CAATAAGACCCCAGTAGTGAATTCTGTCGAATTATAAATACAAATGGACCAAGGAACTCCATTCTCCATG
+ATCTATATTGGAAAATCAAACTGCACTCTCAAAGAGGTTTCTGGAAGCAGCAGCCTCTATCTTTAGCTAG
+AGGGAGACCAGCAGGGCTAAATAACGGGAAATCTAGGCATAGTTTGATAGGTAAGAGGGAAGCAGGTGTC
+TGACCAGGAAGGTGGCATTACCGTCACCCATGGTGTATCTCTTCTCATCATAGCTTGAGTCTGTGTATTC
+CAGGAGCATGCGGATCGGGTGTGTCAGCTGGGAGAGATGACCAGAAGAGACAGAGGTCAGACATTGTGGG
+GACTTCTTTGTATGACTTCATCTTCACCCGTTGAGACCTCCCACACTGCACAAACGCCGTCACCCCGCAC
+GTGCATGTGCACACCACACACATACGCGCGCGCGCACGCACACACACACAGCCACCCAAGTGCGTAAACT
+GAGCGCTGAGAGCTACAGGCAGGAGATTCCAGTGAGAGCTGAAGAGTACCCGCTGTAGGAAGGGCTCCTG
+CTCTTAAGAACTTTCTGCTTCACATCCCAGTCCCACCCTCCAAGTGGACCCCTCTCTCACTCCGCGGACG
+TTCCAGTATCCCAGTATCATAGGCATGGTGCTGGTGCTGTGGTCTTCTCAAACTGGCTTCAGCAGGACTG
+ACCTGTGTGTTTGGAGTTGTGACTTTATACAGACTACTAGAGAAACGTGGGCAGGACAAAACAGCGGGTC
+CCTCCCAACCCTGCACCAATCCCAATTAGGCACTCCCCTTCCTAGGTCTGTAACCAGAGCAGCAGACCCT
+AAAGCGGAAGGCAGAGGCCACACCCTCTCCGGGAGAGGCGGGGTTAAGATCCTTAGCGGCCTCTGTCCCG
+GGATTCTGTTTTGCACACGCTGAAACAGAATCTGCTCTCCTGCTTCCCCCGAAACCCTCTTAGCGACTGC
+CGCCATCGCCGCCTGCTGGATGCTCCCTAGACTAGCTAGCACTTTGCCTGGAGTCTGGCCAGACAGCGAA
+AGAACATTGGGTGGGTGTTCTCCCGCCTTCCTTTCTTTTGCAGAGCTCGGTTTTCACACCCAGACTGTGG
+GAATCTGTCATTGGGGAGGCTCAACTCTCAGCTTAAAGACAAAAAGGCGAGAGCCCCCGACCTGAGAGTT
+TAATGTAGCACTTATCGTCATGGAAAGAACTCTGTCAGTCTGACAGGTACCGGATTTTTGCGCACACAGC
+CTCAAGTTACAGGAACAGTCGGTGACATGGAAACAACCCCAAGGCAGAGATAAACAGGGCTCCGTATTTT
+CCTTTAGGGGGTTTGAGAACTGTTATCAGTTATTAAAAAACTAATCTGTATTTTGGCCGTGTTTATGAAT
+CTAAATTCAAGTACCAACCACAGAGGGGCCGGAAGTTTGAGGGAAGCCACAGCTGTGGACAACAGTCAGC
+TGACAAAACCCAGCAGAGCACAAGGGAGAAAAAGAGCCCACGCTGGGTTGTGCCATAGAGAGGGAGAGCA
+TGTGACCCAAGAAGTTTGCTTTCAAGGGATACCAGGATGCTGTACACATGGGAGCCCAAAGCAAGACCCT
+CAGCTGTCCCTAAATGCAGGACCTGGCCCAGGCTCCGGACATGCGTTTGTATATGACAGGGTGGCTCAAA
+TGACCTTTCAGGTTACCCTGAGATATATAGTAGTATATCCTTCACAAGTAGTATATGCCCTGTAGTATTA
+CCTTCACAAGGTCAATGTCCTCCCAAGAACACTGTCCTACCTGTTTACTTTCTTTTTTGCAGACAACCAA
+GAGGCTTGATGGAAGGGGAGACTAAAATGAGTTTACAAGTCAAAGGCCAAGGCTGTTGCTCTGCAGGACG
+TACATTTTGGGATTAAATCATCCTTTACTGGGACTACGGGGTATGAAGGCAACATATCATGTCACCCACA
+TAGGAGCTGTGCAGTGAGGTTGCAGGGGGCAAGCAGAGGCCATCTCATCCTAGGACATTAAAGAAACCAC
+ATGTTCTGCAGGTCTTGGTGAGGATGCACGAAAAGTGACCATGTCACTGGGGCTTTAGCAGCAGTTTGGT
+GGGAACTACAGGGACATATTTTGCAAGTATGAATGAGGCACTTCAGCAGATTGTCACACACTGTGGGAAG
+TAGAGAAGGAGGAGTGGAAGAAAGCAGAAGGAATCCATACTGTCTAGGACAAGGACTCAGAGGGCCTGGG
+AAGGGAGCCAGAGCCCTGGAGCCCTTCAGAGGACAGAATCTATCTGCAGACAGGGATAAGGGCTAGGACA
+GCAAGTTCTCAATGCAGGGTCCCCAGAGTCCGAGATCATTGTCCCTTTGAATGAGAAAGGTCCCTGAAAG
+GCAGCAGGGAACTGTGGGGACTGGAGACGTGGAGAGATCTGAAGCTAGGAGTCACAGCCAGTGCTGTCTT
+GGAGAATTTAATCCCAGCGTTGGTCACTGATCCTCTGGGTGTCTCTGTGCACTGTCACAAGTTCACAGTG
+CAGCCAGCCCTAGTGTCTTAAAAGGACACTGTACAGGAAAAAAATCTAAATGTGCTGGAGGAAATTTGGC
+TTATAACCTATAGAGTGACATCTGAGATGGATTCCGTAATTTCTTCCAGGCCAATGGGTGACATTCAACA
+TGGTCCATGCAAAACCACAGAATTCTATAAATAATGGCTTTAAAAATTTTAAAACTCGGGGTGGGGGTGA
+AGAAGGGGCCGGCCTTCGAGTGACAGCGGCGCAAGATGTCAGCCACCACAGGCTCGGGAGTAAAAGTCCC
+TCAAAATTTCTGACTGTTAGAAGAATTGGAAGAAGGACAGAAAGGAGTAGGCGATGGCACAGTTAGCTGG
+GGTCTGGAGGACGACGAGGGCATGACACTTACAAGATGGACAGGCATGATAATTGGGCCTCCACGAACAA
+TCTATGAAAACCGAATATACAGCCTTAAGATAGAGTGTGGGCCTAAGTACCCAGAGGCACCCCTGTCTGT
+AAAATTTGTAACAAGAGTCAATATGAGCGGCGTGAGCAGTTCGAATGGAGTGGTGGATCCGAGAGCCACG
+GCAGTGCTGGCAAAGTGGCAGAACTCCCACAGCATCAAAGTCATCCTGCAGGAACTGCGGCGCCTGATGA
+TGCCAAAAGAGAACATGAAGCTGCCACAGCCACCGGAAGGACAGCGTTACAGCAATTAGTCACCAAGGCC
+CCGGCCTCCCCTCCCCATCCAACCCAAGTCTTCATTTTCCACAGTAGTGAATTTTCTAGATACATCTGTA
+GACCTCAAAGTACTGGAGAGGAAGCTCCACTCAGCCTTCATCCTGAGGCACTGTTCTGATACTAACTTCT
+CGTCCATTTGAAATACCTAAGTTGTGCTGTGTAACATCAACTGTCAAGTGTAACCGCTGTCTGCCTGGTT
+GAACGTCTGGGATCAAGAAGGTGTTGAAATCGGTTTCCTGTGGGAGCGGTGGGCACAGCTAACACAACTG
+TGAACAGACACGTCACACAATCACCTGCTGCTGGCCCTTGGCCTGAGTCTGCCTTTGCCCTCCCCGCCCT
+CTGCCACGGCTGTGTGGTGGCCCTTAGAATAGATGGGAAGGCTTCAAGTAGCAGTTGTGGGACTGACTAC
+TGCTGGGCTTGGGGCGCTTTGGCTGCACCCCTGCTTTCTTCAGTCTTAAGTGATGCCCCATCCAAGCCAT
+GGTCCCCACTCTTCCACTCCCACCCTTGGCCAAAGCTTAGATTGTAACCCTCCCCTCCCTCTGAAATTGG
+CCATGGGTGAGGAATTCGGGGCTTCCCGTGTCCCCACCTTTATCAAGGGGTACTGCTTTCCCCTCCTCAA
+CTCCCCTGTTGCCCATCACCACCCAACACTTGCTGTGGCCAGAAGCCATCAGATGAGGTTGGAAGAGCCT
+GGCCTACCTCACTTAGCTCCGGACCACACTCACCTGCCACCAGCCTGGGAAGGGAATGCCAGGTCCTTAG
+CCCTGCTGCCACCATCTTTGCACTCAGGTCTAGAGGTAAACAGAGCAGTCACAGGGCGACTCGGACCGGC
+CAGCGATCAGGGTGGTGGTACACACCTTTAATCCCAGCACTCGGGAGGCAGAGGCAAGCGGATTTCTGAG
+TTCGAGGCCAGCCTGGTCTACAAAGTGAGTTCCAGGACAGCCAGGGCTACACAGAGAAACCTTGTCTCGA
+AAAACCAAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAAAATTC
+CCGAGACTGGAGAAATGGCCCAAGTGGCAGAGAGCTTGCTAGGCATTCATAAAGCCTTGCAGTCAATCCC
+AAGAACATCATTAACAAGGCTGGTGGCGCTCACCTGTAACCTTAACACTCAGGAGGCTGGGGCTGGAACA
+GAAGAACCATCAAGGTCATCTTCAGCCTCACAGCAACTTTAAGTCCAGCCTGGGACAGAAGACCTTATTC
+AACATGGGAAGATTCCAAGTTAGTACACATCAGGAGACAGAATAAAGCAACTATAAATCTGAACAAATGA
+TTTCAAAATTGAGGATACATTTTTTCCATAGATGCATAGTTTTAACCGACTGCATCTAATAAATACAACA
+AGATATGGTTTTTAAAAATTAAGAAATGAGTGAAGTAATAATGCATACTAGCGATGTTTTCCCAAGCACT
+GTTTTCCAGAAACTGGCTGTACAAAATTACAAACCTGCCTAAAATGTACACGTTACACCAGTTGTCAGCA
+ACTACACAGATCAGACATGGGCGGCGGCACAGAGAGCATGTCTGTAACTTACTTATTCTGAGCCTAAATC
+CCCACTACACACTACACATGTGTAACTACATATACACATAGTATACTGTAACTTGGTAAAAATAAATAAA
+TAAATAATAAAAATCATTCATCCAGAGCATGGTAGAGTCACCCATGAGGAAGAAAGGAGAGGGGTACGTG
+ACCAAATGGGACAGCTGGAATCCAAGGCAGGAACTAGACCCTCTGCAGGGTGCTTTGAACACACTAGATA
+TCACAGTGCAGACATGGACACCCACTCAAGAACCCTTCAAAATCTGTCCGACCATGTTCCCTGGTCAGGA
+GGGAACCTGAGGCGCAGAAGGCTTAAGCAACTTCTTTCTTTTCACACTGCTAGTTAGTGGAATAGGGGCC
+CAGTCAGACTGGCTCTGTGGGTGTCGTTCAGTGACATTCCTGGGCTCATGTCTCCCATCTCCTTCGAAGG
+GCTTCATCCCTCTCTCTTCATATGCATAGGCATGTTCCCGGTTCACAGCTAAGCTCTTTTTAAAGCTTGT
+TTGCACACAAGTCGAACCTCCAACAGGACAAAGAGGGTCAACAGGTCAGTGCAGACTAGAAGAGATCTTA
+TCCTGTGACCGAGCATGAGAAAGGTCTGGCCAGCAGATTCAGGAAGAAAACCTCATCAGGGCTGCCATCA
+AGTCCCAAGACCCACTTGGGGAGCCTGAGATCAGTCTCGCTGAGCCGAAACTAACCTGCTCCCCAGTGGA
+CCAGTCAGAACAACAGCTGTGTATATTCCAGGCCATCTATGTCAGGACAGCGAGTGGCAAATTGTTTCTA
+AAGAGTTCAAAGATGACCAGGGATTCCTGTACCCAGCAAGACAGGTGCAGCCAGCCTCAAGCACAGATCT
+TGGCTCTCCTTCCCTGTCCCATCCTTCTTCTCCCCCCACAGCCGTGCCAGCTCTTGCACAATGAGAACTA
+GCTGAGAGAGTGTGTAACTTGATGGGTGTGGTCCAGAAAACAGACTCCAGGCTGAAATAACCGTTTCCCT
+AACATTTCCGACTCCGAAATCAATAACAAAGTAGGTTGTGTGGGTGTCGGAAAAAAAGTTAGTGCTCCAT
+GATTATCTGAGAGATGAGCATCAGACTGGGCTCCCCAACACCCAGTAGGCACCATATACACCTCAGAATC
+TGGGGTCTTCCTCCCTCCCCCTCCCCCTCCCCCTGCCCCTCCTCCTCCCCCTCTTCTCCCCCTGTCCCCC
+TCCTCCTCCCTCTCTCTTTCTCTCTCCTACCCTCAGTCTCAGGATGAAGACATTTCTATCTGCCCTACTG
+TACCTGACACCAGAGGCCCCTGGTGTGAAAATGAGAATCTGTCCCATTTTACAGAGAATCCATGTGCTAC
+AGGGGAAAGTACATGATAGCCAGGTGTCTGTTACTTAGACAAAGCAGGAGAGGAAATCGGTCCAAATGAG
+GACATATTTATTCTTCCCCATGGCCTCAGAGCTTTCAGTCCATGATCTCCTGGCTCCACGATGGGAAGAA
+CATTTACTGCAAAGAGTGAGTGCTGGAGAAAAGCTGCTCACCACATGGGGTGAGGAAGCCAAAAGATGAA
+AGAGACAGGTTGTGATATACCCAGCCACACCTACCCCAACATCTACTTCCTCCTAGTAGCCCACCCCTGG
+GTTTCCAACACCTGTCAAAAACCTTTTATTAATCCACAACTACACCACAGCATTGGTGGGTCATGGCCTC
+ATGATGTAGTCACGTCTTAAAGATCCACCTTTGAATACCATTACACTGAGAACAATGCCTTCATTCACTA
+TATAAGCCTTTGAGGGATGTCTTAGTCAAGGTTTCTTTACTTTGTTTAATTTTAATTTTATGTGCATTTG
+GTGTTTTGCCTGTATGTAGGACTTCATGAGGGTGCCAGATCTCCGAGTTACTGACAGTTGTAAAGCTGCC
+ATGTGGGTACTGGGAATTGAACCCTGGACCCTCTGGAAGAGCAGAGTCATCTCTCTTGGCCCCATCACTA
+GTCAGGGTTTCTATTGCTGTAATAAAGTGTTCTGACCAAAAGCAACTTGGGACCTCCCCCAACTAATTAA
+GAAAGTGCCATACAGGTTTGCCTATGACACAATCTTAAAGAGGCATTTTCTAAATTGTGGCTTCCTCCTC
+TCTGATAATTAGCCTGTGTTGAATTGACATAAAACTAGCCAGAACAAGGACAATTCAGATACACACTTTA
+ACATGCAGAAGGAGATGAAAAGAAACTACCACACCACAAGATGTTCCAGATATAGTCCTTGAGAAAGAAC
+ACTGGATACTTACTCAGACCCAGCCCCAACCACAGCCTGGGAAGTCAGAGAGATTTCCAGTCCATCATGG
+CAGCAGGAGCATCTCCCTTCCTTCCACCTTGTGCCTGGGAAGCACTGGCCACTCCCCAGAATCTGTAACA
+TCTTTCATGACCCAGAGAGAGTCTCCAGATACTCTGGGTTCTGGTTCTGAGGACAAAAGCAGCTGTCATT
+ATCCTTATGCACAACCTCCCTGGATGTCCTTGCAGGGCCGTGCAGGCCCCAGCTGGGCAGGCTGAGCAGC
+CATCTTCCTGAGCCTGGGCTCGGGTTATGGTAGAAGCTTCAACACATCAGCATTGGCTGGACTTCAGGTG
+AAGCTGATGTCTACAGATGTGATGGGAAACCAGATCGGAACACAGTACTGTTGGCCCGCCATGGAGGCGC
+ACATCTGTAACGCCACTCAGGTGGGTGAGCTGGGAAGATTGCAGTGAGTTTGAAGCCTATATAGTGGTCT
+CAAGACCAGTATGGGATGGAGTGAATCTCATCAACAACATGTCTAAAGAAATGGCCCAGCCATTAAGACA
+CTTGCTGCTCTTGAAGGTGACCAGAATTTGGTTCCCAGTACTCACATGGGATGACTCACAACCACCTCTA
+ACACCAGCTCCAGAAAACCTGACCACTTCTTCTTGCCTCTTAGGGTATCCATAAATATGTGCATCACACA
+CACACACACACACAAACACACACACACACACACACACACAAACACAAACCTTTAGAAACAATATTTTAAT
+CTGAAAAAGAAAAAACAGCACAATAAAAACAAAACAAAACAAAAAAGGCAAAAACAAAATCAAGACCCTG
+TCCCTGTGTGTGTTGCCACTGAAGCAAACTCTCTCCTGCTGGCTAAGCAAGCCCAAGAGGCTGTGAATAC
+CTGATTATATTCCACAATGGATAGGCTGAGAAGGCAAGGCACTGAAACTGCTCCCCATAGGTCGAGAGCT
+TTACCCAGTAGATGTCGGAAGAATGAATAAAGCATCTGGGGAGCTTTCTACAAAGCCGGGGAGTAACTGA
+GGCCAAAAGGAGGGAAGGAGAGGAATGGAGGTTCTTTTGACCCTGAAAACAGCACTAAAGCCAGTGCAGC
+AGACTGAGTACATGAGAGGGGCAGTACCGGGAGAGACTTGCAGCGTTCATAAATCAGAGTTCTGTGTCAA
+CCTCACAACCCGGAGTTCTGGTTCTCACTGATTGAGCCTGTCTAACCCAGAAATGCACAGGCAGCATGAA
+TCAGCTAGACCGGAGCCCTTGGCTGGGGGCTGAGGCCAGAGGTAAAGAGAGTTCAGGACAGGACTTTACT
+CTCTGGGAACTGGAGGTCCACTCTGGGAAGGACAAACACTGTGTTGAGATAGGAGGGCCCTGATAACAGT
+GCTCACTTGATACAGGGGTTTGCAAGGTCAGTGGCACTAATATGGGGAGAGACCTGGGTTGTAAGTGGCT
+TCAAGGGGTTCAAACAGGAAACAGGACCTGGGTTCACAAGAACCCAAGAGTCCTGACCCTGATCCAGTCT
+GGTTCATCCCAGTTCAAGGCACTGGACGATTCAGAAAGGTCTCAAAGAAAGATGGACAGCACTGAGAGAG
+GGCCCCAACTTTGGAGTCACTGACCCTTATTTATATACATGAGTGCATGCACACACACACATATGCACAC
+ACATACATGTGCATACACACATAAGATAAGCTGAGTGTTCTTAGTTAATATTTTGTCAATCTGACACAAA
+CTAGAGTCATTTGAGAAAGGGATTCTCAATTCAAAAAATGCCCTCATAAGATTTGCCTGTAGGTAAGTCT
+GCAGTGCATTTTGTGATTATTGATTGATTGGTTGATTGATGTGAGTAGGGGTGGTGTCACCCACTGGGCA
+GGTGGTCCTAGATGATGTAAAAAAGCAAGGTGAGCAAGCCAGTGAGCAGGACTCCTCCAAGCTTCTGCTG
+CAGTTCCTATTCCCAGGCTCCTTCCTTATTGCCCTTATCTGATAGGCTGGATTGTGGAAATGTACGCCAC
+GTAAGACCCACCCCCAGCCACGTTACTCTTATCTGGTTGGTGGTTTATCATGACCATAGAAGCTAAGTAA
+GACAGCGAGTGAGCTGAGAAATCCCTGTACTTCTGATTCACAAGTCTCTAGCAACTGGTGATGCTGAACT
+ATTTTTATATGCTCATTGGTCGGGAAAATGTCTACTCAAGTCTATTGTCCACATTTGAATCCAATGATTT
+GCTAATGCTGTGGTTACTTGCTTGTGAATAATAATCTCTTGGGACACACTAATTATCCTGTGCTGTACTG
+TTTCTGTTGGGGGGAGAGTGTCGTTTGGTTGGTTGTTTTATTTGAGACAGGATCTGTCAACATAGCAATG
+CTATTCAGGAACTCACTAAGTAGACCAGTTTGGTCTCGAATGCACAGAGATCCTCCTAACTCTGCCTCTG
+GAGTGCTAGGACTAAAGATGTGTACCACCAGCACCACTATGGCCAGCATTGTATTGTTGCTTCTCCATTC
+TAATTGCAGGGTGGAGCTACAGAGGAAAGAGTATGGCCCACAAAGCCTAAATACTGGGCCCTTATGGGGA
+ATGTTTGCCAACCTTGGTTCTAGGTCCATGTCAACATTATCAGAGTGCCACTCCTGACTTCACTGCATCC
+TTTCCAGCTTATCTTCTTTGTCTTCTAGTTAAAATTTAACAAACACTGACTAAAATAATGTGAACTTACC
+AGTTTCACAAACATTTCACTTCTATGAATTTTTTATCTTATGACCTTTAACTTTTAGTTCCTCATATATT
+TTAGCAAGCCTGCCTGTTCCTTAGCTGCAGTAGGCAACAGCAGGCAGGGGGCATGAACACAACACCTGTC
+AGGGCTCTCATCCAGATGTGCTACCTGATACAGGTAGGTAGCATCCAGATGTGCTACCTGATACAGGTAG
+GTGCCATCCAGATGTGCTACCTGATACAGGTAGGTGCCATCCAGATGTGCTACCTGATACAGGTAGGTAG
+CATCCAGATGTGCTACCTGGTACAGGTAGGTAGCAGGCACCGCCTGACAAAGGTCAGGCACTATGATCCA
+GACCCTGGCGTAGGAGCATGCTCAGTAGATACTTCTGCAGTGTGTCCAAGTCCTTCATGGACAACTTGAT
+ACAAAATGTCCACAGCATCTTCAACCATAGGCTGAGACACTGGGGCCCAGCAAGCCAGGCCTGCCCATTC
+CTGATCATGTTCTTCCAAGGCAGCTGCAGCTTTTTAATGAGCCTGCTTAGCTGAGAGAGCCTCTGACTCA
+GACTCCAGCACCTCTGTGTGGACACATTGCTGTAATGTCTCCGGATAAGATTTTTCTTTCCAGAGCTCTG
+AGCCCTTACAGTATAGTGTCTTAGGGCTTTACTGATGTGAACAGACACCATGACCAAGGTAACTCTTATA
+AGGACAACATTTAATTGGAGCTGGCTTACAGGTTTGGAGATTCAGTTCATTATCATCAAGGTGTGAGAGC
+ATGACAGTGTCCAGACAAGCATGATGCAGGAGGGGCTGAGATTCTACATCTTCATCTGAAGGCTGCTAGC
+AGAATACTCGCTTCCAGGCAGCTAGGATGAGGGTCTTATAGCCCACACCCACAGTGACACACCTATTCCA
+ATAGGGCCACACCTTCTAATAGTGCCACTCCCTGAGCCAAGCATATACAAACCATCACATATGAATAAGT
+CAAAACTACTCTGAAGATAATTCATTCAGATTCCAAATTTCCTAAACACCAACCAAATGCCAAGCTCTGC
+CATCAACAAAGATGGAGCAGTCCAAAGAAAGGGACCTCTTCTGTAAGAATCCCAGGTTGAGGACCAAAAA
+CCATGGACTAGGGACACCAAGGATCTTAATGACTTACAGTCCCCGCATTCCCATCACCCAGGTCCTGCTG
+TGGAGAACAAAACAGAGACAACCTAACCAAGTCAACATAGTCCTGTGAGCCACAGACAGTTAGGCAAGAC
+ACAGAACAAACGTGGTTCAGACTTTACCAGGGATCAGGCAAGACTTGGCTGCTCACTCCAATGACACGAG
+AAGCACATGGGTGGGTCTGGTCCTTCCGTCCATACTGGGGATTCTGGGGTCAGATAATTTCCTCTAAGCA
+GAGAGATGTCTGAATGCAGAACATAGACAGGGAATAGGAGGAAACTGATGAAGAAATAGGGAGCCCCAAT
+GACCTGGGAATTGGGAACTTAGGCTGAGGGAAGGGCTAGGAAGAAGAAACGGGGGAAAGATAAATGGAAA
+GAAGAGGAAACTGGACTGGGGGAAAGGACACAGTTGTGCCCAAGGGCCAGAGCCAGCAAGGGTATTCATG
+AAAAGGCCCCCAGGTCTAGGCTTTGTAGTCTTGTTTTGGGTTCCAAAGGCCATGTTTGCAAAAACTGGTC
+TGCGAAGGAAGCAGATGCCCTTTCTGTGTGTTAGCAACACACATGGAGGCTAAGGGCTTCTCTCCTGCTC
+ACACAGCTAGAAAGGCAGCAGTCTGTCCCCTGTCTACACAGGGATGGACTCTGCGGCTTCCATGTTGCCC
+TCCTAGTTCACTCTCAAAGCAAGAACTCCTTGTCACTCTTGTTCTTTCACCAGCAGGCTCCTCCTTGCTG
+GGGCAGTGAGGGTCAGGGGACTGCAAGCAGGGCACAGGGACCAGTAACATTCCAGTGTACTTTGGAAGAG
+AGCAAAGATCTCACCCAGTACCTCAGTAAATCCTGCCTGAATGACCATAAGGATGGGAGACCAGCAAACA
+GATAACTGAGTGGCCTTGGCCAGTCTCCTGAAGCAGAAAATCTAAACTCAGAAATCTAGTAGCTGAATGC
+AGACTTAATGCATCCACCCTATTCTTCCCAGCCCTGCTAGCGAAAGACATTACGCTCTGGGCCATCTTTT
+CAGTTGAGTGCCCATGAATAAGTCAAAGTTCCAAGGCTGCCTGTACCCAAGAGGCCAGGCCTGGCTGGCC
+TTGGACTGTGGCCTCCCACTCTCTCCCACCACACTGTACAGCCTGGGAATACACAGTAGAATAGAGATTG
+CTCACACGGTCTCCAGGGTGAGCAGCCTGGATCAGAAGCAGTGTGGTGGGAACTAGCTGAGAAATTACTT
+TCTCTCCCCAGATGCCTGTCCCTTACACACTCCTGACTACAGCTTTTCTCTTTGTCATAAATCAGGGACA
+AAGAAGAAGACTGTAGTGACTGCCCTCTGCCACAGGAACCAACACCACGGGAGGATGTATCCATGCTCAC
+TATCACTCGGGAGGATGCGTTATTCATCATCACTACCATCTGTCCAAGAGCTCTCGATAATTCTAGCCAC
+CCCGTAGGTTGCTGTAATGACAAACTTCATTCAGTAGATGTAAGACCTTATTGAACATAGGCCCAGTCTC
+CAATAAATAACAATTATTCTTCATTCCCATTCCAAGGGAGGCATTGTGGAAAGGGGGAAAGGAAGAAATT
+CGTCCAAAAGGAGGGGGATCCAGGAAAGATCGCCTAAAATGGAGCTATGAAATCCTTCCGGTCTGGGAAC
+GGGTTCCGACACTCAAATGCACAGTGCTGGTGAAAACCATTACAAGTGAGGAAATCCACAGAGATAGGTG
+CTCTACAGAAAGGTGAGGGCGTTGAGACTTGGTAAACCTGGAAGAACAGCTACTCCTCAGCCTCATCTCC
+CCCCCCCAACTTTACCTTATCTCCTGCAAACCATGGGTGAACCCAAAGAACTCACAGCAGAGCTTGATCT
+TCCCAAAGATGCTCAGGGATGCCCTCTAAGTCCTCTGGCTTCCATCTCTCCTGAAGCACAGTGTCCATAT
+CCTCAGACATGGTGCATCTCAGGGTGATGTTCAGGCACTAAGTTAGAGCATTACCACGATTAGACTGCTC
+TTAGGGAACAGCCCAACAGGATACAGCCCTGTCTCCTGGTCTCTGCCACTATTGTCAGGACCACTGAGCA
+TGAGACAGCTGTGAGCACATGGCTCTGAACCTCACATGAGGACTGAAATTGTCCAGGAAACAGGCCGTGC
+TCTACCTACTGGTGCGGTGAGGACCTCAATACAGATTCCAAAGAGATGAGGTGAAAAGAGGACAAAAGGA
+AACAGGGGCAGGATCAAAGGCCTGACTGACTGAGTTGCCTATGGGAACGTCTCTGAGTGTGTCAAGGACA
+CAGCTCTGGGAAGCCCTTTGGAAGGAGAGTGTCCCCTATCCCCAGTGTCTTCTTCCCATGAGTGTGCAAT
+CTGCAGACATGGCAAGACTCCAGGACAACAGAAAGACATGACTCAGGACAGAACCAGCGCTTAGACCTGG
+AGGGATCGAACTCTGTCTGTCCTATCAGGGACTAAAGTGTCACTGTGGAGAGTCTAAACTGGACCTTAGG
+CCGATGACTGGAAGTTTTCATTTGGCAAACTAAGCTTTGAGGTTACTCTGTTGGCAATTTCAAACACTGT
+GCCATGAGCAGCACTGAGAGGCTCTGGTGGCAAACAGGGCCAGAGCAGATGCAAGGGCAAGAGTGCGGGG
+CCAGCTCCAACTCACAAAGTCAGGGCTGTAGCAAACCATGGCCAACTGTATGCGGGTGTCCATAGCCTGG
+TTCTCCAAAATGTCCACACGAATCCTCTCCTCCTCTGTCTCTCCACCTGCAGCACACACAGGCCAGACTC
+ACCCTCAGCATCCCACTCCAGATAAGCCAAGAGGAATATATGGTTCTGTCCCTAACCCTGCAGCCAGCCC
+CAACCACTCTGTTAAACCTTATAGGCAGTGAGATACAGGATGGCATTGCTCTGGGTGACCTTATGTGACC
+CATCAATTAAGTAAGGCAGCTGAACAGACAAGCAGGGAGGGTCAGATGAGATAATGGGTCCCTCCATCCC
+TTCCCTGGATGAGAGCAGACACGGGGCCTGACCCTCAGTGTGCCTGCCATGGGAAGCCCTCCATGGGCAC
+AGTGTGCACTGAATGAGAGAGCTGGGACACAGCCCATCACGAAACATTGTGCAGCCAGCAGGGATGGAGC
+AGGAGAGCAGAAACACTGATCCTAAACCCTCATCCAGGCCAAGAGATGAGGTGAGATCACCGTGTCTTCC
+CCAAACCTCCCCTCCCCTGCACCTATGTTGAGAATCCAAGGCCCAACTTGAATTTTCAACCCAGCCACTG
+GCTTCAGTCATAGTCAGGAGCTGTGGACAAAATGACAAAGATTGCTCTCTGTTCACTCCCCCTTATTGAG
+GACCTTCCCAAGAAATCAAGGACAAGACTCCCTGAAGCAATAAACCCGCAGCCTCATGCACTGCTTCCCA
+TTGGAAGGCTGCAGAGATGAAGCTCACGTCACCCCTGTGGTAGGACTGTGGGCCCTACTTAACTAAGCTT
+ATACAAGTCTGCATACCCTCCAAAGTGAAATGTGGCCCAATTACAACAAAAGGAAACTCAGCCCTTCATT
+TCCATTACCCTGTCTTAGAAGACCAACTTCAAGGAGGTTTCTGGAAGGGCACATTGTAGTTCCAGCAGGA
+GCAACCTAATAACAACAGGAGGACGAGGCGTGGTCTGTTGGACATTGATGGTTAGAAAATCAACAAAAGT
+CTGACAAATAAATGTTGTAATCACCATTGCACTCGGTGTGTTATCTCTTCTTCTTGTGGCTTGGATCTGT
+GGACGAAGCAGGAGGATGGTGTGAACCACCCATGAGACGATGAAACCCTGGCAGGGTGACTGCCTCACTG
+CCTGAAGAGTCTCCACCTTACTACTAACAACCAGCCCTCTACACATGCACACACCACACACACACACACA
+CACAAACACATGACACTCTTGTGTTTGAACTCCTGATAGGCAAAGCTTTCCAGCAACTTCTGAACTAAAC
+CCATATTTTAAAATCTATTCTTAGGTACTCCCCAGTCCTGTGAAGCCCCTCATTCATCCCACATATGTTC
+CAGTGTCATGGGCCTGATATGAGTCTCCAGCAGGTGGAATAGGGCCTTTGTTTGTACAGCCTTCTTATGG
+AGGCCTGGGTAGGAGAGAGGCAGGACTTGGCTGCAGTCTCCACTTGAGCCACTATTCTACCTTTCTCCAA
+CTCCAGACATTCCACTAGCAGGCAGAAGAGAAGTGCTGTCCAATACCATTGTGGTTTATCAGAAACAGGT
+CAGGGCACTCAGGAGCCAAAGGAAGGGCTGGACTGGCTGGCTGCCCACACTGGTCTGCCAGTGGCTTCCT
+GGTTGGTTATGTCCACAGGGAAATGGCAGTTTAGTTGTAGACACTCTTGATGGCCAAAGTGGAACACTGT
+TATTATGACCCTGGCATTGTCATTTCTTCATTCATTATTAACTCCCTGCCTGGGCTCTGGGTGGCTCCTG
+GTGGCTGAGCCAGGACAAAGTAGGATACCTCAAGCCTGAGAGATTAGTGTCTCTGTGGTCTATAACCTTT
+CATAGACAGCAGAGGGCATGCAGGACTCTGCTAAGCTGGGCAACAGAGCCACATACAGGTACGACTAGTG
+GATGCAGTACAGATATTCAGGATCACTGGGCACTGGGCAGATGTAAAGCAAACCACAGTGAGATACACCT
+TCCCCAGAAAGAACTGCCATTCTCACATGGGGGCTGGTGAGACTGTGGGCAGGAGAGAACATTCGACACT
+GCAGGTAAGAGTGTCAACTGTCACAACCATTAGGGAAAACAACCTGGTGTTCCTCGAAAAGATAAAACTA
+GAGCTACCACGTGATCCATCAGTACTGGATATACATCCACAGGGAGTGAAATTACAGGAGAACACACTTT
+ATCATAGCCAAGAAATGGATGCCATCTAAGAGTTCATCAACTGATAAACAGATGAGGAAATGTGTTACTG
+TGCACCATGGAATACTATCTAGCCATAAAGAAGGAAAATCTCTAACCAAGAATGCTCGGAAAACTCAATA
+GTCACATGCAAAAGATAGAAATAAGATTCCCCCAACTCTCATCCTGTACAAAAATAAATTCCAAACGAAC
+CAAAGGCCTTAAGTGTAAGAGTTCCATAGACCTCAAATCTAAGAGCAAGACTGGGCAAGTGAGAGGGCAT
+CAAAAAGAAAGTTGCTACACAGCGAGGGAAACTCTCAGGAGAGGAAGAAGACACATTCCGAGTGGGGGGA
+AGTCTCAGCCAGCTGCTCATCGGACCAGGGATTAATATCCCAAACACGAAAAGAATACAAAAGTTCAGCC
+AAGTATGACAGCACATGCCGGCAATCCAAATACCAAGAGGCAAAGACATGAGGACCCCAAGTTTGACCCC
+AGCCTGGGCTATTATATAGTGGAACTTTGTTAACAGACAAAAAGATAAAAACCATAACAACAAAATACTA
+AACAAATAATGAATGGGAAAATGCTCTGAAAGTTGCTGCAGATAACAGCAGCTGTGTTTTTCAGATCTTT
+GGCATTTTTAGTCATCAAGGGAAAGCAAATGAAAGCTAGGCTGAGACTCCACCTCAGCTCTGTCAGAATG
+CCACTGAAAAATACAAAACAAACTGTCAAGGAAACAGTAGTTATATACTGTCTGTAACCGCACAAAATAG
+AGCAGCTCTATAAAAACTGATCCTCAGAAAACAAAACAAACAAACAAACAAAAAACAACAACAAAATGTC
+TGCCATACAACCCTGCTTCAACACGGCACACCCATGTCTACTGCAACTAGCTAGAGTAGCCAAGTTATGG
+GACCAGCCTAGGTGCAGCGCAACAGATGTGTGGCTAAATAAGATACCATGGAGTGTTATCCATAAAGGAT
+GGAGCTGGCACTATCTAAAGGAGTTGGGGAGCATCTCACTAAGTGGAGTAAGCCAGATGCAGGTGACAAA
+TGTTGCATGCTTTCTCTCATGTAAGGAAACAGATAGCCCTGTGTTGGAAAGGGATTGTGGGCTTCCCAAA
+CCCTAGGCACGGATACTGCAGGGAGAGGATGCTAAAAGCCATCCTCATGTTTGGAAGGTCCACTCCTATC
+CTTACCAGATCCTGGTGCTTTTGAACACATCTTCCTTCTCGCAGCCTGTTGGAACAGCCTTTCCTGCCTT
+CCCTAGCTGCTGAGAAGAAAAGCCTCCCGAAGCCGGTGTGTTGATAGCTCCCTCTTCCCTGTCCCTTGCC
+TTCCAGGTGTTTGTGGACCCCAGAGGGCCAGGGAGGGGACAGCAGGGCCACCATACAAAGTGGAACTGGA
+GACACTGCCCACTCAAAACATCATGAAATGACTTTTTACACTGTTAGAAACTGGATGGGAGACAGCAGAC
+ACCATCAGTTGCCTCAGAGGGTTATGTCCTGTCCCTGTGTTGAGTGTTTCTATTGAATTTGTTGCTAGTT
+TTTCTGCTACCTTCCCAACAACTATGTAATAAACCTTATAGATTCTTAAAAAAACAAAAACAAAATGAAG
+AATTACTAGAAATGAGAGAGGGACAAGTAGGGACTGGCAAGGGAATTGAGGGAGGACTGTGAGGCTATGG
+AAGGGAAACAGTAAGCTATAAGGGAAGTAAATATGAACAAAGTACAATATATGCCTATGTTAAAAGACCA
+CAGTGAAATCTACTAGTTCATATAATTAACATGAATTAATCGTTATAATAAAAAACGTTTCTAAAGAATA
+AAATCTTGAAATCTTGCAATATGCATGGATGCAATTAGAGACCATTAAGCTAAATGGCACCTGCCAGGCA
+TGGAGAGATTAAGTCCCACCTGATTTCAGTCAGAGCTAAAGCAGTTCATCTCATCCTTGGTGAGACTCCA
+GCCAGCCAGTATACTGCATGCCTACAATCCCAGCATGCAGAAGCAGAGGGAGGAGGGTCAACATACTGTG
+AGGGTAATGTGGCCCATACAAGGCTATGCCTCAGAGGAGGAAGAGAGGGGATGAGGGCAGGGAGAGGAAG
+GGGGGGTTCTGTGGGAATGGGGGTCACAAAAGCTGAGGAGAGATGCTGTTCAGTGGGTCCTAAGCTGCAG
+TTGGGAGGAAGATACTCTGGTGTGCTGCTGCACAGTAGACTATACACAATGGCCATGTGTGTATTAATCC
+CAGTAACCAGAGGAAAGGACTTCCCGTGTTTCCACTGCGAAGAACTGATATATGATATATGATATATGAT
+ATATGTTTATGTTTAGATATGCTTGGCCTGGTTTCAACATTACATAATATTTAAGTGTATCAAAAGCATT
+ACATGGCATCCTGTTCATACATATGGTTTTTATGATTTTGTGTAGGTTAAAAAAATAAACTTGTGAATCT
+GTTCTTTAAAGGGAGGAAATAGTTGGAGAGATGGCTTAGTGTTTAAGAACACATGACAGCATCCACAACT
+GGTAGCTCACAGCTGCCCGAATGGCAGTTCCAGAGGATCTCATGCCCTCTTCTGGCTTCTGTAAATACCT
+ACAGTCATATAGTCATACACACAAACACATAACCATAATTTACAAATCAGGTCATTTTTCAATGAAAAAT
+AAGCAAGGGAATGTGCTCTTCACATGATAAGGAATACACTCACATTGCTAAATTAAAGTTTCATCAAACA
+AGGCCCGTAATAGTTGTATGAGTTGTATTATTATTATTATTTACTTTTTAAAAGTGGGTATCTCCATCCA
+CTATATTACCAGTGTGTACAGGAAGCAATTTACAGCTCTTGACACATTTAATAAAGCTCAGAAACTTTCT
+GAATGGTCTCCTCTGGGGAGTGCTGCAGCATCTAGATCGGGAAGCTATGAATGCCCCACAATACACAGCA
+GAGCCCCACAGCAAAGAACCAGGCAGTCCAAATGCACTCACAACCAAGTGAGGGGACCTAGGGTTACCTG
+GCCTGTGAACCAGGAATGACTCAGAAGCCACAGCTGGAGAAGCTCAGTCTTAGAAAAGGGTGAGAGGCAG
+AGCAGAGCAGCTGCAGGCCTGGAAACAGCTGAGGGAAGGGGTAGTAGGGAGGGGCTAGCAGCAGCAGTCT
+TGCCCTGGGGAGGCCCTGGGGGCATTGGGCTGCACGTTTGCAAAGGGAGTGAGAGACTTTCTCTATTCCT
+AAGGAATAAACATTTCTTTCTTTCTTTTCTTTTTTAATTTTATGTACATTGGTGTTTTACCTCCATGTAT
+GTCTGTGAGGTGTCAGATCCCCTGGAACGGGGGTTACAGACAGTTGTGAACTGCCCTTGGTGCTGGAAAT
+TGAACCCAGATCTTCTAGAAGAGCAGCCAGTGTGTATGATTGCTGAGCCATCTCTCCAGTCCCATAACAA
+ACATTTCTTTTACAACTCCTAAGAAATCCTAGGTTGGAGAGATGGCTCAGCAGTTAAGATCACCTGCTGC
+TCTCACAGAGGACCTGGGTTTAGTTCCCAGCACTCAACATGGCACCTTGCAATCAGCCATAATTCCAGTT
+CCAGAGGATCTGATGCCCACTTCTGGCCTCTGTGGGCTTCTGCACTCATGTGGGGCACATAGTACATGCA
+GGCAAAACATTCATATTCATAAAATACAAATAAATAAACCTTGAAAAAAAAAGAAAGAACATAAACATAT
+GGCACACAGGGAAGGACAAGAATTAAATATAAAAGCACATAATGGCTACCAAGGGCCAACGACAGCAACT
+ATGGGGTCACCATAGCAGTAACTCCATTCTAATGCCTGCCCAGCCTTGCAAGCCTCCTGGATAGCCTCTT
+ACCAAAGAACTGAGCAAAATGAGCAGGCTGATTTTCCCTTTGGACTCCCTCCAAATCACTCATAATGAAC
+TGAAAGATTTACTAGAACAGAAAAAGATATCTTACCTCTAAACCTTCATCCAATCTGTTCCAAATAGAAA
+AGCTGCCCCCGCCCCGGGCCCCGCCACTGCCCCCTCCCCCACCAAATGGAAGTGCTGTAATGTTTTATGT
+TTTATTTCTTTGTAGATTTATTCTTTCGTTGTTGTAATTAATTTTTGTGTTTGTTTGCTATATATTATCA
+AGGTTCTCCACACAGAGCAAATAGATAGCAATAGATTAGATATGAATAACTTATGATTGGACATGGGTGA
+AAAGGTTATGGCGCAGGGAAGTTTCATCATCTCTGCAATTCAAAGCATGGGAATTGGTGGCATAATTCCT
+TCAAGTTGAAGGGGCAGATAGATGGAGGGCAAGGCTGTAAAGCTCTGCCAAGTTTTAGAGTAAAGGACCT
+ATAGTCCATGACAGGAGAAGATGAAATGTCCTGGTAAACCGAGAGAGAAAATTCACCCTTCTGTGTTAGA
+ATTGCCCAGGCTCTGTGAGTAGCTGGTGCCCCTCCTACTGGTGAGCTTTACCCAGGCTTTGTGAGCCCTA
+CCTACCTCCTATACTCAAATACTAAGTAAGCTTTGCTGGAAGTAAGAGAGATGACATATTGGAGCCTATG
+TGATATTTACATAAGACTAGCTCTCACCTTATGTGCTAGGTAACCTAAAACAGCACAGTTATAACATCCC
+TAAGGAAACCAAGGCATGGACTGCATCAGAGTCTTTCAACTTTCTCCAGGAAATGACTGGGCAATGGGTA
+GGAAATGAATAAGGAGCTTTAATCAGGGAGATTACCAGACATTAATTCCTCAGGTGAAAACATGCCTGAG
+CAATGATCTACAAAGTGGCCATGCAAACGTTACAACGCATTATGAAATGATGAAATTGAGCTAGGTGTGG
+TGACACACACCTTTAGTCCCCGCAGTCGGGGGGGGGGTGCACAACGGGGGTGGGGGGAGATGGGGGAAGG
+GGTCTGTGAGTCTGAGACCAGCCTGGCCTACTAGGTGAGTTCCAGGATAGCCAGGGCTTGAAAAAAAACA
+AAAAAAGAACGAAAGGAAGGAAAGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA
+GGAAGGAAATGACTGAAATTGTTGGTGCCTATGTATATGATTTACTGTTGCCAGCAAAATTTAAATTATT
+ATATCTGAAGAGCACAGATAATCCCATATAAAGTACTTTTATTTAGTTTTACCTTGAAGTATTTACTCAG
+TGATTTATTCAAATAAAAGGACTAAAGCCAGGGGAAACGGTTGAGAGATTCTGGTTAGAAGCTTTTAAAT
+TAATGAAATCCACACATCTAATAACACTGAGATGACGGACAGAGTGTACGGAGACTACACTCACCTGCAC
+AAAGAAGTAGGTATCCACAAAAATTAGAGGGGAGACTGGAGTGAGTAAAACACCTGCCTACAAACTAAAC
+TGTTCATTTTGGTATGTCGAGAGGGCTCACGATGCTGTTAGAAACCTTGGAATCTCCTAATGTTCACTCT
+AGGTGTAATTTTAAATCACTTCAGGAGAATATCATGCAGATGACAGACTGGATCTGACAAAATTCACCAA
+CTCCTGTCCCCAATTGGATGTGAGCCATTCTGCAGGGAATAATCTAGGATTATGCAGGATTTAGACACTG
+TTCTGCCAACTCATAGTGTTTGTCATGATTCCATGGGCTTAGTGACTGGCTGTTCACTTTGGGGAAATGC
+CTATGCAAGTCAAGTCTTAAGTCATTTTTGGAGATTGATTGATTGATTGATTGATTTATGTATTTGGTTT
+TCCAAGACAAAGTTTCTCTGTATAGCCCTGACTGTCTGGAATTCACTTTGTAGACCAGGCTAGCCTCGAA
+CTCAGAAATCCGCCTGCCTCTGCCTCCTGAGTTCTGGGATTAAGTGTGTGCCACCATGCCCGGCTGGAGA
+TGTATTTCTGGCTATAGTTTTTGAACAAAATCTTTACTAAATTAGCAACCGAACACACAGAGACACACCT
+TCACACAATGTACCCCAACAGGTAAGACATAAAAATATTGAAATTAGCCTTATTTTAATTTTTAATTATT
+TTTTATTTTTCAGATTATAATATACTTGCATCATTTTCTCCTTCCCTTTCCTCCCTCCAGACCCTTCCAT
+ATACCTTTTTTCATTAATTGTTGCACATATAAATTCTATGTTGCATATATATGTGTATATATATATATAT
+ATGTATATATATTATATATTCCAAATACACAAATACAACTTAGTCTACATGTCAGTTGTATGTATGTTTT
+CAAGGCTGATCATTGGTATTGGGTAACTAGAGAGTTCTTCCCTGGAGAAGACTATGTCTGCCGCTCTCAG
+CATTCCTTAGTCACCTGTAGTTCTTTGTCTAGAGTTGGGGCCCCATGAGCTTTCCCACCGCCCACTTTAT
+GTCTCTGTTGGTGTTGTCCTTGTTCAGTTCATGTTTAGGCAGCCAGGATGGTGAGACTTCATGAGACTTC
+TGACATTCCTACAGACACAGTCTCACAGCAAACTCCTTGGTCCTTTGGCCCTTAGCAGTCTTTCACCCAC
+TCTTCTGGAGTGATCGCTGAGCCTTAGGTAGGAGTTGAGCTGAAGATGCTATCAGCTGGGACTAGGCTCC
+ATAGCTATGCCATTTGGTTGGTTATGGTTTTCTGTAATGCTCTCCATCTACACTGATCTGTAGCTACAAG
+GACAAGTTGTTGGAATATATTTCAGGATTAGGCTGGTTTAGCAAAGTGGTAGCTGTATGCTCTCCTCTAA
+GATCTATGGCTTCACTAGCCCTGGGCAGTTGGCTAGGTGCCCTATTCCTGGCAGGTTTTTCCTCTTGTTC
+AGCAAGCCTTAAGTCCAACTCAGGAGCTTGGTCACCATCAAAGTAAGGGTTATTGTGACATGCTGGTCAT
+TGCTGTGGTTCATAGGCATGATAGTTGAGAAGGGTTATAGGTTGCCTCCCTCCTTTGGCACCATGAATGT
+TAGTCCCCCAGGAGAAGGGTTTCAAGTCAGATCCAGCTTAGGAGCCTCTGGGCCCTGTCTGAAGCACACA
+GTGTCTTCAGCAATACAGGCTTGACTCCCACTTCTCGGAGGTAGCAATAGCCTGGAACGTTCTGGGAGTC
+TTTTGGACAACCCTGACCAACAACTCAAAAGAGGATTGTTCATGATTGGTGTGGTTTTTGTTCAATGGCC
+TTTGACTCTTGGTGGCAGCATCGTTTACCCAAAGCAACTGTATCTTAAATCTGAGTGGGAGAGGCTCAGA
+GAGGGTGTGGAGAAGCACCAGGCCCAGTACAGATCCTGACAAAGGCTTTGATTTGGGGATCTCAAGACAA
+AGGTGGCCATCTGAAGTATCAAAGTGGAACACATACAAAAGAATCCTAGTCAGAGTGACTGGATACTTCT
+CAGCAGAAACACTGCATTTCTGAAAGGGGCGAGGCCATCTGAAATGGAAAGTTCACTCTACAGTTCTATA
+CCTATCCAAAGTATTGCTGACATCAATGGTAGGAAAAGACATCTGAAATTACGAAAGCATGAAGAAATTT
+ATTTCACGGGGGCTCACAAAGAGGAGGGATTAACCAAGATGGCCCCCATCCCTGGTGACCAAGACGGGAT
+ACTCTGAAGACATCAGAGACAGGATCCTGGAGACATATAGATATAAGGCAGCCATAGCACAGTGGGCAGA
+GAGGATCCTGGCTTTCTGCTGGGGACTCCTATCAGCCTGGATCCTCACCACTGAGACCTGCATGACAAAT
+CTCAGTGCTGCTAGCATAAAGGAGATCTAAGCTCCCTTTGAGCTCCACAGAGCCTTCTGCAGCTGCCTCC
+TGAGAAACAAACAGGTGGGGCTAGGAAGGGCCAAACCCCTGCCTATCCAACTAAAATCCCTGTGAACTCA
+GCCTCTGCCCGGGTCACTAGGCAGCCACTGGAGATAGTAGTAAAAGCTGGTAGTGCGGCAGCCAGAGCCA
+GTGTGGTTCAGGCTTTACAGGAGGGACCCAAAAGAGCTGTGCAGAGACCAAATTCACACACCAGCCGTCT
+CTGGTCTCCAGACAAGCATGGGGAGCCCCAGGCTGAGTGCTGATGGAGAGTCCCAACTCTTCACAACAGC
+ACAGCTGCTGGCAGGCAAGACCATCAAGTGGGCGCGGGACACAGTCATGCTAGAAGGGCATTTTGCTAAA
+GGAAAACTAGGAGCCTAGGCTTTAGAGGGTTTGAAAGGGCAGGGACTGGATGAAAGAAAAGAGGACCTTT
+GACACTCTGGGGTCAGGAAGGGGATGGACAGAAAGCAGCACTAGGGCTACAGGAAGTTCAACTGTTCCCA
+GAACCCCCGGTTCCCACTTCCTGCCCAGTCAAGGCTCTACTTATTGCCCCAAGTGGCCACCTTTGTATAT
+AGGGGTGTTCGGAGGAAGCGGCTGGTCTTCATGTAAGCAGAGATCCTCTTCAGTACCTGAGAGTTAAAGA
+AAAACAGAAAACCAAACACTAAGATTTGCTCAGGGCCAGAGAGGTTAAACAGCTTCCTCCTGCCCACACT
+GCTATTACATGGCACCCATCTGGGATTAGCCTGGCCTCTGTCTCTCAGATTTTGTACTCTCTTGGCCCAC
+TCCCTTCCTGACCTCCCCATGCTCCTCTTGCACACATCTTGCCTGGGTTTGCACACTGACTATGAATCTG
+TCTAGCAGGACAAAGAGGCATGTTTTGAAGATAACCCCTATGTCATCCAGTGACTCTGCAAAACCAGGGT
+CAAAGTTTATAGATACAGATAGATCAGAAAAGAAAAGCCAGGCTCACTGTAATCATTCCCTAAGGAGAAA
+GCCACACTTCTGAACCTGGTACTGAGCTTCATCCAATCCCGTCTCCAGCCTCGATACAGGCATGGACAAG
+GTGCACTGGCATTTACATACGCTGTTGGCCTTCTAGCAGGAGCGTGAGCCCCTAGAATATCTCTAAAGAC
+ATCAATGAGGTCCAGAGCTGCCTGTATCCAAGGGAAGTCAAGGGAGACCCTGGCACAGCTCTGCTAGTGC
+TGAAGCCCTGAGCAAGTATAACTCCACAGTTTCACATTTGCTTAGCCAAGCAATGGGACTGATAGTCCCC
+TCCACCACGGAGGAACGCTGATCACTATAAGCCTGATCAACAGATGTAGGGTGCTCGGCACTGACACTGG
+CAGAAAGCAATTGCTCAATTCCTGATTCTTATCCTCAGCCCAAGAAAACATCAAGGCAAAGTTGACAGCC
+AAGGGGAAGAAGGACCAGGGCATCACCTCAAAGCGGGCCACAAAGTCCTTCAGGTTTGGGAAGGCGTCCA
+GGCACGTGGGTTCGAATATAAGGTGCAGGTCCAGGATATCGTAAGCCAGGAAATCTACAAAAGTAATCTG
+CAAAGGGCCACGAATGAGAGATGGCATGGAATTCTGTCCCCGAGAAGAATGACAATGCTCAACTTTTCCC
+CCTTTACCTTTTCACCAACAAACCATGTCCGCTGGCCCAGGAACTGTGAGAAGAGCTTCACCATTCCAGG
+GAGCTGCTCCAAGTATTCCACCTTCAGTTTCTCCTGAGGCAGAAAACAGCTGTCACCATGTAAGTTCCTG
+AAGAAGGACTGGCCCTTCAGTATCATGCACGGCCCTAAATAGCCAGAGGTGGGCAGCCGTCTCCACTAAG
+ATGGAGACTCAGTCACAGTCCAGGTCTCGATTTAACATATGAGCACTTACTTACTAGACGCCTCGGTAGC
+CCACCCCACCCTTCAAATGCAGTGGAAGACCAGGGAAGAATAAGACTGCCCACAGTGGGGCATCATCATT
+CTTGATGCCTACATGGCTGGGTACCCTGCGCACCTGGCAGGTACCAATGCCAGGCAGCATCAAAGAATCA
+AGAACTAAAATACTCAGAGGAGATGAGAGTCCACACAATGGGTCCAGAAGAATGAAATATACTTGAGTTT
+TCCCCATAAATTGAGGAGAATCTGAAGCAAAAAAGGAGAGAAAATCCTTCAGGCTGACCCTGTGCTACCT
+AAGGAATATGCTGACTCCAGGGGGTGTGAACAGAAGCACTCTGAGAGGCTGGAAAGGGTTCTGTGACAGT
+CCCTCATCTTGGACTTCATTTTCTAATCCATGGGATATTCCTGTATTTCGTGACCAGCAAAGATATGAAC
+TAGCTGGCAGAGGACACAGGTAGAAAACACTTGGAAAACACAACTCCACTATCTAGGAACTTAATGTCCA
+CTCTGGGAAGTCTGGACTGCACCCTAAAATAGGTGGGGCCTAAGTAAGAGCATTCATTTGATAGTTCAGT
+GTCATGGGGACAGCTGATGACAACACATGAAACATCCTGGAGGGTAAGAGGCTAAGAAATTCAAAAGGTA
+AATAAGATCTGTGCAGATTGGGAACCTGAGGGAGCTGACTCCTGACCCAAACGCTGCCTGGTCCAGCCCA
+GGGAGGGAGTGAGATGAGACTCACAAAGTCTGGGCTGTAACAGACTCGAGCCAGCTGATTGGAGACATCC
+ATAGCCTGGTTCTCCAAAATGTCCACGCGAATCTTCTCTTCCTCTGTCTCCCCACCTGCAGCACACACAG
+CACAGACGCATCTTCAACACCTCACACCAGCCAAGCCAAGGGGGTTGTCCCCCACCCCACCCTGCTGACG
+CCCCACTCACACAGGTTGTGCTTGCGGGCAATGTAGCGCAGGATGGCATTGCTCTGCGTGATCTTGTGTG
+ACCCATCAATCAAGTAAGGCAACTGCTTGGACAATAAAGGATGGTCAGACAGGATGTGAGGTCCCAGCCT
+TCCTTCTCTGGATTAGGACTGACATGTGGCCTGCCGTTCATCATGGCTAGCATGGGAAGCCCTCTACGAA
+CACACTGCACACTGAATGGAAAAGCTAGGACACAGCCCATCTCAGAAACGCCACCAGGAACGGAAGCAAG
+TGAACAGAAGGAAGCATTGACTCCAAGCCCCTCTTCCAGGTGAGGAGGTGAGATGACTTCCCCAAACCTC
+CCCGTCCCCAAACCTCCCCGTCCCCCTCTACCTACATTGGGAAAGTCCAGGCCCAATTTGAACTTCTCAC
+TCAGCCACTGGCTTCGGTCATAGTCAGGAGCTGTGATTGATGAGATTGCATAAAGAGAAATCAGCCCAGT
+GTCCAACACACCTATTTCAGGGACCCAACCCCCTAAGACAAGGATCTGTAACCTCACACAGTGAGCCTTT
+ATCCCTGAAAGTTTCAGGAATAAAGAGTGCTTGGAACTGAGGGGCCTACAAAGTTTAGGCAAATCTTGAT
+GAGGAATGTGATCTAATTTCAACAGGAGGGGTATCCACCCCTTCACTTCCAGGAGGAAGGAAGGTCAAGC
+CCAGCTCTCTGGGGCTTACTGAAAGAGGCAGATTCTAAGTCCATCTGGGGGAGCCCCGTAGTGCGGGAGG
+CGTGAAATGTAATGTAGGCACCAATTGCTAGGCGCCAATGGACAGGAGACGTCTGACCGACAAATAGCAT
+TACCGTCTCCCATGGTGTATCTCTTCTCTTCATAGCTTGAGCCTGTGTATTCTAGGAGCAGCCGAATAGC
+GTGAGCTAGCTGAAAAGATAGGAATGGGGGCGGGGGACTTCTGTGTGACTTCACCTCCTCCAAGTGCATA
+TACACGGGCGCACAAACCAGCAAGGACATGTGTGCAAGACCCTGAGCCAGACAAGCAAGGGATTCCTCTC
+GCCCCACCCCATCCTCACCCCCCAGCTCGCCTTCTCCTAACCCTGCAATCCTTCCCACTCATCAGGTGGT
+TGCTCACCCCACGGATGTCCCAGTAACCCAGTGTCATAGGCATGATGCTGGTGCCTGAGACTGAGTCCAC
+CGGCCTCCGGTGTGAGCAACTTTGACTTTATCCTAGTGCCCTCGGGTCCCTGGGCGGGGCTGGTGCCCGG
+CGTGATCCTGCCCCTCCAGACTCTGCCCCGCAACACACACCCTGCTCTACAGCTTTTCCCCTGACAGTCA
+CCTACAGAAGCCAAATTACTGAGCTCCGCCCCCAACAGACCGGAAAGCGTAATAAGCGATCCCGCCTGCA
+GTTCCGCCCAATCGTCCAGCCCGGTGCGCAGATGCAGAGCTCTAGTAACTCCCTGCCTTTGCCGTACCGC
+TCTGCCTTCTTACCCTGTTGCCTTCGGTCTGTGGGAGGGTCAGGCCACTAGGAGAGGCCCCAACCCGGAA
+GTGCTGATTAAGCACTTCAACCCTGCTGTCAAACAAGACAGACTTCTGCGGGGCTCTGCTGATGACAGTG
+GTCCCCAGGTGATGTTTTTCCGTCTGGCAGAGAACATGTTGTCTTTCTGTTCAGAAGTTCCTCATCTGAA
+TTCAAAGAGATTATTTCTATCACAGATAAACCCCTGACTCAAAAAAAGGGTGGAAAATGGAAAATTTGAA
+GTTTCATACATGGTCAGGATGTCTGTATTACCCAAGGACAGAGGCAGCTTTCTCTATTGAGTGACCGTCT
+TAGTGTGTACTTACACAGACCTAAGACTTCTGAAATGTCCGAGGGTACACAGGGTCAGCATGTGACAACT
+CAGGATTAGTTTATAGCAAATGCAATCAAGAGAGAGATGTCTGAGTTCTCAATAGCTGCTGTTCATAGCA
+AGGCACACTGTTTCTTCACACAACATTGGTTGTCGTATGTGTCCAAAGACTGTAGTCTGCGGAGGGGGGC
+TTAGGGAAGTGGGCTCTCATTTTGTACAACAGACATCACACACTCACTGCATTTTGTTATTACATTAAGA
+CTTCTACAAGATCACCAAATAAGAATTTTAAGCTCCATTATAAGTCTTCCTGAGTATATAGGGTCCATCT
+TTAACTAAAATGTGCTAAAGTGGCAGGTGACCATACTTACAATTCTTACTAATGTTAAGAATGTACTGTG
+GCAATGCATTGTGACAAAAATAATTCTCACATTCTAAGAGACTGGGACTTTCACTGGACCTCACAGCAAG
+AGTTAATAATACATCTTGAAGTCCCTTAAGCTTGCTATTCTACTATGTACTCTGTCTGTCTGTTTAGCCT
+TCCTTTAAGAGAATGATGTAAGAAGAAATACTTGTCTTGGATTTTCCATGTAATAAACTTCTATGACCTA
+AGATAACACGCAGCTGTAATCTGTTTTTTGGGTTTTTTGGTTGATGATGGTGGTGTTTTGAAGGTTTGGG
+GGGGTGGGGGGGGCAAGTTTTTGTTTGGTTTTTATTGTTGTTTCTCAGAGACAAGGATGTGTCACTCTCT
+GCATATCCCCAGATGTCCTAGAACTCACTGTATAGACCAGGCTGGCCTTGAATTCACGGAGATCCACCTG
+TCTCTGCCTCTGTGTGCTGAAATTAAAGGCGTGTGTCATCACACATGGACCTGTAATCCTTAATTGTGTA
+CTTGTCTTAGTGACTTTTCTACTGCTGTGGCAAAGCATGTGTTTTGCATGTTGGTTTATTGTGTTGCTTA
+ACTTGCCATTTCCCTTGCTTGTCTTCTGACTTCCCTACTCATTATCAAGCATAGAACTATTTCTACATAT
+CTCTCAATGTGTTTGTCTCTTTGATTTTTGGTGTTTGAGATTAGGGGTCTCACTATGTAGCCCAGGCTGC
+CCTGTAACTCACAGAGTTCCTTCTGCCAATTTTTGCTGTATGTATTTTTGTCCCAGTCCCACTATGGAAG
+TGATCATTTATAAAAAGGAATCCCGTAACCCTCCTGGGAGTTCTCCCAAATTATCTGTGCTTGCCTCGTT
+TTGAAAGGGTTATCTAGTTATCCTTGAATAAGCGTGATTTCTTGGCAGATAGTTTTAAATATCTGGAACA
+CTCTAAGGACTAGTTGGAATCAGAGTAAGTTAGTGTACAGGAGAAGGTGTCAGCCTCATGTCACTAGAGA
+CATGGTGGAACTCATATGGCTCCAGACGTGTGAGCATCCTTTGTGTCTCAAGATGAACAAAGTTCTGGGC
+GTTGGGTTCTTGTTTTTTGTTTCATCTTGTTGCTTAAAACAACAACAACTTATTTTTGACTATTTAATAC
+CTCCGTACAAGATAGACTTATTGAATAGAATTTAAACAGTTCCTGCTAGACAGGTTTAGATAGCCCTGAA
+GAAAGCAAGAAGCTGTGGCAGCCTGAGTAACCCCAAAATAAATTATAGTTGATTGGCCTTGCTGATAGGC
+TTGTAGTGACCTCAGGAGGACCCAAAAGTTTTGCAAGACTGGAGACTGGAATCCAGACCATGAGCAAAGG
+TAACTACCAGAGGGGCCTCTTAAAGGGAAGTTCCCTTCTGACTTTTTTTTTTTTAAAGCAGCTGTGAAAT
+TTTTGTTGTCAGAGTCATGTGAGGGTCATAAGTTAGCAACCTTCAGATCCAGAGCTGCAGCGGTGATCTG
+GGAGAAGGGCAGGGAAGTATTATAAACTCTTAAGTGACAGGTAGAGAGTACCTGGCATCAGACATAGAAG
+GATCAAGATGGGAGAAATAACGCTTTTGACTTGAGAGCTTCAAAGTGGAAAGAAGAGAAGGACTAATGGG
+CAGGAAAGGGCTCTGGATCTGAATGTTAATCTTTACCAGTGTCTGACATGGAAGCTGTCTACCTGTGACT
+CAGCATCTTCTAGAGGACTCCCAGGGATCTCATTTTAAGATTTCTATGGTTGGCTGGCAAGATGACTCAG
+CAATTAAGAACACTTAGTGCTCTTGCAGAGGACCTGGGTTTGGTTTCCATCACCTATATCAGGTAGCTCA
+CTACCTCCTGACCCACACCCCTTCTGGATTCCATGAGCACTTGCACACATGCAATACATACATACATACA
+TACATACATACATGGAGAATAATTAATAATTTTTTAAAATTTTCTACTGAGTGGTGGAGGCATGCATGTT
+TAATCCCAGAACTCAGAAGGCAGAGGCAATCAGATTTGTATGAGTTCGAGGCCAGCCTACCTACAGAATG
+AGTTCCAAGACAGCCAGAGCTACACAAAGAAACCCTGTCTCAAAAGAAACAAAAACAAACAAAACCATTT
+CCGATAGAATGGGTGCTCAGTGCTTTGGAGTGAATATGTACTCTTCAAGCTGATCAATACTAACCTTTGG
+ATAAATATTTTAGAGTTTCACAAAACCCTTCTTATGTTGACTAAAGGAAAATTCTTCTCAGGCCTTTAAA
+AATAGTCTCATGCTATAGACAATAGCTAGGGAGTTTGACATTGAAGCTAGTACCTCTTGATAGCATTGTC
+GGTGCTAAGTGAATCTCTTACACTCCTTGATTGAAGTTCATCACAATTACATCGCGCCTAAGATTGGAGG
+TGAAACTTTCAAACACATGGGCCAGCATCTTATTAACTCCTCAGGAGGTCATCCCCAGGCCTGGAAGGGA
+CTGATCTGGTATTCTTTAAAGCCACTGGTAAAGTGTTAAGCTAAGGGAGAAAGGGGGCTTGGTTTGTTTA
+GTTTTACAATTTTAGTAGCTCTAGTTTTCCCCTGAAAATGAATATATACATATATATATATAGCACAGTT
+AAAGCAATACAAGAAAGGAAAAGCTTCAATTTGCCCTGCGTGGAGAAGGGAGCTGGGAAGTCATCAGAGC
+ATATTCCACTGTGGGAGGCAGTCCATCCATTTAGAGGTTGGCAAGGTCTCCAAGATCTTGGCTCTGTTCC
+ACATCGCACACTACACTGATTAGCAGCACACAGCTGGCAGGAAGACATAGTTTTCAAACCTCTTTGGTTA
+CATCTGTGAAGTCACAGCACCGATGTTGGTTTATAATTGCTGCTGATTTATGTGCCATGAGACACTTTTG
+CTCAGCTACCTTTGAAATTACCTTTATAATATGAGCAGACATCCTGACCAGATAATGAGCACTCAAAGCT
+TCCTGGTTTTCTGCCTTTTTTTTTTTTTTTGAGTCACTTATTTGTAATAATTAGAAGCAATCTCTTCAAG
+TTCCTCCAAAGCCTGTCTTTTCCAGGTGATTAGAGTCTTGGTTACTGTTGCTGTGTTGAGCACGCTGCTT
+GCCTAAAGCTCTTCTCTGAAGTTCTAGATCCCATGCTCAACAGATATTTATTTAAATTGTATTAAAGCTA
+AAAGTTTAATTATTGTTTTATAGGTTTGGGGTTTCTTTTTGGAAGGGGTTGTTTGGTTTGGGGGGGTGTT
+TGTTATTGTTTTGACTGTGAGTTTTGTTTTAGTCTTGGGTTTGTTTTGTTGAAACAGCATCTCACACTAC
+AGCCCAGACCGTGCTGGAACTCACTATTATGCCCAAGCTGATTTCACACCCCCCCCCCGGGCAATATTTC
+TGCCTCATGCTCCTGAGAGCTGGGGCACAAGGTGCAAGCACACCACAGGGGGCTTCAGTGTTTCATTTGG
+ATCTGTTACTGCAGTTTTATAATTCTTGCCAAAGCCAGAAACCTCTACATTTCCAATGCATTCAATGTTG
+ACAGAACCTCACAATGATTTTGGATACCAGAAGTGATTTCATTCCAGAGTGGTCTATATTTTAAGGGTGA
+ATTTAGGTCTACGCTAACTTATTTAAAGATTTTTAGAAATATGTGTCTGTGAGCACAACTGCAAGTGGCC
+CAGGAGGCCAGAAGCATCAGATCTTCTCTGTAGCTGGAGTTACAGGAGGTTGGGAGCCACCCAGTGTGGG
+TGCTGGGACCCAAACTCAGGTCCTCTGCAAGAGCAGTGCATGCTCTTAACCCCTAAGCCATCTCTCCAGC
+CTCCGTCATCACATTTTTTAGAAAGATTTATCAGAAGAGGGCATCAGGTCCCATTACAGATGGTCGTGAG
+TCACCATGCAGTTGGTGGGAATTGAACTCAGGACCTCTGAAAAGAGCAGTCAGTGCTCTTAACCACAGAG
+CCATCTCTCCAGCCTCTGTCACCACATTTAAAAACATGAGAAAATAAAACCAGGTCAGGACTCTCCAAGC
+GACTTTATAACAAAAACAGTTAAGAATAAAGGAAGAGCATGGAGAGTTAACAAAACAATGCCGGGAGAAA
+AAGCTCTAGGAGCCTGGGTGCACAATAACTAAGTCACTTAGCATGCAAAATCCGCATCCTAAATCCACCC
+CTGTAGCATGCTATGTGCACTGGCTGGTTCTGTGTATCCGCTTGACACAAGCTGGAGCTATCACAGAGAA
+AGGAGCCTCCCTTGAAGAAATGCCTCCATGAGATCCAGCTGTAAGCGTTTTCTCAATTAGTGATCAAGGG
+TGGAAGGGCCCAGCCCATTGTGGGTGGTGCCATCCCTAGGCTAAGCAAGCTGAGCAAGCCAAGGGAAGCA
+AGCCAGTAAGTAGCATCCCTCCATGGCCTCTGCATCAGCTCCTGCTTCCTGACCTGCTTGAGTTCCAGTC
+CTGACTTCCTTTGGTGATGAACAGCAATATGAAAGTGAAAGCTGAATAAACCCTTTCCTCCCCAACTTGC
+TTCTTGGTCATGATGTTTGTGCAGGGATAGAAACCCTGACCATGACACTATATTTGTGACAGATTGCAGT
+CAATTTTCACACATAAACTAAAAACATACATGATATATTAAAGCTTACTTTCTTTGTATTTGAGACAGGG
+TCTCTGTAGGTAGCCCAGGCTGCCTCACAGTCTCCCTCTTTGTGCCTCTGCCTCTCAAGTGCTAGAATTA
+TACAATGTGCCACCATGCCCTACTAAAAGCTTTGGATAAGTTTTTAAGTACACAGTGTAGTTCTGCAAGC
+ATATTATCCTATAGCGGACACATAGAACATATTTCCCTTGAGTCCCTGAGGTGATGAACCCATTCTTTGC
+AGTTGGCATTTTCTGCTCCAGTAGCCGCAGGCAGACTCCATTCTCATACTACTCTGACTGTACAAGTTCC
+CGTTTGTTAAACATCTCCTCTACGCAATTATGCAATATATGTTTCCCTATGAATAGTTCATTTCACTTCC
+CTGGTTTATCCATGTTGTTGCATGTTGCAGAATTTCTCTCTTCATCAAGGCTACGTAATAGACCACATTT
+TCTCCATCCATTAACCCACTGATGAGCATTTAAGTGGTTTGCATGCCTTGACCATGGTTTTGTGCAGCAA
+TAAAAAGAAGAATAGCAGTATCTCTTTGAGATCTTGATTTCAATTGGGAGCAGCTTATATCACTGGACAT
+GGAAATATTGGAGTCATGTGATAGTTTCTTTTTTTTTTTTTAAACTACTGCCATACTGTTTTTCATAGGG
+TTTTTGACACTTTCTATTCTGCTAACAGTATATGAGCATTCTAGTTTCTGCACTTGTTATGGGGAACTGG
+GGCAGGTAGCTACTCTGGTCAGGTGTAAAATGATAGTTAATTGTGGGTTTGATGTTGAGAGTCTTTTAAC
+ACATCTGTTAAACGTTTTCATGTCTTCTTTGGAGAAATGTCTAATTCTGGTCCTTTGCAAAGATGGTTCA
+ACAAATGCAAATCAATCCATTGGACTTGCCATGTTAACAGAAGGAAAGACAAAAACCACATATCATTTAA
+ATAGATACAGAAATGTCATCTTGTGGAATTCAACCACTCTCATGATTAAATAACTCAAAAAATAAATACA
+GAAGTTTTTGATTATGTTTATACCCACATGTCAACTAGACACGTCCACATAATAAAAAGCTTACATCATA
+ATTGGCTGTAGAAACTGGGAATTTTTCTTCTCAAAGAGCATGCATATCTGTGTTCCATGTGCTTGATGCC
+CAAGAAGGTCAAGAGGGGGCACTGGATGCCCTGGAGTTACAGGCAGTTCTAAGGTGCCATGTGGGTGCTA
+GAAATCCAACCTGGCTCCCCTGCAAGAGCAGCCACTGCTCTTAACCACTAACCCATTCCTGCAGCCCCTT
+ACTCAAAGACCACAAACAAAACAAACAATGTTAGTTAGCTCTCACAGTTCCTGCTCAACGCAGCACCAGA
+GCTGTGGCACAGCAATCAGGCAAGCAAAAGAGATAATGACACCCACAGGGCAAAGAAAAGAAAATGTACC
+TTGATTTACAAATTTACTCATAGAAAACCTCGGTGTATGCTTCAGTGGTAGAGCACCTTCCTAGCACCTG
+CAAAGCCCAGAGTTCAATCCCTAGTACCAATAAAAACAAATAATTAAGGACGGGAGAAGTTATTCACTGG
+TTAAGAGCACATACTAGTCTTACATATAACCCAAGTTTGGTTGCCAGCACCTATACCGGATGGCTCACAA
+GTGCTTACAATCCCAGCCCTACAAGATCCAATACCCTCTCCTGGCTTCCTGCACTCACATACATATAACC
+ACAGACAAGTGCACATAATTTTTAAATTAGATACTATTTTGAACCAGTTAAAAGCACTGCCCTTCCCAAG
+AACCCGAGTTCAATTCCCAGCACCCACATCAAGTGACTCACAACCATTTTCTTCTTTCTTTTCAATGTGG
+CTGCCTTTAAAGTCTGGCCCCAGAGGGATCTGACACCTCTGGCCTCTTAGATACCTGCACTCATGTGCAG
+ACACCCACACAGAGACACAAAGGCATATAATTAACTGTAAGTACCATTTGTGAAAGAGCTAACCTTCCTC
+ATTCCCCGTAAAAAAACGAAGCTGAAACCTTCGACCATCAATAAAAGTCAAGTCACACCGGGTTTGAGGT
+TGAAATTTGAGACCTGATTCTATAAAACCCTTAGAAGAAAACAATGGTCTAAGCAAGACTTCTTGGAGAT
+GGTACCAGACACACAGACAGCAAAAGTAAAAATAGAGGCCAAGAGGGTGACACCAAGCTACAAAACTTCT
+ACGAGGTACAGGAAGCGACCAACAGAGTAGGGAAAACATCTGTGAATCTCATACCCAATGAGAGGTTGAT
+ACCGCAAATAAACCAAGAACACCCACCACTCAGCAGAAAAACATGCTTTAAAGCCTTGAACTATATACAT
+TTAAAATATAATTGATTTATTGGCCATGCTAAATAAAGTAAGTTTTAAAAATCATCAGTTAGTGATAAGT
+TTTCTAAACAAGCATAAAACCATTCCCCATTCTGCACACATAACTGGGTCTCTCTGTAATTACTGCAATC
+AGTGCAATTATAACTGTCCAGCCTATTTATTTTCTTTAATTAGAGTAACCAATCTTTTTTTCATTTTCTC
+TAGACACAAAATAATCCAAGAGGAAAAAAAAAGTTTTCTAAACTTTTGAGCTATTTTTAAATTTTTTAGC
+TTATTTTGGAGACAGTGTCTTAGTTATTCCAAGCTGCCCTCAAACTTCTTATGTAGCCTGGATGACCTTG
+AACTCCTGACCCTTCAGACCGAATCTCCTGAATGCTGGGCTCACAGTACTGCACCACAACACTTAGTTTA
+TGTGGTAGTGGGAATCAAACCCAGGGCATGCAGGGCAAACACTCTACCAACTGAACTACATGTCCAACCC
+CTTTAAATTTATTTTTAACTAATACATAAAAATTACACATATAATAGGTATCACATTATTTTTGATAAGC
+TTATACATCATGTACTGTTTAAATCATGTTAAATATATTTCACACTTAATTATTTCTTTATAGTTAAAAA
+CATCCGGACCCTTTCCTTCCAGCTTTCTGAAGAACACATGGCATTATTTGTAGTCATCTTCCTGTCCCAT
+AACACACCAAGGGTTTATTTTAACCATAACTTGGTACTTGAGCGATCTTTCTACAAGCATGTGTCTCACG
+TTACTCAAACCTTCTGTAAAGCTAAAACAAACACACATGTGTTAGCATCTCTAAAAGTTACTGCATTTTA
+AGAAATAAGGACAGAGTAAAATTTATTAAAATTTTAAAAACAAACTGACAACAACAACAAACAATGGGTT
+TGGCTTTCCCCACCTCCCCAATTCCTAAAACCTAGTTTCATTTGTCTGGAATCAGGAGGCCCTGTCCAGT
+TCTGAATGGTGATGGCTTTGCCTTTGACTGACTCACACACAGGCAAGTCACAGACAGCCATTCATTTGAC
+TTGCCCCTCTAGCTCTTCCCACTGAAACTGCCCCTTCTGACACATGTTTCTCCCCAGAGACAGCAAATTC
+TTGTGTCTACCTAATAACTTGTAATCCTAAAAGGCAGGCTCTTAAATGCTGTAATCTTACCTGTTGAAAT
+CCCAAGAGATTAAAATTTCTAACCTGAAAAAAAAATCACAACCTCAAAAGATTAAAATCCCAGATGCTGA
+AATCCCGAAAGCTTAAGACAGGCAGCCAGCCAGCCTGAGCGGGGAGAATCAGCGCGGTCTGGGGTTTGTA
+TACTGTTCACATCTTGCTAGGTGGAAAAAAAACCTAAGAATGTGGTGAAGCACTTTCAGGGCTTGCCTAG
+AAGGTGTTTCCCGAGAAGATTAGTGTGGGAGTTTGAGTGGACTATTCAGGGAAGATAACGCCTCAGTGTG
+GTACTGTTACCGGATCTTAGGGTCAGAGCCTGGGATCTGGCAGTCCCAACAAGGTAGGCTGAGCCATCTG
+TTCCCAACACACCAATTAAAGAGACAAGTGATATTGAAAAGCGTAGAAAGAAGAGTTATTCAACACCATC
+ACGTTGGAAAGAAGAACAAATAATGAGGCCAGGTGACACTCGAGTTGGTCTTTGGGGTACCAATGTAGGA
+AGTTGATCCCACCCAGCACTCACAATTTCTTGGCTCTGATCTGTCCTGCCTGGAGGTCTGACAAGTGTCA
+GACTGTGTGAAATCTCATCATGGTAAGCAAGTTCCTTCTTTGGTTGGCACAAAGACTTCAGCCTTTCCCT
+TCCTCCATCATGGGATTAATGGGAAATTTTTAACTTCTTGAAGCTCATTATTTCAATGGTGTGGTAGTAC
+AGACGTGTCTCTCTTTTGAATAGCCTTTCCCACCAGCACCATCCAGTATGCCCAGCTAAAAAGAACAAAC
+ACAGAAAGCTAACTTATCTCTGAAAGTGGAGACCAAGCCTGTTGCCACTGCCTTGGATACCAGCCCTCCA
+GAATCCCTGGCCTTTGGAGTCCAGCACCTGTGCCAGTGAAAGACCCACAACGTGAGGACTCCCCCACGTT
+CTGACTCAGGAGAGGCGACACCCCAAAATCACCCACAAGAAACGATCTTGCTGCAAACTGCAAGAGGATT
+TTTATTCAAGAGCACTCTCGAGCCCACGGTCATACACCACGCAGGGGTAGAGGACCGCAGTGCCCCGAGT
+AGCTGGATAAGGGAGTATTTAAAGGAAGAAACCACAACTCAAGGAGGTGGGAAGGGCGTTGTTGGAAAAT
+ACCAAAAATACCAGTTAAGAGTCACAAGGAAGAGCAAAGTCACAAGTGTCACAAGGAATACCTGGTAATT
+GTTAACTCTCAAGACAGTTTCTAAGAGCCCCTAACAGTAGCACATTTGCATTGCAGGTTCCGGTAATGGT
+CAGGTGACTTTCTTTGAATGAATACTCCTTGAACCCAGGAAGTGGGTGGGTGGAGGGATGTCGATATCTG
+TTTTATGACCAGCACACCTAGGAGCACTGAGTCACAAAGGCCACATTCCCAAGCCTGGGCCTAAAGGCCT
+AGAATTTTGTTTTTACGTTTACTCTTTCACCAGCATTTTTCAGATCCTTGTGCTTGCTTGCTTGCTTGCT
+TGCTTGCTTGCTTGCTTGCTTCCTTCCTTCCTTCCTTCTTCCCTCCCTCCCTCACTCACTCCCTCTCTTC
+CTCCCTTTCTTCCTTCCTTTCTTTCTCTCTCTTTCTTTCTTTCTCTTCCTTCCTTCCTTCCTTCCTTCCT
+TTCTGTCTTCCTTCCTTTCTTTCTCAGCTTTCTTCTTTCTTTCTCTCACTCTCTCTTTCCTTCTTTCTTT
+CGTTTTTTCTTTTTTAGCTAGGGTCTCTTCACCTACTTGTCAGCACTAGTTTTTACAATTTGCTAAGCCA
+TATAGTTCAGGTTTGCATTCTATTGAATCCTGGAAATATAGATTCTACAAAGACATCCAGAGTTCATTCA
+CGGCAAGCTTTCCCATTTTGTTTGCAAATTTTACTCTATGGAAATTCAGTTCCACCACAGTGACTTACTG
+AGCACTGCCCATGCATGTGAAAATGCTGAAACTTCCCGGTAAGTGAGGGCATGTTCTTTTTCACACATCT
+GCATTTGTGGAAGATGAAGTTTTTAAGGATTCCACCCCTTTGATCGAGTGCCTGTGCTGTGGGCTGTGGT
+GACTTGCAGAGGCTTTTTGTCTTGTTACAGATGGGCAGTGGCAGAGAGAAACGCACTGCCGCACATGGTT
+GAAGAAGCTGTACCACTAAGCCGCGGCCCAGAACCCAAGCTCTGGTTTCTCTTTCTTTTTTGTTTGTTTA
+TTCTTTGTTACTTTTTTTTTTTCAAGACAGAGTTTCTCTATAGCCCTGGCTGTCCTAGAACTCACTTTGT
+AGACCAGGCTGGCCTCGAACTCAGAAATCCACCTGCTTCTGCCTCCTGAGTGCTGGGATTAAAGGCGTGC
+GCCACCATGTGTTTGTTTATTCTTAGTTGAGGTTTTGTTTGGTTTTTGTTGTTGTTGTTTTCTTTTCTTT
+AGTCCTGGCTGTCCTGGAACTCACTGGGTAGACCAGACTGGCCTCAAACTAAGAGATCTGTTGCTGCCTC
+TGCCTCATGAATGTGGGACTAAAGGCGTGCACTGTCTTTTTCCTTTTCTATTTTATTATCCAAAGAGGAA
+TATGTATGAAGGAATCTTTACAGGGGTATGGGCAACTAACACCACTGAAGAAAATATCTCAAAATATCTC
+TTCCTCCCGCCAGCAACCATTAACTGTACTGAAAGCCTCAAGGTGGGAGGGAAAATTCTAATTAAAGTCA
+AAGACTACCATTGACCTGATAGCCTTCTGAGGCTCAGTGTGTAATGGGGGACTCCCTTTATTAACAGAGT
+TTTTCCTTCTCTGACCTTGTCTGGGGAATTCTACGCCCCAGACTCTCCTCTACCTGAGGAATGTCATGGA
+GCCTCAATTACTTTATAGGATCAATTTCCCTTCTCCTGACAAAAGGTGTTTAGCCAGCATCTGAGATTCC
+TGAAATGCTTCCCCACGCTAAAGAGGCATTCCAGGTCCCCTAAGCCCCCAGTGACCTTTGCCCATCCTGG
+ATGTCCCTACTATCAGTCCCCAAAAACTTCATAAGCCTTGAATCACCCTAAGTAAAGTTGATCTGCCTAC
+CGTCGAGGGGTCCCGGTGTGGTCGTTTACCATCTGTATTCACAGGGCTTCCGGACCCTCTGTTCAGCTTT
+TCCACTCCCTCTCCCTTGTGGTCCCAAATGGCCAATGATCCACAAGGGTAGGGAGTCCCACAAAGGTGGT
+TCGCTCCTGTGTGTTTTGAGACAGGGTCTTATTATGCAGCCACAGGCTGGCCTCAAACTTCAGAGCACCC
+TGCCTCTTCCCCCACTCCCGGTGTTGGGATTGTTGGGACGGCAGGTATGCAGGGCCATCATTGTTTTAGA
+TCAAGCAACAAAAGGCTTGGGTTAGAACTTCCAAGACAACTCACCTGGTAAACCCTATAGCATCATTGAA
+TATGGAGAGCTGTTCTTTTCTGCATAACAGAAAAGTTGGTACAACTCCAAAGCCAAAATTATCTTGAGTT
+ACTCTAGCCTTATTAGTACTCAGCCTGTTGTTGCTTACCTCTGTGTATGGTGTAATATTCCTATTATGGG
+AACTCCAAACTGCTTAATGAAATAAACTTTAAGGACAAACCAATTCTAGTAACGCCTGTCTGTTATGGCT
+TGGATGAGAAACGTCTCCCAACGTGCGTAAGCACTTGGTTCTGGTTTGCGAAGGTGAAGTATTGTAGAGG
+AAGCACATCACCAGGGATGAGTTTTATGGCTTTGTTTTTTATTTGTTTGTTTTGTTTAGGTTTTTAAGAC
+TTTTATTTTTGTTTGTTTTGATTTTTTGAGATAGGGTTTCTCTGTATAGCCCTGACTATCCTGGAACTCA
+CTCTAGAGACCAGGCTGTCCTCAAGTTCATAAGAGATGCATGTGCCTCTGCCTCCAGAGTGATGGAATTA
+AAGTCATGTACCACCACTGCCCAACAGCTTTGAGGTTTTATAACCCAGCCCTACCTCCTGTGTTGTAGTT
+CCCCCCACCCCCATTGTCTCCCACCACCCTAATTCCAGACTGCCTTTCAGGGCATATTGGTTCTACCCAC
+TCAAGAATTGTAAGTAAAAAATAAAAGCTTTCTTCCTAAAGCTACTCCGATCGGGGTGTTTTATCACAGC
+AATAAAAAAGTGAAGCATCCTCTTGGCCCACTGATGCTCTCCCCATCCCCTCCGCTAACCTACACAGGTT
+AACCTCACCTTAAACCCCAACATGTGGGAAGAGAGGCAAACAGAACTCTGTGGGTTCCAGACCAGCCAGG
+GCCACACGGTGAGGATGCTCCTCAGAAACAAAAGCAAAGTGATCACGTTGTCATTAACACTTCTATTTAC
+CTCATCCCTCCTTCTTAACAAATTCAATCCAAAAATAATATCAAAGCTTGTCATAGAATCCCTGATCTAC
+ACTTGTGGTTCTTCCTGGAATTACTTTTTGAGTCTTCCACCTGTGAGCCCTGCTCTATTCTGCTTGCTGA
+TTCGATTTACCCAGGTTGCCATTCAGCATTTATATGGACAATATTTCCCCACTTCATAAATCCTGGTACC
+AACCTCTGCCTCAAAGACCACGGAAAAGAGGGGAGGTGTGGTGGCTAGATTGAAAATGGTCCCGGTAGGC
+TCATGTATTTGAATACTTGTCTCCTGCACTCCGTGGCAGTATCTGGGAAGGTTACAGTCTGGGTAGGAGG
+TGGAGCCTTGCTGGAAGGAATCAGTCAGTCATAGGAGCAGGTTTTGTGTTTAGAGCCTTGCCTGACTTAA
+CAGCTACAAAAGAAGCACCCCCCCCCCACGACCCTACTACACCTCAATAGTCAACTTGAGGAAGC
+
diff --git a/seq/mgtt2_x.seq b/seq/mgtt2_x.seq
new file mode 100644
index 0000000..236d72b
--- /dev/null
+++ b/seq/mgtt2_x.seq
@@ -0,0 +1,12 @@
+>>mgtt2_x Length: 1089 January 26, 2000 04:00 Type: N Check: 1394 ..
+CTGAGTTGGG TCCACGAAAG CCCAGCTAGG CCATTACCGC GTCCGGGTGA GACTAAGGTC CTGGGCTGGA TTCCTGGCTC CACGGTCCGC TGGAGCAAAT
+CGCATAAGTC AGTCTGAGTG CGCGCGCCCT CAGCCCTGCT TTTGGTATAA AGTCCTCCAA AGCGTCTCCC TCCCCAANNN NGATCagCAg GtGTCAGCTA
+TCCAGAGGAG GAAATCGTTT GGCTTGGcCA ACTGAGGcTG TGCTGGACCC CAGCTTGCTG TTATCGAACG CAGTCGGCAC ACCATCTTGT GTCGCTACCG
+GCAATGGGCT TGGAGCTCTA CCTGGACCTG CTGTCACAAC CCAGCCGCGC TGTCTACATC tTCNGCCAaG AAGAATGGCA TCCCCTTCCA GACGCGTACC
+GTGGATATAC TCAAAGGGCA GCACATGAGC GAGCAATTCT CCCAGGTGAA CTGCTTAAAC AAAGTTCCTG TACTCAAAGA CGGAAGCTTC GTGTTGACCG
+AAAGCACAGC CATCTtGATT TACCTGAGTT CCAAGTACCA GGTGGCAGAC CACTGGTACC CGGCCGACCT ACAGGCCCGT GCCCAAGTCC ACGAATACCT
+GGGCTGGCAT GcCGACAACA TCCgtGGTAC TTtcgGAGTG CTCCTATGGA CCNAAgGTGT TgGGGCCACT CATTGgGGTc CAgGTTCCCC agGAGAAGGT
+GGAACgGAAC agAGATAGAA TGGTCCTGGt TCTGCaACAG CTGGAgGACA AGTTCTCAGG GACAGGsCTC CTGTTGGCAG CAGTGAGCTA GCGATCTCAT
+TCTCTGGAGA GTGATGCAGC GTGCTCTTGC TATACCTGTT GAGGACGGCT CAGCTGACAG CATGCGAGAA AGGTGGAGGC GTCTTGGTGC TGAGCTGTGT
+AGAGCTCATA GACATCTGGC ATCTGGACAA GCAGCAGAAA TGTACAGTAC CCCTTCGAGT CATGCACATG CACTCAATTG TAGATCCTGA TGGTTGACCA
+CATAAGACTA TTTGTGTTAA AAAAGGGGGC CGTCCCATTC CCTTATGATC GATACATACT GGCTCCTTTA CACATNGATG GAAAACTGC
diff --git a/seq/ms1.aa b/seq/ms1.aa
new file mode 100644
index 0000000..46f476a
--- /dev/null
+++ b/seq/ms1.aa
@@ -0,0 +1,6 @@
+>test m1
+MPMIL,
+MLLEY,
+MGDAP,
+MDTRX,
+MLCYN
diff --git a/seq/mu.lib b/seq/mu.lib
new file mode 100644
index 0000000..e7aec3e
--- /dev/null
+++ b/seq/mu.lib
@@ -0,0 +1,50 @@
+>GTM1_MOUSE GLUTATHIONE S-TRANSFERASE GT8.7 (EC 2.5.1.18) (GST 1-1) (CLASS-MU
+PMILGYWNVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKFKLGLDFPNLPYLIDGSHKIT
+QSNAILRY
+LARKHHLDGETEEERIRADIVENQVMDTRMQLIMLCYNPDFEKQKPEFLKTIPEKMKLYSEFLGKRPWFA
+GDKVTYVD
+FLAYDILDQYRMFEPKCLDAFPNLRDFLARFEGLKKISAYMKSSRYIATPIFSKMAHWSNK
+>GTM1_HUMAN GLUTATHIONE S-TRANSFERASE MU 1 (EC 2.5.1.18) (GSTM1-1) (HB SUBUNI
+PMILGYWDIRGLAHAIRLLLEYTDSSYEEKKYTMGDAPDYDRSQWLNEKFKLGLDFPNLPYLIDGAHKIT
+QSNAI
+LCYIARKHNLCGETEEEKIRVDILENQTMDNHMQLGMICYNPEFEKLKPKYLEELPEKLKLYSEFLGKRP
+WFAGN
+KITFVDFLVYDVLDLHRIFEPKCLDAFPNLKDFISRFEGLEKISAYMKSSRFLPRPVFSKMAVWGNK
+>GTMU_CRILO GLUTATHIONE S-TRANSFERASE Y1 (EC 2.5.1.18) (CHAIN 3) (CLASS-MU).
+PMILGYWNVRGLTNPIRLLLEYTDSSYEEKKYTMGDAPDSDRSQWLNEKFKLGLDFPNLPYLIDGSHKIT
+QSNAI
+LRYIARKHNLCGETEEERIRVDIVENQAMDTRMQLIMLCYNPDFEKQKPEFLKTIPEKMKMYSEFLGKRP
+WFAGD
+KVTLCGFLAYDVLDQYQMFEPKCLDPFPNLKDFLARFEGLKKISAYMKTSRFLRRPIFSKMAQWSNK
+>GTM1_RAT GLUTATHIONE S-TRANSFERASE YB1 (EC 2.5.1.18) (CHAIN 3) (CLASS-MU).
+PMILGYWNVRGLTHPIRLLLEYTDSSYEEKRYAMGDAPDYDRSQWLNEKFKLGLDFPNLPYLIDGSRKIT
+QSNAI
+MRYLARKHHLCGETEEERIRADIVENQVMDNRMQLIMLCYNPDFEKQKPEFLKTIPEKMKLYSEFLGKRP
+WFAGD
+KVTYVDFLAYDILDQYHIFEPKCLDAFPNLKDFLARFEGLKKISAYMKSSRYLSTPIFSKLAQWSNK
+>GTMU_RABIT GLUTATHIONE S-TRANSFERASE MU 1 (EC 2.5.1.18) (GST MU I) (CLASS-MU
+PMTLGYWDVRGLALPIRMLLEYTDTSYEEKKYTMGDAPNYDQSKWLSEKFTLGLDFPNLPYLIDGTHKLT
+QSNAI
+LRYLARKHGLCGETEEERIRVDILENQLMDNRFQLVNVCYSPDFEKLKPEYLKGLPEKLQLYSQFLGSLP
+WFAGD
+KITFADFLVYDVLDQNRIFVPGCLDAFPNLKDFHVRFEGLPKISAYMKSSRFIRVPVFLKKATWTGI
+>GTM4_HUMAN GLUTATHIONE S-TRANSFERASE MU 4 (EC 2.5.1.18) (GSTM4-4) (GTS-MU2)
+MSMTLGYWDIRGLAHAIRLLLEYTDSSYEEKKYTMGDAPDYDRSQWLNEKFKLGLDFPNLPYLIDGAHKI
+TQSNAILC
+YIARKHNLCGETEEEKIRVDILENQAMDVSNQLARVCYSPDFEKLKPEYLEELPTMMQHFSQFLGKRPWF
+VGDKITFV
+DFLAYDVLDLHRIFEPNCLDAFPNLKDFISRFEGLEKISAYMKSSRFLPKPLYTRVAVWGNK
+>GLNA_ANASP GLUTAMINE SYNTHETASE (EC 6.3.1.2) (GLUTAMATE--AMMONIA LIGASE).
+TTPQEVLKRIQDEKIELIDLKFIDTVGTWQHLTLYQNQIDESSFSDGVPFDGSSIRGWKAINESDMTMVL
+DPNTA
+WIDPFMEVPTLSIVCSIKEPRTGEWYNRCPRVIAQKAIDYLVSTGIGDTAFFGPEAEFFIFDSARFAQNA
+NEGYY
+FLDSVEGAWNSGKEGTADKPNLAYKPRFKEGYFPVSPTDSFQDIRTEMLLTMAKLGVPIEKHHHEVATGG
+QCELG
+FRFGKLIEAADWLMIYKYVIKNVAKKYGKTVTFMPKPIFGDNGSGMHCHQSIWKDGKPLFAGDQYAGLSE
+MGLYY
+IGGLLKHAPALLAITNPSTNSYKRLVPGYEAPVNLAYSQGNRSASIRIPLSGTNPKAKRLEFRCPDATSN
+PYLAF
+AAMLCAGIDGIKNKIHPGEPLDKNIYELSPEELAKVPSTPGSLELALEALENDHAFLTDTGVFTEDFIQN
+WIDYK
+LANEVKQMQLRPHPYEFSIYYDV
diff --git a/seq/musplfm.aa b/seq/musplfm.aa
new file mode 100644
index 0000000..f6bf80b
--- /dev/null
+++ b/seq/musplfm.aa
@@ -0,0 +1,9 @@
+>musplfm transl. of musplfm.seq, 2 to 676
+ MLPSLIQPCSWILLLLLVNSSLLWKNVASFP
+MCAMRNGRCFMSFEDTFELAGSLSHNISIEVS
+ELFTEFEKHYSNVSGLRDKSPMRCNTSFLPTP
+ENKEQARLTHYSALLKSGAMILDAWESPLDDL
+VSELSTIKNVPDIIISKATDIKKKINAVRNGV
+NALMSTMLQNGDEEKKNPAWFLQSDNEDARIH
+SLYGMISCLDNDFKKVDIYLNVLKCYMLKIDN
+C
diff --git a/seq/mwkw.aa b/seq/mwkw.aa
new file mode 100644
index 0000000..f992fdd
--- /dev/null
+++ b/seq/mwkw.aa
@@ -0,0 +1,31 @@
+>MWKW Myosin heavy chain - Caenorhabditis elegans
+MEHEKDPGWQYLRRTREQVLEDQSKPYDSKKNVWIPDPEEGYLAGEITATKGDQVTIVTAREMSVIQVTL
+KKELVQEMNPPKFEKTEDMSNLSFLNDASVLHNLRSRYAAMLIYTYSGLFCVVINPYKRLPIYTDSCARM
+FMGKRKTEMPPHLFAVSDEAYRNMLQDHENQSMLITGESGAGKTENTKKVICYFAAVGASQQEGGAEVDP
+NKKKVTLEDQIVQTNPVLEAFGNAKTVRNNNSSRFGKFIRIHFNKHGRLASCDIEHYLLEKSRVIRQAPG
+ERCYHIFYQIYSDFRPELKKELLLDLPIKDYWFVAQAELIIDGIDDVEEFQLTDEAFDILNFSAVEKQDC
+YRLMSAHMHMGNMKFKQRPREEQAEPDGTVEAEKASNMYGIGCE
+EFLKALTKPRVKVGTEWVSKGQNCEQVNWAVGAMAKGLYSRVFNWLVKKCNLTLDQKGIDRDYFIGVLDI
+AGFEIFDFNSFEQLWINFVNEKLQQFFNHHMFVLEQEEYAREGIQWVFIDFGLDLQACIELIEKPLGIIS
+MLDEECIVPKATDLTLASKLVDQHLGKHPNFEKPKPPKGKQGEAHFAMRHYAGTVRYNCLNWLEKNKDPL
+NDTVVSAMKQSKGNDLLVEIWQDYTTQEEAAAKAKEGGGGGKKKGKSGSFMTVSMLYRESLNNLMTMLNK
+THPHFIRCIIPNEKKQSGMIDAALVLNQLTCNGVLEGIRICRKGFPNRTLHPDFVQRYAILAAKEAKSDD
+DKKKCAEAIMSKLVNDGSLSEEMFRIGLTKVFFKAGVLAHLEDI
+RDEKLATILTGFQSQIRWHLGLKDRKRRMEQRAGLLIVQRNVRSWCTLRTWEWFKLYGKVKPMLKAGKEA
+EELEKINDKVKALEDSLAKEEKLRKELEESSAKLVEEKTSLFTNLESTKTQLSDAEERLAKLEAQQKDAS
+KQLSELNDQLADNEDRTADVQRAKKKIEAEVEALKKQIQDLEMSLRKAESEKQSKDHQIRSLQDEMQQQD
+EAIAKLNKEKKHQEEINRKLMEDLQSEEDKGNHQNKVKAKLEQTLDDLEDSLEREKRARADLDKQKRKVE
+GELKIAQENIDESGRQRHDLENNLKKKESELHSVSSRLEDEQALVSKLQRQIKDGQSRISELEEELENER
+QSRSKADRAKSDLQRELEELGEKLDEQGGATAAQVEVNKKREAE
+LAKLRRDLEEANMNHENQLGGLRKKHTDAVAELTDQLDQLNKAKAKVEKDKAQAVRDAEDLAAQLDQETS
+GKLNNEKLAKQFELQLTELQSKADEQSRQLQDFTSLKGRLHSENGDLVRQLEDAESQVNQLTRLKSQLTS
+QLEEARRTADEEARERQTVAAQAKNYQHEAEQLQESLEEEIEGKNEILRQLSKANADIQQWKARFEGEGL
+LKADELEDAKRRQAQKINELQEALDAANSKNASLEKTKSRLVGDLDDAQVDVERANGVASALEKKQKGFD
+KIIDEWRKKTDDLAAELDGAQRDLRNTSTDLFKAKNAQEELAEVVEGLRRENKSLSQEIKDLTDQLGEGG
+RSVHEMQKIIRRLEIEKEELQHALDEAEAALEAEESKVLRAQVE
+VSQIRSEIEKRIQEKEEEFENTRKNHARALESMQASLETEAKGKAELLRIKKKLEGDINELEIALDHANK
+ANADAQKNLKRYQEQVRELQLQVEEEQRNGADTREQFFNAEKRATLLQSEKEELLVANEAAERARKQAEY
+EAADARDQANEANAQVSSLTSAKRKLEGEIQAIHADLDETLNEYKAAEERSKKAIADATRLAEELRQEQE
+HSQHVDRLRKGLEQQLKEIQVRLDEAEAAALKGGKKVIAKLEQRVRELESELDGEQRRFQDANKNLGRAD
+RRVRELQFQVDEDKKNFERLQDLIDKLQQKLKTQKKQVEEAEELANLNLQKYKQLTHQLEDAEERADQAE
+NSLSKMRSKSRASASVAPGLQSSASAAVIRSPSRARASDF
diff --git a/seq/mwrtc1.aa b/seq/mwrtc1.aa
new file mode 100644
index 0000000..8eadfb3
--- /dev/null
+++ b/seq/mwrtc1.aa
@@ -0,0 +1,8 @@
+>MWRTC1 - Myosin heavy chain 1, cardiac muscle - Rat (fragment)
+/DLTEQLGEGGKNVHELEKIRKQLEVEKLELQSALEEAEASLEHEEGKILRAQLEFNQIKAEIE
+SKLAEKDEEMEQAKRNHLRVVDSLQTSLDAETRSRNEALRVKKKMEGDLNEMEIQLSQANRIAS
+EAQKHLKNAQAHLKDTQLQLDDAVRANDDLKENIAIVERRNTLLQAELEELRAVVEQTERSRKL
+AEQELIETSERVQLLHSQNNSLINQKKKMDADLSQLQTEVEEAVQECRNAEEKAKKAITDAAMM
+AEELKKEQDTSAHLERMKKNMEQTIKDLQHRLDEAEQIALKGGKKQLQKLEARVRELENELEAE
+QKRNAESVKGMRKSERRIKELNYQTEEDKKNLVRLQDLVNKLQLKVKAYKRQAEEAEEQANTNL
+SKFRKVQHELDEAEERADIAESQVNKLRAKSRDIGAKQKIHDEE*
diff --git a/seq/myosin_bp.aa b/seq/myosin_bp.aa
new file mode 100644
index 0000000..c3a4957
--- /dev/null
+++ b/seq/myosin_bp.aa
@@ -0,0 +1,20 @@
+>gi|46049110|ref|NP_996557| myosin binding protein C, slow type isoform 4; myosin-binding protein C, slow-type; skeletal muscle C-protein [Homo sapiens]
+MPEPTkkeenevpapapppeepskekeaGTTPAKDWTLVETPPGEEQAKQNANSQLSILF
+IEKPQGGTVKVGEDITFIAKVKAEDLLRKPTIKWFKGKWMDLASKAGKHLQLKETFERHS
+RVYTFEMQIIKAKDNFAGNYRCEVTYKDKFDSCSFDLEVHESTGTTPNIDIRSAFKRSGE
+GQEDAGELDFSGLLKRREVKQQEEEPQVDVWELLKNAKPSEYEKIAFQYGITDLRGmlkr
+lkrmrreekkSAAFAKILDPAYQVDKGGRVRFVVELADPKLEVKWYKNGQEIRPSTKYIF
+EHKGCQRILFINNCQMTDDSEYYVTAGDEKCSTELFVREPPIMVTKQLEDTTAYCGERVE
+LECEVSEDDANVKWFKNGEEIIPGPKSRYRIRVEGKKHILIIEGATKADAAEYSVMTTGG
+QSSAKLSVDLKPLKILTPLTDQTVNLGKEICLKCEISENIPGKWTKNGLPVQESDRLKVV
+HKGRIHKLVIANALTEDEGDYVFAPDAYNVTLPAKVHVIDPPKIILDGLDADNTVTVIAG
+NKLRLEIPISGEPPPKAMWSRGDKAIMEGSGRIRTESYPDSSTLVIDIAERDDSGVYHIN
+LKNEAGEAHASIkvkvvdfpdppvaptvtEVGDDWCIMNWEPPAYDGGSPILGYFIERKK
+KQSSRWMRLNFDLCKETTFEPKKMIEGVAYEVRIFAVNAIGISKPSMPSRPFVPLAVTSP
+PtlltvdsvtdttvtMRWRPPDHIGAAGLDGYVLEYCFEGTEDWIVANKDLIDKTKFTIT
+GLPTDAKIFVRVKAVNAAGASEPKYYSQPILVkeiieppkiriprHLKQTYIRRVGEAVN
+LVIPFQGKPRPELTWKKDGAEIDKNQINIRNSETDTIIFIRKAERSHSGKYDLQVKVDKF
+VETASIDIQIIDRPGPPQIVKIEDVWGENVALTWTPPKDDGNAAITGYTIQKADKKSMEW
+FTVIEHYHRTSATITELVIGNEYYFRVFSENMCGLSEDATMTKESAVIARDGKIYKNPVY
+EDFDFSEAPMFTQPLVNTYAIAGYNATLNCSVRGNPKPKITWMKNKVAIVDDPRYRMFSN
+QGVCTLEIRKPSPYDGGTYCCKAVNDLGTVEIECKLEVKVIAQ
\ No newline at end of file
diff --git a/seq/n0.aa b/seq/n0.aa
new file mode 100644
index 0000000..d34f73f
--- /dev/null
+++ b/seq/n0.aa
@@ -0,0 +1,4 @@
+>mgstm1
+MLLEYTD,
+MGDAPDFD
+
diff --git a/seq/n1.aa b/seq/n1.aa
new file mode 100644
index 0000000..ab84beb
--- /dev/null
+++ b/seq/n1.aa
@@ -0,0 +1,5 @@
+>tests from mgstm1
+MILGYW,
+MLLE,
+MGDAP,
+MLCYNP
diff --git a/seq/n2.aa b/seq/n2.aa
new file mode 100644
index 0000000..2d43da7
--- /dev/null
+++ b/seq/n2.aa
@@ -0,0 +1,28 @@
+>gi|345664 gi|345664|pir||A45388 protein-tyrosine kinase (EC 2.7.1.112) - chicken [MASS=116670](65:883)
+GSIEREDGGLQGPAGNQHIYQPVGKPDHAAPPK,
+LIGVITENPVWIIMELCTLGELRSFLQVR,
+KPPRPGAPHLGSLASLNSPVDSYNEGVK,
+EDGGLQGPAGNQHIYQPVGKPDHAAPPK,
+QVTVSWDSGGSDEAPPKPSRPGYPSPR,
+GANPTHLADFNQVQTIQYSNSEDKDR,
+LPMPPNCPPTLYSLMTKCWAYDPSR,
+PGAPHLGSLASLNSPVDSYNEGVK,
+GANPTHLADFNQVQTIQYSNSEDK,
+LSHLQSEEVHWLHLDMGVSNVR,
+QVTVSWDSGGSDEAPPKPSR,
+VFHYFENSSEPTTWASIIR,
+TLLATVDESLPVLPASTHR,
+RQVTVSWDSGGSDEAPPK,
+AQLSTILEEEKLQQEER,
+EKFELAHPPEEWKYELR,
+LAQQYVMTSLQQEYKK,
+FELAHPPEEWKYELR,
+LVNGATQSFIIRPQK,
+KQMLTAAHALAVDAK,
+SNDKVYENVTGLVK,
+QMLTAAHALAVDAK,
+GMGQVLPTHLMEER,
+PQEISPPPTANLDR,
+IQPAPPEEYVPMVK,
+GMGQVLPTHLMEER,
+QFANLNREESILK,
diff --git a/seq/n2_fs.lib b/seq/n2_fs.lib
new file mode 100644
index 0000000..de927e1
--- /dev/null
+++ b/seq/n2_fs.lib
@@ -0,0 +1,84 @@
+>GT8.7 | 40001 90043 | transl. of pa875.con, 19 to 675
+ILGYWN,
+DQYRMFEP,
+SRYIATP,
+KCLDAFP,
+EYTDS,
+SYDEKR
+>GT8.7 | 40001 90043 | transl. of pa875.con, 19 to 675
+ILGYWN,
+DQYRMFEP,
+SRYIATP,
+KCLDAFP,
+EYTDS,
+SYDEKR,
+YTMGD,
+EKQKPEFL,
+VRGLTHP,
+TRMQLI,
+FKLGLDFP,
+NLPYLI,
+DGSHKIT,
+LRYLAR,
+KTIPEK,
+KRPWFA,
+ETEEERIR,
+GDKVTYVD,
+HWSNK
+>tests from mgstm1
+MLLE,
+MILGYW,
+MGADP,
+MLCYNP
+>gi|345664 gi|345664|pir||A45388 protein-tyrosine kinase (EC 2.7.1.112) - chicken [MASS=116670](65:883)
+GANPTHLADF,
+QVTVSWDSGG,
+EDGGLQGPA,
+TLLATVDE,
+LSHLQSEE,
+PGAPHLGS,
+GANPTHLA
+>gi|345664 gi|345664|pir||A45388 protein-tyrosine kinase (EC 2.7.1.112) - chicken [MASS=116670](65:883)
+GSIEREDGGLQGPAGNQHIYQPVGKPDHAAPPK,
+LIGVITENPVWIIMELCTLGELRSFLQVR,
+KPPRPGAPHLGSLASLNSPVDSYNEGVK,
+EDGGLQGPAGNQHIYQPVGKPDHAAPPK,
+QVTVSWDSGGSDEAPPKPSRPGYPSPR,
+GANPTHLADFNQVQTIQYSNSEDKDR,
+LPMPPNCPPTLYSLMTKCWAYDPSR,
+PGAPHLGSLASLNSPVDSYNEGVK,
+GANPTHLADFNQVQTIQYSNSEDK,
+LSHLQSEEVHWLHLDMGVSNVR,
+QVTVSWDSGGSDEAPPKPSR,
+VFHYFENSSEPTTWASIIR,
+TLLATVDESLPVLPASTHR,
+RQVTVSWDSGGSDEAPPK,
+AQLSTILEEEKLQQEER,
+EKFELAHPPEEWKYELR,
+LAQQYVMTSLQQEYKK,
+FELAHPPEEWKYELR,
+LVNGATQSFIIRPQK,
+KQMLTAAHALAVDAK,
+SNDKVYENVTGLVK,
+QMLTAAHALAVDAK,
+GMGQVLPTHLMEER,
+PQEISPPPTANLDR,
+IQPAPPEEYVPMVK,
+GMGQVLPTHLMEER,
+QFANLNREESILK,
+>gi|345664 gi|345664|pir||A45388 protein-tyrosine kinase (EC 2.7.1.112) - chicken [MASS=116670](65:883)
+GANPTHLADF,
+QVTVSWDSGG,
+EDGGLQGPA,
+TLLATVDE,
+LSHLQSEE,
+PGAPHLGS,
+GANPTHLA,
+AQLSTILE,
+KPPRPGA,
+GSIERED,
+VFHYFEN,
+LIGVIT,
+LPMPP,
+RQVTV,
+QVTV
diff --git a/seq/n2s.aa b/seq/n2s.aa
new file mode 100644
index 0000000..4f1875c
--- /dev/null
+++ b/seq/n2s.aa
@@ -0,0 +1,8 @@
+>gi|345664 gi|345664|pir||A45388 protein-tyrosine kinase (EC 2.7.1.112) - chicken [MASS=116670](65:883)
+GANPTHLADF,
+QVTVSWDSGG,
+EDGGLQGPA,
+TLLATVDE,
+LSHLQSEE,
+PGAPHLGS,
+GANPTHLA
diff --git a/seq/n2t.aa b/seq/n2t.aa
new file mode 100644
index 0000000..f141ca3
--- /dev/null
+++ b/seq/n2t.aa
@@ -0,0 +1,16 @@
+>gi|345664 gi|345664|pir||A45388 protein-tyrosine kinase (EC 2.7.1.112) - chicken [MASS=116670](65:883)
+GANPTHLADF,
+QVTVSWDSGG,
+EDGGLQGPA,
+TLLATVDE,
+LSHLQSEE,
+PGAPHLGS,
+GANPTHLA,
+AQLSTILE,
+KPPRPGA,
+GSIERED,
+VFHYFEN,
+LIGVIT,
+LPMPP,
+RQVTV,
+QVTV
diff --git a/seq/n_fs.lib b/seq/n_fs.lib
new file mode 100644
index 0000000..9be7ba2
--- /dev/null
+++ b/seq/n_fs.lib
@@ -0,0 +1,20 @@
+>tests from mgstm1
+MLLE,
+MILGYW,
+MGADP,
+MLCYNP
+>GT8.7 | 40001 90043 | transl. of pa875.con, 19 to 675
+ILGYWN,
+DQYRMFEP,
+SRYIATP,
+KCLDAFP,
+EYTDS,
+SYDEKR
+>gi|345664 gi|345664|pir||A45388 protein-tyrosine kinase (EC 2.7.1.112) - chicken [MASS=116670](65:883)
+GANPTHLADF,
+QVTVSWDSGG,
+EDGGLQGPA,
+TLLATVDE,
+LSHLQSEE,
+PGAPHLGS,
+GANPTHLA
diff --git a/seq/ngt.aa b/seq/ngt.aa
new file mode 100644
index 0000000..c09be5a
--- /dev/null
+++ b/seq/ngt.aa
@@ -0,0 +1,20 @@
+>GT8.7 | 40001 90043 | transl. of pa875.con, 19 to 675
+ILGYWN,
+DQYRMFEP,
+SRYIATP,
+KCLDAFP,
+EYTDS,
+SYDEKR,
+YTMGD,
+EKQKPEFL,
+VRGLTHP,
+TRMQLI,
+FKLGLDFP,
+NLPYLI,
+DGSHKIT,
+LRYLAR,
+KTIPEK,
+KRPWFA,
+ETEEERIR,
+GDKVTYVD,
+HWSNK
diff --git a/seq/ngts.aa b/seq/ngts.aa
new file mode 100644
index 0000000..bec7ea8
--- /dev/null
+++ b/seq/ngts.aa
@@ -0,0 +1,7 @@
+>GT8.7 | 40001 90043 | transl. of pa875.con, 19 to 675
+ILGY*WN,
+EYTDS?,
+S?YDEKR,
+DQY*RMFEP,
+KCLDAFP,
+S*RY*IATP
diff --git a/seq/oohu.aa b/seq/oohu.aa
new file mode 100644
index 0000000..092d62a
--- /dev/null
+++ b/seq/oohu.aa
@@ -0,0 +1,6 @@
+>OOHU | 1358 rhodopsin - human
+MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRT
+PLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVC
+KPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVV
+HFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQG
+SNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
diff --git a/seq/oohu.raa b/seq/oohu.raa
new file mode 100644
index 0000000..8c11539
--- /dev/null
+++ b/seq/oohu.raa
@@ -0,0 +1,7 @@
+>oohu.aa shuffled
+KLILINAIFT GLQNSGCTAQ PTPEFFVMAQ YLFAMVSNMG GVFFQTALLN SAGQGFYSWC
+IFIFMTYPGF MLFIQLTGAD FVTVNEGANL CMTFCTQVTA VEYAKPTPVN AAPSSYRILR
+VIGGPYQAIF HSIATVFINS PTTEELQFLR IVVIHIAFIV VAVPLTDPRA VKFNAGELTF
+GCIFYMQYYM VISLFAANPF YYAFIRVPFE VYCETELIMG PLCAKRYVLA AASNGAYLGW
+LKLLEVYSAF PSVKCLNMLR GHVFTTIPET QNAVMYKDVI SSTLFVLSEQ LSAWITSEYP
+VPGKCYWMPF GANTHKNINP DPFAEHEKEY ILVWMVCKFG LGMTVMAG
diff --git a/seq/prio_atepa.aa b/seq/prio_atepa.aa
new file mode 100644
index 0000000..9e31979
--- /dev/null
+++ b/seq/prio_atepa.aa
@@ -0,0 +1,5 @@
+>PRIO_ATEPA | 90377 | MAJOR PRION PROTEIN PRECURSOR (PRP) (PRP27-30) (PRP33-35C).
+MANLGYWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGGNRYPPQGGGWGQPHGGGWGQPHGGGWGQP
+HGGGWGQPHGGGWGQAGGTHNQWNKPSKPKTNMKHMAGAAAAGAVVGGLGGYMLGSAMSRPLIHFGNDYEDRYYR
+ENMYRYPNQVYYRPVDQYNNQNNFVHDCVNITIKQHTVTTTTKGENLTETDVKMMERVVEQMCITQYERESQAYY
+QRGSSMVLFSSPPVILLISFLIFLIVG
diff --git a/seq/prot_test.lib b/seq/prot_test.lib
new file mode 100644
index 0000000..603a00b
--- /dev/null
+++ b/seq/prot_test.lib
@@ -0,0 +1,51 @@
+>HAHU | 1114 | Hemoglobin alpha chain - Human, chimpanzee, and pygmy chimpanzee
+VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAV
+AHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKY
+R
+>K1HUAG | 1091 | Ig kappa chain V-I region (Ag) - Human
+DIQMTQSPSSLSASVGDRVTITCQASQDINHYLNWYQQGPKKAPKILIYDASNLETGVPSRFSGSGFGTD
+FTFTISGLQPEDIATYYCQQYDTLPRTFGQGTKLEIKR/
+>CCHU | 1 | Cytochrome c - Human
+MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPGYSYTAANKNKGIIWGEDTLMEYLE
+NPKKYIPGTKMIFVGIKKKEERADLIAYLKKATNE
+>N2KF1U | 1021 | Long neurotoxin 1 - Many-banded krait
+IVCHTTATIPSSAVTCPPGENLCYRKMWCDAFCSSRGKVVELGCAATCPSKKPYEEVTCCSTDKCNHPPK
+RQPG
+>TPHUCS | 1322 | Troponin C, skeletal muscle - Human
+DTQQAEARSYLSEEMIAEFKAAFDMFDADGGGDISVKELGTVMRMLGQTPTKEELDAIIEEVDEDGSGTI
+DFEEFLVMMVRQMKEDAKGKSEEELAECFRIFDRNADGYIDPEELAEIFRASGEHVTDEEIESLMKDGDK
+NNDGRIDFDEFLKMMEGVQ
+>FEPE | 25 | Ferredoxin - Peptostreptococcus asaccharolyticus
+AYVINDSCIACGACKPECPVNIQQGSIYAIDADSCIDCGSCASVCPVGAPNPED
+>RKMDS | 677 | Ribulose-bisphosphate carboxylase (EC 4.1.1.39) small chain - Cry
+MRLTQGAFSFLPDLTDEQIVKQIQYAISKNWALNVEWTDDPHPRNAYWDLWGLPLFGIKDPAAVMFEINA
+CRKAKPACYVKVNAFDNSRGVESCCLSFIVQRPTSNEPGFQLIRSEVDSRNIRYTIQSYASTRPEGERY*
+
+>K3HU | 1099 | Ig kappa chain C region - Human
+/TVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSS
+TLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
+>HMIVV | 2581 | Hemagglutinin precursor - Influenza A virus (2 strains)
+MKTIIALSYIFCLVFAQDLPGNDNNSTATLCLGHHAVPNGTLVKTITNDQIEVTNATELVQSSSTGKICN
+NPHRILDGINCTLIDALLGDPHCDGFQNEKWDLFVERSKAFSNCYPYDVPDYASLRSLVASSGTLEFINE
+GFNWTGVTQNGGSSACKRGPDSGFFSRLNWLYKSGSTYPVQNVTMPNNDNSDKLYIWGVHHPSTDKEQTN
+LYVQASGKVTVSTKRSQQTIIPNVGSRPWVRGLSSRISIYWTIVKPGDILVINSNGNLIAPRGYFKMRTG
+KSSI
+MRSDAPIGTCSSECITPNGSIPNDKPFQNVNKITYGACPKYVKQNTLKLATGMRNVPEKQTRGIFGAIAG
+FIENGWEGMIDGWYGFRHQNSEGTGQAADLKSTQAAIDQINGKLNRVIEKTNEKFHQIEKEFSEVEGRIQ
+DLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTRRQLRENAEDMGNGCFKIYHKCDNAC
+IGSIRNGTYDHDVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVVLLGFIMWACQKGNIRCN
+ICI
+>OKBO2C | 296 | Protein kinase (EC 2.7.1.37), cAMP-dependent, catalytic chain - B
+GNAAAAKKGSEQESVKEFLAKAKEDFLKKWENPAQNTAHLDQFERIKTLGTGSFGRVMLVKHMETGNHYA
+MKILDKQKVVKLKQIEHTLNEKRILQAVNFPFLVKLEFSFKDNSNLYMVMEYVPGGEMFSHLRRIGRFSE
+PHARFYAAQIVLTFEYLHSLDLIYRDLKPENLLIDQQGYIQVTDFGFAKRVKGRTWTLCGTPEYLAPEII
+LSKGYNKAVDWWALGVLIYEMAAGYPPFFADQPIQIYEKIVSGKVRFPSHFSSDLKDLLRNLLQVDLTKR
+FGNLKDGVNDIKNHKWFATTDWIAIYQRKVEAPFIPKFKGPGDTSNFDDYEEEEIRVSINEKCGKEFSEF
+>GT8.7 | 266 | transl. of pa875.con, 19 to 675
+MPMILGYWNVRGLTHPIRMLLEYTDSSYDEKR
+YTMGDAPDFDRSQWLNEKFKLGLDFPNLPYLI
+DGSHKITQSNAILRYLARKHHLDGETEEERIR
+ADIVENQVMDTRMQLIMLCYNPDFEKQKPEFL
+KTIPEKMKLYSEFLGKRPWFAGDKVTYVDFLA
+YDILDQYRMFEPKCLDAFPNLRDFLARFEGLK
+KISAYMKSSRYIATPIFSKMAHWSNK
diff --git a/seq/prot_test.lseg b/seq/prot_test.lseg
new file mode 100644
index 0000000..1c63e8e
--- /dev/null
+++ b/seq/prot_test.lseg
@@ -0,0 +1,66 @@
+>sp|P69905|HBA_HUMAN Hemoglobin subunit alpha OS=Homo sapiens GN=HBA1 PE=1 SV=2
+MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG
+KKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP
+AVHASLDKFLASVSTVLTSKYR
+
+>sp|P00502|GSTA1_RAT Glutathione S-transferase alpha-1 OS=Rattus norvegicus GN=Gsta1 PE=1 SV=3
+MSGKPVLHYFNARGRMECIRWLLAAAGVEFDEKFIQSPEDLEKLKKDGNLMFDQVPMVEI
+DGMKLAQTRAILNYIATKYDLYGKDMKERALIDMYTEGILDLTEMIMQLVICPPDQKEAK
+TALAKDRTKNRYLPAFEKVLKSHGQDYLVGNRLTRVDIHLLELLLYVEEFDASLLTSFPL
+LKAFKSRISSLPNVKKFLQPGSQRKLPVDAKQIEEARKIFKF
+
+>sp|P01593|KV101_HUMAN Ig kappa chain V-I region AG OS=Homo sapiens PE=1 SV=1
+DIQMTQSPSSLSASVGDRVTITCQASQDINHYLNWYQQGPKKAPKILIYDASNLETGVPs
+rfsgsgfgtdftftisgLQPEDIATYYCQQYDTLPRTFGQGTKLEIKR
+
+>sp|P99998|CYC_PANTR Cytochrome c OS=Pan troglodytes GN=CYCS PE=1 SV=2
+MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAPGYSYTAANKNKGIIW
+GEDTLMEYLENPKKYIPGTKMIFVGIKKKEERADLIAYLKKATNE
+
+>sp|P60615|NXL1A_BUNMU Alpha-bungarotoxin isoform A31 OS=Bungarus multicinctus PE=1 SV=1
+MKtllltlvvvtIVCLDLGYTIVCHTTATSPISAVTCPPGENLCYRKMWCDAFCSSRGKV
+VELGCAATCPSKKPYEEVTCCSTDKCNPHPKQRPG
+
+>sp|P02585|TNNC2_HUMAN Troponin C, skeletal muscle OS=Homo sapiens GN=TNNC2 PE=1 SV=2
+MTDQQAEARSYLSEEMIAEfkaafdmfdadgggdISVKELGTVMRMLGQTPTKEELDAII
+EEVDEDGSGTIDFEEFLVMMVRQMKEDAKGKSEEELAECFRIFDRNADGYIDPEELAEIF
+RASGEHVTDEEIESLMKDGDKNNDGRIDFDEFLKMMEGVQ
+
+>sp|P00193|FER_PEPAS Ferredoxin OS=Peptostreptococcus asaccharolyticus PE=1 SV=1
+AYVINDSCIACGACKPECPVNIQQGSIYAIDADSCIDCGSCASVCPVGAPNPED
+
+>sp|P14960|RBS_GUITH Ribulose bisphosphate carboxylase small chain OS=Guillardia theta GN=rbcS PE=3 SV=1
+MRLTQGAFSFLPDLTDEQIVKQIQYAISKNWALNVEWTDDPHPRNAYWDLWGLPLFGIKD
+PAAVMFEINACRKAKPACYVKVNAFDNSRGVESCCLSFIVQRPTSNEPGFQLIRSEVDSR
+NIRYTIQSYASTRPEGERY
+
+>sp|P01834|IGKC_HUMAN Ig kappa chain C region OS=Homo sapiens GN=IGKC PE=1 SV=1
+TVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDS
+KDstyslsstltlsKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
+
+>sp|P03435|HEMA_I75A3 Hemagglutinin OS=Influenza A virus (strain A/Victoria/3/1975 H3N2) GN=HA PE=1 SV=1
+MKTIIALSYIFCLVFAQDLPGNDNNSTATLCLGHHAVPNGTLVKTITNDQIEVTNATELV
+QSSSTGKICNNPHRILDGINCTLIDALLGDPHCDGFQNEKWDLFVERSKAFSNCYPYDVP
+DYASLRSLVASSGTLEFINEGFNWTGVTQNGGSSACKRGPDSGFFSRLNWLYKSGSTYPV
+QNVTMPNNDNSDKLYIWGVHHPSTDKEQTNLYVQASGKVTVSTKRSQQTIIPNVGSRPWV
+RGLSSRISIYWTIVKPGDILVINSNGNLIAPRGYFKMRTGKSSIMRSDAPIGTCSSECIT
+PNGSIPNDKPFQNVNKITYGACPKYVKQNTLKLATGMRNVPEKQTRGIFGAIAGFIENGW
+EGMIDGWYGFRHQNSEGTGQAADLKSTQAAIDQINGKLNRVIEKTNEKFHQIEKEFSEVE
+GRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTRRQLRENAEDMG
+NGCFKIYHKCDNACIGSIRNGTYDHDVYRDEALNNRFQIKGVELKSGYKDWILWISFAIS
+CFLLCVVLLGFIMWACQKGNIRCNICI
+
+>sp|P00517|KAPCA_BOVIN cAMP-dependent protein kinase catalytic subunit alpha OS=Bos taurus GN=PRKACA PE=1 SV=3
+MGNAAAAKKGSEQESVKEFLAKAKEDFLKKWENPAQNTAHLDQFERIKTLGTGSFGRVML
+VKHMETGNHYAMKILDKQKVVKLKQIEHTLNEKRILQAVNFPFLVKLEFSFKDNSNLYMV
+MEYVPGGEMFSHLRRIGRFSEPHARFYAAQIVLTFEYLHSLDLIYRDLKPENLLIDQQGY
+IQVTDFGFAKRVKGRTWTLCGTPEYLAPEIILSKGYNKAVDWWALGVLIYEMAAGYPPFF
+ADQPIQIYEKIVSGKVRFPSHFSSDLKDLLRNLLQVDLTKRFGNLKNGVNDIKNHKWFAT
+TDWIAIYQRKVEAPFIPKFKGPGDTSNFDDYEEEEIRVSINEKCGKEFSEF
+
+>sp|P09488|GSTM1_HUMAN Glutathione S-transferase Mu 1 OS=Homo sapiens GN=GSTM1 PE=1 SV=3
+MPMILGYWDIRGLAHAIRLLLEYTDSSYEEKKYTMGDAPDYDRSQWLNEKFKLGLDFPNL
+PYLIDGAHKITQSNAILCYIARKHNLCGETEEEKIRVDILENQTMDNHMQLGMICYNpef
+eklkpkyleelpeklklYSEFLGKRPWFAGNKITFVDFLVYDVLDLHRIFEPKCLDAFPN
+LKDFISRFEGLEKISAYMKSSRFLPRPVFSKMAVWGNK
+
diff --git a/seq/prot_test_s.lseg b/seq/prot_test_s.lseg
new file mode 100644
index 0000000..c72b3ce
--- /dev/null
+++ b/seq/prot_test_s.lseg
@@ -0,0 +1,25 @@
+>sp|P09488|GSTM1_HUMAN GLUTATHIONE S-TRANSFERASE MU 1 (EC 2.5.1.18) (GSTM1-1) (HB SUBUNI
+MPMILGYWDIRGLAHAIRLLLEYTDSSYEEKKYTMGDAPDYDRSQWLNEKFKLGLDFPNLPYLIDGAHKITQSNAILCYIARKHNLCGETEEEKIRVDILENQTMDNHMQLGMICYNPEFEKLKPKYLEELPEKLKLYSEFLGKRPWFAGNKITFVDFLVYDVLDLHRIFEPNCLD^AFPNLKDFISRFEGLEKISAYMKSSRFLPRPVFTKMAVWGNK
+>XURTG | 266 | glutathione transferase (EC 2.5.1.18) Ya - rat
+MSGKPVLHYFNARGRMECIRWLLAAAGVEFDEKFIQSPEDLEKLKKDGNLMFDQVPMVEIDGMKLAQTRA
+ILNYIATKYDLYGKDMKERALIDMYTEGILDLTEMIMQLVICPPDQKEAKTALAKDRTKNRYLPAFEKVL
+KSHGQDYLVGNRLTRVDIHLLELLLYVEEFDASLLTSFPLLKAFKSRISSLPNVKKFLQPGSQRKLPMDA
+KQIEEARKIFKF
+>sp|P00517|KAPCA_BOVIN cAMP-dependent protein kinase catalytic subunit alpha; PKA
+GNAAAAKKGSEQESVKEFLAKAKEDFLKKWENPAQNTAHLDQFERIKTLGTGSFGRVMLV
+KHMETGNHYAMKILDKQKVVKLKQIEHTLNEKRILQAVNFPFLVKLEFSFKDNSNLYMVM
+EYVPGGEMFSHLRRIGRFSEPHARFYAAQIVLTFEYLHSLDLIYRDLKPENLLIDQQGYI
+QVTDFGFAKRVKGRTWTLCGTPEYLAPEIILSKGYNKAVDWWALGVLIYEMAAGYPPFFA
+DQPIQIYEKIVSGKVRFPSHFSSDLKDLLRNLLQVDLTKRFGNLKDGVNDIKNHKWFATT
+DWIAIYQRKVEAPFIPKFKGPGDTSNFDDYEEEEIRVSINEKCGKEFSEF
+>sp|P62161|CALM_RAT Calmodulin; CaM
+ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID
+FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA
+DIDGDGQVNYEEFVQMMTAK
+>sp|P10649|GSTM1_MOUSE Glutathione S-transferase Mu 1; GST 1-1; GST class-mu 1;
+MPMILGYWNVRGLTHPIRMLLEYTDSSYDEKRYTMGDAPDFDRSQWLNEKFKLGLDFPNL
+pylidgshkitqsnailrylarkhhldget
+EEERIRADIVENQVMDTRMQLIMLCYNPDF
+ekqkpeflktipekmklyseflgkrpwfag
+DKVTYVDFLAYDILDQYRMFEPKCLDAFPN
+LRDFLARFEGLKKISAYMKSSRYIATPIFSKMAHWSNK
diff --git a/seq/qrhuld.aa b/seq/qrhuld.aa
new file mode 100644
index 0000000..1ecbfd2
--- /dev/null
+++ b/seq/qrhuld.aa
@@ -0,0 +1,15 @@
+>QRHULD LDL receptor precursor - Human
+MGPWGWKLRWTVALLLAAAGTAVGDRCERNEFQCQDGKCISYKWVCDGSAECQDGSDESQETCLSVTCKS
+GDFSCGGRVNRCIPQFWRCDGQVDCDNGSDEQGCPPKTCSQDEFRCHDGKCISRQFVCDSDRDCLDGSDE
+ASCPVLTCGPASFQCNSSTCIPQLWACDNDPDCEDGSDEWPQRCRGLYVFQGDSSPCSAFEFHCLSGECI
+HSSWRCDGGPDCKDKSDEENCAVATCRPDEFQCSDGNCIHGSRQCDREYDCKDMSDEVGCVNVTLCEGPN
+KFKCHSGECITLDKVCNMARDCRDWSDEPIKECGTNECLDNNGGCSHVCNDLKIGYECLCPDGFQLVAQR
+RCEDIDECQDPDTCSQLCVNLEGGYKCQCEEGFQLDPHTKACKAVGSIAYLFFTNRHEVRKMTLDRSEYT
+SLIPNLRNVVA
+LDTEVASNRIYWSDLSQRMICSTQLDRAHGVSSYDTVISRDIQAPDGLAVDWIHSNIYWTDSVLGTVSVA
+DTKGVKRKTLFRENGSKPRAIVVDPVHGFMYWTDWGTPAKIKKGGLNGVDIYSLVTENIQWPNGITLDLL
+SGRLYWVDSKLHSISSIDVNGGNRKTILEDEKRLAHPFSLAVFEDKVFWTDIINEAIFSANRLTGSDVNL
+LAENLLSPEDMVLFHNLTQPRGVNWCERTTLSNGGCQYLCLPAPQINPHSPKFTCACPDGMLLARDMRSC
+LTEAEAAVATQETSTVRLKVSSTAVRTQHTTTRPVPDTSRLPGATPGLTTVEIVTMSHQALGDVAGRGNE
+KKPSSVRALSIVLPIVLLVFLCLGVFLLWKNWRLKNINSINFDNPVYQKTTEDEVHICHNQDGYSYPSRQ
+MVSLEDDVA
diff --git a/seq/titin_hum.aa b/seq/titin_hum.aa
new file mode 100644
index 0000000..8e2b064
--- /dev/null
+++ b/seq/titin_hum.aa
@@ -0,0 +1,431 @@
+>gi|108861911|sp|Q8WZ42|TITIN_HUMAN Titin (Connectin) (Rhabdomyosarcoma antigen MU-RMS-40.14)
+MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVISTSTLPGVQISFSDGRAKLTIPAVTKANSGRYSL
+KATNGSGQATSTAELLVKAETAPPNFVQRLQSMTVRQGSQVRLQVRVTGIPTPVVKFYRDGAEIQSSLDFQISQEGDLYS
+LLIAEAYPEDSGTYSVNATNSVGRATSTAELLVQGEEEVPAKKTKTIVSTAQISESRQTRIEKKIEAHFDARSIATVEMV
+IDGAAGQQLPHKTPHRIPPKPKSRSPTPPSIAAKAQLARQQSPSPIRHSPSPVRHVRAPTPSPVRSVSPAARISTSPIRS
+VRSPLLMRKTQASTVATGPEVPPPWKQEGYVASSSEAEMRETTLTTSTQIRTEERWEGRYGVQEQVTISGAAGAAASVSA
+SASYAAEAVATGAKEVKQDADKSAAVATVVAAVDMARVREPVISAVEQTAQRTTTTAVHIQPAQEQVRKEAEKTAVTKVV
+VAADKAKEQELKSRTKEVITTKQEQMHVTHEQIRKETEKTFVPKVVISAAKAKEQETRISEEITKKQKQVTQEAIRQETE
+ITAASMVVVATAKSTKLETVPGAQEETTTQQDQMHLSYEKIMKETRKTVVPKVIVATPKVKEQDLVSRGREGITTKREQV
+QITQEKMRKEAEKTALSTIAVATAKAKEQETILRTRETMATRQEQIQVTHGKVDVGKKAEAVATVVAAVDQARVREPREP
+GHLEESYAQQTTLEYGYKERISAAKVAEPPQRPASEPHVVPKAVKPRVIQAPSETHIKTTDQKGMHISSQIKKTTDLTTE
+RLVHVDKRPRTASPHFTVSKISVPKTEHGYEASIAGSAIATLQKELSATSSAQKITKSVKAPTVKPSETRVRAEPTPLPQ
+FPFADTPDTYKSEAGVEVKKEVGVSITGTTVREERFEVLHGREAKVTETARVPAPVEIPVTPPTLVSGLKNVTVIEGESV
+TLECHISGYPSPTVTWYREDYQIESSIDFQITFQSGIARLMIREAFAEDSGRFTCSAVNEAGTVSTSCYLAVQVSEEFEK
+ETTAVTEKFTTEEKRFVESRDVVMTDTSLTEEQAGPGEPAAPYFITKPVVQKLVEGGSVVFGCQVGGNPKPHVYWKKSGV
+PLTTGYRYKVSYNKQTGECKLVISMTFADDAGEYTIVVRNKHGETSASASLLEEADYELLMKSQQEMLYQTQVTAFVQEP
+KVGETAPGFVYSEYEKEYEKEQALIRKKMAKDTVVVRTYVEDQEFHISSFEERLIKEIEYRIIKTTLEELLEEDGEEKMA
+VDISESEAVESGFDLRIKNYRILEGMGVTFHCKMSGYPLPKIAWYKDGKRIKHGERYQMDFLQDGRASLRIPVVLPEDEG
+IYTAFASNIKGNAICSGKLYVEPAAPLGAPTYIPTLEPVSRIRSLSPRSVSRSPIRMSPARMSPARMSPARMSPARMSPG
+RRLEETDESQLERLYKPVFVLKPVSFKCLEGQTARFDLKVVGRPMPETFWFHDGQQIVNDYTHKVVIKEDGTQSLIIVPA
+TPSDSGEWTVVAQNRAGRSSISVILTVEAVEHQVKPMFVEKLKNVNIKEGSQLEMKVRATGNPNPDIVWLKNSDIIVPHK
+YPKIRIEGTKGEAALKIDSTVSQDSAWYTATAINKAGRDTTRCKVNVEVEFAEPEPERKLIIPRGTYRAKEIAAPELEPL
+HLRYGQEQWEEGDLYDKEKQQKPFFKKKLTSLRLKRFGPAHFECRLTPIGDPTMVVEWLHDGKPLEAANRLRMINEFGYC
+SLDYGVAYSRDSGIITCRATNKYGTDHTSATLIVKDEKSLVEESQLPEGRKGLQRIEELERMAHEGALTGVTTDQKEKQK
+PDIVLYPEPVRVLEGETARFRCRVTGYPQPKVNWYLNGQLIRKSKRFRVRYDGIHYLDIVDCKSYDTGEVKVTAENPEGV
+IEHKVKLEIQQREDFRSVLRRAPEPRPEFHVHEPGKLQFEVQKVDRPVDTTETKEVVKLKRAERITHEKVPEESEELRSK
+FKRRTEEGYYEAITAVELKSRKKDESYEELLRKTKDELLHWTKELTEEEKKALAEEGKITIPTFKPDKIELSPSMEAPKI
+FERIQSQTVGQGSDAHFRVRVVGKPDPECEWYKNGVKIERSDRIYWYWPEDNVCELVIRDVTAEDSASIMVKAINIAGET
+SSHAFLLVQAKQLITFTQELQDVVAKEKDTMATFECETSEPFVKVKWYKDGMEVHEGDKYRMHSDRKVHFLSILTIDTSD
+AEDYSCVLVEDENVKTTAKLIVEGAVVEFVKELQDIEVPESYSGELECIVSPENIEGKWYHNDVELKSNGKYTITSRRGR
+QNLTVKDVTKEDQGEYSFVIDGKKTTCKLKMKPRPIAILQGLSDQKVCEGDIVQLEVKVSLESVEGVWMKDGQEVQPSDR
+VHIVIDKQSHMLLIEDMTKEDAGNYSFTIPALGLSTSGRVSVYSVDVITPLKDVNVIEGTKAVLECKVSVPDVTSVKWYL
+NDEQIKPDDRVQAIVKGTKQRLVINRTHASDEGPYKLIVGRVETNCNLSVEKIKIIRGLRDLTCTETQNVVFEVELSHSG
+IDVLWNFKDKEIKPSSKYKIEAHGKIYKLTVLNMMKDDEGKYTFYAGENITSGKLTVAGGAISKPLTDQTVAESQEAVFE
+CEVANPDSKGEWLRDGKHLPLTNNIRSESDGHKRRLIIAATKLDDIGEYTYKVATSKTSAKLKVEAVKIKKTLKNLTVTE
+TQDAVFTVELTHPNVKGVQWIKNGVVLESNEKYAISVKGTIYSLRIKNCAIVDESVYGFRLGRLGASARLHVETVKIIKK
+PKDVTALENATVAFEVSVSHDTVPVKWFHKNVEIKPSDKHRLVSERKVHKLMLQNISPSDAGEYTAVVGQLECKAKLFVE
+TLHITKTMKNIEVPETKTASFECEVSHFNVPSMWLKNGVEIEMSEKFKIVVQGKLHQLIIMNTSTEDSAEYTFVCGNDQV
+SATLTVTPIMITSMLKDINAEEKDTITFEVTVNYEGISYKWLKNGVEIKSTDKCQMRTKKLTHSLNIRNVHFGDAADYTF
+VAGKATSTATLYVEARHIEFRKHIKDIKVLEKKRAMFECEVSEPDITVQWMKDDQELQITDRIKIQKEKYVHRLLIPSTR
+MSDAGKYTVVAGGNVSTAKLFVEGRDVRIRSIKKEVQVIEKQRAVVEFEVNEDDVDAHWYKDGIEINFQVQERHKYVVER
+RIHRMFISETRQSDAGEYTFVAGRNRSSVTLYVNAPEPPQVLQELQPVTVQSGKPARFCAVISGRPQPKISWYKEEQLLS
+TGFKCKFLHDGQEYTLLLIEAFPEDAAVYTCEAKNDYGVATTSASLSVEVPEVVSPDQEMPVYPPAIITPLQDTVTSEGQ
+PARFQCRVSGTDLKVSWYSKDKKIKPSRFFRMTQFEDTYQLEIAEAYPEDEGTYTFVASNAVGQVSSTANLSLEAPESIL
+HERIEQEIEMEMKEFSSSFLSAEEEGLHSAELQLSKINETLELLSESPVYSTKFDSEKEGTGPIFIKEVSNADISMGDVA
+TLSVTVIGIPKPKIQWFFNGVLLTPSADYKFVFDGDDHSLIILFTKLEDEGEYTCMASNDYGKTICSAYLKINSKGEGHK
+DTETESAVAKSLEKLGGPCPPHFLKELKPIRCAQGLPAIFEYTVVGEPAPTVTWFKENKQLCTSVYYTIIHNPNGSGTFI
+VNDPQREDSGLYICKAENMLGESTCAAELLVLLEDTDMTDTPCKAKSTPEAPEDFPQTPLKGPAVEALDSEQEIATFVKD
+TILKAALITEENQQLSYEHIAKANELSSQLPLGAQELQSILEQDKLTPESTREFLCINGSIHFQPLKEPSPNLQLQIVQS
+QKTFSKEGILMPEEPETQAVLSDTEKIFPSAMSIEQINSLTVEPLKTLLAEPEGNYPQSSIEPPMHSYLTSVAEEVLSPK
+EKTVSDTNREQRVTLQKQEAQSALILSQSLAEGHVESLQSPDVMISQVNYEPLVPSEHSCTEGGKILIESANPLENAGQD
+SAVRIEEGKSLRFPLALEEKQVLLKEEHSDNVVMPPDQIIESKREPVAIKKVQEVQGRDLLSKESLLSGIPEEQRLNLKI
+QICRALQAAVASEQPGLFSEWLRNIEKVEVEAVNITQEPRHIMCMYLVTSAKSVTEEVTIIIEDVDPQMANLKMELRDAL
+CAIIYEEIDILTAEGPRIQQGAKTSLQEEMDSFSGSQKVEPITEPEVESKYLISTEEVSYFNVQSRVKYLDATPVTKGVA
+SAVVSDEKQDESLKPSEEKEESSSESGTEEVATVKIQEAEGGLIKEDGPMIHTPLVDTVSEEGDIVHLTTSITNAKEVNW
+YFENKLVPSDEKFKCLQDQNTYTLVIDKVNTEDHQGEYVCEALNDSGKTATSAKLTVVKRAAPVIKRKIEPLEVALGHLA
+KFTCEIQSAPNVRFQWFKAGREIYESDKCSIRSSKYISSLEILRTQVVDCGEYTCKASNEYGSVSCTATLTVTEAYPPTF
+LSRPKSLTTFVGKAAKFICTVTGTPVIETIWQKDGAALSPSPNWKISDAENKHILELSNLTIQDRGVYSCKASNKFGADI
+CQAELIIIDKPHFIKELEPVQSAINKKVHLECQVDEDRKVTVTWSKDGQKLPPGKDYKICFEDKIATLEIPLAKLKDSGT
+YVCTASNEAGSSSCSATVTVREPPSFVKKVDPSYLMLPGESARLHCKLKGSPVIQVTWFKNNKELSESNTVRMYFVNSEA
+ILDITDVKVEDSGSYSCEAVNDVGSDSCSTEIVIKEPPSFIKTLEPADIVRGTNALLQCEVSGTGPFEISWFKDKKQIRS
+SKKYRLFSQKSLVCLEIFSFNSADVGEYECVVANEVGKCGCMATHLLKEPPTFVKKVDDLIALGGQTVTLQAAVRGSEPI
+SVTWMKGQEVIREDGKIKMSFSNGVAVLIIPDVQISFGGKYTCLAENEAGSQTSVGELIVKEPAKIIERAELIQVTAGDP
+ATLEYTVAGTPELKPKWYKDGRPLVASKKYRISFKNNVAQLKFYSAELHDSGQYTFEISNEVGSSSCETTFTVLDRDIAP
+FFTKPLRNVDSVVNGTCRLDCKIAGSLPMRVSWFKDGKEIAASDRYRIAFVEGTASLEIIRVDMNDAGNFTCRATNSVGS
+KDSSGALIVQEPPSFVTKPGSKDVLPGSAVCLKSTFQGSTPLTIRWFKGNKELVSGGSCYITKEALESSLELYLVKTSDS
+GTYTCKVSNVAGGVECSANLFVKEPATFVEKLEPSQLLKKGDATQLACKVTGTPPIKITWFANDREIKESSKHRMSFVES
+TAVLRLTDVGIEDSGEYMCEAQNEAGSDHCSSIVIVKESPYFTKEFKPIEVLKEYDVMLLAEVAGTPPFEITWFKDNTIL
+RSGRKYKTFIQDHLVSLQILKFVAADAGEYQCRVTNEVGSSICSARVTLREPPSFIKKIESTSSLRGGTAAFQATLKGSL
+PITVTWLKDSDEITEDDNIRMTFENNVASLYLSGIEVKHDGKYVCQAKNDAGIQRCSALLSVKEPATITEEAVSIDVTQG
+DPATLQVKFSGTKEITAKWFKDGQELTLGSKYKISVTDTVSILKIISTEKKDSGEYTFEVQNDVGRSSCKARINVLDLII
+PPSFTKKLKKMDSIKGSFIDLECIVAGSHPISIQWFKDDQEISASEKYKFSFHDNTAFLEISQLEGTDSGTYTCSATNKA
+GHNQCSGHLTVKEPPYFVEKPQSQDVNPNTRVQLKALVGGTAPMTIKWFKDNKELHSGAARSVWKDDTSTSLELFAAKAT
+DSGTYICQLSNDVGTATSKATLFVKEPPQFIKKPSPVLVLRNGQSTTFECQITGTPKIRVSWYLDGNEITAIQKHGISFI
+DGLATFQISGARVENSGTYVCEARNDAGTASCSIELKVKEPPTFIRELKPVEVVKYSDVELECEVTGTPPFEVTWLKNNR
+EIRSSKKYTLTDRVSVFNLHITKCDPSDTGEYQCIVSNEGGSCSCSTRVALKEPPSFIKKIENTTTVLKSSATFQSTVAG
+SPPISITWLKDDQILDEDDNVYISFVDSVATLQIRSVDNGHSGRYTCQAKNESGVERCYAFLLVQEPAQIVEKAKSVDVT
+EKDPMTLECVVAGTPELKVKWLKDGKQIVPSRYFSMSFENNVASFRIQSVMKQDSGQYTFKVENDFGSSSCDAYLRVLDQ
+NIPPSFTKKLTKMDKVLGSSIHMECKVSGSLPISAQWFKDGKEISTSAKYRLVCHERSVSLEVNNLELEDTANYTCKVSN
+VAGDDACSGILTVKEPPSFLVKPGRQQAIPDSTVEFKAILKGTPPFKIKWFKDDVELVSGPKCFIGLEGSTSFLNLYSVD
+ASKTGQYTCHVTNDVGSDSCTTMLLVTEPPKFVKKLEASKIVKAGDSSRLECKIAGSPEIRVVWFRNEHELPASDKYRMT
+FIDSVAVIQMNNLSTEDSGDFICEAQNPAGSTSCSTKVIVKEPPVFSSFPPIVETLKNAEVSLECELSGTPPFEVVWYKD
+KRQLRSSKKYKIASKNFHTSIHILNVDTSDIGEYHCKAQNEVGSDTCVCTVKLKEPPRFVSKLNSLTVVAGEPAELQASI
+EGAQPIFVQWLKEKEEVIRESENIRITFVENVATLQFAKAEPANAGKYICQIKNDGGMEENMATLMVLEPAVIVEKAGPM
+TVTVGETCTLECKVAGTPELSVEWYKDGKLLTSSQKHKFSFYNKISSLRILSVERQDAGTYTFQVQNNVGKSSCTAVVDV
+SDRAVPPSFTRRLKNTGGVLGASCILECKVAGSSPISVAWFHEKTKIVSGAKYQTTFSDNVCTLQLNSLDSSDMGNYTCV
+AANVAGSDECRAVLTVQEPPSFVKEPEPLEVLPGKNVTFTSVIRGTPPFKVNWFRGARELVKGDRCNIYFEDTVAELELF
+NIDISQSGEYTCVVSNNAGQASCTTRLFVKEPAAFLKRLSDHSVEPGKSIILESTYTGTLPISVTWKKDGFNITTSEKCN
+IVTTEKTCILEILNSTKRDAGQYSCEIENEAGRDVCGALVSTLEPPYFVTELEPLEAAVGDSVSLQCQVAGTPEITVSWY
+KGDTKLRPTPEYRTYFTNNVATLVFNKVNINDSGEYTCKAENSIGTASSKTVFRIQERQLPPSFARQLKDIEQTVGLPVT
+LTCRLNGSAPIQVCWYRDGVLLRDDENLQTSFVDNVATLKILQTDLSHSGQYSCSASNPLGTASSSARLTAREPKKSPFF
+DIKPVSIDVIAGESADFECHVTGAQPMRITWSKDNKEIRPGGNYTITCVGNTPHLRILKVGKGDSGQYTCQATNDVGKDM
+CSAQLSVKEPPKFVKKLEASKVAKQGESIQLECKISGSPEIKVSWFRNDSELHESWKYNMSFINSVALLTINEASAEDSG
+DYICEAHNGVGDASCSTALTVKAPPVFTQKPSPVGALKGSDVILQCEISGTPPFEVVWVKDRKQVRNSKKFKITSKHFDT
+SLHILNLEASDVGEYHCKATNEVGSDTCSCSVKFKEPPRFVKKLSDTSTLIGDAVELRAIVEGFQPISVVWLKDRGEVIR
+ESENTRISFIDNIATLQLGSPEASNSGKYICQIKNDAGMRECSAVLTVLEPARIIEKPEPMTVTTGNPFALECVVTGTPE
+LSAKWFKDGRELSADSKHHITFINKVASLKIPCAEMSDKGLYSFEVKNSVGKSNCTVSVHVSDRIVPPSFIRKLKDVNAI
+LGASVVLECRVSGSAPISVGWFQDGNEIVSGPKCQSSFSENVCTLNLSLLEPSDTGIYTCVAANVAGSDECSAVLTVQEP
+PSFEQTPDSVEVLPGMSLTFTSVIRGTPPFKVKWFKGSRELVPGESCNISLEDFVTELELFEVQPLESGDYSCLVTNDAG
+SASCTTHLFVKEPATFVKRLADFSVETGSPIVLEATYTGTPPISVSWIKDEYLISQSERCSITMTEKSTILEILESTIED
+YAQYSCLIENEAGQDICEALVSVLEPPYFIEPLEHVEAVIGEPATLQCKVDGTPEIRISWYKEHTKLRSAPAYKMQFKNN
+VASLVINKVDHSDVGEYSCKADNSVGAVASSAVLVIKERKLPPFFARKLKDVHETLGFPVAFECRINGSEPLQVSWYKDG
+VLLKDDANLQTSFVHNVATLQILQTDQSHIGQYNCSASNPLGTASSSAKLILSEHEVPPFFDLKPVSVDLALGESGTFKC
+HVTGTAPIKITWAKDNREIRPGGNYKMTLVENTATLTVLKVGKGDAGQYTCYASNIAGKDSCSAHLGVQEPPRFIKKLEP
+SRIVKQDEFTRYECKIGGSPEIKVLWYKDETEIQESSKFRMSFVDSVAVLEMHNLSVEDSGDYTCEAHNAAGSASSSTSL
+KVKEPPIFRKKPHPIETLKGADVHLECELQGTPPFHVSWYKDKRELRSGKKYKIMSENFLTSIHILNVDAADIGEYQCKA
+TNDVGSDTCVGSIALKAPPRFVKKLSDISTVVGKEVQLQTTIEGAEPISVVWFKDKGEIVRESDNIWISYSENIATLQFS
+RVEPANAGKYTCQIKNDAGMQECFATLSVLEPATIVEKPESIKVTTGDTCTLECTVAGTPELSTKWFKDGKELTSDNKYK
+ISFFNKVSGLKIINVAPSDSGVYSFEVQNPVGKDSCTASLQVSDRTVPPSFTRKLKETNGLSGSSVVMECKVYGSPPISV
+SWFHEGNEISSGRKYQTTLTDNTCALTVNMLEESDSGDYTCIATNMAGSDECSAPLTVREPPSFVQKPDPMDVLTGTNVT
+FTSIVKGTPPFSVSWFKGSSELVPGDRCNVSLEDSVAELELFDVDTSQSGEYTCIVSNEAGKASCTTHLYIKAPAKFVKR
+LNDYSIEKGKPLILEGTFTGTPPISVTWKKNGINVTPSQRCNITTTEKSAILEIPSSTVEDAGQYNCYIENASGKDSCSA
+QILILEPPYFVKQLEPVKVSVGDSASLQCQLAGTPEIGVSWYKGDTKLRPTTTYKMHFRNNVATLVFNQVDINDSGEYIC
+KAENSVGEVSASTFLTVQEQKLPPSFSRQLRDVQETVGLPVVFDCAISGSEPISVSWYKDGKPLKDSPNVQTSFLDNTAT
+LNIFKTDRSLAGQYSCTATNPIGSASSSARLILTEGKNPPFFDIRLAPVDAVVGESADFECHVTGTQPIKVSWAKDSREI
+RSGGKYQISYLENSAHLTVLKVDKGDSGQYTCYAVNEVGKDSCTAQLNIKERLIPPSFTKRLSETVEETEGNSFKLEGRV
+AGSQPITVAWYKNNIEIQPTSNCEITFKNNTLVLQVRKAGMNDAGLYTCKVSNDAGSALCTSSIVIKEPKKPPVFDQHLT
+PVTVSEGEYVQLSCHVQGSEPIRIQWLKAGREIKPSDRCSFSFASGTAVLELRDVAKADSGDYVCKASNVAGSDTTKSKV
+TIKDKPAVAPATKKAAVDGRLFFVSEPQSIRVVEKTTATFIAKVGGDPIPNVKWTKGKWRQLNQGGRVFIHQKGDEAKLE
+IRDTTKTDSGLYRCVAFNEHGEIESNVNLQVDERKKQEKIEGDLRAMLKKTPILKKGAGEEEEIDIMELLKNVDPKEYEK
+YARMYGITDFRGLLQAFELLKQSQEEETHRLEIEEIERSERDEKEFEELVSFIQQRLSQTEPVTLIKDIENQTVLKDNDA
+VFEIDIKINYPEIKLSWYKGTEKLEPSDKFEISIDGDRHTLRVKNCQLKDQGNYRLVCGPHIASAKLTVIEPAWERHLQD
+VTLKEGQTCTMTCQFSVPNVKSEWFRNGRILKPQGRHKTEVEHKVHKLTIADVRAEDQGQYTCKYEDLETSAELRIEAEP
+IQFTKRIQNIVVSEHQSATFECEVSFDDAIVTWYKGPTELTESQKYNFRNDGRCHYMTIHNVTPDDEGVYSVIARLEPRG
+EARSTAELYLTTKEIKLELKPPDIPDSRVPIPTMPIRAVPPEEIPPVVAPPIPLLLPTPEEKKPPPKRIEVTKKAVKKDA
+KKVVAKPKEMTPREEIVKKPPPPTTLIPAKAPEIIDVSSKAEEVKIMTITRKKEVQKEKEAVYEKKQAVHKEKRVFIESF
+EEPYDELEVEPYTEPFEQPYYEEPDEDYEEIKVEAKKEVHEEWEEDFEEGQEYYEREEGYDEGEEEWEEAYQEREVIQVQ
+KEVYEESHERKVPAKVPEKKAPPPPKVIKKPVIEKIEKTSRRMEEEKVQVTKVPEVSKKIVPQKPSRTPVQEEVIEVKVP
+AVHTKKMVISEEKMFFASHTEEEVSVTVPEVQKEIVTEEKIHVAVSKRVEPPPKVPELPEKPAPEEVAPVPIPKKVEPPA
+PKVPEVPKKPVPEEKKPVPVPKKEPAAPPKVPEVPKKPVPEEKIPVPVAKKKEAPPAKVPEVQKRVVTEEKITIVTQREE
+SPPPAVPEIPKKKVPEERKPVPRKEEEVPPPPKVPALPKKPVPEEKVAVPVPVAKKAPPPRAEVSKKTVVEEKRFVAEEK
+LSFAVPQRVEVTRHEVSAEEEWSYSEEEEGVSISVYREEEREEEEEAEVTEYEVMEEPEEYVVEEKLHIISKRVEAEPAE
+VTERQEKKIVLKPKIPAKIEEPPPAKVPEAPKKIVPEKKVPAPVPKKEKVPPPKVPEEPKKPVPEKKVPPKVIKMEEPLP
+AKVTERHMQITQEEKVLVAVTKKEAPPKARVPEEPKRAVPEEKVLKLKPKREEEPPAKVTEFRKRVVKEEKVSIEAPKRE
+PQPIKEVTIMEEKERAYTLEEEAVSVQREEEYEEYEEYDYKEFEEYEPTEEYDQYEEYEEREYERYEEHEEYITEPEKPI
+PVKPVPEEPVPTKPKAPPAKVLKKAVPEEKVPVPIPKKLKPPPPKVPEEPKKVFEEKIRISITKREKEQVTEPAAKVPMK
+PKRVVAEEKVPVPRKEVAPPVRVPEVPKELEPEEVAFEEEVVTHVEEYLVEEEEEYIHEEEEFITEEEVVPVIPVKVPEV
+PRKPVPEEKKPVPVPKKKEAPPAKVPEVPKKPEEKVPVLIPKKEKPPPAKVPEVPKKPVPEEKVPVPVPKKVEAPPAKVP
+EVPKKPVPEKKVPVPAPKKVEAPPAKVPEVPKKLIPEEKKPTPVPKKVEAPPPKVPKKREPVPVPVALPQEEEVLFEEEI
+VPEEEVLPEEEEVLPEEEEVLPEEEEVLPEEEEIPPEEEEVPPEEEYVPEEEEFVPEEEVLPEVKPKVPVPAPVPEIKKK
+VTEKKVVIPKKEEAPPAKVPEVPKKVEEKRIILPKEEEVLPVEVTEEPEEEPISEEEIPEEPPSIEEVEEVAPPRVPEVI
+KKAVPEAPTPVPKKVEAPPAKVSKKIPEEKVPVPVQKKEAPPAKVPEVPKKVPEKKVLVPKKEAVPPAKGRTVLEEKVSV
+AFRQEVVVKERLELEVVEAEVEEIPEEEEFHEVEEYFEEGEFHEVEEFIKLEQHRVEEEHRVEKVHRVIEVFEAEEVEVF
+EKPKAPPKGPEISEKIIPPKKPPTKVVPRKEPPAKVPEVPKKIVVEEKVRVPEEPRVPPTKVPDVLPPKEVVPEKKVPVP
+PAKKPEAPPPKVPEAPKEVVPEKKVPVPPPKKPEVPPTKVPEVPKAAVPEKKVPEAIPPKPESPPPEVPEAPKEVVPEKK
+VPAAPPKKPEVTPVKVPEAPKEVVPEKKVPVPPPKKPEVPPTKVPEVPKVAVPEKKVPEAIPPKPESPPPEVFEEPEEVA
+LEEPPAEVVEEPEPAAPPQVTVPPKKPVPEKKAPAVVAKKPELPPVKVPEVPKEVVPEKKVPLVVPKKPEAPPAKVPEVP
+KEVVPEKKVAVPKKPEVPPAKVPEVPKKPVLEEKPAVPVPERAESPPPEVYEEPEEIAPEEEIAPEEEKPVPVAEEEEPE
+VPPPAVPEEPKKIIPEKKVPVIKKPEAPPPKEPEPEKVIEKPKLKPRPPPPPPAPPKEDVKEKIFQLKAIPKKKVPEKPQ
+VPEKVELTPLKVPGGEKKVRKLLPERKPEPKEEVVLKSVLRKRPEEEEPKVEPKKLEKVKKPAVPEPPPPKPVEEVEVPT
+VTKRERKIPEPTKVPEIKPAIPLPAPEPKPKPEAEVKTIKPPPVEPEPTPIAAPVTVPVVGKKAEAKAPKEEAAKPKGPI
+KGVPKKTPSPIEAERRKLRPGSGGEKPPDEAPFTYQLKAVPLKFVKEIKDIILTESEFVGSSAIFECLVSPSTAITTWMK
+DGSNIRESPKHRFIADGKDRKLHIIDVQLSDAGEYTCVLRLGNKEKTSTAKLVVEELPVRFVKTLEEEVTVVKGQPLYLS
+CELNKERDVVWRKDGKIVVEKPGRIVPGVIGLMRALTINDADDTDAGTYTVTVENANNLECSSCVKVVEVIRDWLVKPIR
+DQHVKPKGTAIFACDIAKDTPNIKWFKGYDEIPAEPNDKTEILRDGNHLYLKIKNAMPEDIAEYAVEIEGKRYPAKLTLG
+EREVELLKPIEDVTIYEKESASFDAEISEADIPGQWKLKGELLRPSPTCEIKAEGGKRFLTLRKVKLDQAGEVLYQALNA
+ITTAILTVKEIELDFAVPLKDVTVPERRQARFECVLTREANVIWSKGPDIIKSSDKFDIIADGKKHILVINDSQFDDEGV
+YTAEVEGKKTSARLFVTGIRLKFMSPLEDQTVKEGETATFVCELSHEKMHVVWFKNDAKLHTSRTVLISSEGKTHKLEMK
+EVTLDDISQIKAQVKELSSTAQLKVLEADPYFTVKLHDKTAVEKDEITLKCEVSKDVPVKWFKDGEEIVPSPKYSIKADG
+LRRILKIKKADLKDKGEYVCDCGTDKTKANVTVEARLIKVEKPLYGVEVFVGETAHFEIELSEPDVHGQWKLKGQPLTAS
+PDCEIIEDGKKHILILHNCQLGMTGEVSFQAANAKSAANLKVKELPLIFITPLSDVKVFEKDEAKFECEVSREPKTFRWL
+KGTQEITGDDRFELIKDGTKHSMVIKSAAFEDEAKYMFEAEDKHTSGKLIIEGIRLKFLTPLKDVTAKEKESAVFTVELS
+HDNIRVKWFKNDQRLHTTRSVSMQDEGKTHSITFKDLSIDDTSQIRVEAMGMSSEAKLTVLEGDPYFTGKLQDYTGVEKD
+EVILQCEISKADAPVKWFKDGKEIKPSKNAVIKADGKKRMLILKKALKSDIGQYTCDCGTDKTSGKLDIEDREIKLVRPL
+HSVEVMETETARFETEISEDDIHANWKLKGEALLQTPDCEIKEEGKIHSLVLHNCRLDQTGGVDFQAANVKSSAHLRVKP
+RVIGLLRPLKDVTVTAGETATFDCELSYEDIPVEWYLKGKKLEPSDKVVPRSEGKVHTLTLRDVKLEDAGEVQLTAKDFK
+THANLFVKEPPVEFTKPLEDQTVEEGATAVLECEVSRENAKVKWFKNGTEILKSKKYEIVADGRVRKLVIHDCTPEDIKT
+YTCDAKDFKTSCNLNVVPPHVEFLRPLTDLQVREKEMARFECELSRENAKVKWFKDGAEIKKGKKYDIISKGAVRILVIN
+KCLLDDEAEYSCEVRTARTSGMLTVLEEEAVFTKNLANIEVSETDTIKLVCEVSKPGAEVIWYKGDEEIIETGRYEILTE
+GRKRILVIQNAHLEDAGNYNCRLPSSRTDGKVKVHELAAEFISKPQNLEILEGEKAEFVCSISKESFPVQWKRDDKTLES
+GDKYDVIADGKKRVLVVKDATLQDMGTYVVMVGAARAAAHLTVIEKLRIVVPLKDTRVKEQQEVVFNCEVNTEGAKAKWF
+RNEEAIFDSSKYIILQKDLVYTLRIRDAHLDDQANYNVSLTNHRGENVKSAANLIVEEEDLRIVEPLKDIETMEKKSVTF
+WCKVNRLNVTLKWTKNGEEVPFDNRVSYRVDKYKHMLTIKDCGFPDEGEYIVTAGQDKSVAELLIIEAPTEFVEHLEDQT
+VTEFDDAVFSCQLSREKANVKWYRNGREIKEGKKYKFEKDGSIHRLIIKDCRLDDECEYACGVEDRKSRARLFVEEIPVE
+IIRPPQDILEAPGADVVFLAELNKDKVEVQWLRNNMVVVQGDKHQMMSEGKIHRLQICDIKPRDQGEYRFIAKDKEARAK
+LELAAAPKIKTADQDLVVDVGKPLTMVVPYDAYPKAEAEWFKENEPLSTKTIDTTAEQTSFRILEAKKGDKGRYKIVLQN
+KHGKAEGFINLKVIDVPGPVRNLEVTETFDGEVSLAWEEPLTDGGSKIIGYVVERRDIKRKTWVLATDRAESCEFTVTGL
+QKGGVEYLFRVSARNRVGTGEPVETDNPVEARSKYDVPGPPLNVTITDVNRFGVSLTWEPPEYDGGAEITNYVIELRDKT
+SIRWDTAMTVRAEDLSATVTDVVEGQEYSFRVRAQNRIGVGKPSAATPFVKVADPIERPSPPVNLTSSDQTQSSVQLKWE
+PPLKDGGSPILGYIIERCEEGKDNWIRCNMKLVPELTYKVTGLEKGNKYLYRVSAENKAGVSDPSEILGPLTADDAFVEP
+TMDLSAFKDGLEVIVPNPITILVPSTGYPRPTATWCFGDKVLETGDRVKMKTLSAYAELVISPSERSDKGIYTLKLENRV
+KTISGEIDVNVIARPSAPKELKFGDITKDSVHLTWEPPDDDGGSPLTGYVVEKREVSRKTWTKVMDFVTDLEFTVPDLVQ
+GKEYLFKVCARNKCGPGEPAYVDEPVNMSTPATVPDPPENVKWRDRTANSIFLTWDPPKNDGGSRIKGYIVERCPRGSDK
+WVACGEPVAETKMEVTGLEEGKWYAYRVKALNRQGASKPSRPTEEIQAVDTQEAPEIFLDVKLLAGLTVKAGTKIELPAT
+VTGKPEPKITWTKADMILKQDKRITIENVPKKSTVTIVDSKRSDTGTYIIEAVNVCGRATAVVEVNVLDKPGPPAAFDIT
+DVTNESCLLTWNPPRDDGGSKITNYVVERRATDSEVWHKLSSTVKDTNFKATKLIPNKEYIFRVAAENMYGVGEPVQASP
+ITAKYQFDPPGPPTRLEPSDITKDAVTLTWCEPDDDGGSPITGYWVERLDPDTDKWVRCNKMPVKDTTYRVKGLTNKKKY
+RFRVLAENLAGPGKPSKSTEPILIKDPIDPPWPPGKPTVKDVGKTSVRLNWTKPEHDGGAKIESYVIEMLKTGTDEWVRV
+AEGVPTTQHLLPGLMEGQEYSFRVRAVNKAGESEPSEPSDPVLCREKLYPPSPPRWLEVINITKNTADLKWTVPEKDGGS
+PITNYIVEKRDVRRKGWQTVDTTVKDTKCTVTPLTEGSLYVFRVAAENAIGQSDYTEIEDSVLAKDTFTTPGPPYALAVV
+DVTKRHVDLKWEPPKNDGGRPIQRYVIEKKERLGTRWVKAGKTAGPDCNFRVTDVIEGTEVQFQVRAENEAGVGHPSEPT
+EILSIEDPTSPPSPPLDLHVTDAGRKHIAIAWKPPEKNGGSPIIGYHVEMCPVGTEKWMRVNSRPIKDLKFKVEEGVVPD
+KEYVLRVRAVNAIGVSEPSEISENVVAKDPDCKPTIDLETHDIIVIEGEKLSIPVPFRAVPVPTVSWHKDGKEVKASDRL
+TMKNDHISAHLEVPKSVRADAGIYTITLENKLGSATASINVKVIGLPGPCKDIKASDITKSSCKLTWEPPEFDGGTPILH
+YVLERREAGRRTYIPVMSGENKLSWTVKDLIPNGEYFFRVKAVNKVGGGEYIELKNPVIAQDPKQPPDPPVDVEVHNPTA
+EAMTITWKPPLYDGGSKIMGYIIEKIAKGEERWKRCNEHLVPILTYTAKGLEEGKEYQFRVRAENAAGISEPSRATPPTK
+AVDPIDAPKVILRTSLEVKRGDEIALDASISGSPYPTITWIKDENVIVPEEIKKRAAPLVRRRKGEVQEEEPFVLPLTQR
+LSIDNSKKGESQLRVRDSLRPDHGLYMIKVENDHGIAKAPCTVSVLDTPGPPINFVFEDIRKTSVLCKWEPPLDDGGSEI
+INYTLEKKDKTKPDSEWIVVTSTLRHCKYSVTKLIEGKEYLFRVRAENRFGPGPPCVSKPLVAKDPFGPPDAPDKPIVED
+VTSNSMLVKWNEPKDNGSPILGYWLEKREVNSTHWSRVNKSLLNALKANVDGLLEGLTYVFRVCAENAAGPGKFSPPSDP
+KTAHDPISPPGPPIPRVTDTSSTTIELEWEPPAFNGGGEIVGYFVDKQLVGTNEWSRCTEKMIKVRQYTVKEIREGADYK
+LRVSAVNAAGEGPPGETQPVTVAEPQEPPAVELDVSVKGGIQIMAGKTLRIPAVVTGRPVPTKVWTKEEGELDKDRVVID
+NVGTKSELIIKDALRKDHGRYVITATNSCGSKFAAARVEVFDVPGPVLDLKPVVTNRKMCLLNWSDPEDDGGSEITGFII
+ERKDAKMHTWRQPIETERSKCDITGLLEGQEYKFRVIAKNKFGCGPPVEIGPILAVDPLGPPTSPERLTYTERTKSTITL
+DWKEPRSNGGSPIQGYIIEKRRHDKPDFERVNKRLCPTTSFLVENLDEHQMYEFRVKAVNEIGESEPSLPLNVVIQDDEV
+PPTIKLRLSVRGDTIKVKAGEPVHIPADVTGLPMPKIEWSKNETVIEKPTDALQITKEEVSRSEAKTELSIPKAVREDKG
+TYTVTASNRLGSVFRNVHVEVYDRPSPPRNLAVTDIKAESCYLTWDAPLDNGGSEITHYVIDKRDASRKKAEWEEVTNTA
+VEKRYGIWKLIPNGQYEFRVRAVNKYGISDECKSDKVVIQDPYRLPGPPGKPKVLARTKGSMLVSWTPPLDNGGSPITGY
+WLEKREEGSPYWSRVSRAPITKVGLKGVEFNVPRLLEGVKYQFRAMAINAAGIGPPSEPSDPEVAGDPIFPPGPPSCPEV
+KDKTKSSISLGWKPPAKDGGSPIKGYIVEMQEEGTTDWKRVNEPDKLITTCECVVPNLKELRKYRFRVKAVNEAGESEPS
+DTTGEIPATDIQEEPEVFIDIGAQDCLVCKAGSQIRIPAVIKGRPTPKSSWEFDGKAKKAMKDGVHDIPEDAQLETAENS
+SVIIIPECKRSHTGKYSITAKNKAGQKTANCRVKVMDVPGPPKDLKVSDITRGSCRLSWKMPDDDGGDRIKGYVIEKRTI
+DGKAWTKVNPDCGSTTFVVPDLLSEQQYFFRVRAENRFGIGPPVETIQRTTARDPIYPPDPPIKLKIGLITKNTVHLSWK
+PPKNDGGSPVTHYIVECLAWDPTGTKKEAWRQCNKRDVEELQFTVEDLVEGGEYEFRVKAVNAAGVSKPSATVGPCDCQR
+PDMPPSIDLKEFMEVEEGTNVNIVAKIKGVPFPTLTWFKAPPKKPDNKEPVLYDTHVNKLVVDDTCTLVIPQSRRSDTGL
+YTITAVNNLGTASKEMRLNVLGRPGPPVGPIKFESVSADQMTLSWFPPKDDGGSKITNYVIEKREANRKTWVHVSSEPKE
+CTYTIPKLLEGHEYVFRIMAQNKYGIGEPLDSEPETARNLFSVPGAPDKPTVSSVTRNSMTVNWEEPEYDGGSPVTGYWL
+EMKDTTSKRWKRVNRDPIKAMTLGVSYKVTGLIEGSDYQFRVYAINAAGVGPASLPSDPATARDPIAPPGPPFPKVTDWT
+KSSADLEWSPPLKDGGSKVTGYIVEYKEEGKEEWEKGKDKEVRGTKLVVTGLKEGAFYKFRVSAVNIAGIGEPGEVTDVI
+EMKDRLVSPDLQLDASVRDRIVVHAGGVIRIIAYVSGKPPPTVTWNMNERTLPQEATIETTAISSSMVIKNCQRSHQGVY
+SLLAKNEAGERKKTIIVDVLDVPGPVGTPFLAHNLTNESCKLTWFSPEDDGGSPITNYVIEKRESDRRAWTPVTYTVTRQ
+NATVQGLIQGKAYFFRIAAENSIGMGPFVETSEALVIREPITVPERPEDLEVKEVTKNTVTLTWNPPKYDGGSEIINYVL
+ESRLIGTEKFHKVTNDNLLSRKYTVKGLKEGDTYEYRVSAVNIVGQGKPSFCTKPITCKDELAPPTLHLDFRDKLTIRVG
+EAFALTGRYSGKPKPKVSWFKDEADVLEDDRTHIKTTPATLALEKIKAKRSDSGKYCVVVENSTGSRKGFCQVNVVDRPG
+PPVGPVSFDEVTKDYMVISWKPPLDDGGSKITNYIIEKKEVGKDVWMPVTSASAKTTCKVSKLLEGKDYIFRIHAENLYG
+ISDPLVSDSMKAKDRFRVPDAPDQPIVTEVTKDSALVTWNKPHDGGKPITNYILEKRETMSKRWARVTKDPIHPYTKFRV
+PDLLEGCQYEFRVSAENEIGIGDPSPPSKPVFAKDPIAKPSPPVNPEAIDTTCNSVDLTWQPPRHDGGSKILGYIVEYQK
+VGDEEWRRANHTPESCPETKYKVTGLRDGQTYKFRVLAVNAAGESDPAHVPEPVLVKDRLEPPELILDANMAREQHIKVG
+DTLRLSAIIKGVPFPKVTWKKEDRDAPTKARIDVTPVGSKLEIRNAAHEDGGIYSLTVENPAGSKTVSVKVLVLDKPGPP
+RDLEVSEIRKDSCYLTWKEPLDDGGSVITNYVVERRDVASAQWSPLSATSKKKSHFAKHLNEGNQYLFRVAAENQYGRGP
+FVETPKPIKALDPLHPPGPPKDLHHVDVDKTEVSLVWNKPDRDGGSPITGYLVEYQEEGTQDWIKFKTVTNLECVVTGLQ
+QGKTYRFRVKAENIVGLGLPDTTIPIECQEKLVPPSVELDVKLIEGLVVKAGTTVRFPAIIRGVPVPTAKWTTDGSEIKT
+DEHYTVETDNFSSVLTIKNCLRRDTGEYQITVSNAAGSKTVAVHLTVLDVPGPPTGPINILDVTPEHMTISWQPPKDDGG
+SPVINYIVEKQDTRKDTWGVVSSGSSKTKLKIPHLQKGCEYVFRVRAENKIGVGPPLDSTPTVAKHKFSPPSPPGKPVVT
+DITENAATVSWTLPKSDGGSPITGYYMERREVTGKWVRVNKTPIADLKFRVTGLYEGNTYEFRVFAENLAGLSKPSPSSD
+PIKACRPIKPPGPPINPKLKDKSRETADLVWTKPLSDGGSPILGYVVECQKPGTAQWNRINKDELIRQCAFRVPGLIEGN
+EYRFRIKAANIVGEGEPRELAESVIAKDILHPPEVELDVTCRDVITVRVGQTIRILARVKGRPEPDITWTKEGKVLVREK
+RVDLIQDLPRVELQIKEAVRADHGKYIISAKNSSGHAQGSAIVNVLDRPGPCQNLKVTNVTKENCTISWENPLDNGGSEI
+TNFIVEYRKPNQKGWSIVASDVTKRLIKANLLANNEYYFRVCAENKVGVGPTIETKTPILAINPIDRPGEPENLHIADKG
+KTFVYLKWRRPDYDGGSPNLSYHVERRLKGSDDWERVHKGSIKETHYMVDRCVENQIYEFRVQTKNEGGESDWVKTEEVV
+VKEDLQKPVLDLKLSGVLTVKAGDTIRLEAGVRGKPFPEVAWTKDKDATDLTRSPRVKIDTRADSSKFSLTKAKRSDGGK
+YVVTATNTAGSFVAYATVNVLDKPGPVRNLKIVDVSSDRCTVCWDPPEDDGGCEIQNYILEKCETKRMVWSTYSATVLTP
+GTTVTRLIEGNEYIFRVRAENKIGTGPPTESKPVIAKTKYDKPGRPDPPEVTKVSKEEMTVVWNPPEYDGGKSITGYFLE
+KKEKHSTRWVPVNKSAIPERRMKVQNLLPDHEYQFRVKAENEIGIGEPSLPSRPVVAKDPIEPPGPPTNFRVVDTTKHSI
+TLGWGKPVYDGGAPIIGYVVEMRPKIADASPDEGWKRCNAAAQLVRKEFTVTSLDENQEYEFRVCAQNQVGIGRPAELKE
+AIKPKEILEPPEIDLDASMRKLVIVRAGCPIRLFAIVRGRPAPKVTWRKVGIDNVVRKGQVDLVDTMAFLVIPNSTRDDS
+GKYSLTLVNPAGEKAVFVNVRVLDTPGPVSDLKVSDVTKTSCHVSWAPPENDGGSQVTHYIVEKREADRKTWSTVTPEVK
+KTSFHVTNLVPGNEYYFRVTAVNEYGPGVPTDVPKPVLASDPLSEPDPPRKLEVTEMTKNSATLAWLPPLRDGGAKIDGY
+ITSYREEEQPADRWTEYSVVKDLSLVVTGLKEGKKYKFRVAARNAVGVSLPREAEGVYEAKEQLLPPKILMPEQITIKAG
+KKLRIEAHVYGKPHPTCKWKKGEDEVVTSSHLAVHKADSSSILIIKDVTRKDSGYYSLTAENSSGTDTQKIKVVVMDAPG
+PPQPPFDISDIDADACSLSWHIPLEDGGSNITNYIVEKCDVSRGDWVTALASVTKTSCRVGKLIPGQEYIFRVRAENRFG
+ISEPLTSPKMVAQFPFGVPSEPKNARVTKVNKDCIFVAWDRPDSDGGSPIIGYLIERKERNSLLWVKANDTLVRSTEYPC
+AGLVEGLEYSFRIYALNKAGSSPPSKPTEYVTARMPVDPPGKPEVIDVTKSTVSLIWARPKHDGGSKIIGYFVEACKLPG
+DKWVRCNTAPHQIPQEEYTATGLEEKAQYQFRAIARTAVNISPPSEPSDPVTILAENVPPRIDLSVAMKSLLTVKAGTNV
+CLDATVFGKPMPTVSWKKDGTLLKPAEGIKMAMQRNLCTLELFSVNRKDSGDYTITAENSSGSKSATIKLKVLDKPGPPA
+SVKINKMYSDRAMLSWEPPLEDGGSEITNYIVDKRETSRPNWAQVSATVPITSCSVEKLIEGHEYQFRICAENKYGVGDP
+VFTEPAIAKNPYDPPGRCDPPVISNITKDHMTVSWKPPADDGGSPITGYLLEKRETQAVNWTKVNRKPIIERTLKATGLQ
+EGTEYEFRVTAINKAGPGKPSDASKAAYARDPQYPPAPPAFPKVYDTTRSSVSLSWGKPAYDGGSPIIGYLVEVKRADSD
+NWVRCNLPQNLQKTRFEVTGLMEDTQYQFRVYAVNKIGYSDPSDVPDKHYPKDILIPPEGELDADLRKTLILRAGVTMRL
+YVPVKGRPPPKITWSKPNVNLRDRIGLDIKSTDFDTFLRCENVNKYDAGKYILTLENSCGKKEYTIVVKVLDTPGPPVNV
+TVKEISKDSAYVTWEPPIIDGGSPIINYVVQKRDAERKSWSTVTTECSKTSFRVANLEEGKSYFFRVFAENEYGIGDPGE
+TRDAVKASQTPGPVVDLKVRSVSKSSCSIGWKKPHSDGGSRIIGYVVDFLTEENKWQRVMKSLSLQYSAKDLTEGKEYTF
+RVSAENENGEGTPSEITVVARDDVVAPDLDLKGLPDLCYLAKENSNFRLKIPIKGKPAPSVSWKKGEDPLATDTRVSVES
+SAVNTTLIVYDCQKSDAGKYTITLKNVAGTKEGTISIKVVGKPGIPTGPIKFDEVTAEAMTLKWAPPKDDGGSEITNYIL
+EKRDSVNNKWVTCASAVQKTTFRVTRLHEGMEYTFRVSAENKYGVGEGLKSEPIVARHPFDVPDAPPPPNIVDVRHDSVS
+LTWTDPKKTGGSPITGYHLEFKERNSLLWKRANKTPIRMRDFKVTGLTEGLEYEFRVMAINLAGVGKPSLPSEPVVALDP
+IDPPGKPEVINITRNSVTLIWTEPKYDGGHKLTGYIVEKRDLPSKSWMKANHVNVPECAFTVTDLVEGGKYEFRIRAKNT
+AGAISAPSESTETIICKDEYEAPTIVLDPTIKDGLTIKAGDTIVLNAISILGKPLPKSSWSKAGKDIRPSDITQITSTPT
+SSMLTIKYATRKDAGEYTITATNPFGTKVEHVKVTVLDVPGPPGPVEISNVSAEKATLTWTPPLEDGGSPIKSYILEKRE
+TSRLLWTVVSEDIQSCRHVATKLIQGNEYIFRVSAVNHYGKGEPVQSEPVKMVDRFGPPGPPEKPEVSNVTKNTATVSWK
+RPVDDGGSEITGYHVERREKKSLRWVRAIKTPVSDLRCKVTGLQEGSTYEFRVSAENRAGIGPPSEASDSVLMKDAAYPP
+GPPSNPHVTDTTKKSASLAWGKPHYDGGLEITGYVVEHQKVGDEAWIKDTTGTALRITQFVVPDLQTKEKYNFRISAIND
+AGVGEPAVIPDVEIVEREMAPDFELDAELRRTLVVRAGLSIRIFVPIKGRPAPEVTWTKDNINLKNRANIENTESFTLLI
+IPECNRYDTGKFVMTIENPAGKKSGFVNVRVLDTPGPVLNLRPTDITKDSVTLHWDLPLIDGGSRITNYIVEKREATRKS
+YSTATTKCHKCTYKVTGLSEGCEYFFRVMAENEYGIGEPTETTEPVKASEAPSPPDSLNIMDITKSTVSLAWPKPKHDGG
+SKITGYVIEAQRKGSDQWTHITTVKGLECVVRNLTEGEEYTFQVMAVNSAGRSAPRESRPVIVKEQTMLPELDLRGIYQK
+LVIAKAGDNIKVEIPVLGRPKPTVTWKKGDQILKQTQRVNFETTATSTILNINECVRSDSGPYPLTARNIVGEVGDVITI
+QVHDIPGPPTGPIKFDEVSSDFVTFSWDPPENDGGVPISNYVVEMRQTDSTTWVELATTVIRTTYKATRLTTGLEYQFRV
+KAQNRYGVGPGITSACIVANYPFKVPGPPGTPQVTAVTKDSMTISWHEPLSDGGSPILGYHVERKERNGILWQTVSKALV
+PGNIFKSSGLTDGIAYEFRVIAENMAGKSKPSKPSEPMLALDPIDPPGKPVPLNITRHTVTLKWAKPEYTGGFKITSYIV
+EKRDLPNGRWLKANFSNILENEFTVSGLTEDAAYEFRVIAKNAAGAISPPSEPSDAITCRDDVEAPKIKVDVKFKDTVIL
+KAGEAFRLEADVSGRPPPTMEWSKDGKELEGTAKLEIKIADFSTNLVNKDSTRRDSGAYTLTATNPGGFAKHIFNVKVLD
+RPGPPEGPLAVTEVTSEKCVLSWFPPLDDGGAKIDHYIVQKRETSRLAWTNVASEVQVTKLKVTKLLKGNEYIFRVMAVN
+KYGVGEPLESEPVLAVNPYGPPDPPKNPEVTTITKDSMVVCWGHPDSDGGSEIINYIVERRDKAGQRWIKCNKKTLTDLR
+YKVSGLTEGHEYEFRIMAENAAGISAPSPTSPFYKACDTVFKPGPPGNPRVLDTSRSSISIAWNKPIYDGGSEITGYMVE
+IALPEEDEWQIVTPPAGLKATSYTITGLTENQEYKIRIYAMNSEGLGEPALVPGTPKAEDRMLPPEIELDADLRKVVTIR
+ACCTLRLFVPIKGRPAPEVKWARDHGESLDKASIESTSSYTLLIVGNVNRFDSGKYILTVENSSGSKSAFVNVRVLDTPG
+PPQDLKVKEVTKTSVTLTWDPPLLDGGSKIKNYIVEKRESTRKAYSTVATNCHKTSWKVDQLQEGCSYYFRVLAENEYGI
+GLPAETAESVKASERPLPPGKITLMDVTRNSVSLSWEKPEHDGGSRILGYIVEMQTKGSDKWATCATVKVTEATITGLIQ
+GEEYSFRVSAQNEKGISDPRQLSVPVIAKDLVIPPAFKLLFNTFTVLAGEDLKVDVPFIGRPTPAVTWHKDNVPLKQTTR
+VNAESTENNSLLTIKDACREDVGHYVVKLTNSAGEAIETLNVIVLDKPGPPTGPVKMDEVTADSITLSWGPPKYDGGSSI
+NNYIVEKRDTSTTTWQIVSATVARTTIKACRLKTGCEYQFRIAAENRYGKSTYLNSEPTVAQYPFKVPGPPGTPVVTLSS
+RDSMEVQWNEPISDGGSRVIGYHLERKERNSILWVKLNKTPIPQTKFKTTGLEEGVEYEFRVSAENIVGIGKPSKVSECY
+VARDPCDPPGRPEAIIVTRNSVTLQWKKPTYDGGSKITGYIVEKKELPEGRWMKASFTNIIDTHFEVTGLVEDHRYEFRV
+IARNAAGVFSEPSESTGAITARDEVDPPRISMDPKYKDTIVVHAGESFKVDADIYGKPIPTIQWIKGDQELSNTARLEIK
+STDFATSLSVKDAVRVDSGNYILKAKNVAGERSVTVNVKVLDRPGPPEGPVVISGVTAEKCTLAWKPPLQDGGSDIINYI
+VERRETSRLVWTVVDANVQTLSCKVTKLLEGNEYTFRIMAVNKYGVGEPLESEPVVAKNPFVVPDAPKAPEVTTVTKDSM
+IVVWERPASDGGSEILGYVLEKRDKEGIRWTRCHKRLIGELRLRVTGLIENHDYEFRVSAENAAGLSEPSPPSAYQKACD
+PIYKPGPPNNPKVIDITRSSVFLSWSKPIYDGGCEIQGYIVEKCDVSVGEWTMCTPPTGINKTNIEVEKLLEKHEYNFRI
+CAINKAGVGEHADVPGPIIVEEKLEAPDIDLDLELRKIINIRAGGSLRLFVPIKGRPTPEVKWGKVDGEIRDAAIIDVTS
+SFTSLVLDNVNRYDSGKYTLTLENSSGTKSAFVTVRVLDTPSPPVNLKVTEITKDSVSITWEPPLLDGGSKIKNYIVEKR
+EATRKSYAAVVTNCHKNSWKIDQLQEGCSYYFRVTAENEYGIGLPAQTADPIKVAEVPQPPGKITVDDVTRNSVSLSWTK
+PEHDGGSKIIQYIVEMQAKHSEKWSECARVKSLQAVITNLTQGEEYLFRVVAVNEKGRSDPRSLAVPIVAKDLVIEPDVK
+PAFSSYSVQVGQDLKIEVPISGRPKPTITWTKDGLPLKQTTRINVTDSLDLTTLSIKETHKDDGGQYGITVANVVGQKTA
+SIEIVTLDKPDPPKGPVKFDDVSAESITLSWNPPLYTGGCQITNYIVQKRDTTTTVWDVVSATVARTTLKVTKLKTGTEY
+QFRIFAENRYGQSFALESDPIVAQYPYKEPGPPGTPFATAISKDSMVIQWHEPVNNGGSPVIGYHLERKERNSILWTKVN
+KTIIHDTQFKAQNLEEGIEYEFRVYAENIVGVGKASKNSECYVARDPCDPPGTPEPIMVKRNEITLQWTKPVYDGGSMIT
+GYIVEKRDLPDGRWMKASFTNVIETQFTVSGLTEDQRYEFRVIAKNAAGAISKPSDSTGPITAKDEVELPRISMDPKFRD
+TIVVNAGETFRLEADVHGKPLPTIEWLRGDKEIEESARCEIKNTDFKALLIVKDAIRIDGGQYILRASNVAGSKSFPVNV
+KVLDRPGPPEGPVQVTGVTSEKCSLTWSPPLQDGGSDISHYVVEKRETSRLAWTVVASEVVTNSLKVTKLLEGNEYVFRI
+MAVNKYGVGEPLESAPVLMKNPFVLPGPPKSLEVTNIAKDSMTVCWNRPDSDGGSEIIGYIVEKRDRSGIRWIKCNKRRI
+TDLRLRVTGLTEDHEYEFRVSAENAAGVGEPSPATVYYKACDPVFKPGPPTNAHIVDTTKNSITLAWGKPIYDGGSEILG
+YVVEICKADEEEWQIVTPQTGLRVTRFEISKLTEHQEYKIRVCALNKVGLGEATSVPGTVKPEDKLEAPELDLDSELRKG
+IVVRAGGSARIHIPFKGRPTPEITWSREEGEFTDKVQIEKGVNYTQLSIDNCDRNDAGKYILKLENSSGSKSAFVTVKVL
+DTPGPPQNLAVKEVRKDSAFLVWEPPIIDGGAKVKNYVIDKRESTRKAYANVSSKCSKTSFKVENLTEGAIYYFRVMAEN
+EFGVGVPVETVDAVKAAEPPSPPGKVTLTDVSQTSASLMWEKPEHDGGSRVLGYVVEMQPKGTEKWSIVAESKVCNAVVT
+GLSSGQEYQFRVKAYNEKGKSDPRVLGVPVIAKDLTIQPSLKLPFNTYSIQAGEDLKIEIPVIGRPRPNISWVKDGEPLK
+QTTRVNVEETATSTVLHIKEGNKDDFGKYTVTATNSAGTATENLSVIVLEKPGPPVGPVRFDEVSADFVVISWEPPAYTG
+GCQISNYIVEKRDTTTTTWHMVSATVARTTIKITKLKTGTEYQFRIFAENRYGKSAPLDSKAVIVQYPFKEPGPPGTPFV
+TSISKDQMLVQWHEPVNDGGTKIIGYHLEQKEKNSILWVKLNKTPIQDTKFKTTGLDEGLEYEFKVSAENIVGIGKPSKV
+SECFVARDPCDPPGRPEAIVITRNNVTLKWKKPAYDGGSKITGYIVEKKDLPDGRWMKASFTNVLETEFTVSGLVEDQRY
+EFRVIARNAAGNFSEPSDSSGAITARDEIDAPNASLDPKYKDVIVVHAGETFVLEADIRGKPIPDVVWSKDGKELEETAA
+RMEIKSTIQKTTLVVKDCIRTDGGQYILKLSNVGGTKSIPITVKVLDRPGPPEGPLKVTGVTAEKCYLAWNPPLQDGGAN
+ISHYIIEKRETSRLSWTQVSTEVQALNYKVTKLLPGNEYIFRVMAVNKYGIGEPLESGPVTACNPYKPPGPPSTPEVSAI
+TKDSMVVTWARPVDDGGTEIEGYILEKRDKEGVRWTKCNKKTLTDLRLRVTGLTEGHSYEFRVAAENAAGVGEPSEPSVF
+YRACDALYPPGPPSNPKVTDTSRSSVSLAWSKPIYDGGAPVKGYVVEVKEAAADEWTTCTPPTGLQGKQFTVTKLKENTE
+YNFRICAINSEGVGEPATLPGSVVAQERIEPPEIELDADLRKVVVLRASATLRLFVTIKGRPEPEVKWEKAEGILTDRAQ
+IEVTSSFTMLVIDNVTRFDSGRYNLTLENNSGSKTAFVNVRVLDSPSAPVNLTIREVKKDSVTLSWEPPLIDGGAKITNY
+IVEKRETTRKAYATITNNCTKTTFRIENLQEGCSYYFRVLASNEYGIGLPAETTEPVKVSEPPLPPGRVTLVDVTRNTAT
+IKWEKPESDGGSKITGYVVEMQTKGSEKWSTCTQVKTLEATISGLTAGEEYVFRVAAVNEKGRSDPRQLGVPVIARDIEI
+KPSVELPFHTFNVKAREQLKIDVPFKGRPQATVNWRKDGQTLKETTRVNVSSSKTVTSLSIKEASKEDVGTYELCVSNSA
+GSITVPITIIVLDRPGPPGPIRIDEVSCDSITISWNPPEYDGGCQISNYIVEKKETTSTTWHIVSQAVARTSIKIVRLTT
+GSEYQFRVCAENRYGKSSYSESSAVVAEYPFSPPGPPGTPKVVHATKSTMLVTWQVPVNDGGSRVIGYHLEYKERSSILW
+SKANKILIADTQMKVSGLDEGLMYEYRVYAENIAGIGKCSKSCEPVPARDPCDPPGQPEVTNITRKSVSLKWSKPHYDGG
+AKITGYIVERRELPDGRWLKCNYTNIQETYFEVTELTEDQRYEFRVFARNAADSVSEPSESTGPIIVKDDVEPPRVMMDV
+KFRDVIVVKAGEVLKINADIAGRPLPVISWAKDGIEIEERARTEIISTDNHTLLTVKDCIRRDTGQYVLTLKNVAGTRSV
+AVNCKVLDKPGPPAGPLEINGLTAEKCSLSWGRPQEDGGADIDYYIVEKRETSHLAWTICEGELQMTSCKVTKLLKGNEY
+IFRVTGVNKYGVGEPLESVAIKALDPFTVPSPPTSLEITSVTKESMTLCWSRPESDGGSEISGYIIERREKNSLRWVRVN
+KKPVYDLRVKSTGLREGCEYEYRVYAENAAGLSLPSETSPLIRAEDPVFLPSPPSKPKIVDSGKTTITIAWVKPLFDGGA
+PITGYTVEYKKSDDTDWKTSIQSLRGTEYTISGLTTGAEYVFRVKSVNKVGASDPSDSSDPQIAKEREEEPLFDIDSEMR
+KTLIVKAGASFTMTVPFRGRPVPNVLWSKPDTDLRTRAYVDTTDSRTSLTIENANRNDSGKYTLTIQNVLSAASLTLVVK
+VLDTPGPPTNITVQDVTKESAVLSWDVPENDGGAPVKNYHIEKREASKKAWVSVTNNCNRLSYKVTNLQEGAIYYFRVSG
+ENEFGVGIPAETKEGVKITEKPSPPEKLGVTSISKDSVSLTWLKPEHDGGSRIVHYVVEALEKGQKNWVKCAVAKSTHHV
+VSGLRENSEYFFRVFAENQAGLSDPRELLLPVLIKEQLEPPEIDMKNFPSHTVYVRAGSNLKVDIPISGKPLPKVTLSRD
+GVPLKATMRFNTEITAENLTINLKESVTADAGRYEITAANSSGTTKAFINIVVLDRPGPPTGPVVISDITEESVTLKWEP
+PKYDGGSQVTNYILLKRETSTAVWTEVSATVARTMMKVMKLTTGEEYQFRIKAENRFGISDHIDSACVTVKLPYTTPGPP
+STPWVTNVTRESITVGWHEPVSNGGSAVVGYHLEMKDRNSILWQKANKLVIRTTHFKVTTISAGLIYEFRVYAENAAGVG
+KPSHPSEPVLAIDACEPPRNVRITDISKNSVSLSWQQPAFDGGSKITGYIVERRDLPDGRWTKASFTNVTETQFIISGLT
+QNSQYEFRVFARNAVGSISNPSEVVGPITCIDSYGGPVIDLPLEYTEVVKYRAGTSVKLRAGISGKPAPTIEWYKDDKEL
+QTNALVCVENTTDLASILIKDADRLNSGCYELKLRNAMGSASATIRVQILDKPGPPGGPIEFKTVTAEKITLLWRPPADD
+GGAKITHYIVEKRETSRVVWSMVSEHLEECIITTTKIIKGNEYIFRVRAVNKYGIGEPLESDSVVAKNAFVTPGPPGIPE
+VTKITKNSMTVVWSRPIADGGSDISGYFLEKRDKKSLGWFKVLKETIRDTRQKVTGLTENSDYQYRVCAVNAAGQGPFSE
+PSEFYKAADPIDPPGPPAKIRIADSTKSSITLGWSKPVYDGGSAVTGYVVEIRQGEEEEWTTVSTKGEVRTTEYVVSNLK
+PGVNYYFRVSAVNCAGQGEPIEMNEPVQAKDILEAPEIDLDVALRTSVIAKAGEDVQVLIPFKGRPPPTVTWRKDEKNLG
+SDARYSIENTDSSSLLTIPQVTRNDTGKYILTIENGVGEPKSSTVSVKVLDTPAACQKLQVKHVSRGTVTLLWDPPLIDG
+GSPIINYVIEKRDATKRTWSVVSHKCSSTSFKLIDLSEKTPFFFRVLAENEIGIGEPCETTEPVKAAEVPAPIRDLSMKD
+STKTSVILSWTKPDFDGGSVITEYVVERKGKGEQTWSHAGISKTCEIEVSQLKEQSVLEFRVFAKNEKGLSDPVTIGPIT
+VKELIITPEVDLSDIPGAQVTVRIGHNVHLELPYKGKPKPSISWLKDGLPLKESEFVRFSKTENKITLSIKNAKKEHGGK
+YTVILDNAVCRIAVPITVITLGPPSKPKGPIRFDEIKADSVILSWDVPEDNGGGEITCYSIEKRETSQTNWKMVCSSVAR
+TTFKVPNLVKDAEYQFRVRAENRYGVSQPLVSSIIVAKHQFRIPGPPGKPVIYNVTSDGMSLTWDAPVYDGGSEVTGFHV
+EKKERNSILWQKVNTSPISGREYRATGLVEGLDYQFRVYAENSAGLSSPSDPSKFTLAVSPVDPPGTPDYIDVTRETITL
+KWNPPLRDGGSKIVGYSIEKRQGNERWVRCNFTDVSECQYTVTGLSPGDRYEFRIIARNAVGTISPPSQSSGIIMTRDEN
+VPPIVEFGPEYFDGLIIKSGESLRIKALVQGRPVPRVTWFKDGVEIEKRMNMEITDVLGSTSLFVRDATRDHRGVYTVEA
+KNASGSAKAEIKVKVQDTPGKVVGPIRFTNITGEKMTLWWDAPLNDGCAPITHYIIEKRETSRLAWALIEDKCEAQSYTA
+IKLINGNEYQFRVSAVNKFGVGRPLDSDPVVAQIQYTVPDAPGIPEPSNITGNSITLTWARPESDGGSEIQQYILERREK
+KSTRWVKVISKRPISETRFKVTGLTEGNEYEFHVMAENAAGVGPASGISRLIKCREPVNPPGPPTVVKVTDTSKTTVSLE
+WSKPVFDGGMEIIGYIIEMCKADLGDWHKVNAEACVKTRYTVTDLQAGEEYKFRVSAINGAGKGDSCEVTGTIKAVDRLT
+APELDIDANFKQTHVVRAGASIRLFIAYQGRPTPTAVWSKPDSNLSLRADIHTTDSFSTLTVENCNRNDAGKYTLTVENN
+SGSKSITFTVKVLDTPGPPGPITFKDVTRGSATLMWDAPLLDGGARIHHYVVEKREASRRSWQVISEKCTRQIFKVNDLA
+EGVPYYFRVSAVNEYGVGEPYEMPEPIVATEQPAPPRRLDVVDTSKSSAVLAWLKPDHDGGSRITGYLLEMRQKGSDFWV
+EAGHTKQLTFTVERLVEKTEYEFRVKAKNDAGYSEPREAFSSVIIKEPQIEPTADLTGITNQLITCKAGSPFTIDVPISG
+RPAPKVTWKLEEMRLKETDRVSITTTKDRTTLTVKDSMRGDSGRYFLTLENTAGVKTFSVTVVVIGRPGPVTGPIEVSSV
+SAESCVLSWGEPKDGGGTEITNYIVEKRESGTTAWQLVNSSVKRTQIKVTHLTKYMEYSFRVSSENRFGVSKPLESAPII
+AEHPFVPPSAPTRPEVYHVSANAMSIRWEEPYHDGGSKIIGYWVEKKERNTILWVKENKVPCLECNYKVTGLVEGLEYQF
+RTYALNAAGVSKASEASRPIMAQNPVDAPGRPEVTDVTRSTVSLIWSAPAYDGGSKVVGYIIERKPVSEVGDGRWLKCNY
+TIVSDNFFTVTALSEGDTYEFRVLAKNAAGVISKGSESTGPVTCRDEYAPPKAELDARLHGDLVTIRAGSDLVLDAAVGG
+KPEPKIIWTKGDKELDLCEKVSLQYTGKRATAVIKFCDRSDSGKYTLTVKNASGTKAVSVMVKVLDSPGPCGKLTVSRVT
+QEKCTLAWSLPQEDGGAEITHYIVERRETSRLNWVIVEGECPTLSYVVTRLIKNNEYIFRVRAVNKYGPGVPVESEPIVA
+RNSFTIPSPPGIPEEVGTGKEHIIIQWTKPESDGGNEISNYLVDKREKKSLRWTRVNKDYVVYDTRLKVTSLMEGCDYQF
+RVTAVNAAGNSEPSEASNFISCREPSYTPGPPSAPRVVDTTKHSISLAWTKPMYDGGTDIVGYVLEMQEKDTDQWYRVHT
+NATIRNTEFTVPDLKMGQKYSFRVAAVNVKGMSEYSESIAEIEPVERIEIPDLELADDLKKTVTIRAGASLRLMVSVSGR
+PPPVITWSKQGIDLASRAIIDTTESYSLLIVDKVNRYDAGKYTIEAENQSGKKSATVLVKVYDTPGPCPSVKVKEVSRDS
+VTITWEIPTIDGGAPVNNYIVEKREAAMRAFKTVTTKCSKTLYRISGLVEGTMYYFRVLPENIYGIGEPCETSDAVLVSE
+VPLVPAKLEVVDVTKSTVTLAWEKPLYDGGSRLTGYVLEACKAGTERWMKVVTLKPTVLEHTVTSLNEGEQYLFRIRAQN
+EKGVSEPRETVTAVTVQDLRVLPTIDLSTMPQKTIHVPAGRPVELVIPIAGRPPPAASWFFAGSKLRESERVTVETHTKV
+AKLTIRETTIRDTGEYTLELKNVTGTTSETIKVIILDKPGPPTGPIKIDEIDATSITISWEPPELDGGAPLSGYVVEQRD
+AHRPGWLPVSESVTRSTFKFTRLTEGNEYVFRVAATNRFGIGSYLQSEVIECRSSIRIPGPPETLQIFDVSRDGMTLTWY
+PPEDDGGSQVTGYIVERKEVRADRWVRVNKVPVTMTRYRSTGLTEGLEYEHRVTAINARGSGKPSRPSKPIVAMDPIAPP
+GKPQNPRVTDTTRTSVSLAWSVPEDEGGSKVTGYLIEMQKVDQHEWTKCNTTPTKIREYTLTHLPQGAEYRFRVLACNAG
+GPGEPAEVPGTVKVTEMLEYPDYELDERYQEGIFVRQGGVIRLTIPIKGKPFPICKWTKEGQDISKRAMIATSETHTELV
+IKEADRGDSGTYDLVLENKCGKKAVYIKVRVIGSPNSPEGPLEYDDIQVRSVRVSWRPPADDGGADILGYILERREVPKA
+AWYTIDSRVRGTSLVVKGLKENVEYHFRVSAENQFGISKPLKSEEPVTPKTPLNPPEPPSNPPEVLDVTKSSVSLSWSRP
+KDDGGSRVTGYYIERKETSTDKWVRHNKTQITTTMYTVTGLVPDAEYQFRIIAQNDVGLSETSPASEPVVCKDPFDKPSQ
+PGELEILSISKDSVTLQWEKPECDGGKEILGYWVEYRQSGDSAWKKSNKERIKDKQFTIGGLLEATEYEFRVFAENETGL
+SRPRRTAMSIKTKLTSGEAPGIRKEMKDVTTKLGEAAQLSCQIVGRPLPDIKWYRFGKELIQSRKYKMSSDGRTHTLTVM
+TEEQEDEGVYTCIATNEVGEVETSSKLLLQATPQFHPGYPLKEKYYGAVGSTLRLHVMYIGRPVPAMTWFHGQKLLQNSE
+NITIENTEHYTHLVMKNVQRKTHAGKYKVQLSNVFGTVDAILDVEIQDKPDKPTGPIVIEALLKNSAVISWKPPADDGGS
+WITNYVVEKCEAKEGAEWQLVSSAISVTTCRIVNLTENAGYYFRVSAQNTFGISDPLEVSSVVIIKSPFEKPGAPGKPTI
+TAVTKDSCVVAWKPPASDGGAKIRNYYLEKREKKQNKWISVTTEEIRETVFSVKNLIEGLEYEFRVKCENLGGESEWSEI
+SEPITPKSDVPIQAPHFKEELRNLNVRYQSNATLVCKVTGHPKPIVKWYRQGKEIIADGLKYRIQEFKGGYHQLIIASVT
+DDDATVYQVRATNQGGSVSGTASLEVEVPAKIHLPKTLEGMGAVHALRGEVVSIKIPFSGKPDPVITWQKGQDLIDNNGH
+YQVIVTRSFTSLVFPNGVERKDAGFYVVCAKNRFGIDQKTVELDVADVPDPPRGVKVSDVSRDSVNLTWTEPASDGGSKI
+TNYIVEKCATTAERWLRVGQARETRYTVINLFGKTSYQFRVIAENKFGLSKPSEPSEPTITKEDKTRAMNYDEEVDETRE
+VSMTKASHSSTKELYEKYMIAEDLGRGEFGIVHRCVETSSKKTYMAKFVKVKGTDQVLVKKEISILNIARHRNILHLHES
+FESMEELVMIFEFISGLDIFERINTSAFELNEREIVSYVHQVCEALQFLHSHNIGHFDIRPENIIYQTRRSSTIKIIEFG
+QARQLKPGDNFRLLFTAPEYYAPEVHQHDVVSTATDMWSLGTLVYVLLSGINPFLAETNQQIIENIMNAEYTFDEEAFKE
+ISIEAMDFVDRLLVKERKSRMTASEALQHPWLKQKIERVSTKVIRTLKHRRYYHTLIKKDLNMVVSAARISCGGAIRSQK
+GVSVAKVKVASIEIGPVSGQIMHAVGEEGGHVKYVCKIENYDQSTQVTWYFGVRQLENSEKYEITYEDGVAILYVKDITK
+LDDGTYRCKVVNDYGEDSSYAELFVKGVREVYDYYCRRTMKKIKRRTDTMRLLERPPEFTLPLYNKTAYVGENVRFGVTI
+TVHPEPHVTWYKSGQKIKPGDNDKKYTFESDKGLYQLTINSVTTDDDAEYTVVARNKYGEDSCKAKLTVTLHPPPTDSTL
+RPMFKRLLANAECQEGQSVCFEIRVSGIPPPTLKWEKDGQPLSLGPNIEIIHEGLDYYALHIRDTLPEDTGYYRVTATNT
+AGSTSCQAHLQVERLRYKKQEFKSKEEHERHVQKQIDKTLRMAEILSGTESVPLTQVAKEALREAAVLYKPAVSTKTVKG
+EFRLEIEEKKEERKLRMPYDVPEPRKYKQTTIEEDQRIKQFVPMSDMKWYKKIRDQYEMPGKLDRVVQKRPKRIRLSRWE
+QFYVMPLPRITDQYRPKWRIPKLSQDDLEIVRPARRRTPSPDYDFYYRPRRRSLGDISDEELLLPIDDYLAMKRTEEERL
+RLEEELELGFSASPPSRSPPHFELSSLRYSSPQAHVKVEETRKDFRYSTYHIPTKAEASTSYAELRERHAQAAYRQPKQR
+QRIMAEREDEELLRPVTTTQHLSEYKSELDFMSKEEKSRKKSRRQREVTEITEIEEEYEISKHAQRESSSSASRLLRRRR
+SLSPTYIELMRPVSELIRSRPQPAEEYEDDTERRSPTPERTRPRSPSPVSSERSLSRFERSARFDIFSRYESMKAALKTQ
+KTSERKYEVLSQQPFTLDHAPRITLRMRSHRVPCGQNTRFILNVQSKPTAEVKWYHNGVELQESSKIHYTNTSGVLTLEI
+LDCHTDDSGTYRAVCTNYKGEASDYATLDVTGGDYTTYASQRRDEEVPRSVFPELTRTEAYAVSSFKKTSEMEASSSVRE
+VKSQMTETRESLSSYEHSASAEMKSAALEEKSLEEKSTTRKIKTTLAARILTKPRSMTVYEGESARFSCDTDGEPVPTVT
+WLRKGQVLSTSARHQVTTTKYKSTFEISSVQASDEGNYSVVVENSEGKQEAEFTLTIQKARVTEKAVTSPPRVKSPEPRV
+KSPEAVKSPKRVKSPEPSHPKAVSPTETKPTPTEKVQHLPVSAPPKITQFLKAEASKEIAKLTCVVESSVLRAKEVTWYK
+DGKKLKENGHFQFHYSADGTYELKINNLTESDQGEYVCEISGEGGTSKTNLQFMGQAFKSIHEKVSKISETKKSDQKTTE
+STVTRKTEPKAPEPISSKPVIVTGLQDTTVSSDSVAKFAVKATGEPRPTAIWTKDGKAITQGGKYKLSEDKGGFFLEIHK
+TDTSDSGLYTCTVKNSAGSVSSSCKLTIKAIKDTEAQKVSTQKTSEITPQKKAVVQEEISQKALRSEEIKMSEAKSQEKL
+ALKEEASKVLISEEVKKSAATSLEKSIVHEEITKTSQASEEVRTHAEIKAFSTQMSINEGQRLVLKANIAGATDVKWVLN
+GVELTNSEEYRYGVSGSDQTLTIKQASHRDEGILTCISKTKEGIVKCQYDLTLSKELSDAPAFISQPRSQNINEGQNVLF
+TCEISGEPSPEIEWFKNNLPISISSNVSISRSRNVYSLEIRNASVSDSGKYTIKAKNFRGQCSATASLMVLPLVEEPSRE
+VVLRTSGDTSLQGSFSSQSVQMSASKQEASFSSFSSSSASSMTEMKFASMSAQSMSSMQESFVEMSSSSFMGISNMTQLE
+SSTSKMLKAGIRGIPPKIEALPSDISIDEGKVLTVACAFTGEPTPEVTWSCGGRKIHSQEQGRFHIENTDDLTTLIIMDV
+QKQDGGLYTLSLGNEFGSDSATVNIHIRSI
diff --git a/seq/titin_hum.seq b/seq/titin_hum.seq
new file mode 100644
index 0000000..a135656
--- /dev/null
+++ b/seq/titin_hum.seq
@@ -0,0 +1,1174 @@
+>gi|20143913|ref|NM_003319.2| Homo sapiens titin (TTN), transcript variant N2-B, mRNA
+AGCAGTCGTGCATTCCCAGCCTCGCCTCGGGTGTAGGGATTGCATAGAAAAGCAAAACTACACAGTCTTG
+ACTGTGTAGTTTTGTTTTTAGGATTAGAGGCTCACCGATTCATGTCGGAGATGGTCAGAAAAACCAACTC
+TCCATAGGACGTCGTTTCAGAAGCAACCTTGGGCTTAGTCCCACCCTTTTTAGGCACTCTTGAGAAATCA
+AGTGCCTAGAAAGATGACAACTCAAGCACCGACGTTTACGCAGCCGTTACAAAGCGTTGTGGTACTGGAG
+GGTAGTACCGCAACCTTTGAGGCTCACATTAGTGGTTTTCCAGTTCCTGAGGTGAGCTGGTTTAGGGATG
+GCCAGGTGATTTCCACTTCCACTCTGCCCGGCGTGCAGATCTCCTTTAGCGATGGCCGCGCTAAACTGAC
+GATCCCCGCCGTGACTAAAGCCAACAGTGGACGATATTCCCTGAAAGCCACCAATGGATCTGGACAAGCG
+ACTAGTACTGCTGAGCTTCTCGTGAAAGCTGAGACAGCACCACCCAACTTCGTTCAACGACTGCAGAGCA
+TGACCGTGAGACAAGGAAGCCAAGTGAGACTCCAAGTGAGAGTGACTGGAATCCCTACACCTGTGGTGAA
+GTTCTACCGGGATGGAGCCGAAATCCAGAGCTCCCTTGATTTCCAAATTTCACAAGAAGGCGACCTCTAC
+AGCTTACTGATTGCAGAAGCATACCCTGAGGACTCAGGGACCTATTCAGTAAATGCCACCAATAGCGTTG
+GAAGAGCTACTTCGACTGCTGAATTACTGGTTCAAGGTGAAGAAGAAGTACCTGCTAAAAAGACAAAGAC
+AATTGTTTCGACTGCTCAGATCTCAGAATCAAGACAAACCCGAATTGAAAAGAAGATTGAAGCCCACTTT
+GATGCCAGATCAATTGCAACAGTTGAGATGGTCATAGATGGTGCCGCTGGGCAACAGCTGCCACATAAAA
+CACCTCCCAGGATTCCTCCGAAGCCAAAGTCAAGATCCCCAACACCACCGTCTATTGCTGCCAAAGCACA
+GCTGGCTCGGCAGCAGTCCCCATCGCCCATAAGACACTCCCCTTCCCCGGTCAGACACGTGCGGGCACCG
+ACCCCATCTCCGGTCAGGTCCGTGTCTCCAGCAGCAAGAATCTCCACATCCCCCATCAGGTCTGTTAGGT
+CTCCATTGCTCATGCGTAAGACTCAGGCATCCACCGTGGCCACAGGTCCTGAAGTGCCTCCCCCTTGGAA
+GCAAGAGGGCTACGTGGCCTCCTCATCTGAGGCTGAGATGAGAGAGACAACGCTGACAACCTCTACTCAG
+ATCAGGACAGAAGAGAGATGGGAAGGGAGATACGGTGTCCAGGAGCAAGTGACCATCAGTGGTGCTGCGG
+GTGCTGCCGCCAGTGTGTCGGCCAGTGCTAGCTACGCAGCAGAGGCTGTTGCCACTGGTGCTAAAGAGGT
+GAAACAAGATGCTGACAAAAGTGCAGCTGTTGCGACTGTTGTTGCTGCCGTTGATATGGCCAGAGTGAGA
+GAACCAGTGATCAGCGCTGTAGAGCAGACTGCTCAGAGGACAACCACGACTGCTGTGCACATCCAACCTG
+CTCAAGAACAGGTAAGAAAGGAAGCGGAGAAGACTGCTGTAACTAAGGTAGTAGTGGCCGCCGATAAAGC
+CAAGGAACAAGAATTAAAATCAAGAACCAAAGAAGTAATTACCACAAAGCAAGAGCAGATGCACGTAACT
+CATGAGCAGATAAGAAAAGAAACTGAAAAAACATTTGTACCAAAGGTAGTAATTTCCGCAGCTAAAGCCA
+AAGAACAAGAAACTAGAATTTCTGAAGAAATTACTAAGAAACAGAAACAAGTAACTCAAGAAGCAATAAT
+GAAGGAAACTAGGAAAACAGTTGTACCTAAAGTCATAGTTGCCACACCCAAAGTCAAAGAACAAGATTTA
+GTATCAAGAGGTAGAGAAGGCATTACTACCAAAAGAGAACAAGTGCAAATAACTCAGGAGAAGATGAGAA
+AGGAAGCCGAGAAAACTGCCTTGTCTACAATAGCAGTTGCTACTGCTAAAGCCAAAGAACAAGAAACAAT
+ACTGAGAACTAGAGAAACTATGGCTACTAGACAAGAACAAATCCAAGTTACCCATGGAAAGGTGGACGTT
+GGAAAAAAGGCTGAAGCTGTAGCAACAGTTGTTGCTGCAGTAGACCAGGCCCGAGTCAGAGAGCCCAGAG
+AGCCTGGGCATCTTGAAGAATCCTATGCTCAGCAGACCACTTTGGAGTACGGATATAAGGAACGCATTTC
+CGCCGCAAAGGTAGCTGAGCCTCCCCAACGTCCAGCCTCAGAACCCCACGTTGTCCCTAAAGCAGTCAAG
+CCTAGAGTAATCCAGGCTCCTTCTGAGACTCATATCAAAACTACTGATCAAAAGGGAATGCACATATCAT
+CACAGATCAAGAAAACTACAGATCTAACAACGGAAAGATTAGTCCATGTGGATAAACGCCCCCGCACAGC
+TAGCCCTCACTTTACTGTTTCAAAAATTTCTGTTCCTAAGACAGAACATGGATATGAGGCATCAATAGCC
+GGTAGTGCTATTGCCACATTACAAAAAGAGTTGTCAGCCACATCTTCTGCTCAGAAGATCACCAAATCGG
+TGAAGGCTCCTACTGTGAAGCCCAGTGAGACTAGAGTAAGGGCAGAGCCCACACCCTTGCCACAGTTCCC
+CTTCGCTGACACACCAGATACTTACAAGAGTGAAGCTGGCGTTGAGGTGAAAAAGGAAGTAGGGGTGAGC
+ATCACTGGCACCACCGTCCGTGAAGAGCGCTTTGAAGTACTGCACGGACGCGAAGCCAAGGTAACAGAAA
+CAGCAAGAGTACCAGCACCTGTTGAAATTCCTGTTACTCCACCAACTTTGGTCTCGGGCTTAAAAAATGT
+GACTGTCATAGAAGGTGAATCTGTCACCTTGGAGTGCCACATCTCTGGATACCCATCCCCGACAGTGACA
+TGGTACAGGGAAGACTACCAAATCGAAAGTTCCATTGACTTCCAGATAACCTTCCAGAGTGGAATTGCTC
+GTCTTATGATTCGCGAAGCATTTGCGGAAGACAGCGGGCGATTTACTTGCAGTGCTGTAAATGAGGCTGG
+AACCGTCAGCACATCCTGCTATCTGGCTGTGCAGGTGTCAGAAGAATTTGAAAAGGAAACCACAGCCGTG
+ACTGAGAAATTTACTACAGAAGAGAAACGCTTTGTTGAGTCAAGAGATGTGGTTATGACTGATACTAGCC
+TCACAGAGGAACAAGCAGGGCCTGGAGAACCTGCCGCGCCTTACTTTATTACAAAACCAGTGGTCCAGAA
+ACTGGTGGAAGGTGGGAGCGTGGTGTTTGGATGCCAAGTTGGCGGCAACCCAAAGCCCCATGTATACTGG
+AAAAAATCTGGTGTTCCTCTAACCACTGGATACAGATACAAAGTGAGTTACAACAAACAAACCGGTGAAT
+GCAAGCTGGTGATTTCTATGACTTTTGCTGATGATGCTGGAGAATACACTATTGTTGTTCGCAATAAGCA
+TGGAGAAACTTCTGCATCTGCTTCCTTGCTTGAAGAAGCTGATTATGAGTTACTGATGAAGTCCCAGCAA
+GAAATGCTTTATCAGACACAAGTGACTGCATTTGTTCAAGAACCTAAAGTTGGAGAAACAGCACCTGGAT
+TTGTATACTCTGAGTATGAAAAAGAGTATGAAAAAGAACAAGCCTTAATTAGGAAGAAAATGGCCAAAGA
+TACTGTAGTGGTCAGAACTTATGTAGAAGATCAGGAATTCCATATTTCTTCCTTTGAAGAGAGACTTATT
+AAAGAAATTGAATATAGAATAATAAAGACTACATTAGAAGAACTTCTTGAAGAAGATGGAGAAGAAAAGA
+TGGCAGTTGACATTTCTGAATCTGAAGCTGTTGAATCAGGATTTGATTTAAGAATCAAGAATTATAGAAT
+TCTTGAGGGGATGGGTGTCACTTTTCATTGCAAGATGTCTGGATATCCATTACCAAAGATTGCTTGGTAC
+AAAGATGGCAAGCGCATCAAACATGGAGAAAGATACCAAATGGACTTTCTACAAGATGGCAGAGCTAGTC
+TGCGTATACCTGTTGTTCTTCCAGAAGATGAAGGAATCTACACTGCATTTGCCAGCAATATTAAAGGAAA
+TGCAATTTGCTCAGGGAAATTGTATGTGGAGCCTGCTGCACCACTTGGAGCTCCGACTTACATTCCCACA
+CTAGAGCCAGTGAGCAGAATCAGATCTCTCTCTCCACGTTCAGTGAGCAGGTCTCCTATACGCATGTCTC
+CTGCACGGATGTCACCTGCAAGGATGTCTCCTGCACGGATGTCCCCTGCAAGAATGTCCCCTGGACGTAG
+GCTGGAGGAGACAGATGAGTCACAACTTGAGAGACTATATAAACCAGTCTTTGTGTTAAAACCTGTTTCT
+TTCAAATGTTTAGAAGGGCAAACTGCCAGATTTGACTTAAAGGTTGTTGGTAGACCTATGCCAGAGACGT
+TCTGGTTTCATGATGGCCAGCAAATTGTCAATGACTATACCCATAAAGTAGTCATTAAAGAAGATGGTAC
+TCAATCACTAATTATTGTCCCTGCCACACCCAGTGATTCTGGGGAATGGACTGTGGTTGCCCAAAACAGG
+GCAGGCAGATCTTCAATTTCAGTGATTTTAACTGTGGAAGCTGTGGAACATCAGGTAAAACCGATGTTTG
+TAGAAAAACTGAAAAATGTCAATATAAAGGAAGGTTCCCGACTTGAAATGAAAGTCAGAGCTACGGGTAA
+CCCCAACCCTGACATTGTATGGTTGAAAAACAGTGACATCATTGTGCCTCATAAATATCCCAAAATCAGA
+ATTGAAGGAACCAAGGGAGAAGCTGCCCTTAAAATCGATTCCACTGTCAGCCAAGATTCTGCCTGGTATA
+CTGCGACTGCTATTAATAAAGCTGGCAGAGACACTACAAGATGCAAAGTAAATGTTGAAGTTGAGTTTGC
+AGAGCCTGAGCCAGAGAGAAAGTTAATCATCCCACGGGGGACATATAGAGCAAAGGAGATTGCAGCCCCA
+GAACTGGAGCCCCTCCATTTGCGATATGGCCAAGAGCAATGGGAAGAAGGTGATCTCTATGACAAAGAGA
+AACAACAGAAACCATTTTTCAAGAAAAAACTCACTTCCTTAAGACTTAAGCGCTTTGGGCCTGCCCACTT
+TGAATGCAGGCTAACACCCATTGGTGACCCAACGATGGTGGTGGAGTGGCTCCATGATGGAAAGCCACTT
+GAAGCAGCCAACAGGCTCCGTATGATCAATGAATTTGGGTACTGCAGCCTTGATTATGGCGTTGCATATT
+CTAGAGACAGTGGTATCATTACTTGCAGAGCCACTAACAAATATGGAACAGATCACACATCTGCTACCCT
+TATTGTTAAAGATGAGAAAAGTCTTGTGGAAGAATCCCAATTGCCTGAGGGGAGGAAAGGCTTACAGAGA
+ATTGAAGAATTAGAGAGAATGGCTCATGAAGGTGCACTTACAGGTGTAACAACAGATCAGAAAGAAAAGC
+AAAAGCCAGACATTGTCTTGTACCCAGAGCCAGTTAGAGTACTTGAAGGGGAGACTGCAAGGTTCCGCTG
+CAGGGTAACAGGCTACCCTCAGCCCAAAGTCAACTGGTACCTCAATGGACAGCTCATCCGCAAAAGCAAA
+AGGTTCAGAGTTCGCTATGATGGTATCCATTACCTGGACATCGTGGACTGCAAATCATATGACACAGGTG
+AAGTGAAGGTCACCGCGGAAAATCCTGAAGGTGTGATAGAGCATAAAGTGAAGCTTGAGATTCAACAGAG
+GGAAGATTTTAGGTCTGTCCTTAGGAGAGCTCCTGAACCAAGGCCTGAGTTTCACGTACATGAACCAGGA
+AAGCTTCAGTTTGAAGTACAAAAAGTGGATAGACCTGTTGACACCACTGAAACCAAAGAAGTTGTGAAGT
+TGAAAAGGGCTGAAAGAATTACCCATGAAAAAGTGCCTGAAGAGTCGGAAGAGCTGCGCAGTAAATTCAA
+GCGCAGAACAGAAGAGGGCTATTATGAAGCCATTACCGCTGTGGAGCTCAAGTCTCGAAAGAAGGATGAA
+TCCTATGAGGAACTCCTCAGGAAGACAAAAGATGAACTTCTCCACTGGACCAAAGAGTTAACTGAAGAGG
+AAAAGAAAGCTCTTGCCGAAGAAGGCAAAATCACGATTCCAACTTTTAAACCTGACAAGATTGAACTAAG
+TCCTAGTATGGAGGCTCCAAAAATCTTCGAAAGAATCCAGAGCCAAACAGTGGGCCAAGGATCTGATGCA
+CACTTCCGGGTCAGAGTCGTGGGGAAACCAGACCCCGAATGTGAATGGTACAAAAATGGTGTCAAAATTG
+AACGGTCTGACCGGATCTACTGGTACTGGCCCGAAGACAATGTTTGTGAATTGGTCATAAGAGATGTGAC
+TGCTGAGGACTCTGCCAGCATCATGGTAAAAGCCATCAACATAGCTGGAGAAACCTCCAGTCACGCATTC
+TTACTTGTCCAAGCCAAGCAATTGATCACTTTCACACAGGAATTACAAGATGTTGTTGCTAAGGAAAAAG
+ACACTATGGCAACCTTTGAATGTGAAACTTCAGAACCATTTGTCAAAGTGAAATGGTATAAAGATGGTAT
+GGAGGTTCATGAGGGAGATAAATACAGGATGCACTCTGACAGAAAGGTTCACTTCCTCTCCATACTGACC
+ATTGATACGTCTGATGCTGAAGATTACAGCTGTGTACTTGTGGAAGATGAAAATGTCAAAACGACTGCTA
+AACTTATTGTTGAAGGTGCAGTTGTTGAGTTTGTGAAAGAACTTCAGGACATAGAAGTTCCAGAATCATA
+TTCAGGAGAATTAGAGTGCATTGTATCCCCAGAAAATATAGAAGGAAAATGGTATCATAATGATGTGGAG
+CTTAAATCCAATGGCAAATATACAATTACATCTCGTCGTGGACGTCAGAACCTCACGGTCAAGGATGTAA
+CCAAGGAGGACCAGGGAGAATACAGCTTTGTCATCGACGGGAAAAAGACAACCTGTAAATTAAAGATGAA
+ACCCCGCCCCATTGCTATCCTACAAGGACTTAGTGACCAAAAAGTCTGTGAGGGTGACATTGTTCAGCTT
+GAAGTTAAAGTCTCCTTGGAAAGTGTGGAAGGCGTCTGGATGAAAGACGGCCAAGAAGTGCAGCCCAGTG
+ACAGGGTTCACATTGTGATAGACAAACAATCTCATATGCTGCTCATTGAAGACATGACTAAGGAAGATGC
+TGGAAATTACTCTTTCACCATTCCAGCCCTTGGCCTCTCCACCAGTGGGCGTGTCTCTGTCTATAGTGTG
+GACGTGATAACACCTCTAAAAGATGTTAATGTGATTGAAGGCACCAAGGCTGTGCTTGAATGTAAGGTGT
+CAGTCCCTGATGTGACTTCTGTTAAGTGGTACTTAAATGATGAACAAATCAAGCCTGATGACCGTGTACA
+GGCCATTGTGAAAGGTACTAAACAGCGACTAGTCATTAACCGAACTCATGCTTCAGACGAAGGACCTTAC
+AAGCTGATAGTTGGCAGAGTTGAAACCAACTGTAATCTCTCTGTAGAAAAAATTAAAATTATCAGAGGTC
+TTCGTGACCTTACCTGTACAGAAACTCAAAATGTGGTGTTTGAGGTTGAGCTGTCCCACTCTGGAATTGA
+TGTCCTGTGGAATTTTAAGGACAAGGAAATCAAGCCCAGTTCTAAATATAAAATTGAAGCACATGGAAAA
+ATATATAAATTGACAGTTCTAAATATGATGAAAGATGATGAAGGAAAATACACATTTTACGCGGGAGAAA
+ATATCACATCTGGAAAACTTACTGTGGCAGGTGGGGCCATCTCCAAGCCACTCACAGATCAGACCGTAGC
+TGAATCCCAGGAAGCTGTGTTTGAATGTGAAGTTGCCAACCCAGATTCCAAAGGCGAATGGTTGAGGGAT
+GGCAAACACCTACCACTGACTAACAACATCAGAAGTGAGTCTGATGGCCACAAAAGGAGACTTATCATTG
+CTGCCACCAAATTAGATGACATTGGAGAATATACCTACAAGGTGGCCACCTCCAAAACATCTGCCAAACT
+CAAAGTTGAAGCTGTCAAAATTAAGAAGACTCTGAAGAACCTCACAGTGACAGAAACACAGGATGCTGTT
+TTCACTGTCGAGCTTACACACCCTAATGTCAAAGGTGTCCAGTGGATCAAAAATGGAGTTGTGCTGGAAT
+CCAATGAAAAGTATGCTATCTCTGTCAAAGGAACAATTTACTCTCTGAGGATTAAAAACTGTGCCATCGT
+GGATGAGTCTGTTTATGGCTTCAGGCTTGGAAGGCTTGGAGCCAGTGCCAGACTGCACGTGGAGACTGTC
+AAGATCATTAAAAAGCCAAAGGATGTGACAGCCTTGGAAAATGCCACTGTTGCCTTTGAAGTTAGTGTTT
+CCCATGACACTGTTCCAGTAAAATGGTTCCATAAGAGTGTGGAAATTAAGCCAAGTGACAAACACAGACT
+GGTCTCAGAAAGGAAAGTCCACAAGCTGATGCTGCAGAACATCTCCCCCTCAGATGCTGGGGAATACACA
+GCTGTGGTCGGGCAATTGGAATGCAAAGCAAAACTGTTTGTGGAGACATTACATATTACAAAAACCATGA
+AAAATATCGAGGTGCCTGAGACCAAAACTGCCTCTTTTGAGTGTGAGGTGTCCCACTTCAATGTCCCTTC
+CATGTGGCTGAAGAATGGTGTGGAAATTGAGATGAGTGAAAAGTTCAAGATAGTTGTGCAGGGAAAACTC
+CATCAGCTGATCATCATGAACACCAGCACAGAGGACTCGGCAGAATACACATTTGTCTGTGGCAATGACC
+AAGTCAGTGCCACCCTGACAGTCACCCCAATCATGATTACTTCCATGCTGAAAGACATCAACGCTGAAGA
+AAAAGACACTATTACTTTTGAGGTGACAGTGAACTATGAAGGCATCTCTTACAAATGGTTAAAGAATGGT
+GTGGAAATCAAATCAACTGACAAGTGCCAGATGAGAACCAAAAAGCTCACACACTCACTGAACATCAGGA
+ATGTTCACTTTGGGGATGCTGCTGACTACACCTTTGTGGCTGGAAAAGCAACATCAACAGCCACACTTTA
+TGTGGAAGCTCGTCATATAGAATTTAGGAAACACATTAAGGACATTAAGGTACTGGAGAAGAAGCGAGCC
+ATGTTTGAATGTGAAGTTTCTGAACCTGACATCACTGTACAGTGGATGAAAGATGACCAGGAACTGCAGA
+TCACAGACAGAATAAAGATTCAGAAGGAGAAATATGTCCACCGCCTTCTGATCCCATCCACCCGGATGTC
+TGATGCTGGGAAGTACACAGTGGTGGCAGGAGGCAACGTGTCAACTGCAAAACTCTTTGTAGAAGGCAGA
+GATGTTCGCATCCGAAGTATTAAAAAGGAGGTTCAGGTCATTGAGAAACAGCGTGCTGTTGTTGAATTTG
+AGGTCAATGAAGACGATGTTGATGCCCACTGGTATAAAGATGGCATTGAAATCAATTTCCAAGTTCAAGA
+ACGACACAAATATGTAGTGGAAAGAAGAATCCACCGAATGTTTATCTCTGAGACCAGACAGAGCGATGCA
+GGAGAATACACCTTTGTGGCAGGAAGGAACAGGAGTTCTGTCACTCTCTATGTCAATGCTCCTGAACCGC
+CCCAAGTTCTGCAGGAGCTCCAGCCTGTCACTGTGCAGTCTGGCAAGCCTGCCCGCTTCTGTGCCGTGAT
+ATCCGGAAGACCACAGCCCAAAATTTCCTGGTACAAGGAAGAGCAGCTGCTTTCCACTGGCTTCAAGTGC
+AAATTTCTTCATGATGGGCAAGAGTACACGCTTTTGCTAATTGAAGCCTTCCCAGAGGATGCGGCAGTCT
+ATACCTGTGAAGCCAAGAATGACTATGGTGTTGCCACAACATCAGCTTCACTCTCAGTGGAAGTTCCAGA
+AGTTGTGTCTCCTGATCAGGAAATGCCTGTTTATCCACCTGCCATCATCACCCCGCTTCAGGACACTGTC
+ACTTCTGAAGGGCAGCCAGCCCGTTTTCAATGCCGGGTTTCTGGAACAGATCTAAAAGTGTCGTGGTACA
+GCAAAGACAAGAAAATCAAGCCATCTCGGTTCTTTAGAATGACTCAATTTGAAGACACTTATCAACTGGA
+AATTGCCGAAGCTTATCCAGAAGATGAAGGAACTTACACGTTTGTTGCTAGTAATGCTGTAGGCCAAGTA
+TCAAGCACAGCCAACCTGAGTCTGGAAGCTCCTGAATCAATTTTGCATGAGAGGATTGAACAAGAGATTG
+AGATGGAAATGAAAGAGTTTTCTAGTTCTTTTCTGTCTGCCGAGGAAGAAGGACTTCATAGCGCCGAACT
+TCAATTATCTAAAATAAATGAAACACTTGAACTTTTGTCTGAATCTCCAGTTTACTCAACTAAATTTGAT
+TCCGAAAAGGAAGGCACTGGCCCAATTTTCATCAAAGAAGTGTCAAATGCTGATATAAGCATGGGGGATG
+TGGCTACACTGTCTGTAACTGTCATTGGCATCCCCAAACCTAAAATTCAGTGGTTCTTTAATGGAGTGCT
+ATTAACCCCTTCTGCTGACTACAAATTTGTTTTTGACGGTGATGATCATAGCCTGATCATTCTGTTCACC
+AAATTGGAGGATGAGGGAGAGTATACATGTATGGCCAGTAATGACTATGGAAAGACAATATGTAGTGCCT
+ATCTAAAAATTAATTCCAAAGGAGAGGGTCACAAAGACACTGAAACAGAATCAGCAGTGGCAAAATCTCT
+GGAAAAGCTGGGAGGTCCTTGTCCTCCTCACTTCCTTAAGGAGTTAAAACCAATTCGCTGTGCTCAAGGG
+CTTCCTGCCATCTTTGAGTACACAGTGGTTGGAGAGCCTGCCCCTACTGTTACATGGTTCAAAGAAAACA
+AGCAGCTTTGCACCAGTGTTTATTACACTATCATTCATAACCCTAATGGCTCTGGAACTTTCATTGTCAA
+TGACCCTCAGAGGGAAGACAGTGGCCTCTATATCTGTAAAGCAGAGAATATGTTGGGTGAGTCCACCTGT
+GCAGCAGAGCTGCTTGTGCTTCTGGAAGACACAGACATGACTGATACCCCCTGCAAAGCAAAGTCCACAC
+CAGAGGCTCCTGAGGATTTTCCACAGACACCCTTAAAGGGTCCCGCAGTTGAAGCACTTGACTCAGAGCA
+GGAAATTGCAACGTTTGTAAAAGACACCATTTTGAAAGCTGCTTTAATTACAGAAGAAAACCAGCAACTA
+TCTTATGAGCATATTGCTAAAGCCAATGAATTGAGCAGTCAGCTTCCTTTGGGAGCTCAGGAATTGCAAT
+CCATTTTGGAGCAAGACAAGCTCACTCCTGAAAGCACCAGGGAATTTCTTTGCATCAATGGCAGTATTCA
+CTTTCAGCCTCTCAAGGAACCATCTCCCAACCTACAGCTGCAGATTGTACAGTCCCAGAAAACCTTCTCC
+AAAGAAGGTATTCTAATGCCTGAAGAGCCTGAGACACAGGCAGTTCTATCAGATACCGAGAAAATCTTCC
+CAAGTGCCATGTCCATAGAACAAATTAATTCATTAACAGTTGAGCCTCTGAAAACTTTATTAGCTGAACC
+TGAAGGGAATTATCCACAGTCTTCAATAGAACCTCCAATGCATTCTTATCTAACCTCTGTGGCTGAGGAA
+GTACTTTCACCAAAAGAAAAGACAGTATCTGACACCAACAGAGAGCAAAGAGTGACTCTTCAAAAGCAAG
+AGGCACAAAGTGCGCTCATCTTGAGTCAGAGCTTAGCTGAGGGACACGTGGAGAGTCTCCAGAGTCCTGA
+TGTCATGATCTCTCAGGTAAACTATGAGCCCCTAGTCCCTTCAGAACACTCATGCACAGAAGGAGGTAAA
+ATTTTGATAGAAAGTGCAAATCCACTGGAAAATGCAGGGCAAGATTCTGCGGTCAGAATTGAGGAAGGCA
+AGTCCTTAAGATTTCCACTAGCACTTGAAGAAAAGCAGGTACTGCTCAAAGAAGAGCATTCTGACAACGT
+GGTGATGCCCCCAGACCAAATCATTGAGTCTAAAAGAGAGCCCGTGGCAATAAAGAAAGTGCAGGAGGTA
+CAGGGAAGGGACCTTCTTTCTAAGGAAAGCTTGCTTTCTGGTATTCCAGAAGAGCAGAGATTAAACCTGA
+AAATTCAAATCTGCCGGGCTTTGCAAGCAGCCGTGGCCAGCGAGCAGCCAGGTCTTTTCTCTGAGTGGCT
+AAGAAATATTGAAAAGGTGGAGGTCGAGGCTGTAAACATCACCCAAGAGCCCAGACACATCATGTGCATG
+TACCTTGTTACTTCGGCAAAGTCTGTAACAGAAGAAGTAACCATCATTATTGAAGATGTTGATCCTCAAA
+TGGCTAACCTGAAAATGGAACTTAGGGATGCTTTGTGTGCTATTATATATGAGGAAATAGACATCCTAAC
+AGCTGAGGGTCCTAGAATTCAGCAAGGAGCCAAAACAAGTTTGCAAGAAGAAATGGATTCTTTTTCAGGT
+TCACAGAAGGTTGAACCCATTACTGAACCAGAAGTTGAATCTAAATATCTGATCTCAACTGAAGAGGTCA
+GTTATTTTAACGTGCAAAGTAGGGTTAAATATTTGGATGCCACACCTGTCACTAAAGGGGTTGCTTCAGC
+TGTTGTCTCTGACGAAAAACAAGATGAGAGTCTGAAACCATCAGAGGAAAAAGAGGAGTCTTCCTCTGAA
+AGTGGTACTGAGGAGGTTGCTACAGTAAAGATACAGGAAGCTGAGGGTGGCTTAATCAAAGAGGATGGCC
+CCATGATACATACACCTTTAGTGGACACTGTTTCTGAGGAAGGTGATATTGTACACCTCACAACATCCAT
+AACAAATGCTAAAGAGGTGAATTGGTATTTTGAGAATAAACTGGTGCCTTCAGATGAAAAGTTCAAGTGT
+TTACAAGATCAAAATACATATACGCTAGTCATCGACAAAGTAAATACCGAAGACCATCAAGGAGAGTATG
+TCTGTGAGGCCTTGAATGACAGCGGAAAAACAGCAACTTCAGCCAAACTCACTGTAGTAAAAAGAGCTGC
+CCCAGTGATCAAGAGGAAAATCGAACCCCTGGAAGTAGCACTGGGCCACCTAGCCAAATTCACCTGTGAG
+ATCCAAAGTGCTCCCAATGTCCGGTTCCAGTGGTTTAAAGCTGGCCGAGAAATTTATGAGAGTGACAAGT
+GTTCTATTCGATCTTCAAAGTATATCTCCAGCCTTGAAATCCTGAGAACCCAGGTGGTTGACTGCGGCGA
+GTATACATGCAAAGCTTCCAATGAGTATGGCAGTGTCAGCTGTACAGCCACACTAACTGTGACAGTGCCT
+GGAGGTGAAAAGAAAGTTCGCAAATTACTTCCGGAACGTAAACCTGAACCAAAGGAAGAAGTTGTTCTGA
+AAAGCGTTCTAAGAAAAAGACCTGAAGAAGAAGAACCTAAAGTAGAACCTAAAAAACTAGAAAAAGTTAA
+AAAACCTGCAGTACCAGAACCACCACCTCCAAAACCTGTTGAAGAGGTTGAAGTACCTACTGTTACAAAA
+AGGGAAAGGAAGATTCCTGAACCAACAAAAGTGCCTGAAATCAAGCCAGCAATACCTCTCCCTGCACCTG
+AACCGAAACCAAAGCCCGAAGCAGAAGTGAAAACAATCAAACCACCTCCTGTGGAACCTGAACCAACCCC
+CATCGCTGCCCCAGTAACAGTGCCAGTGGTTGGAAAGAAAGCAGAAGCCAAAGCACCTAAGGAAGAGGCT
+GCCAAGCCAAAAGGTCCTATCAAAGGTGTACCCAAAAAGACTCCTTCACCAATAGAAGCCGAAAGGAGAA
+AGTTAAGGCCAGGAAGTGGTGGAGAGAAACCTCCTGATGAAGCCCCGTTCACCTACCAGCTAAAGGCTGT
+GCCACTGAAGTTTGTGAAAGAAATCAAAGACATCATCTTGACAGAATCAGAGTTCGTTGGCTCTTCAGCA
+ATCTTTGAATGTTTGGTCTCCCCTTCCACTGCAATTACAACCTGGATGAAAGACGGTAGCAATATCCGTG
+AGAGTCCCAAGCACAGGTTTATTGCAGATGGTAAAGACAGAAAGCTGCACATCATTGATGTTCAACTTTC
+CGATGCTGGTGAATACACCTGTGTTTTACGTTTGGGAAACAAAGAAAAGACCTCCACGGCTAAACTTGTT
+GTAGAAGAACTTCCTGTGCGTTTTGTAAAAACACTGGAAGAGGAAGTCACAGTGGTCAAAGGACAGCCAT
+TGTACTTGAGCTGCGAGTTAAACAAAGAGCGTGACGTGGTCTGGAGGAAGGATGGCAAGATTGTGGTGGA
+GAAACCTGGCCGAATTGTGCCAGGCGTCATTGGCTTGATGCGGGCTCTGACCATCAACGATGCAGATGAC
+ACAGATGCTGGAACATACACAGTTACTGTGGAAAACGCCAACAACCTGGAGTGTTCATCTTGCGTAAAAG
+TAGTAGAAGTCATTAGAGATTGGCTGGTGAAACCTATACGAGACCAGCATGTGAAACCCAAGGGGACAGC
+TATTTTTGCCTGTGATATAGCAAAAGATACTCCAAACATTAAGTGGTTCAAAGGATATGATGAAATCCCT
+GCGGAACCAAATGATAAGACTGAAATACTGAGAGATGGAAATCATCTGTACCTCAAAATTAAGAATGCTA
+TGCCAGAAGATATTGCTGAGTATGCAGTGGAAATTGAAGGAAAAAGATACCCTGCAAAGCTGACACTTGG
+AGAGCGTGAAGTTGAACTGCTTAAACCAATAGAGGACGTTACCATTTATGAGAAAGAAAGTGCAAGCTTT
+GATGCAGAAATCTCAGAGGCAGACATTCCTGGACAATGGAAACTGAAAGGAGAACTTCTAAGGCCCTCAC
+CTACTTGTGAAATCAAAGCAGAAGGTGGAAAACGCTTCTTAACTTTGCACAAAGTCAAACTGGACCAAGC
+TGGTGAAGTCCTCTACCAGGCCCTTAATGCAATTACAACTGCCATTTTGACAGTAAAAGAAATCGAACTT
+GACTTTGCTGTGCCCCTGAAGGATGTCACTGTTCCAGAAAGGCGACAGGCTCGATTCGAATGTGTCCTCA
+CCCGAGAGGCAAATGTTATATGGTCCAAAGGACCTGATATAATTAAGTCATCTGACAAATTTGATATCAT
+CGCTGATGGAAAGAAACATATTCTTGTTATTAATGATTCTCAATTTGATGATGAAGGGGTCTATACTGCT
+GAGGTGGAGGGCAAGAAGACCTCAGCTCGGTTGTTTGTCACAGGTATAAGACTGAAATTCATGTCACCTC
+TTGAAGATCAAACAGTAAAAGAAGGTGAAACAGCAACTTTTGTTTGTGAACTTTCTCATGAAAAAATGCA
+TGTAGTCTGGTTCAAAAATGATGCCAAACTCCATACAAGCAGAACAGTACTCATCTCTTCTGAGGGCAAG
+ACTCACAAATTGGAAATGAAAGAAGTGACATTGGATGATATATCTCAGATAAAAGCTCAAGTCAAGGAGC
+TGAGCTCCACAGCACAGCTGAAGGTCTTAGAGGCCGATCCCTACTTCACTGTGAAATTACATGACAAAAC
+TGCAGTGGAGAAGGATGAGATTACTTTGAAGTGTGAAGTGAGCAAAGATGTACCAGTGAAATGGTTCAAA
+GATGGTGAAGAGATTGTCCCTTCACCCAAATATTCTATCAAGGCAGATGGCCTGCGCCGCATCTTAAAAA
+TCAAAAAGGCGGACCTTAAAGATAAAGGCGAATATGTGTGTGACTGTGGCACAGACAAGACCAAGGCAAA
+TGTTACTGTTGAGGCTCGACTAATAAAAGTGGAAAAGCCTCTGTACGGAGTAGAGGTGTTTGTTGGTGAA
+ACAGCCCACTTTGAAATTGAACTTTCTGAACCTGATGTTCACGGCCAGTGGAAGCTGAAAGGACAGCCTT
+TGACAGCTTCCCCTGACTGTGAAATCATTGAGGATGGAAAGAAGCATATTCTGATCCTTCATAACTGTCA
+GCTGGGTATGACAGGAGAGGTTTCCTTCCAGGCTGCTAATGCCAAATCTGCAGCCAATCTGAAAGTGAAA
+GAATTGCCTCTTATCTTCATCACACCTCTCAGTGATGTTAAAGTCTTCGAGAAAGATGAGGCTAAGTTTG
+AGTGTGAAGTATCCAGGGAGCCCAAAACATTCCGTTGGCTAAAAGGAACCCAGGAAATCACAGGTGATGA
+CAGATTTGAGCTTATAAAGGATGGCACTAAGCATTCAATGGTGATCAAGTCAGCTGCTTTTGAAGATGAA
+GCAAAATACATGTTTGAAGCTGAAGATAAGCACACAAGTGGCAAACTGATCATTGAAGGAATCCGGCTCA
+AATTCCTCACCCCTCTCAAAGATGTAACTGCCAAAGAGAAGGAAAGTGCTGTATTTACTGTGGAGTTATC
+TCATGATAACATCCGAGTTAAATGGTTCAAGAATGACCAGCGCCTACACACCACCAGGTCGGTCTCAATG
+CAAGACGAAGGGAAAACTCATTCGATCACATTCAAAGACCTGTCTATTGATGACACCTCCCAAATTAGAG
+TAGAAGCTATGGGGATGAGTTCAGAAGCTAAACTCACTGTGCTTGAGGGAGACCCATATTTTACAGGAAA
+ACTTCAAGATTATACTGGTGTAGAGAAAGATGAAGTTATTCTACAGTGTGAAATTAGCAAAGCAGATGCA
+CCAGTGAAATGGTTTAAGGATGGGAAGGAAATAAAGCCATCCAAAAATGCTGTTATTAAGGCAGATGGCA
+AGAAACGCATGCTAATCCTAAAGAAAGCCTTGAAATCAGATATTGGACAGTACACCTGTGACTGTGGGAC
+AGATAAGACCTCAGGAAAACTTGACATTGAGGATCGGGAAATTAAACTGGTGCGACCCCTGCACAGTGTG
+GAGGTGATGGAGACTGAGACAGCACGCTTTGAAACCGAAATCTCTGAAGATGATATCCACGCCAACTGGA
+AACTCAAGGGAGAGGCCCTACTCCAAACACCTGATTGTGAAATTAAGGAAGAAGGCAAAATACACTCCCT
+TGTTTTGCACAACTGTCGCCTGGACCAGACGGGTGGGGTGGATTTCCAAGCTGCCAATGTTAAATCTAGT
+GCCCACCTCCGAGTTAAGCCACGAGTAATTGGTCTTCTGAGGCCTTTAAAGGATGTCACCGTGACTGCAG
+GGGAAACAGCCACCTTCGACTGCGAGCTCTCCTACGAAGATATCCCAGTGGAATGGTATCTCAAAGGGAA
+GAAACTAGAGCCCAGCGATAAGGTGGTCCCACGTTCAGAAGGAAAAGTTCATACACTTACTCTGAGGGAT
+GTAAAGTTAGAAGATGCTGGGGAAGTCCAACTAACAGCAAAAGATTTCAAAACTCACGCCAACCTCTTTG
+TGAAAGAACCCCCAGTTGAATTCACTAAGCCTCTTGAGGACCAGACGGTCGAAGAGGGAGCCACTGCAGT
+GCTGGAGTGTGAAGTCTCCAGAGAAAATGCTAAGGTGAAATGGTTCAAAAATGGGACAGAAATCCTCAAA
+AGCAAGAAGTATGAAATTGTTGCTGATGGCAGGGTCAGAAAACTTGTTATACATGACTGTACCCCAGAGG
+ATATTAAAACATACACTTGTGATGCTAAGGATTTTAAGACTTCCTGTAACCTGAATGTCGTGCCTCCTCA
+TGTGGAATTCTTAAGACCACTCACCGACCTTCAAGTTAGAGAAAAAGAAATGGCTCGATTTGAGTGTGAA
+CTTTCCCGAGAAAATGCTAAGGTTAAGTGGTTTAAAGATGGTGCTGAAATTAAAAAGGGCAAAAAATATG
+ACATCATATCCAAGGGAGCAGTGCGCATTCTTGTCATCAACAAATGTCTACTGGATGATGAAGCTGAATA
+TTCCTGTGAAGTAAGGACAGCGAGAACTTCTGGCATGCTGACAGTTCTGGAAGAAGAAGCTGTCTTTACC
+AAAAATCTTGCCAACATTGAAGTTAGTGAAACAGACACTATAAAACTGGTTTGTGAAGTCTCCAAACCTG
+GCGCAGAAGTGATTTGGTATAAAGGGGATGAGGAGATCATTGAAACAGGAAGATATGAAATACTGACTGA
+AGGACGGAAGAGAATCCTGGTCATTCAGAACGCTCACCTTGAGGATGCTGGCAACTACAACTGTCGACTC
+CCAAGCTCTCGAACCGATGGCAAAGTCAAAGTACATGAACTGGCTGCTGAATTTATCTCAAAGCCTCAAA
+ACCTTGAAATACTTGAAGGAGAAAAGGCTGAATTTGTCTGCTCTATATCAAAAGAAAGCTTTCCAGTCCA
+GTGGAAGAGGGATGATAAGACACTTGAATCTGGAGATAAATATGACGTTATTGCTGATGGTAAAAAGAGG
+GTCCTAGTTGTGAAAGATGCCACATTACAAGATATGGGCACTTACGTTGTCATGGTAGGGGCCGCCAGAG
+CAGCAGCTCACTTGACAGTCATTGAAAAACTCAGGATCGTAGTTCCTCTTAAGGACACCCGGGTGAAGGA
+ACAACAGGAAGTTGTCTTCAACTGTGAAGTCAATACTGAAGGTGCCAAAGCCAAATGGTTCAGAAATGAA
+GAAGCTATATTTGATAGTTCAAAATACATCATTCTCCAAAAAGACCTAGTCTACACCCTCAGAATTAGAG
+ATGCACACTTAGATGACCAAGCCAACTATAATGTGTCTTTGACCAATCACAGAGGTGAAAATGTTAAAAG
+TGCAGCCAATCTAATAGTAGAAGAGGAAGACCTTAGGATTGTTGAGCCTCTTAAAGATATTGAAACAATG
+GAGAAGAAATCTGTCACATTCTGGTGCAAGGTGAATCGTCTCAATGTAACACTGAAGTGGACCAAAAATG
+GTGAAGAAGTGCCTTTTGACAACCGTGTCTCATACAGAGTTGATAAGTACAAGCACATGTTAACCATTAA
+AGACTGTGGCTTCCCAGATGAAGGTGAATACATTGTCACTGCTGGACAAGATAAATCTGTTGCTGAGCTT
+CTCATCATAGAAGCCCCGACAGAATTTGTGGAACACTTGGAAGATCAGACAGTCACTGAGTTCGATGACG
+CTGTCTTCTCCTGCCAGCTCTCCAGAGAGAAAGCCAATGTAAAATGGTACAGAAATGGGAGAGAAATCAA
+AGAAGGCAAAAAATACAAATTTGAAAAAGATGGAAGTATACACAGACTCATTATAAAAGATTGCAGGCTG
+GATGATGAGTGTGAATATGCTTGCGGGGTAGAAGACAGGAAGTCTCGTGCTAGACTTTTTGTGGAAGAAA
+TTCCTGTTGAGATCATCAGGCCTCCACAAGATATTCTTGAAGCCCCTGGTGCTGATGTTGTCTTTTTAGC
+AGAACTCAATAAAGATAAGGTGGAAGTCCAATGGCTAAGAAATAACATGGTTGTTGTCCAGGGTGATAAA
+CACCAGATGATGAGTGAAGGAAAGATACATCGACTACAGATTTGTGATATTAAGCCCCGTGACCAGGGTG
+AATACAGATTTATTGCCAAAGACAAAGAAGCCAGAGCTAAGCTTGAACTGGCAGCTGCACCAAAAATCAA
+GACAGCTGACCAAGACCTTGTGGTTGATGTTGGCAAGCCTCTGACAATGGTGGTGCCATATGATGCCTAC
+CCCAAAGCAGAAGCTGAATGGTTTAAAGAAAATGAACCTTTATCTACAAAAACCATTGATACTACGGCTG
+AACAAACTTCTTTCAGAATTTTAGAAGCCAAGAAAGGAGACAAAGGGAGGTATAAAATTGTGCTTCAGAA
+CAAACATGGAAAAGCAGAAGGATTCATCAATTTAAAAGTTATTGATGTTCCTGGGCCAGTACGTAACTTA
+GAAGTGACAGAAACATTTGATGGTGAAGTGAGCCTTGCTTGGGAAGAACCTTTAACTGATGGTGGAAGCA
+AAATCATAGGTTACGTTGTTGAAAGACGTGACATTAAGAGAAAGACCTGGGTTCTGGCCACAGACCGTGC
+AGAGAGTTGTGAGTTTACTGTCACTGGTCTACAGAAAGGAGGAGTTGAGTACCTATTCCGTGTGAGTGCA
+AGAAACAGAGTTGGCACTGGTGAGCCAGTAGAAACTGACAATCCTGTAGAAGCAAGGAGTAAATATGATG
+TTCCAGGCCCTCCTTTGAATGTAACCATCACTGATGTGAATCGATTTGGTGTCTCACTGACATGGGAACC
+ACCAGAGTATGATGGAGGTGCTGAGATCACAAACTACGTCATTGAATTAAGAGACAAGACTTCTATCAGG
+TGGGATACTGCCATGACTGTGAGAGCTGAAGACCTGTCTGCAACTGTTACTGATGTGGTAGAAGGACAGG
+AGTACAGTTTCCGAGTGAGAGCCCAAAATCGAATTGGAGTTGGAAAACCAAGTGCAGCCACACCCTTCGT
+CAAAGTTGCTGATCCAATTGAGAGACCAAGTCCTCCTGTAAACCTAACTTCCTCAGATCAGACTCAGTCA
+TCAGTTCAGCTCAAATGGGAACCTCCTCTGAAAGATGGAGGAAGCCCAATATTAGGCTATATAATTGAGC
+GATGCGAAGAAGGAAAAGATAATTGGATTCGTTGCAATATGAAACTTGTCCCTGAACTGACTTACAAGGT
+TACCGGATTGGAAAAAGGAAATAAATATTTATATAGAGTATCTGCAGAAAATAAAGCTGGTGTTTCAGAT
+CCATCTGAAATTCTTGGTCCTCTCACCGCTGACGATGCATTTGTTGAACCAACAATGGATTTAAGTGCAT
+TTAAAGATGGTCTGGAAGTTATTGTCCCAAATCCTATCACGATCCTGGTTCCAAGTACAGGCTATCCAAG
+GCCAACTGCAACCTGGTGTTTTGGAGATAAAGTACTAGAAACAGGGGACCGGGTGAAAATGAAGACCTTG
+TCTGCCTATGCCGAACTTGTCATTTCTCCAAGTGAACGTTCAGACAAGGGCATTTATACACTGAAATTAG
+AAAACCGTGTGAAAACAATTTCTGGGGAAATTGATGTCAATGTAATTGCTCGCCCAAGTGCACCCAAAGA
+ATTGAAATTTGGTGATATAACCAAGGACTCAGTACATTTGACTTGGGAACCACCTGATGATGATGGAGGA
+AGTCCGTTAACTGGATACGTTGTTGAAAAACGAGAAGTCAGCCGGAAAACATGGACTAAAGTTATGGACT
+TTGTGACTGATCTAGAATTCACAGTTCCTGATCTTGTTCAAGGAAAAGAGTACTTATTTAAAGTTTGTGC
+TCGTAACAAATGTGGCCCTGGAGAACCTGCATATGTTGATGAACCTGTAAATATGTCAACTCCTGCAACG
+GTACCTGACCCACCAGAGAATGTTAAATGGAGAGATCGAACAGCCAATAGCATCTTCTTAACATGGGATC
+CACCTAAAAATGATGGTGGTTCACGCATCAAAGGATATATAGTTGAAAGATGTCCACGTGGTTCTGATAA
+ATGGGTTGCCTGTGGAGAACCTGTTGCAGAAACAAAAATGGAAGTGACAGGTCTTGAGGAAGGCAAATGG
+TATGCCTACCGCGTGAAGGCCTTAAACAGGCAGGGTGCTAGCAAACCAAGCAGACCCACAGAGGAAATCC
+AGGCTGTGGACACACAAGAGGCCCCAGAAATCTTCCTCGATGTGAAGCTCCTTGCTGGTCTCACTGTAAA
+AGCTGGGACCAAGATTGAACTTCCTGCCACCGTAACCGGAAAACCTGAACCTAAAATAACTTGGACAAAG
+GCTGATATGATTCTGAAGCAGGACAAAAGAATTACCATTGAAAATGTCCCTAAGAAATCCACAGTGACTA
+TTGTTGATAGTAAGAGAAGTGACACTGGCACATATATCATTGAGGCTGTGAATGTGTGTGGCCGGGCCAC
+TGCTGTGGTGGAAGTGAACGTCTTAGATAAACCCGGACCACCAGCTGCCTTTGACATCACAGATGTAACC
+AATGAGTCATGTCTTCTAACATGGAACCCACCACGCGATGATGGTGGATCTAAGATCACAAACTATGTTG
+TGGAGAGACGAGCAACTGATAGTGAAGTGTGGCACAAGCTCTCATCCACCGTCAAGGATACAAACTTCAA
+GGCCACCAAATTAATCCCCAATAAAGAGTACATCTTCAGAGTTGCTGCAGAAAACATGTATGGTGTTGGT
+GAACCAGTTCAGGCCTCTCCAATAACAGCCAAATATCAGTTTGATCCACCTGGTCCTCCAACTCGCCTAG
+AACCTTCTGATATCACTAAAGACGCAGTGACTCTCACATGGTGTGAGCCAGATGATGATGGTGGCAGCCC
+AATCACAGGATACTGGGTTGAAAGACTGGATCCTGATACAGATAAATGGGTTAGATGCAATAAGATGCCA
+GTAAAGGACACAACATACAGAGTGAAAGGTCTCACTAATAAGAAAAAATACAGATTCCGTGTGTTGGCTG
+AAAATCTTGCTGGACCTGGAAAACCAAGCAAATCAACTGAACCAATCTTAATAAAGGATCCCATAGATCC
+TCCATGGCCCCCTGGAAAACCAACTGTAAAAGATGTAGGCAAAACATCAGTAAGGTTGAATTGGACAAAA
+CCAGAACATGATGGAGGTGCAAAGATTGAGTCTTATGTCATTGAAATGCTGAAGACTGGAACAGATGAGT
+GGGTCAGAGTGGCGGAAGGGGTTCCCACCACTCAGCACTTGCTCCCAGGGCTCATGGAAGGACAGGAATA
+CTCATTCCGAGTTAGAGCTGTGAATAAGGCTGGGGAAAGTGAACCCAGTGAACCCAGTGACCCTGTGCTT
+TGCCGGGAGAAGCTATATCCTCCATCACCACCACGCTGGCTTGAAGTTATTAATATCACAAAAAATACAG
+CAGACCTAAAATGGACAGTTCCTGAGAAAGATGGAGGGTCCCCCATCACCAACTACATTGTGGAAAAGAG
+AGACGTCAGGCGAAAAGGCTGGCAAACAGTGGATACCACTGTCAAGGACACCAAGTGCACAGTCACCCCA
+CTGACTGAGGGCTCTTTATATGTGTTCCGAGTTGCTGCAGAAAATGCTATAGGACAAAGCGACTACACCG
+AAATTGAGGACTCTGTGCTGGCCAAAGACACCTTTACCACTCCTGGACCACCCTACGCCCTGGCAGTGGT
+TGATGTGACAAAACGACATGTTGACCTAAAGTGGGAGCCACCTAAAAATGATGGTGGAAGACCAATACAG
+AGATATGTCATTGAGAAGAAAGAAAGGTTAGGTACCCGTTGGGTGAAAGCTGGAAAGACTGCAGGACCTG
+ACTGTAACTTCAGAGTAACTGATGTCATCGAAGGAACAGAGGTCCAGTTTCAGGTTCGGGCTGAAAATGA
+AGCTGGAGTTGGCCACCCAAGTGAACCCACAGAAATCCTATCCATTGAAGATCCAACAAGTCCTCCCTCA
+CCACCCCTTGACCTACATGTGACTGATGCTGGGAGAAAACACATTGCCATTGCTTGGAAGCCTCCAGAGA
+AAAATGGTGGAAGTCCTATCATAGGATACCATGTTGAAATGTGTCCAGTAGGCACTGAGAAATGGATGAG
+AGTTAATTCTCGCCCAATAAAGGACTTGAAATTCAAGGTTGAAGAAGGTGTTGTTCCTGACAAAGAATAT
+GTCCTGAGAGTGAGAGCAGTCAATGCTATTGGTGTCAGCGAGCCATCTGAAATCTCTGAAAATGTGGTTG
+CCAAAGACCCAGACTGCAAGCCAACAATTGACCTGGAGACTCATGACATTATTGTTATTGAAGGTGAAAA
+GTTAAGCATTCCTGTTCCCTTCAGAGCTGTCCCAGTTCCAACTGTTAGTTGGCATAAAGATGGCAAAGAA
+GTTAAAGCAAGTGATAGATTAACAATGAAGAATGATCACATCTCTGCACACCTTGAAGTTCCCAAGAGTG
+TCCGTGCAGATGCCGGAATTTATACCATTACACTGGAGAATAAGCTCGGCTCAGCAACAGCCTCAATCAA
+TGTCAAAGTCATAGGCCTACCTGGACCATGCAAAGATATTAAAGCAAGTGACATTACCAAGAGTTCTTGT
+AAGTTAACTTGGGAACCTCCAGAATTTGATGGTGGAACCCCAATTCTTCATTATGTCCTGGAGCGCAGAG
+AAGCTGGGAGGAGAACATATATACCAGTCATGTCTGGTGAGAACAAACTGTCATGGACTGTGAAGGATCT
+CATACCAAATGGTGAATACTTCTTCCGTGTTAAAGCAGTCAACAAGGTTGGTGGAGGAGAATATATTGAA
+CTGAAAAATCCAGTCATTGCTCAAGATCCAAAGCAACCCCCTGATCCACCTGTAGATGTAGAGGTTCATA
+ATCCTACAGCGGAGGCAATGACTATTACATGGAAGCCACCTTTGTATGATGGAGGGAGCAAGATAATGGG
+CTACATCATAGAGAAGATTGCTAAGGGTGAAGAAAGGTGGAAGAGATGCAATGAACACCTGGTACCAATC
+CTGACCTATACAGCAAAAGGACTTGAAGAGGGGAAAGAGTACCAATTCCGTGTGCGAGCAGAGAACGCCG
+CGGGTATTAGTGAACCTTCTCGGGCTACTCCTCCAACCAAAGCTGTAGATCCCATTGATGCCCCCAAAGT
+CATTCTGAGAACAAGCCTAGAAGTGAAACGAGGTGATGAAATAGCACTTGATGCAAGTATTTCTGGATCA
+CCTTACCCAACTATTACATGGATAAAGGATGAAAATGTTATTGTACCAGAGGAAATTAAGAAGCGTGCAG
+CACCCTTGGTTAGGAGAAGGAAGGGTGAAGTTCAAGAAGAAGAACCATTTGTCCTGCCTCTGACACAGCG
+TTTGAGTATTGACAACAGCAAAAAGGGAGAATCTCAGCTACGCGTCCGAGATTCTCTCCGACCTGACCAT
+GGTCTGTATATGATCAAAGTTGAAAATGACCACGGTATTGCAAAAGCTCCTTGTACTGTCAGTGTGTTAG
+ATACACCGGGACCACCAATCAACTTTGTATTTGAAGATATCAGAAAGACCTCAGTCCTTTGTAAATGGGA
+ACCACCCCTTGATGATGGTGGCAGTGAAATCATAAACTACACTTTGGAAAAGAAAGACAAGACAAAACCC
+GACTCAGAATGGATTGTTGTCACTTCAACACTTAGACATTGCAAATATTCAGTAACAAAACTGATTGAAG
+GAAAAGAGTACCTCTTCCGTGTAAGAGCTGAAAACAGATTTGGGCCAGGTCCACCATGTGTTTCAAAGCC
+ACTTGTGGCTAAAGATCCATTTGGACCACCTGATGCACCAGATAAGCCCATTGTGGAAGATGTTACCAGC
+AACAGTATGCTAGTGAAATGGAATGAACCAAAAGATAATGGAAGCCCCATTTTGGGTTACTGGCTTGAAA
+AACGTGAAGTTAACAGTACACATTGGTCTCGTGTCAACAAAAGCCTTCTGAATGCCTTGAAAGCCAATGT
+AGATGGCTTATTAGAAGGACTCACCTATGTCTTCAGAGTATGTGCTGAAAATGCAGCTGGACCTGGAAAG
+TTCAGTCCACCTTCAGATCCCAAAACAGCACATGATCCAATCTCTCCTCCTGGGCCACCTATCCCAAGAG
+TCACTGACACAAGCTCTACAACTATTGAACTAGAATGGGAACCCCCAGCTTTCAATGGTGGTGGGGAAAT
+TGTTGGCTATTTTGTTGATAAGCAGTTGGTTGGCACAAATGAATGGTCACGCTGCACAGAGAAGATGATC
+AAGGTCCGTCAGTACACCGTCAAAGAAATCCGAGAGGGTGCTGATTACAAACTTCGGGTGAGTGCTGTCA
+ATGCCGCAGGGGAAGGACCGCCTGGAGAAACACAACCTGTTACTGTGGCTGAACCACAAGAGCCTCCAGC
+TGTGGAACTGGATGTTTCTGTCAAGGGTGGAATACAAATAATGGCTGGGAAGACTCTTAGAATTCCAGCT
+GTGGTGACTGGTCGCCCTGTACCTACAAAAGTATGGACCAAAGAAGAAGGGGAGCTGGATAAAGACCGTG
+TTGTAATAGACAACGTTGGAACCAAATCTGAACTAATTATCAAGGATGCACTGCGAAAAGACCATGGCAG
+ATATGTGATTACAGCTACAAATAGCTGTGGTTCCAAATTTGCAGCAGCCAGGGTAGAAGTTTTTGATGTC
+CCTGGTCCAGTTCTTGACTTAAAACCTGTTGTAACAAACAGAAAAATGTGTCTACTTAACTGGTCTGATC
+CAGAAGATGATGGAGGAAGTGAAATAACAGGCTTTATCATTGAAAGAAAAGATGCCAAGATGCATACTTG
+GAGACAACCAATAGAGACTGAGAGATCTAAATGTGACATCACAGGTCTGCTTGAGGGACAAGAATATAAG
+TTCCGTGTTATTGCCAAGAACAAGTTTGGCTGTGGCCCTCCTGTTGAAATAGGACCAATTCTTGCAGTTG
+ATCCACTAGGTCCTCCAACATCTCCAGAGAGGCTCACATACACTGAAAGGACAAAGTCCACTATCACACT
+TGACTGGAAAGAGCCCCGCAGTAATGGTGGCAGTCCCATCCAAGGATATATCATTGAAAAACGGCGTCAT
+GACAAACCTGACTTTGAAAGAGTTAACAAGCGACTCTGCCCAACCACATCTTTTCTGGTTGAAAATCTTG
+ATGAACACCAAATGTATGAGTTCCGTGTCAAAGCTGTCAATGAAATTGGTGAAAGTGAACCATCCCTACC
+TCTTAATGTAGTCATACAAGATGATGAAGTGCCTCCAACTATTAAGTTGCGTCTGAGTGTTCGAGGAGAC
+ACTATCAAAGTTAAGGCAGGAGAGCCTGTCCACATCCCTGCAGATGTGACAGGCCTTCCAATGCCTAAGA
+TTGAATGGTCCAAAAATGAAACTGTAATTGAAAAACCCACTGATGCACTTCAGATAACCAAGGAAGAGGT
+ATCCCGAAGTGAGGCAAAAACTGAGCTTAGCATTCCCAAAGCGGTCCGGGAGGACAAAGGCACTTACACA
+GTTACTGCTTCCAATCGCCTTGGCTCAGTGTTCCGAAATGTTCACGTTGAAGTATATGACCGCCCATCCC
+CACCAAGAAATCTTGCTGTTACTGACATTAAAGCTGAATCTTGCTACTTGACATGGGATGCCCCTCTTGA
+TAATGGTGGCAGTGAAATCACCCATTATGTTATTGACAAACGTGATGCAAGTAGGAAGAAAGCAGAATGG
+GAGGAAGTCACCAACACTGCTGTAGAGAAAAGATATGGGATCTGGAAACTTATCCCCAATGGTCAGTATG
+AGTTCCGAGTCAGGGCAGTGAATAAATATGGAATCAGTGATGAGTGCAAATCAGATAAAGTAGTCATTCA
+AGATCCTTATCGCCTTCCTGGACCTCCAGGAAAACCAAAAGTTTTGGCACGCACCAAAGGATCAATGCTA
+GTGAGCTGGACTCCTCCTTTGGACAATGGTGGCTCTCCAATTACTGGCTACTGGCTGGAGAAAAGAGAAG
+AGGGAAGTCCTTATTGGTCACGTGTTAGCCGAGCACCAATAACCAAAGTGGGATTGAAAGGCGTGGAATT
+TAATGTTCCTCGTTTGCTTGAAGGCGTTAAATACCAGTTCAGAGCCATGGCAATAAATGCTGCAGGAATT
+GGTCCTCCCAGTGAACCATCAGATCCAGAGGTTGCAGGAGATCCCATATTTCCACCGGGGCCACCTTCTT
+GCCCAGAAGTTAAAGATAAAACGAAGTCAAGCATCTCACTAGGATGGAAACCTCCAGCCAAAGATGGTGG
+CAGCCCAATCAAAGGATACATTGTAGAAATGCAAGAAGAAGGTACTACTGACTGGAAAAGAGTAAATGAA
+CCAGACAAACTTATAACTACCTGTGAATGTGTGGTGCCTAATCTGAAAGAGCTCAGGAAGTACAGATTCA
+GAGTGAAAGCTGTCAATGAAGCTGGTGAATCTGAACCAAGTGATACAACTGGGGAGATCCCTGCCACTGA
+TATTCAAGAGGAACCAGAAGTTTTCATTGACATTGGAGCACAGGACTGTCTGGTTTGTAAAGCTGGCTCA
+CAGATTAGGATTCCTGCTGTCATCAAGGGACGCCCAACACCAAAATCATCTTGGGAATTTGATGGAAAGG
+CAAAGAAAGCAATGAAGGATGGAGTTCATGACATACCCGAAGATGCACAGCTGGAGACTGCTGAAAACTC
+CTCAGTAATTATTATTCCGGAGTGTAAACGATCTCATACAGGCAAATACAGCATCACAGCCAAGAATAAA
+GCAGGACAAAAGACTGCAAATTGCAGAGTTAAAGTCATGGATGTACCAGGCCCACCCAAAGATCTGAAAG
+TCAGTGATATCACAAGGGGTAGTTGCAGACTTTCATGGAAGATGCCAGACGACGATGGAGGAGACAGGAT
+CAAAGGCTATGTTATTGAGAAGAGGACTATTGATGGAAAAGCCTGGACCAAAGTCAATCCAGACTGTGGA
+AGCACCACATTTGTAGTGCCTGATCTCCTCTCTGAACAGCAATATTTCTTCCGTGTGCGAGCAGAAAACC
+GTTTTGGTATTGGCCCACCTGTGGAAACCATTCAGAGGACCACTGCCAGAGATCCGATATATCCTCCTGA
+TCCTCCTATTAAACTCAAGATTGGCCTCATCACAAAGAACACAGTGCATCTGTCATGGAAACCCCCGAAG
+AATGATGGGGGCTCCCCTGTTACCCACTATATTGTTGAGTGCCTTGCATGGGACCCTACTGGGACAAAGA
+AAGAAGCCTGGAGGCAGTGCAATAAGCGTGATGTGGAAGAACTGCAATTTACTGTTGAAGACCTAGTAGA
+AGGTGGGGAATATGAATTCCGAGTCAAAGCTGTCAATGCTGCAGGAGTCAGCAAGCCTTCAGCCACTGTT
+GGGCCCTGTGACTGTCAAAGACCAGACATGCCACCATCAATTGATCTAAAAGAATTCATGGAGGTTGAAG
+AAGGAACCAATGTTAACATTGTGGCCAAAATTAAAGGTGTGCCATTCCCGACACTAACCTGGTTTAAAGC
+TCCTCCAAAGAAGCCTGATAACAAAGAACCTGTTCTCTATGACACCCATGTCAACAAACTGGTGGTAGAT
+GATACTTGCACTTTAGTTATTCCGCAGTCTCGCAGGAGTGACACTGGCTTATATACCATCACAGCTGTAA
+ATAATCTGGGAACAGCATCAAAGGAGATGAGACTGAATGTCCTGGGTCGTCCTGGCCCTCCAGTGGGACC
+CATAAAATTTGAATCTGTTTCAGCAGATCAAATGACACTATCTTGGTTTCCACCTAAAGATGATGGTGGG
+TCTAAGATTACAAACTATGTAATTGAGAAAAGAGAAGCTAACAGGAAGACATGGGTCCATGTCTCCAGTG
+AACCTAAGGAGTGCACGTACACGATTCCCAAATTGCTAGAAGGCCATGAATATGTATTCCGAATCATGGC
+CCAGAATAAATATGGCATTGGAGAACCTCTTGACAGTGAACCTGAAACAGCAAGAAACCTCTTCTCTGTC
+CCTGGAGCACCAGATAAACCAACAGTTAGCAGCGTGACTCGTAACTCCATGACTGTCAACTGGGAAGAGC
+CAGAATATGATGGAGGCTCTCCTGTGACAGGGTACTGGCTGGAAATGAAAGACACCACTTCAAAGAGATG
+GAAGAGAGTTAACCGAGATCCTATCAAAGCCATGACTTTGGGTGTTTCTTATAAAGTGACTGGTCTTATT
+GAAGGTTCCGACTATCAATTCCGGGTATATGCAATCAATGCTGCTGGCGTGGGTCCAGCAAGTCTGCCAT
+CAGACCCAGCGACTGCTAGAGATCCAATTGCCCCTCCTGGTCCTCCATTTCCCAAAGTGACAGATTGGAC
+TAAATCATCTGCAGATCTGGAGTGGTCTCCCCCACTAAAAGATGGTGGATCCAAAGTAACTGGATACATC
+GTTGAATATAAAGAAGAAGGAAAAGAAGAATGGGAAAAGGGTAAAGATAAAGAAGTGAGAGGAACAAAGC
+TCGTTGTGACAGGATTAAAGGAAGGAGCATTCTACAAATTTAGAGTTAGTGCAGTCAACATTGCTGGCAT
+TGGAGAACCTGGAGAGGTCACAGATGTCATTGAAATGAAGGACAGACTTGTTTCACCTGACCTTCAGCTA
+GATGCCAGTGTCAGAGATAGAATTGTTGTCCATGCTGGAGGGGTGATCCGAATCATTGCCTATGTGTCTG
+GAAAGCCTCCTCCAACCGTCACCTGGAACATGAATGAAAGAACCTTACCTCAAGAAGCCACCATTGAGAC
+CACAGCCATTAGCTCATCCATGGTCATCAAGAACTGCCAGAGGAGCCATCAAGGCGTCTATTCTCTTCTT
+GCCAAAAATGAAGCCGGAGAAAGAAAGAAGACAATTATTGTTGATGTATTAGATGTTCCAGGTCCCGTTG
+GAACACCATTCCTAGCTCACAACCTAACCAATGAGTCCTGCAAACTGACATGGTTTTCTCCAGAAGATGA
+TGGAGGCTCTCCAATCACCAATTATGTCATTGAAAAGCGTGAATCTGACCGCAGAGCATGGACCCCAGTG
+ACATATACAGTTACCCGACAAAATGCTACTGTCCAGGGTCTCATTCAAGGAAAAGCCTACTTTTTCCGAA
+TTGCGGCTGAAAATAGTATTGGCATGGGTCCATTTGTTGAGACATCAGAGGCACTTGTTATCAGAGAGCC
+AATAACTGTACCAGAGCGTCCTGAAGACCTGGAAGTCAAAGAAGTTACTAAAAATACTGTAACTTTGACT
+TGGAATCCTCCTAAGTATGATGGTGGGTCAGAAATTATTAACTATGTCCTAGAAAGTCGGCTCATTGGGA
+CTGAGAAGTTCCACAAAGTTACAAATGACAACTTGCTTAGCAGAAAATACACTGTTAAAGGCTTAAAAGA
+AGGTGATACCTATGAGTACCGTGTCAGTGCTGTCAACATTGTTGGACAAGGCAAACCATCATTTTGCACC
+AAACCAATTACTTGCAAGGATGAGCTGGCACCCCCAACGCTTCACCTCGACTTCAGAGATAAGCTCACGA
+TTCGAGTTGGTGAAGCTTTTGCCCTCACTGGCCGTTACTCAGGCAAACCAAAGCCTAAGGTTTCCTGGTT
+CAAAGATGAAGCTGATGTGCTGGAAGATGATCGCACTCATATAAAGACTACACCAGCAACACTTGCTTTA
+GAGAAGATCAAGGCCAAACGTTCAGATTCCGGCAAATACTGTGTGGTTGTGGAGAACAGTACAGGCTCTA
+GGAAAGGTTTCTGTCAAGTTAATGTTGTTGACCGTCCTGGACCACCAGTAGGACCAGTTAGTTTTGATGA
+GGTGACCAAAGATTACATGGTTATCTCTTGGAAGCCTCCTTTAGATGATGGAGGCAGTAAAATCACCAAT
+TATATTATTGAGAAGAAGGAAGTGGGTAAAGACGTCTGGATGCCAGTGACATCTGCAAGTGCTAAAACAA
+CATGCAAAGTTTCTAAACTACTTGAAGGAAAAGATTATATTTTCCGGATACATGCTGAAAATCTGTATGG
+AATAAGTGATCCTCTGGTGTCTGATTCAATGAAAGCCAAAGATCGTTTCAGGGTTCCTGATGCACCTGAT
+CAGCCAATTGTTACAGAAGTTACCAAAGACTCTGCATTAGTAACCTGGAATAAGCCACATGATGGAGGAA
+AACCCATCACAAACTACATCCTGGAAAAGAGAGAAACTATGTCTAAACGATGGGCTAGAGTTACCAAAGA
+TCCTATTCATCCATACACTAAATTTAGGGTTCCTGATCTTCTAGAAGGATGTCAGTATGAATTCCGGGTT
+TCTGCAGAAAATGAAATTGGTATTGGAGATCCAAGCCCACCATCCAAACCAGTCTTTGCTAAAGATCCAA
+TTGCTAAACCAAGTCCACCTGTTAATCCTGAAGCAATAGATACAACATGCAATTCAGTCGATCTAACTTG
+GCAGCCACCACGTCATGATGGTGGGAGCAAGATTCTGGGTTATATTGTTGAGTACCAGAAAGTTGGAGAT
+GAAGAGTGGAGAAGAGCCAATCACACCCCTGAGTCATGTCCTGAAACTAAATATAAAGTCACCGGTCTTC
+GGGACGGTCAAACCTATAAGTTTAGAGTGTTAGCAGTCAATGCAGCTGGTGAATCAGATCCAGCTCATGT
+TCCGGAGCCAGTCCTAGTAAAAGACAGGCTTGAACCCCCTGAGTTGATTCTTGATGCCAACATGGCAAGA
+GAACAACACATTAAAGTTGGTGATACTCTAAGACTTAGTGCCATCATCAAAGGAGTGCCATTCCCAAAAG
+TAACTTGGAAAAAAGAAGACAGAGATGCTCCAACTAAAGCAAGAATTGATGTGACTCCAGTTGGTAGCAA
+GCTTGAAATTCGTAATGCTGCCCATGAAGATGGTGGAATTTATTCTTTAACAGTGGAGAATCCAGCTGGT
+TCAAAAACTGTCTCAGTAAAAGTACTTGTATTAGATAAACCTGGGCCACCTAGAGATCTGGAAGTCAGTG
+AAATTAGGAAAGATTCATGTTACCTTACTTGGAAAGAACCACTGGATGATGGTGGTTCTGTTATTACCAA
+TTATGTGGTTGAGAGGAGAGATGTTGCCAGCGCCCAGTGGTCACCTCTCTCAGCTACATCAAAGAAAAAG
+AGTCACTTCGCTAAGCATCTGAATGAAGGCAACCAGTACCTCTTCCGAGTAGCTGCGGAGAACCAGTATG
+GACGTGGTCCTTTTGTTGAAACACCAAAACCAATCAAGGCTTTGGATCCTCTCCATCCCCCAGGGCCACC
+CAAGGACCTGCACCATGTAGATGTTGACAAGACTGAAGTCTCCCTAGTCTGGAATAAGCCGGATCGTGAT
+GGTGGTTCTCCAATCACTGGATATTTGGTAGAATATCAAGAAGAAGGCACCCAGGACTGGATTAAATTTA
+AGACTGTGACAAACTTAGAGTGTGTGGTTACTGGACTACAACAAGGAAAGACCTATAGATTCCGTGTAAA
+AGCTGAAAACATTGTGGGTCTTGGTCTCCCTGACACAACTATCCCGATAGAATGTCAAGAAAAACTAGTG
+CCTCCATCCGTGGAGCTAGATGTGAAATTAATTGAAGGTCTTGTGGTAAAGGCTGGAACCACAGTCAGAT
+TCCCTGCTATTATAAGAGGTGTGCCTGTTCCTACTGCAAAGTGGACAACCGATGGGAGTGAGATTAAAAC
+CGATGAGCACTACACAGTTGAAACAGACAACTTCTCATCAGTACTTACCATTAAGAACTGCTTAAGGAGA
+GACACTGGGGAATATCAAATCACAGTTTCCAATGCAGCCGGTAGCAAAACAGTAGCCGTACATCTTACTG
+TTCTTGATGTTCCTGGGCCACCAACAGGTCCTATTAATATTCTGGATGTTACTCCTGAACACATGACTAT
+CTCATGGCAGCCACCTAAGGATGATGGAGGAAGCCCTGTGATAAATTATATTGTTGAGAAACAAGATACA
+AGGAAAGACACGTGGGGTGTTGTCTCTTCCGGAAGCAGTAAGACAAAGCTGAAAATCCCACATCTGCAGA
+AGGGCTGTGAATATGTTTTCCGAGTTAGAGCAGAGAATAAGATAGGTGTTGGTCCTCCCCTTGACTCCAC
+ACCTACTGTTGCTAAGCATAAATTTAGTCCTCCGTCTCCTCCTGGTAAACCAGTGGTTACTGACATTACT
+GAAAATGCAGCAACAGTGTCTTGGACCCTGCCAAAATCTGATGGTGGCAGTCCAATAACTGGCTACTATA
+TGGAACGTCGAGAAGTAACTGGCAAATGGGTGAGGGTCAACAAAACACCTATCGCTGACCTGAAGTTCAG
+AGTGACTGGACTCTATGAAGGAAATACATATGAGTTTAGAGTTTTTGCTGAAAATCTTGCAGGACTAAGC
+AAACCATCCCCAAGTTCTGATCCAATAAAAGCTTGCCGGCCCATCAAACCACCTGGACCACCTATTAATC
+CTAAACTGAAAGACAAGAGCAGAGAAACAGCTGATTTGGTGTGGACAAAGCCTCTCAGTGATGGTGGTAG
+CCCCATTCTAGGATATGTAGTGGAATGTCAGAAACCTGGCACGGCACAATGGAACAGGATTAATAAAGAT
+GAACTCATTAGGCAATGTGCCTTTAGGGTACCTGGACTAATTGAAGGAAATGAGTACAGATTCCGTATAA
+AGGCAGCTAATATTGTAGGAGAGGGTGAGCCAAGAGAACTAGCAGAATCTGTGATTGCAAAAGATATCCT
+TCATCCTCCAGAAGTAGAACTTGATGTTACTTGTCGTGATGTTATTACCGTGAGAGTAGGCCAAACTATC
+CGCATTCTAGCTCGAGTCAAAGGCAGACCTGAACCAGACATAACTTGGACTAAGGAAGGCAAAGTATTGG
+TCCGAGAAAAGAGGGTGGACCTTATTCAGGATCTACCTCGTGTTGAGTTACAAATTAAAGAAGCTGTTAG
+AGCTGATCATGGCAAGTATATCATCTCAGCTAAGAACAGCAGTGGACATGCCCAAGGTTCAGCCATCGTT
+AACGTCCTTGACAGACCTGGGCCTTGCCAGAATTTGAAGGTTACCAATGTAACCAAAGAGAACTGTACAA
+TTTCTTGGGAAAACCCACTAGATAATGGTGGCTCAGAAATAACAAACTTCATAGTAGAATATCGCAAACC
+AAACCAGAAAGGCTGGTCAATTGTTGCATCAGATGTCACTAAACGATTAATCAAGGCCAACCTTTTAGCC
+AACAATGAATACTATTTCCGAGTTTGTGCAGAGAATAAAGTAGGTGTTGGGCCAACCATCGAAACAAAAA
+CTCCCATTCTGGCTATTAACCCTATTGACAGACCAGGTGAGCCTGAAAACCTTCACATTGCAGATAAAGG
+AAAGACATTTGTCTATCTAAAGTGGCGGAGGCCTGACTATGATGGTGGCAGTCCAAATCTGTCATATCAT
+GTTGAGAGAAGGCTTAAGGGCTCCGATGACTGGGAAAGAGTGCATAAAGGAAGCATTAAAGAAACTCACT
+ACATGGTTGACAGATGTGTTGAAAACCAGATTTATGAGTTCAGAGTGCAAACAAAGAATGAAGGTGGGGA
+AAGTGACTGGGTGAAGACAGAGGAAGTTGTTGTGAAAGAAGACTTACAAAAACCAGTACTTGATCTGAAA
+TTAAGTGGGGTCCTAACTGTCAAAGCAGGGGACACCATTAGGCTTGAGGCAGGGGTTAGAGGCAAACCAT
+TCCCAGAAGTTGCATGGACCAAGGACAAAGACGCTACAGACTTAACAAGATCACCAAGGGTCAAGATTGA
+TACCCGTGCTGATTCATCTAAATTTTCTCTTACTAAAGCAAAGCGAAGTGATGGGGGTAAATATGTAGTT
+ACGGCAACTAACACGGCTGGCAGTTTTGTGGCCTATGCCACTGTCAATGTTTTAGATAAGCCTGGTCCTG
+TGAGAAATCTGAAAATTGTTGATGTGTCCAGTGATAGGTGTACTGTTTGCTGGGATCCACCAGAAGATGA
+TGGTGGCTGTGAAATCCAAAATTATATTCTAGAAAAATGTGAGACAAAGCGAATGGTTTGGTCTACCTAT
+TCTGCTACTGTCTTGACACCTGGTACTACAGTAACACGTCTCATAGAAGGAAATGAATATATTTTCAGAG
+TCCGTGCAGAAAATAAAATAGGCACAGGGCCTCCAACAGAAAGTAAACCAGTCATAGCCAAAACCAAGTA
+TGATAAACCTGGTCGCCCTGATCCCCCAGAAGTCACTAAAGTAAGCAAAGAAGAGATGACTGTGGTTTGG
+AATCCACCTGAATATGATGGTGGAAAGTCTATAACTGGATACTTTTTGGAGAAAAAGGAAAAGCATTCAA
+CACGATGGGTCCCTGTCAACAAGAGTGCAATCCCTGAGAGACGTATGAAAGTACAGAATCTCCTCCCAGA
+CCATGAATATCAGTTCCGTGTCAAGGCAGAAAATGAAATTGGAATTGGAGAACCAAGCTTGCCTTCAAGA
+CCGGTGGTGGCAAAAGACCCCATAGAGCCACCTGGTCCACCAACCAATTTCAGAGTGGTTGATACAACCA
+AACATTCCATAACTCTTGGGTGGGGAAAACCAGTCTATGATGGTGGTGCACCGATCATTGGATATGTTGT
+GGAAATGAGACCAAAAATAGCAGATGCGTCTCCTGATGAAGGCTGGAAACGGTGTAATGCTGCAGCACAG
+CTTGTACGCAAGGAATTCACTGTTACCAGCTTGGATGAAAACCAGGAATATGAGTTCAGGGTGTGTGCCC
+AAAACCAAGTTGGTATTGGGCGCCCTGCAGAGCTAAAGGAAGCTATCAAACCTAAAGAAATACTAGAACC
+TCCGGAGATTGATTTGGATGCCAGCATGAGGAAACTGGTCATAGTGAGAGCAGGATGCCCTATTCGTCTC
+TTTGCTATAGTGAGAGGACGACCAGCCCCTAAAGTCACTTGGCGAAAAGTTGGCATTGATAATGTGGTCA
+GAAAAGGACAAGTTGATCTGGTTGACACTATGGCCTTCCTTGTCATCCCCAATTCTACCCGTGATGACTC
+AGGAAAATATTCCTTAACACTTGTGAACCCAGCAGGAGAAAAGGCTGTATTCGTAAATGTCAGAGTATTA
+GACACTCCTGGGCCTGTGTCTGATTTAAAAGTTTCAGATGTCACTAAAACATCATGCCATGTGTCCTGGG
+CCCCTCCTGAAAACGACGGTGGGAGCCAAGTGACACATTATATCGTGGAGAAACGTGAGGCAGACAGAAA
+GACATGGTCGACCGTTACCCCAGAAGTTAAGAAAACAAGCTTCCATGTAACCAATCTTGTCCCTGGGAAT
+GAGTATTACTTCAGAGTAACTGCTGTCAACGAATATGGCCCTGGCGTCCCAACAGATGTCCCAAAACCAG
+TGCTTGCATCAGATCCTCTAAGTGAGCCGGATCCCCCAAGGAAATTAGAAGTGACTGAAATGACCAAGAA
+CAGTGCCACCTTAGCCTGGTTACCTCCCCTACGTGATGGAGGTGCTAAAATCGATGGCTACATCACTAGT
+TACAGAGAAGAAGAGCAGCCTGCAGATCGCTGGACAGAGTACTCAGTGGTAAAAGATCTGAGCCTTGTTG
+TCACTGGCCTAAAGGAAGGAAAGAAATACAAATTTAGAGTAGCGGCCAGAAATGCTGTTGGAGTCAGTTT
+GCCAAGAGAAGCTGAAGGAGTGTATGAAGCCAAAGAACAACTGTTGCCACCAAAGATCCTTATGCCAGAG
+CAAATAACTATCAAAGCTGGGAAAAAACTCCGAATTGAAGCCCATGTGTATGGAAAGCCTCATCCCACCT
+GTAAATGGAAAAAAGGAGAAGATGAAGTTGTCACATCCAGCCACCTGGCAGTGCATAAAGCAGACAGCTC
+TTCAATTCTGATCATAAAAGATGTGACTAGGAAAGACAGTGGTTACTACAGCCTCACAGCAGAGAACAGT
+TCTGGGACAGACACTCAGAAAATCAAAGTTGTAGTCATGGATGCCCCCGGCCCCCCTCAGCCTCCATTTG
+ACATTTCTGATATAGACGCTGATGCTTGCTCCCTGTCATGGCACATCCCTCTGGAGGACGGAGGCAGTAA
+CATCACCAATTATATAGTGGAGAAGTGTGATGTAAGCCGAGGTGACTGGGTCACGGCTCTAGCTTCAGTC
+ACAAAAACTTCCTGCAGGGTTGGAAAGCTGATCCCAGGCCAGGAGTACATCTTCCGGGTCCGTGCTGAAA
+ACCGATTTGGCATTTCAGAGCCTCTCACATCTCCAAAGATGGTTGCGCAGTTCCCATTTGGTGTTCCTAG
+TGAACCAAAGAATGCACGAGTCACCAAAGTCAACAAGGACTGTATTTTTGTTGCTTGGGACAGACCAGAT
+AGTGATGGAGGGAGCCCCATTATTGGTTATCTGATTGAACGCAAGGAAAGAAACAGTTTGCTGTGGGTGA
+AAGCCAATGATACTCTTGTCCGGTCAACTGAATATCCTTGTGCTGGCCTTGTAGAAGGTCTTGAGTATTC
+ATTCAGAATCTATGCCCTAAACAAAGCTGGATCCAGCCCACCCAGCAAACCCACAGAATATGTAACTGCA
+AGAATGCCAGTTGATCCTCCTGGGAAACCTGAGGTTATTGATGTCACCAAGAGTACTGTATCTCTGATCT
+GGGCTCGTCCAAAGCATGATGGAGGCAGTAAAATTATTGGCTATTTCGTAGAAGCTTGCAAACTTCCTGG
+TGATAAATGGGTACGGTGCAATACTGCACCTCACCAGATTCCCCAGGAAGAGTACACAGCTACTGGCCTA
+GAAGAGAAAGCTCAGTATCAATTTAGAGCTATTGCCAGGACCGCGGTAAACATTAGCCCACCTTCTGAAC
+CTTCTGATCCAGTGACTATCCTCGCAGAAAATGTCCCTCCCAGGATAGACCTGAGTGTGGCTATGAAATC
+TTTGCTTACTGTGAAAGCTGGAACTAATGTCTGCTTGGATGCTACTGTTTTTGGTAAACCGATGCCAACA
+GTTTCTTGGAAAAAAGATGGCACACTGCTAAAACCAGCAGAAGGCATAAAGATGGCCATGCAGCGGAATC
+TGTGCACCTTGGAGCTATTCAGCGTGAACCGGAAGGACTCAGGAGACTATACCATTACTGCTGAAAATTC
+AAGTGGTTCTAAATCAGCCACCATTAAGCTTAAAGTGTTAGATAAACCGGGTCCTCCAGCATCTGTTAAA
+ATCAACAAAATGTATTCAGATCGTGCTATGCTTTCTTGGGAACCGCCTCTTGAAGATGGAGGCTCAGAAA
+TCACCAACTATATTGTTGACAAACGTGAAACAAGCAGGCCCAACTGGGCTCAAGTCTCTGCAACTGTGCC
+TATCACCAGCTGCAGCGTGGAGAAACTTATAGAGGGCCATGAGTATCAGTTCCGTATTTGTGCTGAAAAT
+AAATATGGAGTAGGCGATCCAGTCTTCACTGAACCAGCAATTGCCAAAAACCCATATGACCCACCAGGAC
+GCTGTGATCCTCCTGTTATTAGCAACATAACCAAAGATCACATGACAGTCAGCTGGAAGCCACCAGCAGA
+TGATGGGGGCTCACCCATCACTGGCTATTTGCTTGAAAAGCGGGAAACCCAGGCTGTTAACTGGACTAAG
+GTCAACAGAAAACCTATTATAGAAAGAACATTAAAAGCAACAGGTCTTCAAGAAGGTACCGAATATGAGT
+TCCGTGTTACAGCTATAAATAAAGCTGGACCAGGCAAACCCAGTGACGCATCCAAGGCCGCTTATGCTCG
+GGACCCTCAGTATCCTCCTGCGCCACCGGCTTTCCCTAAAGTATATGATACAACTCGCAGCTCTGTGAGT
+CTATCTTGGGGCAAGCCAGCCTATGACGGCGGCAGCCCTATCATTGGTTATCTCGTTGAAGTAAAACGGG
+CTGACTCCGATAACTGGGTGAGGTGCAACTTACCACAGAATCTACAGAAAACCCGCTTTGAGGTTACTGG
+CCTGATGGAAGACACACAATATCAATTCCGTGTGTATGCCGTTAATAAGATTGGATACAGTGACCCCAGT
+GATGTGCCAGATAAACACTATCCCAAGGACATCTTAATTCCACCTGAGGGAGAACTTGATGCGGACTTAA
+GGAAGACACTCATATTACGTGCTGGAGTTACTATGAGACTATATGTACCAGTAAAAGGACGCCCACCTCC
+AAAGATTACTTGGTCTAAACCAAATGTCAATCTAAGAGACAGGATTGGACTGGACATAAAGTCAACTGAC
+TTTGACACTTTCTTGCGCTGTGAAAATGTGAACAAATATGATGCAGGAAAATATATCTTAACCCTGGAGA
+ACAGCTGTGGTAAAAAGGAATATACCATTGTTGTGAAAGTGCTTGATACTCCTGGGCCACCTGTCAATGT
+GACTGTTAAGGAAATATCCAAAGACTCTGCTTATGTTACCTGGGAGCCTCCCATTATTGATGGCGGAAGC
+CCCATCATAAACTATGTGGTACAAAAACGTGATGCAGAGAGGAAATCCTGGTCTACAGTGACAACTGAGT
+GCTCCAAAACAAGCTTCAGAGTAGCTAATTTGGAGGAGGGAAAATCCTACTTCTTCCGAGTGTTTGCTGA
+AAATGAGTATGGCATTGGTGATCCCGGTGAAACTCGTGATGCTGTCAAAGCTTCCCAAACTCCTGGACCA
+GTTGTGGACCTGAAAGTGAGGTCTGTATCTAAGTCATCCTGTAGCATTGGCTGGAAAAAGCCTCACAGTG
+ATGGTGGAAGTCGGATTATTGGATATGTAGTTGATTTCCTGACTGAAGAAAATAAGTGGCAACGAGTTAT
+GAAATCCTTAAGCCTACAGTACTCTGCAAAAGATTTGACTGAAGGGAAGGAATATACCTTCAGAGTGAGT
+GCTGAGAATGAAAATGGAGAAGGAACCCCAAGCGAAATCACTGTTGTGGCAAGGGATGATGTTGTGGCTC
+CTGATCTTGACTTAAAGGGTCTACCTGATTTGTGCTACTTGGCTAAAGAAAACAGCAACTTCCGGCTTAA
+GATCCCCATAAAAGGCAAGCCAGCTCCATCAGTCTCCTGGAAGAAAGGGGAAGATCCTCTAGCAACTGAC
+ACTAGAGTCAGTGTTGAGTCATCTGCGGTTAACACAACTCTTATAGTGTACGATTGCCAAAAATCTGATG
+CTGGAAAATACACAATCACACTTAAGAATGTTGCTGGCACCAAGGAAGGAACTATCTCCATAAAGGTTGT
+TGGCAAGCCTGGCATCCCCACTGGACCAATCAAATTTGATGAAGTCACAGCAGAAGCCATGACCTTAAAG
+TGGGCTCCTCCAAAGGATGATGGAGGTTCTGAAATCACCAACTATATCCTAGAGAAGAGGGATTCTGTGA
+ACAACAAGTGGGTGACGTGCGCCTCAGCTGTCCAGAAAACCACCTTTAGAGTAACCAGACTTCATGAGGG
+CATGGAATATACCTTCAGGGTCAGTGCCGAAAATAAATATGGTGTAGGGGAAGGCCTGAAATCGGAGCCA
+ATTGTTGCGAGACATCCATTTGATGTGCCTGATGCTCCCCCACCTCCCAATATTGTGGATGTCAGACACG
+ATTCAGTATCTCTAACTTGGACTGACCCCAAGAAAACTGGTGGTTCTCCAATTACAGGGTATCATCTCGA
+GTTCAAGGAAAGAAACAGCCTTTTGTGGAAGAGAGCTAACAAGACTCCGATAAGGATGAGAGACTTTAAA
+GTGACAGGATTAACTGAAGGTCTTGAATATGAATTCCGAGTTATGGCAATCAATTTAGCAGGTGTGGGCA
+AGCCAAGCCTACCATCAGAGCCTGTTGTGGCACTGGACCCAATTGATCCTCCTGGAAAACCTGAGGTTAT
+TAACATAACAAGGAATTCAGTGACTCTCATTTGGACTGAACCTAAATATGACGGTGGTCATAAGTTAACT
+GGATATATAGTGGAGAAGCGAGATCTACCTTCGAAGTCTTGGATGAAAGCCAACCATGTTAATGTCCCAG
+AATGTGCCTTTACTGTAACTGACCTTGTTGAGGGTGGAAAATATGAATTCAGAATTAGAGCAAAGAATAC
+AGCAGGTGCTATCAGTGCTCCATCAGAAAGTACAGAAACCATTATTTGCAAGGATGAATACGAGGCACCA
+ACAATTGTCCTTGATCCCACAATAAAAGATGGGCTAACAATTAAAGCAGGGGATACCATTGTTTTGAATG
+CCATTAGCATTCTTGGCAAACCCCTTCCAAAATCAAGTTGGTCCAAGGCAGGAAAAGACATTAGACCATC
+AGATATCACTCAGATAACTTCAACCCCAACATCTTCCATGCTTACTATCAAGTATGCCACTAGAAAAGAT
+GCGGGTGAATATACCATCACTGCTACCAATCCTTTTGGCACGAAGGTGGAACATGTGAAGGTAACAGTCC
+TTGATGTACCTGGTCCCCCAGGTCCTGTTGAAATCAGTAATGTTTCTGCTGAAAAAGCAACACTTACATG
+GACACCTCCCTTGGAAGATGGCGGCTCACCAATTAAGTCCTATATACTTGAAAAGAGAGAAACCAGCCGA
+CTTTTGTGGACAGTGGTTTCTGAAGATATTCAGTCTTGCAGGCATGTGGCAACCAAACTTATCCAAGGAA
+ATGAGTACATCTTCCGGGTCTCAGCTGTAAACCACTATGGCAAAGGAGAACCTGTACAGTCTGAACCTGT
+CAAAATGGTAGACAGATTTGGTCCCCCTGGCCCTCCTGAAAAACCAGAGGTATCAAATGTCACTAAGAAC
+ACTGCCACTGTCAGCTGGAAAAGGCCAGTGGATGATGGTGGCAGCGAAATTACAGGATATCATGTAGAAA
+GGAGAGAAAAGAAAAGCCTGCGATGGGTGAGAGCAATAAAAACACCAGTTTCCGATCTCAGGTGCAAAGT
+AACAGGACTGCAAGAAGGAAGCACCTACGAATTCCGTGTCAGTGCAGAAAACAGAGCAGGAATTGGTCCA
+CCCAGTGAGGCTTCAGATTCTGTTCTGATGAAAGATGCAGCATATCCTCCAGGACCACCTTCAAATCCGC
+ATGTCACTGATACTACCAAGAAATCTGCTTCTTTGGCATGGGGCAAGCCTCATTATGATGGTGGACTTGA
+AATCACTGGCTATGTCGTGGAGCATCAAAAAGTAGGAGACGAGGCCTGGATAAAAGATACCACAGGAACC
+GCCCTCAGAATCACTCAGTTCGTTGTTCCTGATCTTCAGACTAAAGAAAAATACAACTTCAGAATCAGTG
+CCATCAACGATGCAGGTGTTGGGGAGCCAGCGGTGATTCCAGATGTTGAAATCGTAGAACGGGAGATGGC
+TCCTGATTTTGAACTAGATGCCGAGCTTCGAAGAACACTTGTTGTTAGAGCAGGACTCAGTATTAGGATA
+TTTGTGCCAATTAAAGGTCGTCCTGCTCCTGAAGTGACATGGACCAAAGATAACATCAACCTGAAAAACC
+GAGCCAACATTGAAAATACGGAATCATTTACTCTTCTGATTATCCCAGAATGTAACAGATATGATACCGG
+TAAATTTGTCATGACCATTGAAAACCCGGCTGGGAAGAAAAGTGGCTTTGTGAACGTCAGAGTCTTGGAC
+ACGCCAGGCCCAGTCCTCAACCTGCGGCCTACAGACATCACAAAGGACAGTGTCACCCTGCACTGGGACC
+TCCCTCTGATAGATGGAGGCTCACGTATAACAAACTACATTGTAGAGAAACGTGAAGCAACACGGAAATC
+TTATTCCACAGCCACCACTAAGTGCCATAAATGCACATATAAAGTTACCGGCTTGTCTGAAGGGTGTGAA
+TATTTCTTCAGAGTGATGGCAGAGAATGAATATGGAATTGGTGAGCCAACAGAAACTACAGAGCCCGTAA
+AAGCCTCTGAAGCACCATCTCCACCAGACAGCCTTAACATCATGGACATAACTAAGAGCACCGTCAGCCT
+GGCATGGCCTAAGCCCAAACACGATGGTGGCAGCAAGATCACTGGCTATGTGATTGAAGCCCAAAGAAAA
+GGCTCTGACCAGTGGACCCACATCACAACCGTGAAAGGGTTAGAATGTGTTGTGAGGAATCTAACTGAAG
+GAGAGGAATATACCTTCCAAGTGATGGCAGTGAACAGCGCGGGGAGAAGTGCCCCTAGAGAAAGCAGACC
+CGTCATTGTCAAGGAGCAGACAATGCTTCCAGAGCTGGATCTCCGTGGCATCTATCAGAAACTGGTCATT
+GCCAAAGCTGGTGACAACATCAAAGTTGAAATTCCAGTGCTCGGTCGACCGAAGCCCACAGTGACATGGA
+AAAAAGGAGACCAAATTCTTAAACAGACACAGAGAGTTAATTTTGAAACCACAGCGACTTCAACCATTTT
+AAATATCAATGAGTGTGTCAGAAGTGATAGTGGGCCCTATCCATTAACAGCAAGGAACATTGTAGGAGAG
+GTTGGTGATGTCATCACCATTCAAGTCCATGATATCCCAGGGCCACCTACTGGACCAATCAAATTTGATG
+AAGTTTCATCTGATTTTGTAACCTTCTCTTGGGACCCACCTGAGAACGATGGTGGTGTACCAATAAGCAA
+CTATGTAGTGGAAATGCGGCAGACTGACAGTACTACCTGGGTTGAGTTAGCAACCACCGTTATACGTACT
+ACCTATAAAGCCACCCGCCTTACTACTGGATTAGAGTATCAGTTCCGTGTAAAAGCTCAGAATAGATATG
+GAGTTGGACCAGGCATCACATCAGCATGCATAGTTGCCAACTATCCATTTAAGGTTCCTGGACCTCCTGG
+TACCCCTCAGGTAACTGCAGTTACCAAGGATTCAATGACAATTAGCTGGCATGAGCCACTTTCTGATGGT
+GGAAGCCCCATTTTAGGATATCATGTTGAAAGAAAAGAACGAAATGGTATTCTCTGGCAGACTGTGAGCA
+AAGCTTTAGTACCAGGCAACATTTTCAAATCAAGTGGACTTACAGATGGTATTGCTTATGAGTTCCGGGT
+GATTGCAGAAAACATGGCAGGCAAAAGTAAGCCAAGCAAGCCATCAGAACCTATGTTGGCTCTGGATCCC
+ATTGACCCACCTGGAAAACCAGTACCTCTAAATATTACAAGACACACAGTAACACTTAAATGGGCTAAGC
+CTGAATATACTGGGGGCTTTAAAATTACCAGTTATATCGTTGAAAAGAGAGACCTTCCTAATGGACGGTG
+GCTGAAGGCCAACTTCAGCAACATTTTGGAGAATGAATTTACAGTCAGTGGCCTAACAGAAGATGCTGCA
+TATGAATTCCGTGTGATCGCCAAAAATGCTGCAGGTGCCATCAGTCCACCATCTGAGCCATCTGATGCTA
+TCACTTGCAGGGATGATGTTGAGGCACCAAAGATAAAGGTGGATGTTAAATTTAAGGACACGGTTATATT
+AAAAGCAGGTGAAGCATTCAGACTGGAAGCTGATGTTTCAGGCCGCCCACCTCCAACAATGGAATGGAGC
+AAAGATGGAAAAGAGCTGGAAGGCACAGCAAAGTTAGAAATAAAAATTGCAGATTTCTCTACTAATCTGG
+TAAACAAAGATTCAACAAGAAGGGATAGTGGTGCCTATACCCTTACAGCGACTAATCCTGGTGGCTTTGC
+TAAACACATTTTCAATGTCAAAGTTCTTGACAGACCAGGCCCACCTGAAGGACCTTTGGCTGTAACTGAA
+GTGACATCAGAAAAGTGTGTACTATCATGGTTCCCTCCACTGGATGATGGAGGTGCCAAAATTGATCATT
+ACATAGTACAGAAACGTGAAACCAGCAGATTGGCATGGACAAATGTAGCCTCAGAAGTCCAAGTAACAAA
+GCTAAAGGTCACTAAACTCTTGAAAGGCAATGAATACATATTCCGTGTCATGGCTGTAAATAAATATGGA
+GTGGGAGAGCCACTGGAATCAGAGCCTGTGCTTGCAGTGAATCCTTATGGACCCCCTGATCCGCCCAAAA
+ACCCTGAAGTGACAACTATTACTAAAGATTCGATGGTTGTCTGCTGGGGACATCCTGATTCTGATGGTGG
+AAGTGAAATCATCAATTATATTGTGGAACGGCGTGATAAAGCTGGCCAACGCTGGATTAAATGCAACAAA
+AAAACTCTTACTGATTTAAGATATAAAGTGTCTGGACTGACAGAAGGACATGAATATGAGTTCAGGATTA
+TGGCTGAAAATGCTGCTGGAATTAGTGCACCAAGTCCTACCAGTCCATTTTACAAGGCTTGTGACACTGT
+GTTTAAACCTGGACCACCAGGTAACCCACGTGTTCTGGATACAAGCAGATCATCCATTTCAATCGCTTGG
+AATAAACCTATCTATGATGGTGGTTCAGAAATCACTGGGTATATGGTTGAGATTGCCCTGCCAGAGGAAG
+ATGAATGGCAGATTGTCACTCCACCAGCAGGACTCAAGGCAACTTCGTATACTATCACTGGCCTCACAGA
+GAATCAGGAATATAAGATCCGCATCTATGCCATGAATTCCGAAGGACTTGGGGAACCTGCCCTTGTTCCT
+GGAACTCCAAAGGCTGAAGACAGAATGCTGCCTCCAGAAATTGAACTGGATGCTGACCTGCGCAAAGTTG
+TTACTATAAGGGCCTGCTGCACCCTGAGACTTTTTGTTCCCATCAAAGGAAGGCCTGCACCTGAGGTGAA
+GTGGGCCCGGGACCATGGAGAATCTTTAGATAAAGCTAGCATCGAATCCACAAGCTCTTACACCCTGCTT
+ATTGTTGGAAATGTAAACAGATTTGACAGTGGCAAATATATACTAACTGTAGAAAATAGTTCAGGCAGCA
+AGTCTGCATTTGTCAATGTTAGAGTTCTCGATACACCAGGCCCCCCACAGGATCTGAAGGTAAAAGAGGT
+CACTAAGACATCTGTCACACTCACATGGGACCCACCTCTCCTTGATGGAGGTTCAAAAATCAAGAACTAT
+ATTGTTGAAAAGCGGGAATCAACAAGAAAAGCATATTCAACTGTTGCAACAAACTGCCACAAGACTTCCT
+GGAAGGTAGACCAGCTTCAAGAAGGCTGTAGCTACTATTTCAGGGTTCTCGCAGAAAATGAATATGGCAT
+TGGGCTGCCTGCTGAAACCGCAGAATCTGTGAAAGCATCAGAACGACCTCTTCCTCCAGGAAAAATAACT
+TTGATGGATGTCACAAGAAATAGTGTGTCACTCTCTTGGGAGAAACCAGAGCATGATGGAGGCAGCCGAA
+TTCTAGGCTACATTGTGGAGATGCAGACCAAAGGCAGTGACAAATGGGCCACGTGTGCCACAGTCAAGGT
+CACTGAAGCCACTATCACTGGATTAATTCAGGGTGAAGAATACTCTTTCCGTGTTTCAGCTCAGAATGAA
+AAGGGCATCAGTGATCCTAGACAACTGAGTGTGCCAGTGATCGCCAAAGATCTTGTCATTCCACCAGCCT
+TCAAACTCCTGTTCAATACTTTCACTGTACTGGCAGGTGAAGACCTAAAAGTTGATGTTCCATTCATTGG
+CCGCCCTACCCCAGCTGTAACCTGGCATAAAGATAATGTACCACTGAAGCAGACAACTAGAGTAAATGCA
+GAGAGCACAGAAAATAATTCACTACTGACAATAAAGGACGCCTGCCGAGAAGATGTTGGCCATTATGTGG
+TTAAACTGACTAACTCAGCTGGTGAAGCTATTGAAACCCTTAATGTTATCGTTCTTGACAAACCAGGGCC
+TCCAACTGGACCAGTTAAAATGGATGAAGTGACAGCTGATAGTATTACTCTTTCCTGGGGCCCACCCAAG
+TATGATGGTGGAAGTTCTATCAATAATTACATTGTTGAGAAACGGGACACTTCCACAACCACCTGGCAAA
+TTGTATCAGCTACAGTTGCAAGGACAACAATAAAGGCTTGCAGACTGAAGACTGGATGTGAATATCAGTT
+TAGAATTGCAGCTGAAAACAGATATGGGAAGAGTACCTACCTCAATTCAGAGCCTACTGTAGCCCAATAT
+CCATTCAAAGTTCCTGGTCCTCCTGGCACTCCAGTTGTCACACTGTCCTCCAGGGACAGCATGGAAGTAC
+AATGGAATGAGCCAATCAGTGATGGAGGAAGTAGAGTCATTGGCTATCATCTAGAACGCAAGGAAAGAAA
+TAGCATCCTCTGGGTTAAGTTGAATAAAACACCTATTCCTCAAACCAAGTTTAAGACAACTGGCCTTGAA
+GAAGGTGTTGAATATGAATTTAGAGTCTCTGCAGAGAACATCGTGGGCATTGGCAAGCCGAGTAAAGTAT
+CAGAATGTTATGTGGCTCGTGACCCATGTGATCCACCAGGACGGCCAGAGGCAATCATTGTCACAAGGAA
+TTCTGTGACTCTTCAGTGGAAGAAACCCACCTATGACGGTGGAAGCAAGATCACTGGTTATATTGTTGAG
+AAGAAAGAATTACCTGAGGGCCGTTGGATGAAAGCCAGTTTTACAAATATTATTGACACTCATTTTGAAG
+TAACTGGCCTAGTTGAAGATCACAGATATGAGTTCCGGGTTATAGCCCGAAATGCCGCAGGAGTGTTTAG
+TGAGCCTTCAGAAAGCACAGGAGCAATAACAGCTAGAGATGAGGTAGATCCACCACGAATAAGTATGGAT
+CCAAAATACAAAGACACAATCGTGGTTCATGCTGGTGAATCATTCAAGGTTGATGCAGATATTTATGGCA
+AACCAATACCAACCATTCAGTGGATAAAAGGTGATCAGGAGCTTTCAAACACAGCTCGATTAGAAATAAA
+GAGCACCGACTTTGCCACCAGTCTCAGTGTAAAAGATGCAGTACGTGTCGACAGTGGAAATTACATACTG
+AAGGCCAAAAATGTTGCAGGAGAAAGATCAGTTACTGTGAATGTCAAGGTTCTTGACAGACCAGGGCCAC
+CTGAAGGACCTGTTGTTATCTCAGGAGTTACAGCAGAAAAATGCACACTAGCTTGGAAACCCCCACTTCA
+GGATGGTGGGAGTGACATCATAAATTATATTGTGGAAAGGAGAGAAACCAGCCGCTTAGTTTGGACTGTG
+GTTGATGCCAATGTGCAGACTCTCAGCTGCAAGGTTACTAAGCTTCTTGAAGGCAATGAATATACTTTCC
+GTATAATGGCAGTAAACAAATATGGTGTTGGTGAACCTCTTGAATCTGAGCCAGTAGTTGCCAAGAATCC
+ATTTGTAGTACCAGATGCACCAAAAGCTCCAGAAGTCACAACAGTGACCAAGGACTCAATGATTGTTGTA
+TGGGAAAGACCAGCATCTGATGGTGGTAGTGAAATTCTTGGATATGTTCTTGAGAAACGGGATAAAGAAG
+GCATTAGATGGACAAGATGCCATAAGCGTCTGATTGGAGAGTTGCGCCTGAGAGTAACTGGACTCATAGA
+AAATCACGATTATGAGTTCAGAGTTTCTGCTGAGAATGCTGCTGGACTTAGTGAACCAAGCCCTCCTTCT
+GCTTACCAAAAGGCTTGTGATCCTATTTATAAACCAGGACCCCCAAACAACCCCAAAGTCATAGACATAA
+CCAGATCTTCAGTATTCCTTTCTTGGAGCAAACCAATATATGATGGTGGCTGTGAAATTCAAGGATACAT
+TGTTGAAAAATGTGATGTGAGTGTTGGTGAATGGACAATGTGCACTCCACCAACAGGAATTAATAAAACA
+AACATAGAAGTAGAGAAGCTGTTGGAAAAGCATGAATACAACTTCCGTATCTGTGCTATTAATAAAGCTG
+GAGTTGGAGAACATGCTGACGTCCCTGGACCTATTATAGTTGAAGAAAAATTAGAAGCACCAGACATTGA
+TCTTGACCTAGAACTAAGGAAAATCATAAATATAAGGGCAGGTGGCTCCTTAAGGTTATTTGTTCCTATA
+AAAGGTCGTCCTACACCAGAAGTTAAATGGGGAAAGGTGGATGGTGAAATCCGAGATGCAGCTATAATTG
+ATGTCACTAGCAGTTTCACCTCTCTTGTTCTTGACAATGTCAACCGATATGATAGTGGAAAATATACGCT
+TACATTAGAAAACAGCAGTGGAACAAAGTCTGCCTTTGTTACTGTGAGAGTTCTGGACACGCCAAGTCCA
+CCTGTTAACCTGAAAGTCACAGAAATCACCAAAGACTCAGTATCAATTACATGGGAACCTCCTTTGTTGG
+ATGGGGGATCCAAAATAAAAAATTACATTGTTGAGAAACGTGAAGCCACAAGAAAATCATATGCTGCTGT
+TGTAACTAACTGCCATAAGAATTCTTGGAAAATCGATCAGCTCCAAGAAGGTTGCAGTTATTACTTTAGA
+GTCACAGCTGAGAATGAGTATGGTATTGGCCTTCCTGCCCAGACTGCTGATCCAATTAAGGTTGCAGAAG
+TGCCACAACCTCCTGGAAAAATAACTGTGGATGATGTCACCAGAAACAGTGTCTCTCTGAGTTGGACAAA
+ACCTGAACATGATGGTGGCAGTAAAATCATTCAGTATATTGTGGAAATGCAAGCTAAACACAGTGAGAAA
+TGGTCAGAGTGTGCTCGAGTAAAGTCTCTTCAGGCAGTAATTACCAACCTAACTCAAGGGGAAGAATATC
+TTTTTAGAGTTGTTGCTGTAAATGAAAAGGGGAGAAGTGATCCTCGGTCCCTTGCAGTTCCAATAGTTGC
+CAAAGATCTGGTAATTGAGCCAGATGTAAAACCTGCATTCAGTAGTTACAGTGTACAGGTTGGCCAAGAT
+TTGAAAATAGAAGTGCCAATTTCTGGACGTCCTAAGCCAACCATTACCTGGACTAAAGATGGTCTCCCAC
+TGAAGCAGACCACAAGAATCAATGTTACCGATTCACTGGATCTCACCACACTCAGTATTAAAGAAACTCA
+TAAGGATGATGGTGGACAATATGGAATCACAGTTGCCAATGTTGTTGGTCAGAAGACAGCATCCATCGAA
+ATTGTAACTCTAGATAAACCTGATCCTCCAAAAGGACCTGTTAAATTTGATGACGTCAGTGCTGAAAGTA
+TTACATTATCTTGGAACCCTCCATTATATACAGGGGGCTGCCAAATCACCAACTACATTGTTCAGAAAAG
+AGATACAACCACCACAGTATGGGATGTTGTTTCTGCTACTGTTGCTAGAACTACACTCAAAGTGACCAAA
+CTGAAAACTGGTACAGAATACCAATTTAGAATATTTGCCGAAAACAGATATGGACAAAGCTTTGCCTTAG
+AGTCTGATCCAATTGTAGCTCAATATCCCTACAAAGAACCAGGCCCTCCAGGTACACCATTTGCCACAGC
+CATTTCCAAAGACTCCATGGTCATACAGTGGCATGAACCAGTCAACAATGGTGGAAGCCCCGTCATAGGT
+TACCACCTGGAGAGAAAAGAAAGAAACAGTATTTTGTGGACAAAGGTCAACAAAACTATTATTCATGACA
+CCCAATTCAAAGCACAGAATCTTGAAGAAGGCATTGAATATGAATTCAGAGTGTATGCTGAAAATATTGT
+TGGTGTAGGCAAAGCAAGCAAGAATTCTGAATGCTATGTAGCCAGAGATCCCTGTGACCCACCAGGAACC
+CCAGAACCAATAATGGTTAAAAGAAATGAAATCACTTTACAGTGGACCAAACCTGTGTATGATGGTGGAA
+GTATGATTACAGGCTACATTGTAGAGAAACGTGATTTGCCTGATGGTCGTTGGATGAAAGCTAGCTTTAC
+AAATGTCATTGAAACTCAATTTACTGTGTCAGGTCTTACTGAAGATCAAAGATATGAATTCAGAGTCATT
+GCAAAGAATGCAGCTGGTGCAATAAGTAAACCCTCTGACAGTACTGGACCAATAACTGCCAAGGATGAGG
+TTGAACTCCCAAGAATTTCAATGGATCCAAAATTCAGAGACACAATTGTGGTAAATGCTGGAGAAACATT
+CAGACTTGAGGCTGATGTCCATGGAAAGCCCCTACCTACCATTGAGTGGTTAAGAGGAGATAAGGAAATT
+GAAGAATCTGCTAGATGTGAAATAAAGAACACAGATTTCAAGGCTTTACTTATTGTAAAAGATGCAATTA
+GAATTGATGGTGGGCAGTATATTTTAAGAGCTTCCAATGTTGCAGGTTCTAAGTCATTCCCAGTAAATGT
+AAAAGTATTAGATAGACCAGGACCTCCAGAAGGGCCAGTCCAGGTTACTGGAGTCACTTCTGAAAAATGC
+TCTTTAACATGGTCTCCACCACTTCAAGATGGTGGCAGTGACATTTCTCACTATGTTGTTGAAAAGCGAG
+AAACCAGTCGACTTGCCTGGACTGTTGTTGCTTCAGAAGTTGTGACCAATTCTCTGAAAGTTACCAAACT
+CTTAGAAGGTAATGAATATGTTTTCCGTATAATGGCTGTCAACAAATATGGTGTTGGAGAGCCTTTGGAA
+TCTGCACCAGTACTAATGAAAAATCCATTTGTGCTTCCTGGACCACCAAAAAGCTTGGAAGTCACAAATA
+TTGCCAAAGACTCCATGACCGTCTGTTGGAACCGTCCAGATAGTGATGGTGGAAGTGAGATTATTGGTTA
+CATTGTAGAGAAAAGAGACAGAAGTGGCATTCGATGGATAAAATGTAATAAACGCCGCATTACAGATTTG
+CGTCTAAGAGTGACAGGATTAACAGAAGATCATGAGTATGAATTCAGGGTCTCTGCAGAAAATGCTGCTG
+GAGTTGGGGAACCAAGTCCAGCTACAGTTTATTATAAAGCCTGTGATCCTGTGTTCAAACCTGGCCCACC
+TACCAATGCACACATTGTAGACACCACTAAAAATTCAATCACACTTGCCTGGGGTAAACCCATCTATGAT
+GGCGGCAGTGAGATCTTGGGATATGTAGTAGAAATCTGTAAAGCAGATGAAGAAGAATGGCAAATAGTTA
+CTCCACAGACTGGCCTGAGAGTCACTCGATTTGAAATTTCAAAACTCACTGAACACCAAGAGTATAAAAT
+ACGAGTCTGTGCCCTCAACAAAGTTGGTTTAGGTGAGGCTACATCAGTTCCTGGTACTGTGAAACCAGAA
+GATAAACTTGAAGCACCTGAACTTGACCTTGACTCCGAATTAAGAAAAGGAATTGTTGTAAGAGCTGGTG
+GATCTGCCAGAATTCACATTCCATTCAAAGGTCGTCCAACGCCTGAGATCACTTGGTCTCGAGAGGAAGG
+TGAATTCACAGATAAGGTCCAAATTGAAAAGGGAGTAAACTATACCCAACTATCAATAGATAACTGTGAT
+AGAAATGATGCTGGAAAATACATTCTTAAGTTGGAAAACAGCAGTGGATCAAAGTCTGCTTTTGTAACTG
+TGAAAGTTCTTGACACTCCAGGACCACCACAGAATTTGGCAGTCAAAGAAGTGAGAAAAGATTCTGCCTT
+CCTGGTATGGGAGCCACCCATCATTGATGGAGGGGCAAAGGTCAAGAACTATGTGATTGACAAACGTGAG
+TCAACCAGAAAAGCGTATGCTAATGTGAGTAGTAAATGCAGCAAAACAAGTTTTAAAGTGGAAAACCTTA
+CAGAAGGAGCCATTTATTACTTCAGAGTCATGGCTGAAAATGAATTTGGAGTTGGTGTTCCAGTGGAAAC
+TGTTGATGCCGTGAAAGCTGCTGAACCTCCTTCCCCACCAGGAAAGGTTACACTCACTGATGTGTCCCAG
+ACCAGTGCATCACTTATGTGGGAGAAACCTGAACATGATGGCGGTAGCAGAGTCCTGGGGTACGTTGTTG
+AAATGCAGCCCAAAGGAACTGAAAAATGGAGCATTGTGGCTGAATCCAAAGTCTGTAATGCAGTTGTTAC
+TGGTTTGAGTTCTGGACAAGAATATCAGTTCCGTGTCAAGGCTTATAATGAGAAAGGAAAAAGCGATCCA
+AGAGTGTTGGGTGTTCCTGTCATAGCCAAGGACTTGACTATACAGCCTAGTTTAAAGTTACCATTTAACA
+CATATAGTATCCAAGCTGGAGAAGATCTTAAAATAGAAATTCCAGTTATAGGCCGACCAAGACCTAACAT
+TTCTTGGGTCAAAGATGGTGAGCCTCTTAAACAGACAACAAGAGTAAACGTTGAAGAAACAGCTACCTCA
+ACTGTTTTGCACATTAAAGAAGGTAACAAAGATGACTTTGGAAAATACACCGTAACGGCAACAAATAGTG
+CAGGCACAGCAACAGAAAATCTCAGTGTTATCGTTTTAGAAAAGCCTGGACCTCCAGTTGGCCCAGTTCG
+GTTTGATGAAGTTAGTGCAGACTTTGTAGTCATATCTTGGGAACCTCCAGCCTATACTGGTGGCTGCCAA
+ATAAGCAACTACATTGTAGAGAAGCGAGATACAACCACCACCACTTGGCACATGGTATCAGCAACAGTTG
+CAAGAACAACAATTAAAATAACCAAACTGAAAACAGGCACGGAGTACCAGTTTAGAATTTTTGCTGAAAA
+CAGGTATGGAAAAAGTGCCCCACTGGATTCTAAGGCAGTTATTGTACAATATCCATTTAAAGAACCTGGA
+CCACCTGGAACTCCTTTTGTGACATCAATCTCAAAAGATCAGATGCTTGTGCAATGGCATGAGCCAGTGA
+ATGATGGAGGCACCAAAATTATTGGCTACCATCTTGAACAGAAAGAAAAGAACAGTATTTTATGGGTCAA
+GTTAAATAAGACCCCCATTCAGGACACCAAATTCAAAACAACTGGGCTTGATGAGGGCCTTGAGTATGAG
+TTCAAAGTTTCTGCTGAAAATATTGTTGGCATTGGCAAGCCTAGCAAAGTGTCAGAATGCTTTGTTGCTC
+GTGATCCATGTGACCCACCTGGTCGCCCTGAAGCCATTGTTATTACAAGAAACAATGTCACACTGAAATG
+GAAGAAACCTGCCTATGATGGTGGTAGCAAAATAACAGGTTATATTGTAGAAAAGAAAGATCTACCTGAT
+GGCCGCTGGATGAAAGCCAGCTTTACCAACGTATTAGAAACTGAATTTACAGTGAGTGGACTTGTAGAAG
+ACCAAAGATATGAATTTAGAGTAATTGCAAGAAATGCAGCTGGAAACTTTAGTGAACCATCTGATAGTAG
+TGGTGCCATTACTGCAAGAGATGAAATTGATGCACCAAATGCCTCTCTGGATCCAAAATATAAAGATGTC
+ATCGTTGTTCATGCAGGAGAGACTTTTGTTCTTGAAGCCGACATCCGTGGCAAACCTATACCTGATGTTG
+TTTGGTCAAAAGATGGAAAAGAACTTGAAGAAACAGCTGCTAGAATGGAAATTAAATCTACTATTCAGAA
+AACAACTCTTGTTGTCAAAGACTGTATACGGACTGATGGAGGACAATATATTCTGAAACTCAGCAATGTT
+GGTGGTACAAAGTCTATACCCATCACTGTAAAGGTACTTGACAGGCCAGGGCCTCCTGAAGGGCCTCTGA
+AAGTTACTGGAGTTACTGCGGAAAAATGTTACCTGGCATGGAACCCACCTTTGCAAGATGGTGGTGCTAA
+TATTTCACATTACATCATTGAAAAGAGGGAGACAAGCCGACTCTCTTGGACCCAGGTTTCAACTGAGGTA
+CAGGCCCTTAACTACAAAGTTACTAAACTTCTTCCTGGTAATGAGTACATTTTCCGTGTCATGGCTGTGA
+ATAAATATGGAATTGGAGAGCCCTTGGAATCTGGGCCTGTTACGGCCTGTAATCCTTATAAGCCACCAGG
+TCCTCCCTCAACACCTGAAGTCTCAGCAATCACCAAAGATTCTATGGTAGTAACATGGGCACGCCCAGTA
+GACGACGGAGGTACCGAAATTGAGGGCTACATTCTTGAAAAACGAGATAAGGAAGGCGTTAGATGGACCA
+AGTGCAACAAGAAAACATTAACGGATCTGCGGCTCAGGGTAACTGGTCTTACCGAAGGCCATTCCTATGA
+ATTCAGAGTTGCTGCTGAAAATGCAGCTGGTGTGGGAGAACCTAGTGAGCCATCTGTTTTCTACCGTGCG
+TGTGATGCCTTGTATCCACCAGGTCCCCCAAGCAATCCAAAAGTGACGGACACTTCCAGATCTTCTGTCT
+CCCTGGCATGGAGTAAGCCAATTTATGATGGTGGCGCACCTGTTAAAGGCTATGTTGTAGAGGTCAAAGA
+AGCTGCTGCGGATGAATGGACAACCTGCACTCCACCAACAGGATTACAAGGAAAGCAGTTCACAGTGACC
+AAGCTTAAAGAAAACACTGAATATAACTTCCGTATTTGTGCCATCAATTCTGAAGGTGTAGGTGAACCTG
+CAACTCTACCTGGCTCAGTGGTTGCTCAGGAGAGGATAGAGCCACCAGAAATAGAACTCGATGCTGATCT
+CAGAAAGGTGGTCGTTCTGCGTGCAAGTGCTACTTTACGCTTATTTGTCACTATCAAAGGTCGACCAGAA
+CCCGAAGTTAAATGGGAAAAGGCAGAAGGCATTCTCACTGACAGGGCTCAGATAGAGGTGACCAGCTCAT
+TTACAATGTTGGTGATTGATAATGTTACCAGATTTGACAGTGGTCGGTATAATCTGACATTAGAAAATAA
+TAGTGGCTCCAAAACAGCTTTTGTTAACGTCAGAGTTCTTGACTCACCAAGTGCCCCTGTGAATTTGACC
+ATAAGAGAAGTGAAGAAAGACTCAGTGACGTTGTCCTGGGAACCACCACTTATTGATGGTGGAGCTAAGA
+TTACAAACTACATTGTCGAAAAACGAGAAACTACAAGAAAAGCCTATGCTACCATTACAAATAATTGCAC
+TAAAACTACTTTCAGAATTGAAAATCTACAAGAAGGATGTTCTTACTACTTCCGAGTCTTGGCTTCCAAT
+GAATATGGGATTGGTTTGCCAGCTGAAACAACAGAACCCGTTAAAGTGTCTGAACCACCCCTCCCACCTG
+GAAGAGTAACTCTTGTTGATGTGACCCGTAATACAGCTACAATTAAGTGGGAGAAACCAGAAAGTGATGG
+TGGCAGCAAAATTACTGGTTATGTGGTTGAAATGCAGACTAAAGGGAGTGAAAAGTGGAGCACCTGCACA
+CAAGTTAAGACTCTAGAAGCAACTATATCTGGCTTAACTGCAGGAGAAGAGTATGTCTTCAGGGTAGCTG
+CAGTTAACGAAAAGGGAAGAAGTGATCCAAGACAACTTGGAGTGCCAGTAATTGCAAGGGATATTGAAAT
+AAAGCCTTCAGTTGAGCTTCCTTTCCATACTTTCAATGTAAAGGCTAGAGAACAACTTAAGATTGATGTG
+CCATTCAAAGGAAGACCTCAAGCTACTGTGAACTGGAGAAAAGATGGTCAGACTCTTAAAGAGACAACTA
+GAGTCAATGTTTCTTCTTCAAAGACTGTAACATCACTATCTATTAAGGAAGCTTCAAAGGAAGATGTTGG
+AACTTATGAATTATGTGTTTCAAACAGTGCTGGATCCATAACAGTTCCTATTACTATAATTGTCCTTGAC
+AGACCAGGACCTCCAGGTCCTATACGTATTGATGAGGTTAGTTGTGACAGCATAACCATTTCTTGGAATC
+CTCCAGAATATGATGGTGGCTGCCAAATTAGCAATTACATTGTTGAAAAGAAAGAAACCACCTCTACAAC
+ATGGCACATAGTTTCACAAGCAGTTGCAAGAACATCCATTAAAATAGTTCGCCTGACAACAGGAAGTGAG
+TATCAGTTCCGTGTTTGTGCAGAAAACCGCTATGGAAAGAGCTCCTACAGTGAATCTTCAGCTGTTGTTG
+CAGAGTATCCATTCAGTCCCCCAGGTCCTCCTGGTACTCCTAAAGTTGTGCATGCCACAAAATCTACCAT
+GCTTGTAACCTGGCAAGTGCCAGTTAATGATGGAGGAAGTCGAGTAATTGGCTATCATCTTGAGTATAAA
+GAAAGAAGCAGCATTCTTTGGTCAAAAGCAAATAAAATCCTCATTGCTGATACTCAAATGAAAGTCTCCG
+GCCTTGATGAAGGACTGATGTATGAGTATCGTGTATATGCTGAAAATATTGCTGGAATTGGTAAATGCAG
+TAAATCTTGTGAACCAGTCCCTGCAAGAGATCCTTGTGACCCTCCTGGACAACCTGAAGTCACAAATATC
+ACAAGAAAATCAGTGTCACTTAAATGGTCTAAACCACATTATGATGGTGGAGCTAAGATCACAGGATACA
+TTGTTGAACGCAGAGAACTACCAGATGGCCGGTGGCTGAAGTGCAATTATACTAATATACAAGAAACATA
+CTTTGAAGTAACTGAACTTACTGAAGATCAGCGTTATGAATTCCGGGTTTTTGCAAGGAATGCTGCTGAC
+TCAGTTAGTGAGCCATCTGAATCCACTGGGCCTATTATAGTTAAAGATGATGTTGAGCCTCCAAGAGTTA
+TGATGGATGTCAAGTTCCGAGACGTTATTGTTGTCAAAGCTGGAGAGGTCCTTAAGATAAATGCAGACAT
+TGCAGGGCGACCTCTGCCAGTAATTTCCTGGGCCAAGGATGGTATAGAAATTGAAGAAAGAGCAAGAACA
+GAAATCATCTCAACAGACAATCATACTTTGTTAACAGTTAAAGACTGTATAAGACGAGACACTGGGCAAT
+ATGTACTAACACTGAAGAATGTTGCCGGCACTCGGTCTGTGGCCGTTAATTGCAAAGTACTTGATAAGCC
+TGGTCCACCAGCAGGACCACTTGAAATAAATGGCCTCACTGCTGAGAAATGCTCTCTTTCCTGGGGACGT
+CCCCAAGAAGATGGTGGTGCAGATATCGACTATTACATCGTAGAAAAACGTGAAACAAGCCACCTTGCAT
+GGACAATATGTGAAGGAGAGTTACAGATGACATCCTGTAAAGTAACCAAGTTACTCAAAGGCAATGAATA
+TATATTTAGAGTAACTGGTGTTAATAAATATGGTGTTGGTGAGCCCCTAGAGAGTGTAGCTATAAAGGCA
+CTAGATCCATTTACAGTTCCAAGTCCACCCACGTCTTTGGAAATTACTTCTGTGACCAAAGAATCTATGA
+CACTTTGCTGGTCAAGACCCGAGAGTGATGGAGGTAGTGAAATATCTGGATATATAATTGAAAGGCGAGA
+GAAAAATAGCCTAAGATGGGTGCGTGTAAACAAAAAACCAGTTTATGATCTAAGAGTGAAATCAACAGGA
+CTTCGGGAAGGATGTGAATATGAATATCGTGTTTATGCAGAAAATGCTGCTGGCCTAAGTCTTCCAAGTG
+AAACCTCTCCCTTAATTAGGGCAGAAGATCCAGTGTTCCTACCATCTCCTCCATCCAAACCCAAAATTGT
+GGACTCAGGCAAGACAACTATAACTATTGCCTGGGTTAAGCCGCTGTTTGATGGTGGGGCCCCGATAACT
+GGATATACTGTAGAATACAAAAAATCTGATGACACTGACTGGAAAACTTCCATTCAGAGCTTACGAGGGA
+CAGAATATACAATAAGCGGACTAACAACAGGAGCTGAATATGTTTTCAGAGTAAAATCTGTCAATAAGGT
+TGGTGCTAGTGACCCCAGTGATAGCTCTGACCCTCAGATAGCAAAGGAAAGAGAAGAAGAACCTTTATTT
+GATATTGACAGTGAAATGAGGAAGACCTTGATTGTCAAGGCTGGTGCCTCATTTACCATGACTGTGCCTT
+TCCGAGGAAGACCAGTACCCAATGTCTTGTGGAGTAAGCCAGACACTGACCTCCGTACTAGAGCTTATGT
+TGATACCACAGACTCCCGTACATCACTGACCATTGAAAATGCCAACAGAAATGACTCTGGAAAGTACACA
+TTAACAATTCAGAATGTTTTGAGTGCTGCTTCACTGACCTTAGTTGTCAAAGTTTTAGATACCCCAGGTC
+CTCCAACCAACATTACTGTGCAAGATGTAACCAAAGAGTCTGCAGTGTTATCCTGGGATGTTCCTGAAAA
+CGATGGTGGAGCACCAGTGAAGAATTACCACATAGAAAAACGTGAGGCCAGCAAGAAAGCATGGGTCTCT
+GTGACCAACAACTGTAACCGCCTCTCCTACAAAGTTACCAATTTACAAGAAGGAGCTATCTATTACTTCA
+GAGTCTCTGGAGAAAATGAGTTTGGTGTTGGTATACCAGCTGAAACAAAGGAAGGAGTAAAAATAACAGA
+AAAACCAAGCCCACCTGAAAAACTTGGAGTAACAAGTATATCCAAAGACAGTGTTTCCCTGACCTGGCTG
+AAGCCTGAACATGATGGCGGAAGCAGAATTGTACACTATGTCGTTGAAGCACTAGAAAAAGGACAGAAAA
+ACTGGGTTAAATGTGCAGTGGCAAAGTCAACCCATCACGTTGTTTCCGGTCTGAGAGAGAATTCTGAATA
+CTTTTTCCGAGTGTTTGCTGAAAATCAAGCTGGCCTGAGTGACCCGAGAGAGCTTCTGCTTCCTGTTCTT
+ATTAAGGAGCAACTAGAACCACCTGAAATTGATATGAAGAATTTCCCAAGTCACACTGTATATGTTAGAG
+CTGGTTCAAACCTTAAAGTTGACATTCCAATCTCTGGAAAACCACTTCCCAAAGTGACCTTATCAAGAGA
+TGGTGTCCCCCTTAAGGCAACCATGAGATTTAATACCGAAATTACTGCTGAGAACCTGACCATCAATCTC
+AAAGAAAGTGTTACAGCTGACGCTGGGAGATATGAAATCACTGCTGCCAACTCCAGTGGTACAACCAAAG
+CTTTCATTAACATTGTTGTGCTAGACAGGCCTGGTCCTCCAACTGGCCCTGTTGTTATTAGTGATATAAC
+TGAAGAAAGTGTGACTCTCAAATGGGAGCCACCTAAGTATGACGGTGGAAGTCAAGTTACCAACTACATT
+CTACTCAAAAGAGAAACAAGTACTGCAGTGTGGACTGAAGTGTCTGCAACAGTTGCAAGAACCATGATGA
+AAGTCATGAAACTGACCACAGGAGAAGAATACCAATTCCGCATCAAGGCAGAAAACCGCTTTGGCATCAG
+TGATCATATAGATTCAGCTTGTGTGACTGTCAAACTACCATACACAACACCTGGACCACCATCTACACCA
+TGGGTCACTAATGTTACTCGAGAAAGCATCACTGTGGGCTGGCATGAACCAGTGTCAAATGGAGGCAGTG
+CAGTCGTAGGCTATCACCTGGAAATGAAAGACAGAAACAGTATTTTATGGCAAAAAGCCAACAAACTGGT
+CATCCGCACAACTCACTTCAAAGTCACAACAATCAGTGCTGGACTTATTTATGAATTCAGGGTGTATGCA
+GAAAATGCTGCTGGAGTTGGAAAACCTAGCCATCCTTCTGAACCAGTCTTGGCAATTGATGCTTGTGAAC
+CCCCAAGAAATGTTCGTATCACTGATATTTCAAAGAACTCTGTCAGCCTTTCATGGCAACAACCAGCTTT
+CGATGGAGGTAGCAAGATTACAGGCTACATTGTTGAGAGACGTGACCTTCCAGATGGCAGATGGACCAAG
+GCCAGCTTCACCAATGTTACTGAAACTCAATTCATCATCTCTGGCTTGACTCAGAATTCCCAGTATGAAT
+TCCGTGTCTTTGCTAGGAATGCTGTTGGTTCCATTAGCAATCCATCTGAGGTTGTAGGGCCCATTACTTG
+CATCGATTCTTATGGTGGTCCTGTAATTGATTTGCCTCTAGAATATACAGAAGTTGTCAAATACAGAGCA
+GGTACATCTGTGAAGCTCAGAGCTGGCATTTCTGGCAAACCTGCGCCTACTATTGAGTGGTATAAAGATG
+ATAAAGAATTACAAACCAATGCACTGGTGTGTGTTGAAAATACCACGGACCTCGCATCTATACTCATCAA
+AGATGCCGATCGCCTTAATAGTGGATGCTATGAATTAAAACTAAGGAATGCCATGGGCTCAGCCTCAGCC
+ACCATCAGAGTACAGATCCTTGACAAACCAGGCCCACCTGGTGGACCAATTGAATTTAAGACTGTAACTG
+CTGAGAAGATCACCCTTCTCTGGCGGCCTCCAGCTGATGATGGTGGTGCAAAAATCACTCACTACATTGT
+GGAAAAGCGTGAGACAAGCCGCGTTGTGTGGTCTATGGTGTCTGAACATTTGGAAGAGTGCATCATTACA
+ACCACCAAAATTATCAAAGGAAATGAATACATCTTCCGGGTCCGAGCCGTGAACAAATATGGAATTGGCG
+AGCCACTGGAATCTGATTCCGTTGTAGCCAAGAACGCATTTGTTACACCTGGGCCACCAGGCATACCAGA
+AGTGACAAAGATTACCAAGAATTCGATGACTGTTGTATGGAGCAGGCCAATTGCAGATGGCGGTAGTGAT
+ATAAGTGGCTATTTCCTTGAAAAACGAGACAAGAAGAGCCTAGGATGGTTTAAAGTACTAAAAGAGACTA
+TCCGTGACACCAGACAAAAAGTAACAGGACTCACAGAAAACAGTGACTATCAATACAGAGTTTGTGCTGT
+AAACGCTGCTGGACAGGGTCCATTTTCTGAACCATCTGAATTCTACAAAGCTGCTGATCCTATTGATCCT
+CCAGGTCCACCTGCTAAGATAAGAATCGCAGATTCAACCAAGTCATCCATCACCCTTGGCTGGAGTAAGC
+CTGTCTATGATGGGGGCAGTGCTGTTACTGGGTATGTTGTCGAGATAAGACAAGGAGAGGAAGAGGAATG
+GACTACTGTCTCTACCAAAGGAGAGGTCAGAACTACAGAATATGTGGTATCCAACCTGAAACCTGGAGTC
+AATTACTACTTCCGGGTATCTGCTGTAAACTGTGCTGGACAAGGAGAACCTATAGAAATGAATGAACCTG
+TACAAGCTAAAGATATACTTGAGGCACCAGAGATTGACCTGGATGTGGCTCTCAGAACTTCTGTTATTGC
+CAAAGCTGGTGAAGATGTACAAGTGTTGATTCCCTTTAAAGGCAGACCTCCACCTACTGTCACATGGAGA
+AAAGATGAGAAGAATCTTGGCAGTGATGCCAGATACAGCATTGAAAACACTGATTCATCCTCATTACTCA
+CCATTCCTCAAGTTACTCGCAATGATACAGGAAAATATATTCTCACAATAGAAAATGGAGTTGGTGAACC
+TAAGTCTTCAACTGTGAGTGTTAAAGTGCTTGACACACCAGCTGCCTGCCAGAAACTACAGGTTAAACAT
+GTTTCTCGAGGCACAGTCACTTTGCTCTGGGATCCTCCTCTCATTGATGGAGGATCTCCAATAATTAATT
+ATGTCATTGAAAAGAGAGATGCCACCAAGAGAACATGGTCTGTCGTGTCACACAAATGTTCTAGCACATC
+CTTCAAGCTAATAGATTTGTCGGAGAAGACTCCATTCTTCTTCAGAGTTCTTGCAGAAAATGAAATTGGA
+ATTGGGGAACCCTGTGAAACTACAGAGCCAGTGAAGGCTGCTGAAGTACCAGCTCCTATACGTGATCTCT
+CAATGAAAGACTCAACAAAGACATCTGTCATCCTCAGCTGGACCAAACCTGACTTTGATGGTGGTAGCGT
+CATCACAGAATATGTTGTAGAAAGGAAAGGTAAAGGTGAACAGACGTGGTCCCACGCTGGCATAAGTAAG
+ACATGTGAAATTGAGGTTAGCCAACTTAAGGAGCAGTCAGTCCTGGAGTTCAGAGTGTTTGCCAAAAATG
+AGAAAGGACTGAGTGATCCTGTCACTATTGGGCCAATTACAGTGAAAGAACTTATTATTACACCTGAAGT
+TGACCTGTCAGATATCCCTGGGGCACAAGTCACTGTGAGAATTGGGCACAATGTGCACCTTGAATTACCT
+TATAAGGGAAAACCCAAACCATCCATCAGTTGGCTGAAAGATGGCTTGCCACTGAAAGAAAGTGAATTTG
+TTCGCTTCAGTAAAACTGAAAACAAAATTACTTTGAGTATTAAGAATGCCAAGAAGGAGCATGGAGGAAA
+ATACACTGTTATTCTTGATAATGCAGTGTGTAGAATTGCAGTCCCCATTACAGTCATCACCCTTGGCCCA
+CCATCAAAGCCCAAAGGACCCATTCGATTTGATGAAATCAAGGCTGATAGTGTCATCCTGTCATGGGATG
+TACCTGAAGATAATGGAGGAGGAGAAATTACTTGTTACAGCATCGAGAAGCGGGAAACTTCACAAACTAA
+CTGGAGGATGGTGTGTTCAAGTGTTGCCAGAACGACTTTCAAAGTTCCTAATCTAGTCAAAGATGCTGAG
+TACCAGTTTAGAGTGAGAGCAGAAAACAGATACGGAGTCAGCCAACCACTTGTCTCAAGCATTATTGTGG
+CAAAACACCAGTTCAGGATTCCTGGTCCCCCAGGAAAGCCAGTTATATACAATGTGACTTCTGATGGCAT
+GTCACTAACTTGGGATGCTCCAGTTTATGATGGTGGTTCAGAAGTTACTGGATTCCATGTTGAAAAGAAA
+GAAAGAAATAGCATCCTCTGGCAAAAAGTTAATACATCACCAATCTCTGGAAGAGAATATAGAGCCACTG
+GACTGGTAGAAGGTCTGGATTACCAATTCCGTGTATATGCTGAAAATTCTGCTGGCCTAAGCTCACCTAG
+TGACCCAAGCAAATTTACCTTAGCTGTTTCTCCAGTAGACCCACCTGGCACTCCTGACTACATTGATGTC
+ACCCGGGAAACCATCACACTTAAATGGAACCCACCATTGCGTGATGGAGGCAGTAAGATTGTGGGCTATA
+GCATTGAGAAACGGCAAGGAAATGAACGCTGGGTGAGATGCAACTTTACTGACGTCAGTGAATGTCAGTA
+CACAGTTACAGGACTCAGTCCTGGGGATCGCTATGAGTTCAGAATAATTGCAAGAAATGCTGTTGGAACT
+ATAAGCCCGCCCTCACAGTCTTCTGGCATTATTATGACAAGAGATGAAAATGTTCCACCAATAGTAGAGT
+TTGGCCCTGAATACTTTGATGGTCTCATTATTAAGTCCGGAGAGAGCCTTAGAATTAAAGCTTTGGTACA
+AGGAAGACCAGTGCCTCGAGTAACTTGGTTCAAAGATGGAGTGGAAATCGAAAAGAGGATGAATATGGAA
+ATAACCGACGTACTTGGATCCACCAGCCTATTTGTTAGAGATGCTACTCGGGACCATCGTGGTGTATACA
+CAGTGGAAGCCAAAAATGCATCTGGTTCTGCAAAAGCAGAAATTAAAGTGAAAGTACAAGATACACCAGG
+AAAAGTAGTTGGGCCAATAAGATTCACCAATATTACTGGGGAGAAGATGACTCTGTGGTGGGATGCCCCA
+CTCAATGACGGTTGTGCTCCCATAACCCACTACATCATTGAAAAACGGGAAACCAGCAGACTTGCCTGGG
+CACTAATTGAGGATAAATGTGAAGCCCAAAGTTACACTGCCATTAAACTAATAAACGGCAATGAATACCA
+ATTCCGTGTTTCTGCAGTTAACAAGTTTGGTGTTGGCAGGCCACTTGATTCTGATCCAGTGGTTGCTCAA
+ATACAATATACTGTTCCTGATGCCCCTGGCATTCCAGAACCTAGCAACATAACAGGCAACAGCATTACCC
+TGACATGGGCAAGGCCAGAATCAGATGGTGGCAGTGAAATTCAACAGTATATCCTTGAAAGAAGAGAAAA
+GAAAAGCACAAGATGGGTAAAAGTGATCAGCAAACGACCAATCTCTGAAACAAGATTCAAAGTCACTGGT
+CTGACAGAAGGCAATGAGTATGAATTCCATGTCATGGCTGAAAATGCTGCAGGAGTTGGACCTGCAAGTG
+GCATCTCAAGACTCATTAAATGTAGAGAGCCCGTCAACCCACCAGGTCCTCCCACAGTGGTCAAAGTAAC
+AGACACATCAAAGACAACTGTGAGCTTAGAATGGTCCAAACCAGTGTTTGATGGTGGCATGGAAATAATT
+GGGTATATTATTGAAATGTGTAAGGCCGACTTAGGAGACTGGCACAAGGTGAATGCAGAGGCATGTGTGA
+AAACAAGATATACAGTCACTGATCTACAAGCAGGTGAAGAATACAAATTCCGAGTTAGTGCTATCAATGG
+TGCTGGAAAAGGCGACAGCTGTGAAGTGACTGGCACAATTAAAGCAGTTGACCGGTTAACAGCTCCTGAG
+TTAGACATAGATGCAAACTTCAAACAGACTCATGTTGTTAGAGCTGGGGCCAGTATTCGCCTCTTCATTG
+CCTACCAAGGTAGACCTACTCCTACAGCTGTGTGGAGCAAACCAGACTCTAACCTTAGCCTTCGGGCTGA
+TATCCATACAACAGATTCCTTCAGCACCCTCACTGTGGAAAACTGCAACAGAAATGATGCAGGGAAATAT
+ACCCTTACTGTGGAAAACAACAGTGGTAGTAAGTCAATCACATTCACCGTGAAAGTGCTAGACACTCCAG
+GCCCACCTGGCCCAATTACCTTCAAAGATGTGACCCGGGGATCTGCTACATTGATGTGGGATGCCCCTCT
+TCTTGACGGTGGTGCCCGAATCCATCATTATGTGGTAGAGAAACGAGAGGCAAGTCGCCGTAGTTGGCAG
+GTTATCAGTGAAAAATGCACTCGTCAGATCTTCAAGGTCAATGACCTGGCCGAAGGTGTTCCGTACTATT
+TCCGTGTTTCTGCAGTAAATGAGTATGGTGTTGGTGAGCCCTATGAAATGCCAGAACCAATTGTAGCCAC
+AGAACAGCCTGCTCCACCTAGGAGACTTGATGTTGTTGATACTAGCAAATCCTCCGCAGTCTTAGCTTGG
+CTTAAACCTGACCACGATGGAGGCAGCCGGATCACTGGCTACCTGCTTGAAATGAGACAAAAGGGATCTG
+ACTTCTGGGTTGAAGCTGGTCACACCAAACAGCTAACTTTCACAGTAGAGCGTCTTGTTGAGAAAACTGA
+ATATGAATTCCGTGTGAAGGCCAAGAATGATGCTGGCTATAGTGAACCCAGAGAAGCCTTCTCTTCTGTC
+ATCATTAAGGAGCCTCAAATCGAGCCCACTGCTGACCTCACTGGAATTACCAATCAGCTTATAACTTGCA
+AAGCAGGAAGCCCATTTACCATTGACGTACCAATCAGTGGTCGTCCTGCCCCCAAAGTAACATGGAAACT
+GGAAGAAATGAGACTTAAAGAGACAGATCGAGTGAGCATTACAACAACAAAAGACAGAACCACACTGACT
+GTAAAGGACAGCATGAGAGGTGACTCTGGAAGATACTTCTTGACCCTGGAAAATACAGCTGGTGTTAAAA
+CATTTAGCGTCACAGTTGTGGTCATTGGAAGGCCAGGTCCAGTAACCGGCCCCATTGAGGTCTCATCTGT
+CTCAGCTGAATCGTGTGTCCTGTCATGGGGAGAACCTAAAGATGGAGGAGGCACTGAAATTACTAATTAC
+ATAGTTGAAAAGCGTGAATCGGGTACAACAGCTTGGCAGCTTGTCAATTCCAGTGTCAAGCGCACTCAAA
+TTAAAGTCACTCATCTCACAAAATACATGGAATATTCTTTCCGTGTCAGTTCAGAGAACAGATTTGGTGT
+CAGCAAACCTCTAGAATCAGCACCAATAATTGCTGAACATCCATTTGTCCCACCAAGCGCTCCTACCAGA
+CCTGAGGTCTACCATGTGTCTGCCAATGCCATGTCTATTCGTTGGGAAGAACCCTACCACGATGGTGGCA
+GTAAAATCATTGGCTACTGGGTTGAGAAGAAAGAACGTAATACAATTCTTTGGGTGAAAGAAAACAAAGT
+GCCATGCTTAGAGTGCAACTACAAAGTAACTGGTTTAGTAGAAGGACTGGAATATCAGTTCAGAACTTAT
+GCACTCAATGCTGCAGGTGTTAGCAAGGCCAGCGAAGCTTCAAGACCTATAATGGCTCAAAATCCAGTTG
+ATGCACCAGGCAGACCAGAGGTGACAGATGTCACAAGATCAACAGTATCACTGATTTGGTCTGCCCCAGC
+GTATGATGGAGGCAGCAAGGTTGTGGGCTACATCATAGAGCGTAAGCCAGTCAGTGAGGTAGGAGATGGT
+CGCTGGCTGAAGTGCAACTACACCATTGTATCTGACAATTTCTTCACCGTGACTGCTCTCAGTGAAGGAG
+ACACTTATGAGTTCCGTGTGTTAGCCAAGAATGCAGCAGGCGTAATTAGCAAAGGGTCTGAATCTACAGG
+CCCTGTCACTTGCCGAGATGAATACGCTCCACCCAAAGCCGAACTGGATGCCCGATTACACGGTGATCTG
+GTTACCATCAGAGCAGGTTCTGATCTTGTTCTGGATGCTGCAGTTGGTGGCAAACCTGAACCCAAAATTA
+TCTGGACCAAAGGAGACAAGGAGCTAGATCTCTGTGAAAAAGTCTCTTTGCAGTATACTGGCAAACGAGC
+AACTGCTGTGATCAAGTTCTGTGACAGAAGTGACAGTGGAAAATACACTTTAACAGTGAAAAATGCCAGC
+GGGACCAAGGCCGTGTCTGTCATGGTCAAAGTGCTTGATTCCCCTGGCCCATGTGGAAAGCTCACCGTCA
+GCAGAGTAACACAGGAGAAGTGCACTTTAGCCTGGAGCCTTCCGCAGGAAGACGGAGGAGCAGAAATCAC
+TCACTACATCGTGGAAAGACGCGAGACTAGCAGGCTCAACTGGGTGATTGTTGAAGGCGAATGCCCAACC
+CTATCCTATGTCGTTACCAGGCTCATCAAGAACAATGAGTACATATTCCGAGTGAGGGCAGTAAACAAAT
+ATGGCCCTGGTGTGCCTGTTGAATCAGAGCCAATTGTAGCCAGAAACTCATTCACTATTCCATCACCACC
+CGGCATACCTGAAGAAGTTGGGACTGGCAAAGAGCATATCATCATTCAGTGGACAAAACCTGAATCTGAT
+GGTGGCAATGAAATCAGCAACTACCTAGTAGACAAACGTGAGAAGAAGAGCCTGCGCTGGACACGTGTCA
+ACAAAGACTATGTGGTGTATGATACCAGGCTGAAGGTGACCAGCCTGATGGAGGGTTGTGATTACCAGTT
+CCGGGTGACCGCAGTGAATGCAGCTGGTAACAGTGAGCCCAGCGAAGCTTCCAACTTCATCTCATGCAGA
+GAACCATCATATACCCCTGGACCACCTTCTGCTCCAAGAGTTGTGGATACCACCAAACACAGCATTAGTT
+TGGCATGGACCAAACCCATGTACGATGGTGGTACTGACATTGTAGGATATGTTCTGGAAATGCAAGAGAA
+GGACACTGATCAGTGGTACCGAGTGCATACCAATGCCACAATAAGAAATACTGAATTCACTGTGCCAGAC
+CTTAAAATGGGCCAGAAATATTCCTTCAGAGTTGCTGCCGTGAACGTGAAGGGTATGAGCGAATACAGCG
+AATCAATTGCTGAAATTGAGCCCGTGGAAAGAATAGAAATACCAGATCTTGAGCTTGCAGATGATCTAAA
+GAAGACTGTGACCATCAGGGCTGGGGCCTCCTTGCGCTTGATGGTGTCTGTATCTGGAAGACCACCTCCT
+GTCATAACGTGGAGCAAGCAGGGCATTGACCTTGCAAGCCGGGCAATTATTGACACCACTGAGAGCTACT
+CATTGCTAATAGTGGACAAAGTTAATCGGTACGATGCTGGAAAATACACAATTGAAGCTGAAAACCAATC
+TGGCAAGAAATCAGCAACAGTCCTTGTTAAAGTCTATGATACTCCTGGTCCCTGTCCTTCAGTGAAAGTT
+AAGGAAGTATCAAGAGATTCTGTGACTATAACTTGGGAAATTCCCACGATTGATGGTGGAGCTCCAGTCA
+ACAATTACATCGTTGAGAAGCGTGAAGCTGCTATGAGAGCATTCAAAACAGTAACTACCAAATGCAGCAA
+GACACTTTACAGAATTTCTGGACTTGTAGAAGGAACCATGTACTATTTCAGAGTGCTGCCAGAAAATATT
+TATGGCATTGGAGAACCTTGTGAAACATCTGATGCAGTACTGGTCTCAGAAGTGCCTTTGGTGCCTGCAA
+AGCTAGAAGTGGTCGATGTCACCAAATCCACTGTTACCCTTGCCTGGGAAAAACCACTCTACGATGGTGG
+TAGCCGACTCACTGGATATGTTCTCGAGGCCTGCAAAGCTGGCACAGAGAGATGGATGAAGGTTGTCACC
+TTAAAACCCACAGTCCTAGAGCACACTGTTACTTCCTTAAATGAAGGTGAACAATACTTATTTAGAATAA
+GGGCACAAAATGAGAAAGGTGTGTCAGAACCAAGAGAGACTGTCACAGCCGTGACTGTACAAGACCTCAG
+AGTGTTGCCAACAATCGATCTTTCTACAATGCCTCAGAAGACCATCCATGTCCCAGCTGGCAGACCAGTA
+GAGCTGGTGATACCTATTGCTGGCCGTCCACCTCCTGCTGCTTCCTGGTTCTTTGCTGGTTCTAAACTGA
+GAGAATCAGAGCGTGTCACAGTTGAAACTCACACTAAAGTAGCTAAATTAACCATCCGTGAAACCACTAT
+CAGAGATACTGGAGAATACACACTTGAATTGAAGAATGTTACCGGAACTACTTCAGAAACCATTAAAGTT
+ATCATTCTTGACAAGCCTGGTCCACCAACAGGACCTATTAAGATTGATGAAATTGATGCTACATCAATTA
+CCATTTCCTGGGAACCACCTGAATTGGACGGTGGTGCTCCACTGAGTGGTTATGTGGTAGAACAACGTGA
+CGCTCATCGTCCAGGATGGCTGCCCGTTTCTGAATCAGTGACTAGGTCCACGTTTAAGTTTACCAGACTC
+ACCGAAGGAAATGAGTATGTGTTCCGTGTGGCTGCAACAAACCGCTTCGGGATTGGCTCTTACTTGCAGT
+CTGAGGTCATAGAGTGTCGCAGCAGCATCCGTATTCCTGGACCCCCAGAAACATTACAGATATTTGATGT
+TTCCCGTGATGGCATGACACTTACTTGGTACCCACCAGAGGATGACGGTGGCTCCCAAGTGACTGGATAT
+ATTGTGGAGCGCAAAGAAGTGAGAGCAGATCGATGGGTCCGTGTAAATAAAGTACCTGTGACAATGACAC
+GGTACCGCTCCACTGGCCTTACTGAAGGCTTAGAATATGAACACCGTGTCACAGCCATTAATGCAAGAGG
+GTCTGGGAAACCAAGTCGTCCTTCCAAACCCATCGTTGCCATGGATCCAATTGCTCCTCCAGGAAAGCCA
+CAAAACCCAAGAGTTACTGATACAACAAGGACATCAGTCTCCCTGGCCTGGAGTGTTCCAGAAGATGAAG
+GAGGATCTAAAGTCACAGGCTACTTGATTGAAATGCAAAAAGTAGATCAACATGAATGGACCAAGTGTAA
+CACCACTCCAACCAAGATTCGAGAGTATACTCTAACACACCTACCTCAGGGTGCAGAATACAGGTTCCGC
+GTCCTAGCTTGTAATGCTGGTGGACCTGGTGAGCCTGCTGAGGTACCAGGAACAGTCAAAGTCACTGAAA
+TGCTTGAATATCCTGATTATGAACTTGATGAAAGATACCAAGAAGGTATCTTTGTAAGGCAAGGTGGCGT
+CATCAGACTTACCATACCAATCAAAGGAAAACCATTCCCAATATGTAAATGGACCAAGGAAGGCCAGGAT
+ATTAGTAAGCGTGCCATGATTGCAACATCTGAAACACACACTGAGCTTGTGATCAAAGAAGCAGACAGGG
+GTGATTCTGGCACTTATGACCTGGTTCTGGAAAATAAATGTGGCAAGAAGGCTGTCTACATCAAGGTCAG
+GGTGATAGGAAGTCCCAACAGTCCAGAAGGGCCACTGGAATATGATGACATCCAAGTCCGCTCTGTGAGG
+GTCAGCTGGAGACCTCCTGCTGATGATGGTGGTGCTGACATCTTAGGCTACATCCTCGAGAGACGAGAAG
+TGCCTAAAGCCGCCTGGTATACCATTGATTCCAGAGTCCGAGGTACATCTCTGGTGGTAAAAGGCCTCAA
+AGAGAATGTAGAATACCATTTCCGTGTTTCAGCAGAAAACCAGTTTGGCATAAGCAAACCCTTGAAATCT
+GAGGAACCAGTCACACCAAAAACACCATTGAATCCTCCAGAACCTCCAAGCAATCCTCCAGAAGTACTCG
+ATGTAACCAAGAGTTCTGTTAGCTTGTCCTGGTCCCGGCCCAAAGATGATGGTGGTTCTAGAGTCACAGG
+CTACTACATCGAACGCAAAGAGACATCCACTGACAAGTGGGTCAGACACAACAAGACTCAGATCACCACC
+ACAATGTACACTGTCACAGGGCTTGTTCCCGATGCTGAGTATCAGTTCCGCATCATCGCACAGAATGATG
+TTGGCCTGAGTGAGACCAGCCCTGCTTCTGAACCAGTTGTTTGCAAAGATCCATTTGATAAACCAAGCCA
+ACCAGGAGAACTTGAGATTCTTTCAATATCCAAAGATAGTGTCACTCTACAGTGGGAGAAACCTGAATGT
+GATGGTGGTAAAGAAATTCTTGGATACTGGGTTGAATATAGACAGTCTGGAGACAGTGCCTGGAAGAAGA
+GCAATAAGGAACGTATTAAGGACAAGCAATTCACAATAGGAGGTTTGCTGGAAGCTACTGAGTATGAATT
+CAGGGTTTTTGCTGAGAATGAGACTGGGCTGAGCAGACCTCGCAGAACTGCTATGTCTATAAAGACTAAA
+CTCACATCTGGAGAGGCCCCAGGAATACGCAAAGAAATGAAGGATGTTACCACAAAATTGGGTGAAGCTG
+CTCAACTCTCATGCCAGATTGTTGGAAGGCCTCTTCCTGACATTAAATGGTACAGATTTGGTAAAGAGCT
+CATACAAAGCCGGAAATACAAAATGTCTTCAGATGGACGCACACACACTCTTACAGTAATGACAGAGGAA
+CAGGAAGATGAAGGTGTTTATACCTGCATAGCCACCAATGAGGTTGGAGAAGTAGAAACCAGTAGTAAGC
+TTCTCCTGCAAGCAACACCGCAGTTCCATCCTGGTTACCCACTGAAAGAGAAATATTATGGAGCTGTGGG
+TTCCACACTTCGGCTTCATGTTATGTACATTGGTCGTCCAGTACCTGCCATGACTTGGTTCCATGGTCAG
+AAACTTTTGCAAAACTCAGAAAACATTACTATTGAAAACACTGAGCACTATACTCATCTTGTCATGAAGA
+ATGTCCAACGTAAGACTCATGCTGGGAAATACAAAGTCCAGCTCAGCAATGTTTTTGGAACAGTTGATGC
+CATCCTTGATGTGGAAATACAAGATAAACCAGACAAACCTACAGGACCAATTGTGATCGAAGCTCTATTG
+AAGAACTCCGCAGTGATCAGCTGGAAACCACCCGCAGATGACGGAGGCTCCTGGATCACCAACTATGTGG
+TGGAAAAATGTGAGGCCAAGGAGGGGGCTGAATGGCAATTGGTGTCTTCAGCCATCTCAGTGACAACCTG
+TAGAATTGTGAACCTCACAGAAAATGCTGGCTATTACTTCCGGGTTTCAGCTCAGAACACTTTCGGCATC
+AGTGACCCTCTAGAAGTGTCCTCAGTTGTGATCATTAAGAGTCCATTTGAAAAGCCAGGTGCTCCTGGCA
+AACCAACTATTACTGCTGTCACAAAAGATTCTTGTGTTGTGGCCTGGAAGCCACCTGCCAGTGATGGAGG
+TGCAAAGATTAGAAATTACTACCTTGAGAAGCGTGAGAAGAAGCAGAATAAATGGATTTCTGTGACAACA
+GAAGAAATTCGAGAAACTGTCTTTTCAGTGAAAAACCTTATTGAAGGTCTTGAATACGAGTTTCGTGTGA
+AATGTGAAAATCTAGGTGGGGAAAGTGAATGGAGTGAAATATCAGAACCCATCACTCCCAAATCTGATGT
+CCCAATTCAGGCACCACACTTTAAAGAGGAACTGAGAAATCTAAATGTCAGATATCAGAGCAATGCTACC
+TTGGTCTGCAAAGTGACTGGTCATCCAAAACCTATCGTCAAATGGTACAGACAAGGCAAAGAAATCATTG
+CAGATGGATTAAAATATAGGATTCAAGAATTTAAGGGTGGCTACCACCAGCTCATCATTGCAAGTGTCAC
+AGATGATGATGCCACAGTTTACCAAGTCAGAGCTACCAACCAAGGGGGATCTGTGTCTGGCACTGCCTCC
+TTGGAAGTGGAAGTTCCAGCTAAGATACACTTACCTAAAACTCTTGAAGGCATGGGAGCAGTTCATGCTC
+TCCGAGGTGAAGTGGTCAGCATCAAGATTCCTTTCAGTGGCAAACCAGATCCTGTGATCACCTGGCAGAA
+AGGACAAGATCTCATTGACAATAATGGCCACTACCAAGTTATTGTCACAAGATCCTTCACATCACTTGTT
+TTCCCCAATGGGGTAGAGAGAAAAGATGCTGGTTTCTATGTGGTCTGTGCTAAAAACAGATTTGGAATTG
+ATCAGAAGACAGTTGAACTGGATGTGGCTGATGTTCCTGACCCACCCAGAGGAGTCAAAGTTAGTGATGT
+CTCACGAGATTCTGTCAACTTAACATGGACTGAGCCAGCCTCTGATGGTGGCAGCAAAATCACCAACTAC
+ATTGTTGAAAAATGTGCAACTACTGCAGAAAGATGGCTCCGTGTAGGACAGGCCCGAGAAACACGTTATA
+CCGTGATCAACTTATTTGGAAAAACAAGTTACCAGTTCCGGGTAATAGCTGAAAATAAATTTGGTCTGAG
+CAAGCCTTCAGAGCCTTCAGAACCAACCATAACCAAAGAAGATAAGACCAGAGCTATGAACTATGATGAA
+GAGGTAGATGAAACCAGGGAAGTCTCCATGACTAAAGCATCTCACTCTTCAACCAAGGAACTCTATGAGA
+AATATATGATTGCTGAAGATCTTGGGCGTGGTGAGTTTGGAATTGTCCATCGTTGTGTTGAAACATCCTC
+AAAGAAGACATACATGGCCAAATTTGTTAAAGTCAAAGGGACTGATCAGGTTTTGGTAAAGAAGGAAATT
+TCCATTCTGAATATTGCTAGGCATAGAAACATCTTACACCTCCATGAATCATTTGAAAGCATGGAAGAAT
+TAGTTATGATCTTTGAGTTTATATCAGGACTTGACATATTTGAGCGCATTAACACAAGTGCTTTTGAACT
+TAATGAAAGAGAAATTGTAAGTTATGTTCACCAGGTCTGTGAAGCACTTCAGTTTTTACACAGTCATAAT
+ATTGGACACTTTGACATTAGACCAGAAAATATCATTTACCAAACCAGAAGAAGCTCTACCATTAAAATCA
+TAGAATTTGGTCAAGCCCGTCAGCTGAAACCAGGGGACAACTTCAGGCTTCTATTCACTGCCCCAGAATA
+CTATGCACCTGAAGTCCACCAGCATGATGTTGTCAGCACAGCCACAGACATGTGGTCACTTGGAACACTG
+GTATATGTGCTATTGAGTGGTATCAACCCATTCCTGGCTGAAACTAACCAACAGATCATTGAGAATATCA
+TGAATGCTGAATATACTTTCGATGAGGAAGCATTCAAAGAGATTAGCATTGAAGCCATGGATTTTGTTGA
+CCGGTTGTTAGTGAAAGAGAGGAAATCTCGCATGACAGCATCGGAGGCTCTCCAGCACCCATGGTTGAAG
+CAGAAGATAGAAAGAGTCAGTACTAAAGTTATCAGAACATTAAAACACCGGCGTTATTACCACACCCTGA
+TCAAGAAAGACCTCAACATGGTTGTGTCAGCAGCCCGGATCTCCTGTGGTGGTGCAATTCGATCTCAGAA
+GGGAGTGAGTGTTGCTAAAGTTAAAGTGGCATCCATTGAAATTGGCCCAGTTTCTGGGCAGATAATGCAT
+GCAGTTGGTGAAGAAGGAGGACATGTCAAATATGTATGCAAAATTGAAAATTATGATCAGTCTACCCAAG
+TGACTTGGTACTTTGGCGTCCGACAGCTGGAGAACAGTGAGAAATACGAAATCACCTACGAAGATGGAGT
+GGCCATCCTCTATGTCAAAGACATTACCAAATTAGATGATGGTACCTACAGATGCAAAGTAGTCAATGAC
+TATGGTGAAGACAGTTCTTATGCAGAGCTATTTGTTAAAGGTGTGAGAGAAGTCTATGACTATTACTGCC
+GTAGAACCATGAAGAAAATTAAGCGCAGAACAGACACAATGAGACTCCTGGAAAGGCCACCAGAATTTAC
+CCTGCCTCTCTATAATAAGACAGCTTATGTAGGTGAAAATGTCCGGTTTGGAGTAACTATAACTGTCCAC
+CCAGAGCCTCATGTAACATGGTATAAATCAGGTCAGAAAATCAAACCAGGTGACAATGACAAGAAGTACA
+CATTTGAGTCAGACAAGGGTCTTTACCAATTAACAATCAACAGTGTCACTACAGATGATGACGCTGAATA
+TACTGTTGTGGCAAGGAACAAATATGGTGAAGACAGCTGTAAAGCAAAGCTGACAGTAACCCTACACCCA
+CCTCCAACAGATAGTACCTTAAGACCCATGTTCAAAAGGTTACTGGCAAATGCAGAATGCCAAGAAGGCC
+AAAGTGTCTGCTTTGAGATCAGAGTGTCTGGCATCCCCCCACCAACATTAAAATGGGAGAAAGATGGTCA
+GCCACTGTCCCTCGGGCCTAACATTGAAATTATCCATGAAGGCTTGGATTATTATGCTCTGCACATCAGG
+GACACTTTGCCTGAAGACACGGGTTATTATAGAGTCACAGCCACTAACACAGCTGGGTCCACCAGCTGCC
+AGGCTCACCTACAAGTGGAACGCCTGAGGTACAAGAAACAGGAATTCAAGAGTAAGGAGGAGCATGAGCG
+ACACGTACAAAAACAAATTGACAAAACCCTCAGAATGGCTGAAATTCTTTCTGGAACTGAAAGTGTACCA
+CTGACACAGGTAGCTAAAGAGGCTCTGAGAGAAGCTGCTGTCCTTTATAAACCGGCTGTAAGCACCAAGA
+CTGTAAAAGGGGAATTCAGACTTGAGATAGAAGAAAAGAAGGAGGAGAGAAAACTCCGGATGCCTTATGA
+TGTACCAGAGCCACGCAAGTATAAGCAGACTACCATAGAAGAAGACCAACGCATCAAGCAGTTCGTGCCC
+ATGTCTGACATGAAGTGGTATAAAAAGATACGTGATCAGTATGAAATGCCTGGGAAACTTGACAGAGTTG
+TACAGAAACGACCCAAGCGCATCCGCCTTTCAAGATGGGAACAGTTCTATGTGATGCCTCTTCCACGCAT
+TACAGATCAATACAGACCTAAATGGCGTATTCCTAAACTGTCCCAAGATGATCTTGAGATAGTGAGACCA
+GCCCGCCGGCGTACACCTTCTCCTGATTATGACTTTTACTACCGACCTAGAAGACGTTCTCTTGGGGACA
+TCTCTGATGAAGAATTACTCCTCCCCATTGATGACTACTTAGCAATGAAAAGAACAGAGGAAGAGAGGCT
+GCGTCTTGAAGAAGAGCTTGAGTTAGGTTTTTCAGCTTCACCCCCAAGTCGAAGCCCTCCACACTTTGAG
+CTTTCTAGCCTACGTTACTCTTCACCACAAGCTCATGTCAAGGTGGAGGAAACAAGAAAAGACTTCAGGT
+ATTCAACCTATCACATCCCAACGAAGGCTGAAGCTAGTACAAGTTATGCAGAACTGAGGGAACGGCATGC
+CCAGGCTGCGTACAGACAGCCAAAGCAACGGCAAAGAATCATGGCTGAGAGGGAGGATGAAGAGTTGCTT
+CGCCCAGTTACGACCACCCAGCATCTCTCAGAATACAAAAGCGAACTTGACTTCATGTCAAAGGAGGAAA
+AGTCTAGAAAGAAATCAAGGCGACAAAGAGAAGTGACAGAAATAACAGAAATTGAGGAAGAATACGAAAT
+CTCAAAACATGCTCAAAGAGAATCATCCTCATCTGCGTCTAGACTACTGAGACGACGGCGCTCCCTGTCT
+CCAACTTATATTGAGTTAATGAGGCCAGTGTCTGAGCTGATCCGGTCACGTCCACAACCGGCTGAGGAAT
+ACGAAGATGACACAGAAAGAAGGTCACCTACTCCAGAGAGAACTCGCCCACGATCCCCCAGCCCTGTGTC
+TAGTGAGAGATCACTCTCGAGATTTGAGAGGTCTGCAAGATTTGATATCTTTTCCAGGTATGAGTCCATG
+AAAGCTGCTTTAAAAACTCAGAAGACATCAGAAAGGAAGTATGAAGTTTTGAGTCAGCAGCCTTTCACAC
+TGGACCATGCCCCTCGAATCACACTGAGAATGCGCTCGCACAGGGTACCATGTGGCCAAAATACACGTTT
+TATTTTAAATGTTCAGTCTAAGCCAACTGCCGAGGTTAAATGGTACCACAATGGTGTGGAACTCCAAGAA
+AGCAGTAAGATTCATTACACCAACACGAGTGGAGTCCTCACCCTGGAAATTCTGGACTGTCATACTGATG
+ACAGTGGAACCTACCGTGCTGTGTGCACCAACTACAAGGGCGAAGCTTCTGACTATGCAACGTTGGACGT
+GACAGGAGGGGATTATACCACCTATGCTTCCCAACGCAGAGATGAAGAGGTCCCCAGATCTGTTTTCCCT
+GAGCTGACAAGAACAGAGGCGTATGCTGTTTCATCATTTAAGAAAACATCTGAGATGGAAGCTTCGTCTT
+CTGTCAGGGAAGTGAAATCACAGATGACGGAGACAAGGGAAAGTCTCTCCTCATATGAACACTCTGCATC
+TGCAGAAATGAAAAGTGCTGCATTAGAAGAAAAGTCACTGGAAGAAAAATCCACAACCAGAAAGATCAAG
+ACGACTTTGGCAGCAAGAATTCTAACAAAGCCACGGTCCATGACCGTCTACGAGGGCGAGTCTGCAAGGT
+TTTCTTGTGACACCGATGGTGAGCCGGTACCAACTGTGACCTGGCTGCGTAAAGGACAAGTGCTAAGTAC
+TTCTGCCCGCCACCAAGTGACCACCACAAAGTACAAATCAACCTTTGAGATCTCTTCAGTCCAGGCTTCC
+GATGAGGGCAATTACAGCGTGGTGGTAGAAAACAGTGAAGGGAAACAAGAAGCAGAGTTCACTCTGACTA
+TTCAAAAGGCCAGGGTAACTGAAAAGGCTGTGACATCACCACCAAGAGTCAAATCCCCAGAGCCTCGGGT
+GAAATCCCCAGAAGCAGTTAAGTCTCCAAAACGAGTGAAATCTCCAGAACCTTCTCACCCGAAAGCCGTA
+TCACCCACAGAGACAAAACCAACACCAACAGAGAAAGTTCAGCACCTCCCAGTCTCTGCCCCACCAAAGA
+TTACTCAGTTCCTGAAAGCAGAAGCTTCTAAAGAGATTGCAAAACTGACCTGTGTGGTTGAAAGCAGTGT
+ATTAAGGGCAAAAGAGGTCACCTGGTATAAAGATGGCAAGAAACTGAAGGAAAATGGGCATTTCCAGTTT
+CATTATTCAGCAGATGGTACCTATGAGCTCAAAATCAATAACCTCACTGAATCTGATCAAGGAGAATATG
+TTTGTGAGATTTCTGGTGAAGGTGGAACGTCTAAAACCAACTTACAATTTATGGGGCAAGCCTTTAAGAG
+TATCCATGAGAAGGTATCAAAAATATCAGAAACTAAGAAATCAGATCAGAAAACCACTGAGTCAACAGTA
+ACCAGAAAAACTGAACCAAAAGCTCCTGAACCAATTTCCTCAAAACCAGTAATTGTTACTGGGTTGCAGG
+ATACAACTGTTTCTTCAGACAGTGTTGCTAAATTTGCAGTTAAGGCTACTGGAGAACCCCGGCCAACTGC
+CATCTGGACAAAAGATGGAAAGGCCATTACACAAGGAGGTAAATATAAACTCTCTGAAGACAAGGGAGGG
+TTCTTCTTAGAAATTCATAAGACTGATACTTCTGACAGTGGACTTTATACTTGTACAGTAAAAAATTCAG
+CTGGATCTGTGTCCTCTAGCTGCAAATTAACAATAAAAGCTATAAAAGATACTGAGGCACAGAAAGTCTC
+TACACAAAAGACTTCTGAAATTACACCTCAGAAGAAAGCTGTTGTCCAAGAGGAAATTTCCCAAAAAGCC
+CTAAGGTCTGAAGAAATTAAGATGTCAGAGGCAAAATCTCAAGAAAAGTTAGCCCTCAAAGAGGAAGCTT
+CAAAGGTTCTGATTTCTGAAGAAGTCAAGAAATCAGCAGCAACCTCCCTGGAAAAATCCATTGTCCATGA
+GGAAATCACTAAAACATCACAGGCATCAGAAGAAGTCAGAACTCATGCTGAGATTAAAGCATTTTCTACT
+CAGATGAGCATAAACGAAGGTCAAAGACTGGTTTTAAAAGCCAACATTGCTGGTGCCACTGATGTGAAAT
+GGGTACTGAATGGCGTAGAGCTTACCAACTCTGAGGAGTACCGATATGGTGTCTCAGGCAGCGATCAGAC
+CCTAACCATCAAGCAAGCCAGTCACAGAGATGAAGGAATCCTCACCTGCATAAGCAAAACCAAGGAAGGA
+ATCGTCAAGTGTCAGTATGATTTGACACTGAGCAAAGAACTCTCAGATGCTCCAGCCTTCATCTCACAGC
+CTAGATCTCAAAATATTAATGAAGGACAAAATGTTCTCTTTACTTGTGAAATCAGTGGCGAGCCATCCCC
+TGAAATCGAATGGTTTAAAAACAACCTGCCAATTTCTATTTCTTCAAATGTCAGCATAAGCCGCTCCAGA
+AATGTATACTCCCTTGAAATCCGAAATGCATCAGTCAGCGACAGTGGAAAGTACACAATTAAGGCCAAAA
+ATTTCCGTGGCCAGTGTTCAGCTACAGCTTCCTTAATGGTCCTTCCTCTAGTTGAAGAACCTTCCAGAGA
+GGTAGTATTGAGAACAAGTGGTGACACAAGCTTGCAAGGAAGCTTCTCGTCTCAGTCAGTCCAAATGTCT
+GCCTCCAAGCAGGAGGCCTCCTTCAGCAGTTTCAGCAGCAGCAGTGCTAGCAGCATGACTGAGATGAAAT
+TTGCAAGCATGTCTGCCCAAAGCATGTCCTCCATGCAAGAGTCCTTTGTAGAAATGAGTTCCAGCAGCTT
+TATGGGAATATCTAATATGACACAACTGGAAAGCTCAACTAGTAAAATGCTTAAAGCAGGCATAAGAGGA
+ATTCCGCCTAAAATTGAAGCTCTTCCATCTGATATCAGCATTGATGAAGGCAAAGTTCTAACAGTAGCCT
+GTGCTTTCACGGGTGAGCCTACCCCAGAAGTAACATGGTCCTGTGGTGGAAGAAAAATCCACAGTCAAGA
+ACAGGGGAGGTTCCACATTGAAAACACAGATGACCTGACAACCCTGATCATCATGGACGTACAGAAACAA
+GATGGTGGACTTTATACCCTGAGTTTAGGGAATGAATTTGGATCTGACTCTGCCACTGTGAATATACATA
+TTCGATCCATTTAAGAGGGCCTGTGCCCTTATACTCTACACTCATTCTTAACTTTTCGCAAACGTTTCAC
+ACGGACTAATCTTTCTGAACTGTAAATATTTAAAGAAAAAAAAGTAGTTTTGTATCAACCTAAATGAGTC
+AAAGTTCAAAAATATTCATTTCAATCTTTTCATAATTGTTGACCTAAGAATATAATACATTTGCTAGTGA
+CATGTACATACTGTATATAGCCGGATTAACGGTTATAAAGTTTTGTACCATTTATTTTATGACATTTTAC
+AATGTAAGTTTTGAAACTAACTGTTGGTAGGAGAAAGTTTCTTATGGAACGAATACCCTGCTCAACATTT
+AATCAATCTTTGTGCCTCAACATACTGTTGATGTCTAAGTATGCCTCAGTGGGTTGAGAAAATCCCCATT
+GAAGATGTCCTGTCCACCTAAAAGAGAATGATGCTGTGCATATCACTTGATATGTGCACCAATACCTACT
+GAATCAGAAATGTAAGGCATTGGTGATGTTTGCATTTACCCTCCTGTAAGCAACACTTTAACGTCTTACA
+TTTTCTCTGATGATGTCACACAAAATTATCATGACAAATATTACCAGAGCAAAGTGTAACGGCCAACACT
+TTGTTCGCTCATTTTACGCTGTCTCTGACATAAGGAGTGCCTGAATAGCTTGGAAAAGTAACATCTCCTG
+GCCATCCCTTCATTTAACCAAGCTATTCAAGTATTCCTATGCCAGAGCAGTGCCAACTCTTGGAGGTCCC
+AGAGTGCAGCCAATGCCTTTGTGTGGTAGTTCTAAATTTTAATTGCACCTGAAAAACCTGGGCACCTAAG
+CAATGAGCCACAGCAAAAAGTAAAGAACAACAACAAAATAAAGCTGTTGTTAAATTTTAAACAATATTAC
+TAATTGCCCAAAATGTCAATTTGATGTAGTTCTTTTCATGCAAGTATAAATTCAATTGTTAGTTATAATT
+GTTGGACCTCCTTGAGATAGTAACAACAAAATAAAGCAAGCTATCTGCACCTCAAAA
+
diff --git a/seq/xurt8c.aa b/seq/xurt8c.aa
new file mode 100644
index 0000000..1cd68f0
--- /dev/null
+++ b/seq/xurt8c.aa
@@ -0,0 +1,5 @@
+>XURT8C | 40001 | glutathione transferase (EC 2.5.1.18) 8, cytosolic - rat
+MEVKPKLYYFQGRGRMEVIRWLLATAGVEFEEEFLETREQYEKLQKDDCLLFGQVPLVEIDGMLLTQTRA
+ILSYLAAKYNLYGKDLKERVRIDMYADGTQDLMMMIIGAPFKAPQEKEESLALAVKRAKNRYFPVFEKIL
+KDHGEAFLVGNQLSWADIQLLEAILMVEEVSAPVLSDFPLLQAFKTRISNIPTIKKFLQPGSQRKPPPDG
+HYVDVVRTVLKF
diff --git a/seq/xurt8c.lc b/seq/xurt8c.lc
new file mode 100644
index 0000000..5a161b3
--- /dev/null
+++ b/seq/xurt8c.lc
@@ -0,0 +1,5 @@
+>XURT8C | 40001 | glutathione transferase (EC 2.5.1.18) 8, cytosolic - rat
+MEVKPKLYYFQGRGRMEVIRWLLATAGVEFEEEFLETREQYEKLQKDDCLLFGQVPLVEIDGMLLTQTRA
+ilsylaakynlygkdlkervridmyadgtqdlmmmiigapfkapqekeeslalavkraknryfpvfekil
+KDHGEAFLVGNQLSWADIQLLEAILMVEEVSAPVLSDFPLLQAFKTRISNIPTIKKFLQPGSQRKPPPDG
+HYVDVVRTVLKF
diff --git a/seq/xurtg.aa b/seq/xurtg.aa
new file mode 100644
index 0000000..84c3414
--- /dev/null
+++ b/seq/xurtg.aa
@@ -0,0 +1,5 @@
+>XURTG glutathione transferase (EC 2.5.1.18) Ya - rat
+MSGKPVLHYFNARGRMECIRWLLAAAGVEFDEKFIQSPEDLEKLKKDGNLMFDQVPMVEIDGMKLAQTRA
+ILNYIATKYDLYGKDMKERALIDMYTEGILDLTEMIMQLVICPPDQKEAKTALAKDRTKNRYLPAFEKVL
+KSHGQDYLVGNRLTRVDIHLLELLLYVEEFDASLLTSFPLLKAFKSRISSLPNVKKFLQPGSQRKLPMDA
+KQIEEARKIFKF
diff --git a/sql/README b/sql/README
new file mode 100644
index 0000000..d0bede0
--- /dev/null
+++ b/sql/README
@@ -0,0 +1,26 @@
+
+22-Jan-2014
+
+fasta36/sql
+
+================
+Perl and SQL for creating and searching mySQL and pgSQL sequence
+databases.
+
+See A. J. Mackey and W. R. Pearson (2004) "Using SQL databases for
+sequence similarity searching and analysis" Current Protocols in
+Bioinformatics L. Stein, Ed. Wiley, New York, pp 9.4.1-9.4.25
+
+create_seq_demo.sql
+mysql_demo1.sql -- SQL file for fasta36 format 16 to search seqdb_demo databse
+mysql_demo_pv.sql -- SQL file for searching 50000 Swissprot sequences using database format 16 (mysql)
+ fasta36 ../seq/mgstm1.aa "../sql/myql_demo_pv.sql 16"
+nr_to_sql.pl -- perl script to create mysql seqdb_demo sequence database
+pirpsd.sql -- SQL file for fasta36 format 16 to search custom PIR database (obsolete)
+psql_demo1.sql -- pgSQL version of mysql_demo1.sql
+psql_demo_pv.sql -- pgSQL version of mysql_demo_pv.sql
+
+================
+alternate expansion strategy
+
+join_up50.pl -- more sophisticated way to generate expanded sequences using SQL
diff --git a/sql/create_seq_demo.sql b/sql/create_seq_demo.sql
new file mode 100644
index 0000000..35188a0
--- /dev/null
+++ b/sql/create_seq_demo.sql
@@ -0,0 +1,30 @@
+
+DROP DATABASE seq_demo;
+CREATE DATABASE seq_demo;
+
+USE seq_demo;
+
+CREATE TABLE prot (
+id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+seq TEXT NOT NULL,
+bin BLOB NOT NULL,
+len INT UNSIGNED NOT NULL
+);
+
+CREATE TABLE annot (
+prot_id INT UNSIGNED NOT NULL,
+gi INT UNSIGNED NOT NULL PRIMARY KEY,
+db ENUM("gb","emb","dbj","prf","ref","pdb","pir","sp") NOT NULL,
+descr TEXT NOT NULL,
+
+INDEX (prot_id),
+INDEX (db)
+);
+
+CREATE TABLE sp (
+ gi INT UNSIGNED NOT NULL,
+ acc VARCHAR(10),
+ name VARCHAR(10),
+
+ PRIMARY KEY (gi)
+);
diff --git a/sql/join_up50.pl b/sql/join_up50.pl
new file mode 100755
index 0000000..e0e5145
--- /dev/null
+++ b/sql/join_up50.pl
@@ -0,0 +1,99 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2004, 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+# usage - join_up50.pl link_acc_XYZ12 > "link_acc_XYZ12.sql 16" file
+#
+# this program is designed to read a link_acc file from fasta36 -e "expand.sh" and
+# (1) extract the uniref accessions
+# (2) build a mySQL fasta_tmp table using those accessions
+# (3) write out a link_mysql.sql file that can be used to produce the
+# sequences using format 16
+#
+
+use strict;
+use DBI;
+
+my $in_file = $ARGV[0];
+open (FIN, $in_file) || die "cannot open $in_file\n";
+
+my $db = 'uniprot';
+my $db_tmp = 'fasta_tmp';
+
+my $host='host';
+my $user = 'user';
+my $password = 'password';
+
+my $dbh = DBI->connect("dbi:mysql:host=xdb2:$db",
+ $user, $password,
+ { RaiseError => 1, AutoCommit => 1}
+ ) or die $DBI::errstr;
+
+my $dbh_tmp = DBI->connect("dbi:mysql:host=xdb2:$db_tmp",
+ $user, $password,
+ { RaiseError => 1, AutoCommit => 1}
+ ) or die $DBI::errstr;
+
+my %up_sth = (
+ ur50_to_upacc => "SELECT uniprot_acc FROM uniref50link WHERE uniref50_acc=?",
+ upacc_to_seq => "SELECT * FROM trFull WHERE acc=?",
+ );
+
+
+for my $sth (keys(%up_sth)) {
+ $up_sth{$sth} = $dbh->prepare($up_sth{$sth});
+}
+
+my %q_acc_uniq = ();
+
+while (my $line = <FIN>) {
+ next if ($line =~ m/^UniRef50_UPI/);
+ chomp($line);
+ my ($descr, $score) = split(/\t/,$line);
+
+ my ($acc) = ($descr =~ m/UniRef\d+_(\w+)/i);
+ $q_acc_uniq{$acc} = 1;
+}
+close FIN;
+
+$dbh->disconnect();
+
+# now we have a hash of unique accessions
+# make a table and put them in
+
+$dbh_tmp->do(qq{DROP TABLE IF EXISTS $in_file;}) or die $DBI::errstr;
+$dbh_tmp->do(qq{ CREATE TABLE $in_file ( acc CHAR(10) PRIMARY KEY );})
+ or die $DBI::errstr;
+
+my @acc_list = sort keys(%q_acc_uniq);
+
+my $sql_insert = "INSERT INTO $in_file (acc) VALUES (\"" . join(q/"),("/, at acc_list) . "\");\n" ;
+$dbh_tmp->do($sql_insert);
+$dbh_tmp->disconnect();
+
+# now the $in_file table should be full; write out the SQL join to produce the sequences we need.
+
+print "$host $db $user $password;\n";
+print qq{SELECT up.acc, protein.seq
+ FROM $db_tmp.$in_file AS fa_acc JOIN uniref50link on(fa_acc.acc=uniref50_acc)
+ JOIN annot AS up ON(uniprot_acc = up.acc AND fa_acc.acc != up.acc) JOIN protein USING(prot_id);
+SELECT acc, concat('up|',acc,'|',name,' ',descr) FROM annot WHERE acc='#';
+SELECT acc,protein.seq FROM protein INNER JOIN annot USING(prot_id)
+ WHERE annot.acc='#';
+DROP TABLE $db_tmp.$in_file;
+};
diff --git a/sql/mysql_demo1.sql b/sql/mysql_demo1.sql
new file mode 100644
index 0000000..07fa46d
--- /dev/null
+++ b/sql/mysql_demo1.sql
@@ -0,0 +1,6 @@
+seqdb_host seqdb_demo seqdb_user password;
+SELECT acc, protein.seq, sp_name
+ FROM annot INNER JOIN protein USING(prot_id) WHERE annot.db='sp' LIMIT 50000;
+SELECT acc, concat('sp|',acc,'|',sp_name,' ',descr) FROM annot WHERE acc='#' AND db='sp';
+SELECT acc,protein.seq FROM protein INNER JOIN annot USING(prot_id)
+ WHERE annot.acc='#' AND db='sp';
diff --git a/sql/mysql_demo_pv.sql b/sql/mysql_demo_pv.sql
new file mode 100644
index 0000000..70dcff3
--- /dev/null
+++ b/sql/mysql_demo_pv.sql
@@ -0,0 +1,6 @@
+seqdb_host seqdb_demo seqdb_user password;
+SELECT acc, protein.seq, sp_name, concat('sp|',acc,'|',sp_name,' ',descr)
+ FROM annot INNER JOIN protein USING(prot_id) WHERE annot.db='sp' LIMIT 50000;
+SELECT acc, concat('sp|',acc,'|',sp_name,' ',descr) FROM annot WHERE acc='#' AND db='sp';
+SELECT acc,protein.seq FROM protein INNER JOIN annot USING(prot_id)
+ WHERE annot.acc='#' AND db='sp';
diff --git a/sql/nr_to_sql.pl b/sql/nr_to_sql.pl
new file mode 100755
index 0000000..d64d897
--- /dev/null
+++ b/sql/nr_to_sql.pl
@@ -0,0 +1,103 @@
+#!/usr/bin/perl -w
+
+################################################################
+# copyright (c) 2004, 2014 by William R. Pearson and The Rector &
+# Visitors of the University of Virginia */
+################################################################
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under this License is distributed on an "AS
+# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+################################################################
+
+use DBI;
+
+$SIG{__WARN__} = sub { die @_ };
+
+my $mysql = DBI->connect("DBI:mysql:database=seq_demo;user=seq_demo;password=demo_pass");
+
+$mysql->do(q{LOCK TABLES prot WRITE,
+ annot WRITE,
+ sp WRITE });
+
+my $EL = 125;
+my $NA = 123;
+
+my @aatrans = ($EL,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$EL,$NA,$NA,$EL,$NA,$NA,
+ $NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,
+ $NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA, 24,$NA,$NA,$NA,$NA,$NA,
+ $NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,$NA,
+ $NA, 1, 21, 5, 4, 7, 14, 8, 9, 10,$NA, 12, 11, 13, 3,$NA,
+ 15, 6, 2, 16, 17,$NA, 20, 18, 23, 19, 22,$NA,$NA,$NA,$NA,$NA,
+ $NA, 1, 21, 5, 4, 7, 14, 8, 9, 10,$NA, 12, 11, 13, 3,$NA,
+ 15, 6, 2, 16, 17,$NA, 20, 18, 23, 19, 22,$NA,$NA,$NA,$NA,$NA
+ );
+
+my $ins_prot = $mysql->prepare(q{
+ INSERT INTO prot (seq,bin,len) VALUES (?, ?, ?)
+ });
+
+my $ins_annot = $mysql->prepare(q{
+ INSERT INTO annot (gi, prot_id, db, descr) VALUES (?, ?, ?, ?)
+ });
+
+my $ins_sp = $mysql->prepare(q{
+ INSERT INTO sp (gi, acc, name) VALUES (?, ?, ?)
+ });
+
+use vars qw( $seq $bin $tot_seq $tot_annot $tot_sp );
+use vars qw( $gi $prot_id $db $desc $sp_acc $sp_name );
+use vars qw( $header $seq @entries );
+use vars qw( $gi $db $db_acc $db_name $desc);
+
+$tot_seq = $tot_annot = $tot_sp = 0;
+
+for my $db_file ( @ARGV ) {
+ open(DATA, "<$db_file") or die $!;
+ local $/ = "\n>";
+ while (<DATA>) {
+ chomp; # remove trailing "\n>" record header
+ ($header, $seq) = $_ =~ m/^>? # record separator (first entry)
+ ( [^\n]* ) \n # header line
+ ( .* ) # the sequence
+ /osx; # optimize, multiline, commented
+
+ $seq =~ s/\W|\d//sg;
+ $bin = pack('C*', map { $aatrans[unpack('C', $_)] } split(//, $seq));
+ $ins_prot->execute($seq,$bin,length($seq));
+ $prot_id = $ins_prot->{mysql_insertid};
+
+ $tot_seq++;
+
+# print STDERR "Inserted $prot_id: ". length($seq)."\n";
+
+ @entries = split(/\001/, $header);
+
+ for ( @entries ) {
+ ($gi,$db,$db_acc,$db_name,$desc)=
+ $_ =~ /^gi\|(\d+)\|([a-z]+)\|(\S*)\|(\S*) (.*)$/o;
+# print "$prot_id: $gi\t$db\t$db_acc\t$desc\n";
+ $ins_annot->execute($gi,$prot_id,$db,$desc);
+
+ $tot_annot++;
+
+ if ($db eq "sp") {
+ $ins_sp->execute($gi,$db_acc,$db_name);
+ $tot_sp++;
+ }
+ }
+ }
+ close(DATA);
+}
+
+print "Inserted $tot_seq sequences; $tot_annot annotations; $tot_sp swissprot\n";
+
+
+
diff --git a/sql/pirpsd.sql b/sql/pirpsd.sql
new file mode 100644
index 0000000..8507733
--- /dev/null
+++ b/sql/pirpsd.sql
@@ -0,0 +1,8 @@
+xdb.wrplab PIRPSD seq_demo demo_pass;
+SELECT PIRID, SEQUENCES, PIRID
+ FROM c_psdsequence;
+SELECT PIRID, concat(PIRID," ",TITLE) FROM c_psdmain
+ WHERE PIRID='#';
+SELECT PIRID, SEQUENCES, PIRID
+ FROM c_psdsequence
+ WHERE PIRID='#';
diff --git a/sql/psql_demo.sql b/sql/psql_demo.sql
new file mode 100644
index 0000000..8f2ec58
--- /dev/null
+++ b/sql/psql_demo.sql
@@ -0,0 +1,7 @@
+@ seqdb_demo seqdb_demo @;
+SELECT acc, protein.seq, sp_name
+ FROM annot INNER JOIN protein USING(prot_id) WHERE annot.db='sp';
+SELECT acc, 'sp|'||acc||'|'||sp_name||' '||descr FROM annot WHERE acc='#' AND db='sp';
+SELECT acc,protein.seq FROM protein INNER JOIN annot USING(prot_id)
+ WHERE annot.acc='#' AND db='sp';
+
diff --git a/sql/psql_demo1.sql b/sql/psql_demo1.sql
new file mode 100644
index 0000000..a80e38a
--- /dev/null
+++ b/sql/psql_demo1.sql
@@ -0,0 +1,6 @@
+seqdb_host seqdb_demo seqdb_user password;
+SELECT acc, protein.seq, 'sp|'||acc||'|'||sp_name||' '||descr
+ FROM annot INNER JOIN protein USING(prot_id) WHERE annot.db='sp' LIMIT 50000;
+SELECT acc, 'sp|'||acc||'|'||sp_name||' '||descr FROM annot WHERE acc='#' AND db='sp';
+SELECT acc,protein.seq FROM protein INNER JOIN annot USING(prot_id)
+ WHERE annot.acc='#' AND db='sp';
diff --git a/sql/psql_demo_pv.sql b/sql/psql_demo_pv.sql
new file mode 100644
index 0000000..1b6a5ec
--- /dev/null
+++ b/sql/psql_demo_pv.sql
@@ -0,0 +1,7 @@
+seqdb_host seqdb_demo seqdb_user password;
+SELECT acc, protein.seq, 'sp|'||acc||'|'||sp_name||' '||descr
+ FROM annot INNER JOIN protein USING(prot_id) WHERE annot.db='sp' LIMIT 50000;
+SELECT acc, descr FROM annot WHERE acc='#' AND db='sp';
+SELECT acc,protein.seq FROM protein INNER JOIN annot USING(prot_id)
+ WHERE annot.acc='#' AND db='sp';
+
diff --git a/src/a_mark.h b/src/a_mark.h
new file mode 100644
index 0000000..9396f34
--- /dev/null
+++ b/src/a_mark.h
@@ -0,0 +1,51 @@
+/* a_mark.h - symbols used to indicate match/mismatch alignment code */
+
+/* $Id: a_mark.h 1024 2012-08-07 18:08:45Z wrp $ */
+
+/* copyright (c) 2003 by William R. Pearson and The Rector & Vistors
+ of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* character types in aln_code strings */
+#define M_BLANK 0
+#define M_NEG 1
+#define M_ZERO 2
+#define M_POS 3
+#define M_IDENT 4
+#define M_DEL 5
+
+#define MX_A0 0
+#define MX_A1 1
+#define MX_A2 2
+#ifdef M10_CONS_L
+#define MX_A10 3
+#else
+#define MX_A10 4
+#endif
+#define MX_ACC 5
+#define MX_ABLAST 6
+
+static char *
+aln_map_sym[] = {" ..: ", /* 0 */
+ " Xxx ", /* 1 */
+ " . ", /* 2 */
+ " mzp=-", /* 3: 10a */
+ " ..:-", /* 4: 10b */
+ " <z>=-", /* 5: calc_code */
+ " += " /* 6: MX_MBLAST blast */
+ };
+
+
+
diff --git a/src/aamap.h b/src/aamap.h
new file mode 100644
index 0000000..019d70f
--- /dev/null
+++ b/src/aamap.h
@@ -0,0 +1,17 @@
+
+/* aamap.gbl character and number translations */
+
+/* $Id: aamap.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+char aacmap[64]={
+ 'K','N','K','N','T','T','T','T','R','S','R','S','I','I','M','I',
+ 'Q','H','Q','H','P','P','P','P','R','R','R','R','L','L','L','L',
+ 'E','D','E','D','A','A','A','A','G','G','G','G','V','V','V','V',
+ 'X','Y','X','Y','S','S','S','S','X','C','W','C','L','F','L','F'
+ };
+
+int aamap[64]; /* integer aa values */
+int aamapr[64]; /* reverse sequence map */
+
+
diff --git a/src/ag_stats.c b/src/ag_stats.c
new file mode 100644
index 0000000..ff7d3cc
--- /dev/null
+++ b/src/ag_stats.c
@@ -0,0 +1,129 @@
+/* $Id: ag_stats.c $ */
+
+/* this procedure implements Altschul's pre-calculated values for lambda, K */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "alt_parms.h"
+
+static double K, Lambda, H;
+extern int
+look_p(struct alt_p parm[], int gap, int ext,
+ double *K, double *Lambda, double *H);
+
+int
+ag_parm(char *pam_type, int gdelval, int ggapval)
+{
+ int r_v, t_gdelval, t_ggapval;
+
+#ifdef OLD_FASTA_GAP
+ t_gdelval = gdelval;
+ t_ggapval = ggapval;
+#else
+ t_gdelval = gdelval+ggapval;
+ t_ggapval = ggapval;
+#endif
+
+ if (strcmp(pam_type,"BL50")==0 || strcmp(pam_type,"BLOSUM50")==0)
+ r_v = look_p(bl50_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_type,"BL62")==0 || strcmp(pam_type,"BLOSUM62")==0)
+ r_v = look_p(bl62_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_type,"P250")==0)
+ r_v = look_p(p250_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_type,"P120")==0)
+ r_v = look_p(p120_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_type,"MD10")==0)
+ r_v = look_p(md10_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_type,"MD20")==0)
+ r_v = look_p(md20_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_type,"MD40")==0)
+ r_v = look_p(md40_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_type,"DNA")==0 || strcmp(pam_type,"+5/-4")==0)
+ r_v = look_p(nt54_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ else r_v = 0;
+
+ return r_v;
+}
+
+int
+look_p(struct alt_p parm[], int gap, int ext,
+ double *K, double *Lambda, double *H)
+{
+ int i;
+
+ gap = -gap;
+ ext = -ext;
+
+ if (gap > parm[1].gap) {
+ *K = parm[0].K;
+ *Lambda = parm[0].Lambda;
+ *H = parm[0].H;
+ return 1;
+ }
+
+ for (i=1; parm[i].gap > 0; i++) {
+ if (parm[i].gap > gap) continue;
+ else if (parm[i].gap == gap && parm[i].ext > ext ) continue;
+ else if (parm[i].gap == gap && parm[i].ext == ext) {
+ *K = parm[i].K;
+ *Lambda = parm[i].Lambda;
+ *H = parm[i].H;
+ return 1;
+ }
+ else break;
+ }
+ return 0;
+}
+
+int E1_to_s(double e_val, int n0, int n1) {
+ double mp, np, a_n0, a_n0f, a_n1, a_n1f, u;
+ int score;
+
+ a_n0 = (double)n0;
+ a_n0f = log(a_n0)/H;
+
+ a_n1 = (double)n1;
+ a_n1f = log(a_n1)/H;
+
+ mp = a_n0 - a_n0f - a_n1f;
+ np = a_n1 - a_n0f - a_n1f;
+
+ if (np < 1.0) np = 1.0;
+ if (mp < 1.0) mp = 1.0;
+
+ /*
+ e_val = K * np * mp * exp ( - Lambda * score);
+ log(e_val) = log(K np mp) - Lambda * score;
+ (log(K np mp)-log(e_val)) / Lambda = score;
+ */
+ score = (int)((log( K * mp * np) - log(e_val))/Lambda +0.5);
+ if (score < 0) score = 0;
+ return score;
+}
+
+double s_to_E4(int score, int n0, int n1)
+{
+ double p_val;
+ double mp, np, a_n0, a_n0f, a_n1, a_n1f, u;
+
+ a_n0 = (double)n0;
+ a_n0f = log(a_n0)/H;
+
+ a_n1 = (double)n1;
+ a_n1f = log(a_n1)/H;
+
+ mp = a_n0 - a_n0f - a_n1f;
+ np = a_n1 - a_n0f - a_n1f;
+
+ if (np < 1.0) np = 1.0;
+ if (mp < 1.0) mp = 1.0;
+
+ p_val = K * np * mp * exp ( - Lambda * score);
+
+ if (p_val > 0.01) p_val = 1.0 - exp(-p_val);
+
+ return p_val * 10000.0;
+}
+
diff --git a/src/aln_structs.h b/src/aln_structs.h
new file mode 100644
index 0000000..9c69711
--- /dev/null
+++ b/src/aln_structs.h
@@ -0,0 +1,61 @@
+/* $Id: aln_structs.h 1139 2013-04-16 01:00:09Z wrp $ */
+
+#ifndef A_STRUCT
+#define A_STRUCT
+
+struct a_struct {
+ int smin0; /* coordinate of display start in seqc0 */
+ int smin1; /* coordinate of display start in seqc1 */
+ int amin0, amax0; /* coordinate of alignment start in seqc0 */
+ int amin1, amax1; /* coordinate of alignment start in seqc1 */
+ int calc_last_set; /* boolean that indicates structure was set by
+ calc_code, calc_cons, etc */
+
+ int llen;
+ int llcntx, llcntx_set, showall;
+
+ int qlrev, qlfact;
+ int llrev, llfact, llmult;
+ int frame;
+
+ int nident, nsim, npos, nmismatch, lc, ngap_q, ngap_l, nfs; /* number of identities, gaps in q, l */
+ long q_start_off, q_end_off; /* used in -m 9 calculate for full segment offset */
+ long l_start_off, l_end_off;
+ long q_offset, l_offset; /* offsets that include everything */
+ long d_start0,d_stop0;
+ long d_start1,d_stop1;
+};
+
+struct annot_var_str {
+ long v_pos;
+ unsigned char o_res;
+ unsigned char v_res;
+};
+
+struct a_res_str {
+ int sw_score; /* do_walign() score */
+ struct rstruct rst;
+ int score_delta; /* variant score change */
+ int min0, max0; /* boundaries of alignment in aa0 */
+ int min1, max1; /* boundaries of alignment in aa1 */
+ int v_start, v_len; /* virtual start, length */
+ int *res; /* encoded alignment */
+ int nres; /* length of decoded alignment */
+ int mres; /* length of encoding in res[] */
+ int n1; /* length of library sequence used for this (sub) alignment */
+ struct a_res_str *next; /* pointer to next alignment */
+
+ int index; /* position in a_res chain */
+ /* encoded alignment/annotation information */
+ char *aln_code;
+ int aln_code_n;
+
+ char *annot_code; /* annotation info written by calc_code() */
+ int annot_code_n;
+
+ char *annot_var_s; /* annotation info written by calc_cons_a() */
+ char *annot_var_id; /* annotation info written by calc_id() */
+ char *annot_var_idd; /* annotation info written by calc_idd() */
+ struct a_struct aln;
+};
+#endif
diff --git a/src/alt_parms.h b/src/alt_parms.h
new file mode 100644
index 0000000..055025a
--- /dev/null
+++ b/src/alt_parms.h
@@ -0,0 +1,399 @@
+/* tables of Altschul-Gish parameters */
+
+/* $Id: alt_parms.h 625 2011-03-23 17:21:38Z wrp $ */
+
+/* first entry must be for (inf,inf) penalty */
+
+struct alt_p {
+ int gap;
+ int ext;
+ float Lambda;
+ float K;
+ float H;
+};
+
+/* BL80 1/2 bit */
+struct alt_p bl80_p[] = {
+ {0, 0, 0.343, 0.177, 0.66},
+ {14, 2, 0.336, 0.150, 0.62},
+ {12, 2, 0.328, 0.130, 0.54},
+ {12, 1, 0.314, 0.096, 0.41},
+ {11, 2, 0.320, 0.110, 0.51},
+ {11, 1, 0.296, 0.066, 0.36},
+ {10, 2, 0.311, 0.097, 0.46},
+ {10, 1, 0.282, 0.052, 0.29},
+ { 9, 2, 0.292, 0.069, 0.33},
+ { 9, 1, 0.248, 0.026, 0.18},
+ { 8, 2, 0.271, 0.050, 0.27},
+ { 8, 1, 0.189, 0.0071, 0.07}
+};
+
+/* BL62 1/2 bit */
+struct alt_p bl62_p[] = {
+ {0, 0, 0.318, 0.13, 0.40},
+ {12, 3, 0.305, 0.10, 0.38},
+ {12, 2, 0.300, 0.09, 0.34},
+ {12, 1, 0.275, 0.05, 0.25},
+ {11, 3, 0.301, 0.09, 0.36},
+ {11, 2, 0.286, 0.07, 0.29},
+ {11, 1, 0.255, 0.035, 0.19},
+ {10, 4, 0.293, 0.08, 0.33},
+ {10, 3, 0.281, 0.06, 0.29},
+ {10, 2, 0.266, 0.04, 0.24},
+ {10, 1, 0.216, 0.014, 0.12},
+ {9, 5, 0.286, 0.08, 0.29},
+ {9, 4, 0.273, 0.06, 0.25},
+ {9, 4, 0.273, 0.06, 0.25},
+ {9, 2, 0.244, 0.030, 0.18},
+ {9, 1, 0.176, 0.008, 0.06},
+ {8, 8, 0.270, 0.06, 0.25},
+ {8, 7, 0.270, 0.06, 0.25},
+ {8, 6, 0.262, 0.05, 0.23},
+ {8, 5, 0.262, 0.05, 0.23},
+ {8, 4, 0.262, 0.05, 0.23},
+ {8, 3, 0.243, 0.035, 0.18},
+ {8, 2, 0.215, 0.021, 0.12},
+ {7, 7, 0.247, 0.05, 0.18},
+ {7, 6, 0.247, 0.05, 0.18},
+ {7, 5, 0.230, 0.030, 0.15},
+ {7, 4, 0.230, 0.030, 0.15},
+ {7, 3, 0.208, 0.021, 0.11},
+ {7, 2, 0.164, 0.009, 0.06},
+ {6, 6, 0.200, 0.021, 0.10},
+ {6, 5, 0.200, 0.021, 0.10},
+ {6, 4, 0.179, 0.014, 0.08},
+ {6, 3, 0.153, 0.010, 0.05},
+ {5, 5, 0.131, 0.009, 0.04},
+ {-1, -1, -1.0, -1.0, -1.0},
+};
+
+/* BL50 1/3 bit */
+
+struct alt_p bl50_p[] = {
+ {0, 0, 0.232, 0.11, 0.34},
+ {16, 4, 0.222, 0.08, 0.31},
+ {16, 3, 0.213, 0.06, 0.27},
+ {16, 2, 0.207, 0.05, 0.24},
+ {16, 1, 0.180, 0.024, 0.15},
+ {15, 8, 0.222, 0.09, 0.31},
+ {15, 7, 0.219, 0.08, 0.29},
+ {15, 6, 0.219, 0.08, 0.29},
+ {15, 5, 0.216, 0.07, 0.28},
+ {15, 4, 0.216, 0.07, 0.28},
+ {15, 3, 0.210, 0.06, 0.25},
+ {15, 2, 0.202, 0.05, 0.22},
+ {15, 1, 0.166, 0.018, 0.11},
+ {14, 8, 0.218, 0.08, 0.29},
+ {14, 7, 0.214, 0.07, 0.27},
+ {14, 6, 0.214, 0.07, 0.27},
+ {14, 5, 0.214, 0.07, 0.27},
+ {14, 4, 0.205, 0.05, 0.24},
+ {14, 3, 0.201, 0.05, 0.22},
+ {14, 2, 0.188, 0.034, 0.17},
+ {14, 1, 0.140, 0.009, 0.07},
+ {13, 8, 0.211, 0.06, 0.27},
+ {13, 7, 0.205, 0.05, 0.24},
+ {13, 6, 0.205, 0.05, 0.24},
+ {13, 5, 0.205, 0.05, 0.24},
+ {13, 4, 0.202, 0.05, 0.22},
+ {13, 3, 0.188, 0.034, 0.18},
+ {13, 2, 0.174, 0.025, 0.13},
+ {13, 1, 0.114, 0.006, 0.04},
+ {12, 7, 0.205, 0.06, 0.24},
+ {12, 6, 0.197, 0.05, 0.21},
+ {12, 5, 0.197, 0.05, 0.21},
+ {12, 4, 0.192, 0.04, 0.18},
+ {12, 3, 0.178, 0.028, 0.15},
+ {12, 2, 0.158, 0.019, 0.10},
+ {11, 8, 0.197, 0.05, 0.21},
+ {11, 7, 0.190, 0.04, 0.19},
+ {11, 6, 0.190, 0.04, 0.19},
+ {11, 5, 0.184, 0.04, 0.17},
+ {11, 4, 0.177, 0.031, 0.15},
+ {11, 3, 0.167, 0.028, 0.11},
+ {11, 2, 0.130, 0.009, 0.06},
+ {10, 8, 0.183, 0.04, 0.17},
+ {10, 7, 0.178, 0.035, 0.16},
+ {10, 6, 0.178, 0.035, 0.16},
+ {10, 5, 0.168, 0.026, 0.13},
+ {10, 4, 0.156, 0.020, 0.10},
+ {10, 3, 0.139, 0.013, 0.07},
+ {10, 2, 0.099, 0.007, 0.03},
+ {9, 7, 0.164, 0.029, 0.13},
+ {9, 6, 0.152, 0.021, 0.10},
+ {9, 5, 0.152, 0.021, 0.10},
+ {9, 4, 0.134, 0.014, 0.07},
+ {9, 3, 0.107, 0.008, 0.04},
+ {8, 8, 0.139, 0.017, 0.08},
+ {8, 7, 0.134, 0.015, 0.07},
+ {8, 6, 0.127, 0.013, 0.06},
+ {8, 5, 0.117, 0.011, 0.05},
+ {8, 4, 0.101, 0.009, 0.03},
+ {7, 7, 0.100, 0.010, 0.04},
+ {7, 6, 0.094, 0.010, 0.03},
+ {-1, -1, -1.0, -1.0, -1.0},
+};
+
+struct alt_p p250_p[] = {
+ {0, 0, 0.229, 0.09, 0.23},
+ {16, 4, 0.217, 0.07, 0.21},
+ {16, 3, 0.208, 0.05, 0.18},
+ {16, 2, 0.200, 0.04, 0.16},
+ {16, 1, 0.172, 0.018, 0.09},
+ {15, 5, 0.215, 0.06, 0.20},
+ {15, 4, 0.208, 0.05, 0.18},
+ {15, 3, 0.203, 0.04, 0.16},
+ {15, 2, 0.193, 0.035, 0.14},
+ {15, 1, 0.154, 0.012, 0.07},
+ {14, 6, 0.212, 0.06, 0.19},
+ {14, 5, 0.204, 0.05, 0.17},
+ {14, 4, 0.204, 0.05, 0.17},
+ {14, 3, 0.194, 0.035, 0.14},
+ {14, 2, 0.180, 0.025, 0.11},
+ {14, 1, 0.131, 0.008, 0.04},
+ {13, 6, 0.206, 0.06, 0.17},
+ {13, 5, 0.196, 0.04, 0.14},
+ {13, 4, 0.196, 0.04, 0.14},
+ {13, 3, 0.184, 0.029, 0.12},
+ {13, 2, 0.163, 0.016, 0.08},
+ {13, 1, 0.110, 0.008, 0.03},
+ {12, 7, 0.199, 0.05, 0.15},
+ {12, 6, 0.191, 0.04, 0.13},
+ {12, 5, 0.191, 0.04, 0.13},
+ {12, 4, 0.181, 0.029, 0.12},
+ {12, 3, 0.170, 0.022, 0.10},
+ {12, 2, 0.145, 0.012, 0.06},
+ {11, 7, 0.186, 0.04, 0.13},
+ {11, 6, 0.180, 0.034, 0.11},
+ {11, 5, 0.180, 0.034, 0.11},
+ {11, 4, 0.165, 0.021, 0.09},
+ {11, 3, 0.153, 0.017, 0.07},
+ {11, 2, 0.122, 0.009, 0.04},
+ {10, 8, 0.175, 0.031, 0.11},
+ {10, 7, 0.171, 0.029, 0.10},
+ {10, 6, 0.165, 0.024, 0.09},
+ {10, 5, 0.158, 0.020, 0.08},
+ {10, 4, 0.148, 0.017, 0.07},
+ {10, 3, 0.129, 0.012, 0.05},
+ {9, 7, 0.151, 0.020, 0.07},
+ {9, 6, 0.146, 0.019, 0.06},
+ {9, 5, 0.137, 0.015, 0.05},
+ {9, 4, 0.121, 0.011, 0.04},
+ {9, 3, 0.102, 0.010, 0.03},
+ {8, 8, 0.123, 0.014, 0.05},
+ {8, 7, 0.123, 0.014, 0.05},
+ {8, 6, 0.115, 0.012, 0.04},
+ {8, 5, 0.107, 0.011, 0.03},
+ {7, 7, 0.090, 0.014, 0.02},
+ {-1, -1, -1.0, -1.0, -1.0},
+};
+
+struct alt_p p120_p[] = {
+ {0, 0, 0.342, 0.19, 0.63},
+ {12, 4, 0.334, 0.14, 0.60},
+ {12, 3, 0.330, 0.13, 0.57},
+ {12, 2, 0.330, 0.13, 0.57},
+ {12, 1, 0.219, 0.11, 0.46},
+ {11, 3, 0.330, 0.13, 0.57},
+ {11, 2, 0.323, 0.12, 0.51},
+ {11, 1, 0.296, 0.06, 0.38},
+ {10, 5, 0.323, 0.12, 0.54},
+ {10, 4, 0.314, 0.09, 0.50},
+ {10, 3, 0.314, 0.09, 0.50},
+ {10, 2, 0.301, 0.07, 0.42},
+ {10, 1, 0.273, 0.04, 0.28},
+ {9, 5, 0.316, 0.11, 0.49},
+ {9, 4, 0.311, 0.10, 0.45},
+ {9, 3, 0.311, 0.10, 0.45},
+ {9, 2, 0.284, 0.05, 0.35},
+ {9, 1, 0.239, 0.023, 0.18},
+ {8, 6, 0.307, 0.10, 0.43},
+ {8, 5, 0.295, 0.08, 0.39},
+ {8, 4, 0.295, 0.08, 0.39},
+ {8, 3, 0.284, 0.06, 0.34},
+ {8, 2, 0.262, 0.04, 0.26},
+ {8, 1, 0.183, 0.009, 0.08},
+ {7, 7, 0.286, 0.08, 0.34},
+ {7, 6, 0.286, 0.08, 0.34},
+ {7, 5, 0.276, 0.06, 0.31},
+ {7, 4, 0.276, 0.06, 0.31},
+ {7, 3, 0.255, 0.04, 0.24},
+ {7, 2, 0.224, 0.023, 0.16},
+ {6, 6, 0.248, 0.04, 0.23},
+ {6, 5, 0.248, 0.04, 0.23},
+ {6, 4, 0.234, 0.033, 0.19},
+ {6, 3, 0.216, 0.025, 0.15},
+ {6, 2, 0.160, 0.009, 0.06},
+ {5, 5, 0.191, 0.019, 0.11},
+ {5, 4, 0.173, 0.013, 0.09},
+ {5, 3, 0.134, 0.006, 0.05},
+ {-1, -1, -1.0, -1.0, -1.0}
+};
+
+struct alt_p bl55_p[] = {
+ {0, 0, 0.224, 0.12, 0.36},
+ {16, 4, 0.213, 0.08, 0.32},
+ {16, 3, 0.205, 0.07, 0.28},
+ {16, 2, 0.198, 0.06, 0.23},
+ {16, 1, 0.164, 0.020, 0.12},
+ {15, 8, 0.212, 0.09, 0.31},
+ {15, 7, 0.209, 0.08, 0.30},
+ {15, 6, 0.209, 0.08, 0.30},
+ {15, 5, 0.205, 0.07, 0.28},
+ {15, 4, 0.205, 0.07, 0.28},
+ {15, 3, 0.199, 0.06, 0.25},
+ {15, 2, 0.190, 0.05, 0.20},
+ {15, 1, 0.146, 0.013, 0.09},
+ {14, 7, 0.207, 0.08, 0.29},
+ {14, 6, 0.203, 0.07, 0.27},
+ {14, 5, 0.203, 0.07, 0.27},
+ {14, 4, 0.195, 0.05, 0.24},
+ {14, 3, 0.189, 0.04, 0.21},
+ {14, 2, 0.175, 0.030, 0.16},
+ {14, 1, 0.119, 0.006, 0.05},
+ {13, 8, 0.201, 0.07, 0.27},
+ {13, 7, 0.196, 0.06, 0.24},
+ {13, 6, 0.196, 0.06, 0.24},
+ {13, 5, 0.196, 0.06, 0.24},
+ {13, 4, 0.191, 0.05, 0.21},
+ {13, 3, 0.176, 0.032, 0.17},
+ {13, 2, 0.158, 0.020, 0.12},
+ {12, 8, 0.195, 0.06, 0.24},
+ {12, 7, 0.188, 0.05, 0.21},
+ {12, 6, 0.188, 0.05, 0.21},
+ {12, 5, 0.188, 0.05, 0.21},
+ {12, 4, 0.180, 0.04, 0.18},
+ {12, 3, 0.165, 0.026, 0.14},
+ {12, 2, 0.140, 0.014, 0.08},
+ {11, 8, 0.185, 0.05, 0.20},
+ {11, 7, 0.179, 0.04, 0.18},
+ {11, 6, 0.179, 0.04, 0.18},
+ {11, 5, 0.171, 0.033, 0.16},
+ {11, 4, 0.163, 0.027, 0.13},
+ {11, 3, 0.151, 0.022, 0.10},
+ {11, 2, 0.110, 0.008, 0.04},
+ {10, 10, 0.173, 0.04, 0.16},
+ {10, 9, 0.173, 0.04, 0.16},
+ {10, 8, 0.167, 0.035, 0.15},
+ {10, 7, 0.167, 0.035, 0.15},
+ {10, 6, 0.167, 0.035, 0.15},
+ {10, 5, 0.155, 0.025, 0.12},
+ {10, 4, 0.142, 0.017, 0.09},
+ {10, 3, 0.121, 0.011, 0.06},
+ {9, 9, 0.152, 0.026, 0.11},
+ {9, 8, 0.152, 0.026, 0.11},
+ {9, 7, 0.152, 0.026, 0.11},
+ {9, 6, 0.137, 0.018, 0.08},
+ {9, 5, 0.137, 0.018, 0.08},
+ {9, 4, 0.117, 0.011, 0.05},
+ {9, 3, 0.090, 0.007, 0.03},
+ {8, 8, 0.125, 0.014, 0.07},
+ {8, 7, 0.119, 0.013, 0.06},
+ {8, 6, 0.113, 0.012, 0.05},
+ {8, 5, 0.102, 0.010, 0.04},
+ {8, 4, 0.085, 0.009, 0.03},
+ {7, 7, 0.087, 0.010, 0.03},
+ {-1, -1, -1.0, -1.0, -1.0}
+};
+
+struct alt_p opt5_p[] =
+{
+ { 0, 0, 0.1432, 0.111, 0.316},
+ { 22, 4, 0.128, 0.055, 0.16},
+ { 21, 4, 0.124, 0.044, 0.14},
+ { 20, 4, 0.121, 0.039, 0.13},
+ { 19, 4, 0.117, 0.033, 0.11},
+ { 18, 4, 0.112, 0.026, 0.09},
+ { 25, 3, 0.128, 0.051, 0.16},
+ { 24, 3, 0.124, 0.041, 0.14},
+ { 23, 3, 0.122, 0.038, 0.13},
+ { 22, 3, 0.118, 0.032, 0.11},
+ { 21, 3, 0.113, 0.025, 0.09},
+ { 29, 2, 0.129, 0.051, 0.16},
+ { 28, 2, 0.127, 0.047, 0.14},
+ { 27, 2, 0.124, 0.041, 0.13},
+ { 26, 2, 0.121, 0.035, 0.11},
+ { 25, 2, 0.117, 0.029, 0.10},
+ { 24, 2, 0.111, 0.021, 0.08},
+ { -1,-2, -1.0, -1.0, -1.0}
+};
+
+struct alt_p nt54_p[] =
+{
+ {0, 0, 0.192, 0.173, 0.36},
+ {16, 4, 0.192, 0.177, 0.36},
+ {-1, -1, -1.0, -1.0, -1.0}
+};
+
+struct alt_p rnt54_p[] =
+{
+ {0, 0, 0.192, 0.173, 0.36},
+ {16, 4, 0.192, 0.177, 0.36},
+ {-1, -1, -1.0, -1.0, -1.0}
+};
+
+struct alt_p nt32_p[] = {
+ {0, 0, 0.2712, 0.131, 0.22},
+ {18, 2, 0.2620, 0.100, 0.22},
+ {16, 4, 0.2600, 0.098, 0.22},
+ {16, 2, 0.2540, 0.081, 0.19},
+ {12, 4, 0.2340, 0.054, 0.15},
+ {-1, -1, -1.0, -1.0, -1.0}
+};
+
+struct alt_p nt13_p[] = {
+ {0, 0, 1.374, 0.711, 1.31},
+ {4, 1, 1.36, 0.67, 1.30},
+ {3, 1, 1.34, 0.58, 1.19},
+ {2, 1, 1.21, 0.34, 0.77},
+ {-1, -1, -1.0, -1.0, -1.0}
+};
+
+/* PAM-10 (1/10 Hartley ~ 1/3 bit scale) */
+
+struct alt_p md10_p[] = {
+ {0, 0, 0.2299, 0.309, 3.45},
+ {20, 4, 0.222, 0.21, 3.1},
+ {20, 2, 0.218, 0.18, 2.9},
+ {18, 4, 0.220, 0.20, 2.9},
+ {18, 2, 0.217, 0.18, 2.7},
+ {16, 4, 0.217, 0.19, 2.8},
+ {16, 2, 0.212, 0.17, 2.3},
+ {14, 4, 0.212, 0.17, 2.5},
+ {14, 2, 0.205, 0.15, 1.9},
+ {12, 4, 0.206, 0.16, 2.1},
+ {12, 2, 0.190, 0.11, 1.3},
+ {-1, -1, -1.0, -1.0, -1.0}
+};
+
+/* PAM-20 (1/10 Hartley ~ 1/3 bit scale) */
+struct alt_p md20_p[] = {
+ {0, 0, 0.230, 0.287, 2.94},
+ {20, 4, 0.221, 0.19, 2.6},
+ {20, 2, 0.219, 0.18, 2.5},
+ {18, 4, 0.220, 0.19, 2.5},
+ {18, 2, 0.218, 0.18, 2.3},
+ {16, 4, 0.218, 0.18, 2.4},
+ {16, 2, 0.213, 0.17, 2.0},
+ {14, 4, 0.213, 0.17, 2.1},
+ {14, 2, 0.204, 0.14, 1.6},
+ {12, 4, 0.207, 0.17, 1.8},
+ {12, 2, 0.187, 0.10, 1.1},
+ {-1, -1, -1.0, -1.0, -1.0}
+};
+
+/* PAM-40 (1/10 Hartley ~ 1/3 bit scale) */
+struct alt_p md40_p[] = {
+ {0, 0, 0.2293, 0.257, 2.22},
+ {20, 4, 0.225, 0.22, 2.1},
+ {20, 2, 0.222, 0.20, 1.9},
+ {18, 4, 0.224, 0.22, 2.0},
+ {18, 2, 0.220, 0.20, 1.8},
+ {16, 4, 0.219, 0.19, 1.8},
+ {16, 2, 0.212, 0.16, 1.5},
+ {14, 4, 0.211, 0.15, 1.6},
+ {14, 2, 0.199, 0.11, 1.2},
+ {12, 4, 0.203, 0.14, 1.3},
+ {12, 2, 0.177, 0.064, 0.7},
+ {-1, -1, -1.0, -1.0, -1.0}
+};
diff --git a/src/altlib.h b/src/altlib.h
new file mode 100644
index 0000000..b48e3d4
--- /dev/null
+++ b/src/altlib.h
@@ -0,0 +1,142 @@
+
+/* $Id: altlib.h 905 2012-01-30 17:33:06Z wrp $ */
+/* $Revision: 905 $ */
+
+/* #ifdef UNIX */
+/* ncbi blast 1.3 format */
+/*
+#define NCBIBL13 11
+extern int ncbl_getliba();
+extern void ncbl_ranlib();
+void ncbl_closelib();
+*/
+#define NCBIBL20 12
+/* #endif */
+
+#ifdef MYSQL_DB
+#define MYSQL_LIB 16
+#define LASTLIB MYSQL_LIB+1
+#endif
+
+#ifdef PGSQL_DB
+#define PGSQL_LIB 17
+#define LASTLIB PGSQL_LIB+1
+#endif
+
+#if !defined (LASTLIB) && defined(NCBIBL20)
+#define LASTLIB NCBIBL20+1
+#endif
+#if !defined (LASTLIB)
+#define LASTLIB 10
+#endif
+
+#define FASTA_F 0
+#define DEFAULT 0
+#define FULLGB 1
+#define UNIXPIR 2
+#define EMBLSWISS 3
+#define INTELLIG 4
+#define VMSPIR 5
+#define GCGBIN 6
+#define FASTQ 7
+#define LASTTXT 7
+#define ACC_LIST 10
+
+#include "mm_file.h"
+
+/* pearson fasta format */
+int agetlib(unsigned char *, int, char *, int, fseek_t *, int *,
+ struct lmf_str *, long *);
+void aranlib(char *, int, fseek_t, char *, struct lmf_str *);
+/* full uncompressed GB FULLGB*/
+extern int lgetlib(unsigned char *, int, char *, int, fseek_t *, int *,
+ struct lmf_str *, long *);
+extern void lranlib(char *, int, fseek_t, char *, struct lmf_str *);
+/* PIR UNIX protein UNIXPIR */
+extern int pgetlib(unsigned char *, int, char *, int, fseek_t *, int *,
+ struct lmf_str *, long *);
+extern void pranlib(char *, int, fseek_t, char *, struct lmf_str *);
+/* EMBL/SWISS-PROT EMBLSWISS */
+extern int egetlib(unsigned char *, int, char *, int, fseek_t *, int *,
+ struct lmf_str *, long *);
+extern void eranlib(char *, int, fseek_t, char *, struct lmf_str *);
+
+/* Intelligenetics INTELLIG */
+extern int igetlib(unsigned char *, int, char *, int, fseek_t *, int *,
+ struct lmf_str *, long *);
+extern void iranlib(char *, int, fseek_t, char *, struct lmf_str *);
+/* PIR VMS format */
+extern int vgetlib(unsigned char *, int, char *, int, fseek_t *, int *,
+ struct lmf_str *, long *);
+extern void vranlib(char *, int, fseek_t, char *, struct lmf_str *);
+/* GCG 2bit format */
+extern int gcg_getlib(unsigned char *, int, char *, int, fseek_t *, int *,
+ struct lmf_str *, long *);
+extern void gcg_ranlib(char *, int, fseek_t, char *, struct lmf_str *);
+
+/* FASTQ format (ignoring quality scores) */
+int qgetlib(unsigned char *, int, char *, int, fseek_t *, int *,
+ struct lmf_str *, long *);
+void qranlib(char *, int, fseek_t, char *, struct lmf_str *);
+
+#ifdef NCBIBL20
+/* ncbi blast 2.0 format */
+extern int ncbl2_getliba(unsigned char *, int, char *, int, fseek_t *, int *,
+ struct lmf_str *, long *);
+extern void ncbl2_ranlib(char *, int, fseek_t, char *, struct lmf_str *);
+void ncbl2_closelib();
+#endif
+
+#ifdef MYSQL_DB
+extern int mysql_getlib(unsigned char *, int, char *, int, fseek_t *, int *,
+ struct lmf_str *, long *);
+extern void mysql_ranlib(char *, int, fseek_t, char *, struct lmf_str *);
+int mysql_closelib();
+#endif
+
+int (*getliba[LASTLIB])(unsigned char *, int, char *, int, fseek_t *, int *,
+ struct lmf_str *, long *)={
+ agetlib,lgetlib,pgetlib,egetlib,
+ igetlib,vgetlib,gcg_getlib,qgetlib,
+ agetlib,agetlib
+#ifdef UNIX
+ ,agetlib
+#ifdef NCBIBL13
+ ,ncbl_getliba
+#else
+ ,ncbl2_getliba
+#endif
+#ifdef NCBIBL20
+ ,ncbl2_getliba
+#endif
+#ifdef MYSQL_DB
+ ,agetlib
+ ,agetlib
+ ,agetlib
+ ,mysql_getlib
+#endif
+#endif
+};
+
+void (*ranliba[LASTLIB])(char *, int, fseek_t, char *, struct lmf_str *)={
+ aranlib,lranlib,pranlib,eranlib,
+ iranlib,vranlib,gcg_ranlib,qranlib,
+ aranlib,aranlib
+#ifdef UNIX
+ ,aranlib
+#ifdef NCBIBL13
+ ,ncbl_ranlib
+#else
+ ,ncbl2_ranlib
+#endif
+#ifdef NCBIBL20
+ ,ncbl2_ranlib
+#endif
+#ifdef MYSQL_DB
+ ,aranlib
+ ,aranlib
+ ,aranlib
+ ,mysql_ranlib
+#endif
+#endif
+};
diff --git a/src/apam.c b/src/apam.c
new file mode 100644
index 0000000..a4f51d6
--- /dev/null
+++ b/src/apam.c
@@ -0,0 +1,502 @@
+/* apam.c 19-June-86 */
+
+/* $Id: apam.c 1281 2014-08-21 17:32:06Z wrp $ */
+
+/* copyright (c) 1987, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/*
+ read in the alphabet and pam matrix data
+ designed for universal matcher
+
+ This version reads BLAST format (square) PAM files
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "defs.h"
+#include "param.h"
+
+#define XTERNAL
+#include "uascii.h"
+#include "upam.h"
+#undef XTERNAL
+
+extern void alloc_pam (int d1, int d2, struct pstruct *ppst);
+extern void init_altpam(struct pstruct *ppst);
+
+/* pam_opts -- modify PAM matrix (pamoff, -MS) if -MS or +off is part
+ of PAM matrix name, e.g. MD20-MS or BL50+2
+*/
+
+void
+pam_opts(char *smstr, struct pstruct *ppst) {
+ char *bp;
+
+ ppst->pam_ms = 0;
+ ppst->pamoff = 0;
+
+ if ((bp=strchr(smstr,'-'))!=NULL) {
+ if (!strncmp(bp+1,"MS",2) || !strncmp(bp+1,"ms",2)) {
+ ppst->pam_ms = 1;
+ }
+ else {
+ ppst->pamoff=atoi(bp+1);
+ }
+ *bp = '\0';
+ }
+ else if ((bp=strchr(smstr,'+'))!=NULL) {
+ ppst->pamoff= -atoi(bp+1);
+ *bp = '\0';
+ }
+}
+
+/* modified 13-Oct-2005 to accomodate asymmetrical matrices */
+/* modified 15-Jul-2010 to ensure constant NCBIstdaa encoding */
+/* ensure that all entries in NCBIstdaa have values */
+
+int
+initpam (char *mfname, struct pstruct *ppst)
+{
+ char line[512], *lp;
+ int i, j, iaa, pval, p_i, p_j;
+ int l_nsq;
+ unsigned char l_sq[MAXSQ+1];
+ int ess_tmp, max_val, min_val;
+ int have_es = 0;
+ FILE *fmat;
+
+ pam_opts(mfname, ppst);
+
+ if ((fmat = fopen (mfname, "r")) == NULL)
+ {
+ printf ("***WARNING*** cannot open scoring matrix file %s\n", mfname);
+ fprintf (stderr,"***WARNING*** cannot open scoring matrix file %s\n", mfname);
+ return 0;
+ }
+
+/* removed because redundant, and causes crash under MacOSX -- because copying on top of itself */
+/*
+ SAFE_STRNCPY (ppst->pamfile, mfname, MAX_FN);
+*/
+ SAFE_STRNCPY(ppst->pam_name, ppst->pamfile, MAX_FN);
+
+ if (ppst->pam_ms) {
+ SAFE_STRNCAT(ppst->pam_name,"-MS",MAX_FN-strlen(ppst->pam_name));
+ }
+
+ /*
+ the size of the alphabet is determined in advance
+ */
+ ppst->nt_align = (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA);
+
+ /*
+ look for alphabet line, skipping the comments, alphabet ends up in line[]
+ */
+ while (fgets (line, sizeof(line), fmat) != NULL && line[0]=='#');
+
+ /* transfer the residue line into l_sq[] */
+ l_nsq = 1;
+ l_sq[0] = '\0';
+ for (i=0; i<strlen(line); i++) {
+ if (isalpha(line[i]) || line[i] == '*') {
+ l_sq[l_nsq++] = line[i];
+ }
+ }
+
+ /* if we have a DNA matrix, various defaults must be updated,
+ particularly pascii, which is used to map the residue ordering
+ in the matrix file to the residue ordering used by the
+ program */
+
+ if (l_nsq < 20) {
+ if (ppst->dnaseq <= SEQT_PROT) {
+ ppst->dnaseq = SEQT_DNA;
+ }
+ ppst->nt_align=1;
+ pascii = nascii; /* use correct DNA mapping, NCBIstdaa by default */
+ }
+
+ /* we no-longer re-initialize sascii[], we either use NCBIstdaa
+ mapping for protein, or nascii for DNA */
+
+ /* 11-July-2014 -- need to check that alphabet is consistent with pascii */
+ /*
+ for (i=0; i < l_nsq; i++) {
+ }
+ */
+
+ /* check for 2D pam - if not found, allocate it */
+ if (!ppst->have_pam2) {
+ alloc_pam (MAXSQ+1, MAXSQ+1, ppst);
+ ppst->have_pam2 = 1;
+ }
+
+ max_val = -1;
+ min_val = 1;
+ ppst->pam2[0][0][0] = -BIGNUM;
+ /* make certain the [0] boundaries are -BIGNUM */
+ for (j=1; j < l_nsq; j++) {
+ p_j = pascii[l_sq[j]];
+ ppst->pam2[0][0][p_j] = ppst->pam2[0][p_j][0] = -BIGNUM;
+ }
+
+ /* read the scoring matrix values */
+ for (iaa = 1; iaa < l_nsq; iaa++) { /* read pam value line */
+ p_i = pascii[l_sq[iaa]];
+ if (p_i > MAXSQ) {
+ fprintf(stderr,"*** error [%s:%d] - residue character %c out of range %d\n",
+ __FILE__, __LINE__, l_sq[iaa], p_i);
+ p_i = pascii['X'];
+ }
+ if (fgets(line,sizeof(line),fmat)==NULL) {
+ fprintf (stderr," error reading pam line: %s\n",line);
+ exit (1);
+ }
+ /* fprintf(stderr,"%d/%d %s",iaa,nsq,line); */
+ strtok(line," \t\n"); /* skip the letter (residue) */
+
+ for (j = 1; j < l_nsq; j++) {
+ p_j = pascii[l_sq[j]];
+ lp=strtok(NULL," \t\n"); /* get the number string */
+ pval=ppst->pam2[0][p_i][p_j]=atoi(lp); /* convert to integer */
+ if (pval > max_val) max_val = pval;
+ if (pval < min_val) min_val = pval;
+ }
+ }
+ ppst->pam_h = max_val;
+ ppst->pam_l = min_val;
+
+ if (ppst->dnaseq==0) {
+ pam_sq = apam_sq;
+ pam_sq_n = apam_sq_n;
+ init_altpam(ppst);
+ }
+ else {
+ pam_sq = npam_sq;
+ pam_sq_n = npam_sq_n;
+ }
+
+ /* is protein but do not have '*' in alphabet*/
+ p_i = pascii['*'];
+ p_j = pascii['X'];
+ if (!ppst->nt_align && strchr((char *)l_sq,'*')==NULL) {
+ /* add it */
+ for (i=0; i< l_nsq; i++) {
+ ppst->pam2[0][p_i][i] = ppst->pam2[0][p_j][i];
+ ppst->pam2[0][i][p_i] = ppst->pam2[0][i][p_j];
+ }
+ }
+
+ /* make sure that X:X is < 0 if -S */
+ if (ppst->ext_sq_set && ppst->pam2[0][p_j][p_j] >= 0) {
+ ppst->pam2[0][p_j][p_j] = -1;
+ }
+
+ fclose (fmat);
+ return 1;
+}
+
+/* make a DNA scoring from +match/-mismatch values */
+
+void mk_n_pam(int *arr,int siz, int mat, int mis)
+{
+ int i, j, k;
+ /* current default match/mismatch values */
+ int max_mat = +5;
+ int min_mis = -4;
+ float f_val, f_scale;
+
+ f_scale = (float)(mat - mis)/(float)(max_mat - min_mis);
+
+ k = 0;
+ for (i = 0; i<nnt-1; i++)
+ for (j = 0; j <= i; j++ ) {
+ if (arr[k] == max_mat) arr[k] = mat;
+ else if (arr[k] == min_mis) arr[k] = mis;
+ else if (arr[k] != -1) {
+ f_val = (arr[k] - min_mis)*f_scale + 0.5;
+ arr[k] = f_val + mis;
+ }
+ k++;
+ }
+}
+
+int
+standard_pam(char *smstr, struct pstruct *ppst, int del_set, int gap_set) {
+
+ struct std_pam_str *std_pam_p;
+
+ pam_opts(smstr, ppst);
+
+ for (std_pam_p = std_pams; std_pam_p->abbrev[0]; std_pam_p++ ) {
+ if (strcmp(smstr,std_pam_p->abbrev)==0) {
+ pam = std_pam_p->pam;
+ strncpy(ppst->pam_name,std_pam_p->name,MAX_FN);
+ ppst->pam_name[MAX_FN-1]='\0';
+ if (ppst->pam_ms) {
+ strncat(ppst->pam_name,"-MS",MAX_FN-strlen(ppst->pam_name)-1);
+ }
+ ppst->pam_name[MAX_FN-1]='\0';
+#ifdef OLD_FASTA_GAP
+ if (!del_set) ppst->gdelval = std_pam_p->gdel+std_pam_p->ggap;
+#else
+ if (!del_set) ppst->gdelval = std_pam_p->gdel;
+#endif
+ if (!gap_set) ppst->ggapval = std_pam_p->ggap;
+ ppst->pamscale = std_pam_p->scale;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* scan through the sorted (by decreasing entropy) std_pams[] array
+ for a scoring matrix with enough entropy
+*/
+int
+min_pam_bits(int n0_eff, double bit_thresh, struct pstruct *ppst, int del_set, int gap_set) {
+ struct std_pam_str *std_pam_p;
+ int curr_pam_idx = 0;
+
+ pam_opts(ppst->pamfile, ppst);
+
+ /* get the index for the current (standard) pam file */
+ for (curr_pam_idx = 0; std_pams[curr_pam_idx].abbrev[0]; curr_pam_idx++) {
+ if (strcmp(ppst->pamfile,std_pams[curr_pam_idx].abbrev)==0) break;
+ }
+
+ /* only use matrices from the VT series */
+ for ( ; curr_pam_idx > 0 ; curr_pam_idx-- ) {
+ if ((strncmp(std_pams[curr_pam_idx].name,"VT",2)!=0) &&
+ (strcmp(ppst->pamfile, std_pams[curr_pam_idx].abbrev) != 0)) continue;
+ if (n0_eff * std_pams[curr_pam_idx].entropy >= bit_thresh) goto new_pam;
+ }
+ return 0;
+
+ new_pam:
+ std_pam_p = &std_pams[curr_pam_idx];
+
+ pam = std_pam_p->pam;
+ strncpy(ppst->pam_name,std_pam_p->name,MAX_FN);
+ if (ppst->pam_ms) {
+ strncat(ppst->pam_name,"-MS",MAX_FN-strlen(ppst->pamfile)-1);
+ }
+ ppst->pam_name[MAX_FN-1]='\0';
+#ifdef OLD_FASTA_GAP
+ if (!del_set) ppst->gdelval = std_pam_p->gdel+std_pam_p->ggap;
+#else
+ if (!del_set) ppst->gdelval = std_pam_p->gdel;
+#endif
+ if (!gap_set) ppst->ggapval = std_pam_p->ggap;
+ ppst->pamscale = std_pam_p->scale;
+ return 1;
+}
+
+/* build_xascii is only used for SEQT_UNK - it replaces the default
+ input mapping (aascii[]) with a mapping that preserves any letter
+ in either the aax[], ntx[], or othx[] alphabets, or in
+ save_str[]. othx[] was added to support letters that are mapped,
+ but are not (yet) in aax[], e.g. 'OoUu'. Because build_xascii
+ makes a qascii[] that is all ascii characters with values > '@',
+ these values must be replaced using either aascii[] or nascii[] and
+ the initial query sequence re-coded.
+ */
+void
+build_xascii(int *qascii, char *save_str) {
+ int i, max_save;
+ int comma_val, term_val;
+ int save_arr[MAX_SSTR];
+
+ comma_val = qascii[','];
+ term_val = qascii['*'];
+
+ /* preserve special characters */
+ for (i=0; i < MAX_SSTR && save_str[i]; i++ ) {
+ save_arr[i] = qascii[save_str[i]];
+ }
+ max_save = i;
+
+ for (i=1; i<128; i++) {
+ qascii[i]=NA;
+ }
+
+ /* range of values in aax, ntx is from 1..naax,nntx -
+ do not zero-out qascii[0] - 9 Oct 2002 */
+
+ for (i=1; i<NCBIstdaa_ext_n; i++) {
+ qascii[NCBIstdaa_ext[i]]=NCBIstdaa_ext[i];
+ }
+
+ for (i=1; i<nntx; i++) {
+ qascii[ntx[i]]=ntx[i];
+ }
+
+ /* put it letters that are not in other alphabets because they are
+ mapped -- now included in NCBIstdaa */
+ /*
+ for (i=1; i<nothx; i++) {
+ qascii[othx[i]]=othx[i];
+ }
+ */
+
+ qascii['\n'] = qascii['\r'] = EL;
+
+ qascii[','] = comma_val;
+ qascii['*'] = term_val;
+ qascii[0] = ES;
+
+ for (i=0; i < max_save; i++) {
+ qascii[save_str[i]]=save_arr[i];
+ }
+}
+
+
+/* init_ascii0 -- initializes an ascii mapping from a sequence
+ordering
+*/
+void
+init_ascii0(int *xascii, char *sq_map, int n_sq_map, struct pstruct *ppst) {
+ int i;
+
+ /* first map everything as non-sequence */
+ for (i=0; i<128; i++) {
+ xascii[i] = NA;
+ }
+
+ /* then map the actual sequence letters */
+ for (i = 1; i < n_sq_map; i++) {
+ xascii[sq_map[i]] = i;
+ if (n_sq_map <= MAXUC) { /* only uppercase */
+ xascii[tolower(sq_map[i])] = i; /* map lowercase */
+ }
+ }
+
+ ppst->nsq = n_sq_map;
+ for (i=1; i < n_sq_map; i++) {
+ ppst->sq[i] = sq_map[i];
+ }
+ ppst->sq[0] = 0;
+
+ /* then map the other stuff, EL etc */
+ xascii[0] = ES;
+ xascii[10] = EL;
+ xascii[13] = EL;
+}
+
+/* init_ascii()
+
+ checks for lower case letters in *sq array;
+ if not present, map lowercase to upper
+
+*/
+void
+init_ascii(int is_ext, int *xascii, int p_nsq, int is_dna) {
+
+ int isq, have_lc;
+ char *sq, term_char;
+ int nsq;
+
+ if (is_dna==SEQT_UNK) return;
+
+ term_char = xascii['*'];
+
+ if (is_dna==SEQT_DNA || is_dna == SEQT_RNA) {
+ if (is_ext) {
+ sq = &ntx[0];
+ nsq = nntx;
+ }
+ else {sq = &nt[0]; nsq = nnt;}
+ }
+ else {
+ if (is_ext) { sq = NCBIstdaa_ext; nsq = NCBIstdaa_ext_n; }
+ else {sq = NCBIstdaa; nsq = NCBIstdaa_n;}
+ }
+
+ /* initialize xascii from sq[], checking for lower-case letters */
+ /* this code guarantees that all characters in sq are represented in
+ xascii[], but it does not guarantee that everything else in
+ xascii[], particularly xascii[O,U,J], have appropriate values
+ */
+ have_lc = 0;
+ for (isq = 1; isq <= nsq; isq++) {
+ xascii[sq[isq]] = isq;
+ if (sq[isq] >= 'a' && sq[isq] <= 'z') have_lc = 1;
+ }
+
+ /* no lower case letters in alphabet, map lower case to upper */
+ if (have_lc != 1) {
+ for (isq = 1; isq <= nsq; isq++) {
+ if (sq[isq] >= 'A' && sq[isq] <= 'Z') xascii[sq[isq]-'A'+'a'] = isq;
+ }
+ if (is_dna==1) xascii['u'] = xascii['t'];
+ }
+
+ xascii['*']=term_char;
+ xascii[0] = ES;
+}
+
+void
+validate_novel_aa(int *xascii, int p_nsq, int dnaseq) {
+ int isq, err_p_nsq_limit;
+ /* these checks need to be done after xascii[] has been
+ re-initialized */
+
+ if (dnaseq != SEQT_DNA && dnaseq!=SEQT_RNA) {
+ if (xascii['O'] > p_nsq || xascii['o'] > p_nsq) { xascii['O'] = xascii['K']; xascii['o'] = xascii['k'];}
+ if (xascii['U'] > p_nsq || xascii['u'] > p_nsq) { xascii['U'] = xascii['C']; xascii['u'] = xascii['c'];}
+ if (xascii['J'] > p_nsq || xascii['j'] > p_nsq) { xascii['J'] = xascii['L']; xascii['j'] = xascii['l'];}
+ }
+
+ /* one final check for characters out of range (> nsq)*/
+ err_p_nsq_limit = 0;
+ for (isq = 'A'; isq <= 'Z'; isq++) {
+ if (xascii[isq] < NA && xascii[isq] > p_nsq) {
+ fprintf(stderr, " *** ERROR *** xascii['%c']:%d > %d\n",isq, xascii[isq], p_nsq);
+ err_p_nsq_limit = 1;
+ }
+ }
+ if (err_p_nsq_limit) {exit(1);}
+}
+
+
+void
+print_pam(struct pstruct *ppst) {
+ int i, nsq, ip;
+ unsigned char *sq;
+
+ fprintf(stderr," ext_sq_set: %d\n",ppst->ext_sq_set);
+
+ nsq = ppst->nsq;
+ ip = 0;
+ sq = ppst->sq;
+
+ fprintf(stderr," sq[%d]: %s\n",nsq, sq);
+
+ if (ppst->ext_sq_set) {
+ nsq = ppst->nsqx;
+ ip = 1;
+ sq = ppst->sqx;
+ fprintf(stderr," sq[%d]: %s\n",nsq, sq);
+ }
+
+ for (i=1; i<=nsq; i++) {
+ fprintf(stderr," %c:%c - %3d\n",sq[i], sq[i], ppst->pam2[ip][i][i]);
+ }
+}
diff --git a/src/best_stats.h b/src/best_stats.h
new file mode 100644
index 0000000..4fc29d4
--- /dev/null
+++ b/src/best_stats.h
@@ -0,0 +1,52 @@
+/* Concurrent read version */
+
+/* $Id: best_stats.h 808 2011-07-19 20:05:24Z wrp $ */
+/* $Revision: 808 $ */
+
+#include "param.h"
+
+/*
+#ifndef FSEEK_T_DEF
+#define FSEEK_T_DEF
+#ifndef USE_FSEEKO
+typedef long fseek_t;
+#else
+typedef off_t fseek_t;
+#endif
+#endif
+*/
+
+struct beststr {
+ struct seq_record *seq; /* sequence info */
+ struct mseq_record *mseq; /* sequence meta-info */
+ struct beststr *bbp_link; /* link to a previous beststr entry with the same sequence */
+ struct rstruct rst; /* results info */
+
+ int n1; /* duplicate of seq.n1, used for error checking/debugging */
+#ifdef DEBUG
+ long adler32_crc; /* duplicate of seq.adler32_crc for error checking/debugging */
+#endif
+ int frame; /* in buf2_str */
+ int repeat_thresh; /* threshold for additional alignments */
+ double zscore; /* the z-score mostly exists for sorting best scores */
+ double zscore2; /* z-score - from high-scoring shuffles */
+ double bit_score; /* move to bit-scores for consistency */
+ double bit_score2; /* bit-score for second shuffle */
+
+ int a_res_cnt;
+ struct a_res_str *a_res; /* need only a_res, not a_res[2], because different frames
+ for the same sequence are stored separately */
+ int have_ares;
+ float percent, gpercent;
+};
+
+struct stat_str {
+ int score;
+ int n1;
+ double comp;
+ double H;
+ double escore;
+ int segnum;
+ int seglen;
+};
+
diff --git a/src/build_ares.c b/src/build_ares.c
new file mode 100644
index 0000000..ffe26ff
--- /dev/null
+++ b/src/build_ares.c
@@ -0,0 +1,248 @@
+/* $Id: build_ares.c $ */
+
+/* copyright (c) 2010, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* build_ares_code is called by showbest() (in threaded/serial code) or
+ p2_workcomp in PCOMPLIB code to produce the cur_ares-> chain that
+ is displayed in showbest().
+
+ For PCOMPLIB, the cur_ares->chain is passed to bbp->a_res by
+ do_stage2(), where it is available to showbest();
+
+ By using this code, the a_res chain used in either mode will be the
+ same, so the code required to display an a_res should be the same.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+#include "structs.h"
+#include "param.h"
+
+/* #include "mm_file.h" */
+#include "best_stats.h"
+#include "drop_func.h"
+
+extern void calc_coord(int n0, int n1, long qoffset, long loffset,
+ struct a_struct *aln);
+
+extern void calc_astruct(struct a_struct *aln_p, struct a_res_str *a_res_p, void *f_str);
+
+/* in build_ares_code, *aa1 is separate from *seq because *seq has
+ permanent information about aa1, but aa1 may be temporary
+
+ build_ares_code() calculates various annotation strings, depending
+ on what kinds of annotations are requested with -m "F# file" and -m #
+
+ There are three fundamentally different annotation formats:
+ (1) annot_var_s -- the long version shown with alignments
+ (2) annot_code -- a very compact version, shown with -m 9[cC] and -m 8CC
+ (3) annot_id -- a different compact version
+
+ These need to be saved separately, and used at the right time.
+ They are NOT exclusive (which older code assumed).
+
+*/
+
+struct a_res_str *
+build_ares_code(unsigned char *aa0, int n0,
+ unsigned char *aa1, struct seq_record *seq,
+ int frame, int *have_ares, int repeat_thresh,
+ const struct mngmsg *m_msp, struct pstruct *ppst,
+ void *f_str
+ )
+{
+ unsigned char *aa1_ann;
+ struct rstruct rst;
+ struct a_res_str *my_ares_p, *cur_ares_p;
+ struct a_struct *aln_p;
+ struct dyn_string_str *annot_str_dyn, *align_code_dyn;
+ long loffset; /* loffset is offset from beginning of real sequence */
+ long l_off; /* l_off is the the virtual coordinate of residue 1 */
+ int seqc_max, annc_max;
+ char *seq_code;
+ int seq_code_len, annot_str_len;
+ int score_delta;
+ int variant_calc_done = 0;
+
+ align_code_dyn = init_dyn_string(2048, 2048);
+ annot_str_dyn = init_dyn_string(2048, 2048);
+
+ if (seq->annot_p) {aa1_ann = seq->annot_p->aa1_ann;}
+ else aa1_ann = NULL;
+ loffset = seq->l_offset;
+ l_off = seq->l_off;
+
+ if (! (*have_ares & 0x1)) { /* we don't have an a_res, and we need one */
+
+ my_ares_p = do_walign(aa0, n0, aa1, seq->n1,
+ frame,
+ repeat_thresh, ppst, f_str,
+ have_ares);
+ }
+ else { /* we already have the a_res */
+ pre_cons(aa1,seq->n1,frame,f_str);
+ my_ares_p = NULL;
+ }
+
+ /* here, we need to loop through all the alignments, and produce
+ the statistics/codes for each */
+
+ for (cur_ares_p = my_ares_p; cur_ares_p != NULL; cur_ares_p = cur_ares_p->next) {
+
+ seqc_max = my_ares_p->nres + 4*m_msp->aln.llen+4;
+ cur_ares_p->aln_code = seq_code = NULL;
+ cur_ares_p->aln_code_n = seq_code_len = 0;
+ cur_ares_p->annot_code = NULL;
+ cur_ares_p->annot_code_n = 0;
+ cur_ares_p->annot_var_s = NULL;
+ cur_ares_p->annot_var_id = NULL;
+ cur_ares_p->annot_var_idd = NULL;
+
+ aln_p = &cur_ares_p->aln;
+
+ /* this sets a number of constants, from the alignment function
+ and frame, and only needs to be called once */
+ aln_func_vals(frame, aln_p);
+
+ if (m_msp->tot_show_code & (SHOW_CODE_ALIGN+SHOW_CODE_CIGAR+SHOW_CODE_EXT)) {
+ cur_ares_p->aln_code = seq_code=(char *)calloc(seqc_max,sizeof(char));
+ /* if we have an annotation string, allocate space for the
+ encoded annotation */
+
+ if (seq_code != NULL) {
+
+ calc_astruct(aln_p, cur_ares_p, f_str);
+
+ /* we need this for offset information for calc_code, but it is
+ incomplete so we must do it again */
+
+ calc_coord(m_msp->n0,seq->n1,
+ m_msp->q_offset + (m_msp->q_off-1) + (m_msp->sq0off-1),
+ loffset + (l_off-1) + (m_msp->sq1off-1),
+ aln_p);
+
+ aln_p->lc=calc_code(aa0, m_msp->n0,
+ aa1,seq->n1,
+ aln_p,cur_ares_p,
+ ppst,
+ align_code_dyn,
+ m_msp->ann_arr,
+ m_msp->aa0a, m_msp->annot_p,
+ aa1_ann, seq->annot_p,
+ annot_str_dyn,
+ &score_delta,
+ f_str, m_msp->pstat_void,
+ m_msp->tot_show_code);
+
+ cur_ares_p->aln_code_n = seq_code_len = strlen(seq_code);
+ if (seq_code[1] == '0' && seq_code[0] == '=') {
+ fprintf(stderr," code begins with 0: %s\n", seq_code);
+ }
+
+ if (align_code_dyn != NULL) {
+ if (seq_code && cur_ares_p->aln_code == seq_code) {
+ free(seq_code); /* free it since it is replaced below */
+ }
+ seq_code_len = strlen(align_code_dyn->string);
+ cur_ares_p->aln_code = (char *)calloc(seq_code_len+2,sizeof(char));
+ cur_ares_p->aln_code_n = seq_code_len+2;
+ SAFE_STRNCPY(cur_ares_p->aln_code,align_code_dyn->string, seq_code_len+2);
+ reset_dyn_string(align_code_dyn);
+ }
+
+ if (annot_str_dyn != NULL) {
+ annot_str_len = strlen(annot_str_dyn->string);
+ cur_ares_p->annot_code = (char *)calloc(annot_str_len+2,sizeof(char));
+ SAFE_STRNCPY(cur_ares_p->annot_code,annot_str_dyn->string, annot_str_len+2);
+ reset_dyn_string(annot_str_dyn);
+ }
+ else {annot_str_len = 0;}
+ cur_ares_p->annot_code_n = annot_str_len;
+ variant_calc_done = 1;
+ }
+ }
+
+ if ((m_msp->tot_show_code & SHOW_CODE_IDD) == SHOW_CODE_IDD) {
+ aln_p->lc=calc_idd(aa0,m_msp->n0,aa1,seq->n1,
+ aln_p, cur_ares_p,
+ ppst,
+ m_msp->annot_p, seq->annot_p,
+ &score_delta,
+ annot_str_dyn, f_str);
+ variant_calc_done = 1;
+
+ if (annot_str_dyn != NULL) {
+ annot_str_len = strlen(annot_str_dyn->string);
+ cur_ares_p->annot_var_idd = (char *)calloc(annot_str_len+2,sizeof(char));
+ SAFE_STRNCPY(cur_ares_p->annot_var_idd,annot_str_dyn->string, annot_str_len+2);
+ }
+ else {annot_str_len = 0;}
+
+ }
+
+ /* ensure that calc_id (or something else) is ALWAYS done to set score_delta */
+ if (!variant_calc_done || (m_msp->tot_show_code & SHOW_CODE_ID) == SHOW_CODE_ID) {
+ aln_p->lc=calc_id(aa0,m_msp->n0,aa1,seq->n1,
+ aln_p, cur_ares_p,
+ ppst,
+ m_msp->annot_p, seq->annot_p,
+ &score_delta,
+ annot_str_dyn, f_str);
+
+ if ((m_msp->tot_show_code & SHOW_CODE_ID)==SHOW_CODE_ID && (annot_str_dyn->string[0] != '\0')) {
+ if ((cur_ares_p->annot_var_id = (char *)calloc(strlen(annot_str_dyn->string)+2, sizeof(char)))==NULL) {
+ fprintf(stderr,"*** ERROR *** [%s/%d] cannot allocate cur_ares_p->annot_var_s [%d]\n",
+ __FILE__, __LINE__, (int)strlen(annot_str_dyn->string)+2);
+ }
+ else {
+ strncpy(cur_ares_p->annot_var_id,annot_str_dyn->string,strlen(annot_str_dyn->string)+2);
+ }
+ }
+ }
+
+ if (score_delta > 0) {
+ cur_ares_p->rst.score[0] += score_delta;
+ cur_ares_p->rst.score[1] += score_delta;
+ cur_ares_p->rst.score[2] += score_delta;
+ cur_ares_p->sw_score += score_delta;
+ cur_ares_p->score_delta = score_delta;
+ }
+ else {
+ cur_ares_p->score_delta = 0;
+ }
+
+ /*
+ if (annot_str_dyn->string[0] != '\0') {
+ if ((cur_ares_p->annot_var_s = (char *)calloc(strlen(annot_str_dyn->string)+2, sizeof(char)))==NULL) {
+ fprintf(stderr,"*** ERROR *** [%s/%d] cannot allocate cur_ares_p->annot_var_s [%d]\n",
+ __FILE__, __LINE__, (int)strlen(annot_str_dyn->string)+2);
+ }
+ else {
+ strncpy(cur_ares_p->annot_var_s,annot_str_dyn->string,strlen(annot_str_dyn->string)+2);
+ }
+ }
+ */
+
+ /* this should be all the information we need on the alignment */
+ } /* end for (cur_ares_p;) */
+ free_dyn_string(annot_str_dyn);
+ free_dyn_string(align_code_dyn);
+ return my_ares_p;
+}
diff --git a/src/c_dispn.c b/src/c_dispn.c
new file mode 100644
index 0000000..5c9741e
--- /dev/null
+++ b/src/c_dispn.c
@@ -0,0 +1,573 @@
+/* dispn.c associated subroutines for matching sequences */
+
+/* $Id: c_dispn.c 1124 2013-03-13 20:24:57Z wrp $ */
+/* $Revision: 1124 $ */
+
+/* copyright (c) 1988, 1995, 1996, 2008, 2013, 2014 by William R. Pearson and
+ The Rector and Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "defs.h"
+#include "structs.h"
+#include "param.h"
+
+#define XTERNAL
+
+#define YES 1
+#define NO 0
+
+#define MAXOUT 201
+
+/* the seqca[] array has the following codes:
+ 0 - no alignment symbol
+ 1 - align; pam < 0
+ 2 - align; pam == 0
+ 3 - align; pam > 0
+ 4 - align; ident
+ 5 - align; del
+
+ the map_sym arrays determine the value to be displayed with each
+ type of aligned residue
+*/
+
+#include "a_mark.h"
+
+void
+discons(FILE *fd, const struct mngmsg *m_msp,
+ char *seqc0, char *seqc0a,
+ char *seqc1, char *seqc1a,
+ char *seqca, int *cumm_seq_score, int nc,
+ int n0, int n1, char *name0, char *name1, int nml,
+ struct a_struct *aln)
+{
+ char line[3][MAXOUT]; /* alignment lines [0,2], similarity code [1] */
+ char cline[2][MAXOUT+10], *clinep[2]; /* coordinate line */
+ int il, i, lend, loff, id, tot_score;
+ int del0, del1, ic, ll0, ll1, ll01, cl0, cl1, rl0, rl1;
+ int ic_save;
+ char *map_sym_p;
+ int l_llen;
+ int ioff0, ioff00, ioff1, ioff10;
+ long q_start, q_end, qf_flag, s_start, s_end, sf_flag;
+ long qqoff, lloff;
+ int llsgn, llfact, qlsgn, qlfact, qfx0, qfxn, lfx0, lfxn;
+ long s_digit_max, q_digit_max;
+ char digit_tmp[32];
+ int digit_len;
+ int have_res;
+ char *name01;
+ char blank[32], afmt[32], afmt0[32];
+ int disp_dna_align = ((m_msp->qdnaseq>0) && (m_msp->ldb_info.ldnaseq > 0));
+
+ memset(blank,' ',sizeof(blank)-1);
+ blank[sizeof(blank)-1]='\0';
+
+ clinep[0]=cline[0]+1;
+ clinep[1]=cline[1]+1;
+
+ if (aln->qlfact == 0) {qlfact = 1;}
+ else qlfact = aln->qlfact;
+ if (aln->qlrev == 1) {
+ qlsgn = -1;
+ qfx0 = 0;
+ qfxn = 1;
+ }
+ else {
+ qlsgn = 1;
+ qfx0 = 1;
+ qfxn = 0;
+ }
+
+ if (aln->llfact == 0) {llfact = 1;}
+ else llfact = aln->llfact;
+
+ if (aln->llrev == 1) {
+ llsgn = -1;
+ lfx0 = 0;
+ lfxn = 1;
+ }
+ else {
+ llsgn = 1;
+ lfx0 = 1;
+ lfxn = 0;
+ }
+
+ l_llen = aln->llen;
+ if ((m_msp->markx & MX_M9SUMM) && m_msp->show_code != 1) { l_llen += 40; }
+
+ if ((m_msp->markx & MX_ATYPE)==2) name01=name1;
+ else name01 = "\0";
+
+ ioff0=aln->smin0;
+ ioff00 = ioff0;
+ ioff1=aln->smin1;
+ ioff10 = ioff1;
+
+ if (m_msp->markx& MX_AMAP && (m_msp->markx & MX_ATYPE)==7) return;
+
+ /* set *map_sym_p to correct match symbol */
+ map_sym_p = aln_map_sym[MX_A0];
+ if ((m_msp->markx&MX_ATYPE)==1) {map_sym_p = aln_map_sym[MX_A1];}
+ else if ((m_msp->markx&MX_ATYPE)==2) {map_sym_p = aln_map_sym[MX_A2];}
+ else if (m_msp->markx&MX_M10FORM) { map_sym_p = aln_map_sym[MX_A10]; }
+ if (m_msp->markx & MX_MBLAST) { map_sym_p = aln_map_sym[MX_ABLAST];}
+
+ if (m_msp->markx & MX_ASEP) {
+ fprintf(fd,">%s ..\n",name0);
+ for (i=0; i<nc && seqc0[i]; i++) {
+ /* if (seqc0[i]=='-') fputc('.',fd); else */
+ fputc(seqc0[i],fd);
+ if (i%50 == 49) fputc('\n',fd);
+ }
+ if ((i-1)%50 != 49) fputc('\n',fd);
+ fprintf(fd,">%s ..\n",name1);
+ for (i=0; i<nc && seqc1[i]; i++) {
+ /* if (seqc1[i]=='-') fputc('.',fd); else */
+ fputc(seqc1[i],fd);
+ if (i%50 == 49) fputc('\n',fd);
+ }
+ if ((i-1)%50 != 49) fputc('\n',fd);
+ return;
+ }
+
+ if (m_msp->markx & MX_M10FORM) {
+ fprintf(fd,">%s ..\n",name0);
+ fprintf(fd,"; sq_len: %d\n",n0);
+ fprintf(fd,"; sq_offset: %ld\n",aln->q_offset+1);
+ fprintf(fd,"; sq_type: %c\n",m_msp->sqtype[0]);
+ fprintf(fd,"; al_start: %ld\n",aln->d_start0);
+ fprintf(fd,"; al_stop: %ld\n",aln->d_stop0);
+ /* in the past, this al_display_start does not include sq0off */
+ fprintf(fd,"; al_display_start: %ld\n",
+ aln->q_offset+qlsgn*ioff0*aln->llmult+qfx0);
+
+ have_res = 0;
+ for (i=0; i<nc && seqc0[i]; i++) {
+ if (!have_res && seqc0[i]==' ') fputc('-',fd);
+ else if (seqc0[i]==' ') break;
+ else {
+ have_res = 1;
+ fputc(seqc0[i],fd);
+ }
+ if (i%50 == 49) fputc('\n',fd);
+ }
+ if ((i-1)%50!=49 || seqc0[i-1]==' ') fputc('\n',fd);
+ fprintf(fd,">%s ..\n",name1);
+ fprintf(fd,"; sq_len: %d\n",n1);
+ fprintf(fd,"; sq_offset: %ld\n",aln->l_offset+1);
+ fprintf(fd,"; sq_type: %c\n",m_msp->sqtype[0]);
+ fprintf(fd,"; al_start: %ld\n",aln->d_start1);
+ fprintf(fd,"; al_stop: %ld\n",aln->d_stop1);
+ /* in the past, this al_display_start does not include sq1off */
+ fprintf(fd,"; al_display_start: %ld\n",aln->l_offset+llsgn*ioff1+lfx0);
+
+ have_res = 0;
+ for (i=0; i<nc && seqc1[i]; i++) {
+ if (!have_res && seqc1[i]==' ') fputc('-',fd);
+ else if (seqc1[i]==' ') break;
+ else {
+ have_res = 1;
+ fputc(seqc1[i],fd);
+ }
+ if (i%50 == 49) fputc('\n',fd);
+ }
+ if ((i-1)%50!=49 || seqc1[i-1]==' ') fputc('\n',fd);
+#ifdef M10_CONS
+ fprintf(fd,"; al_cons:\n");
+ for (i=0,del0=0,id=ioff0; id-del0<aln->amax0 && i < nc; i++,id++) {
+ if (seqc0[i] == '\0' || seqc1[i] == '\0') break;
+ if (seqc0[i]=='-' || seqc0[i]==' ' || seqc0[i]=='\\') del0++;
+ else if (seqc0[i]=='/') del0++;
+ if (id-del0<aln->amin0) fputc(' ',fd);
+ else if (seqc0[i]=='-'||seqc1[i]=='-') fputc('-',fd);
+ else fputc(map_sym_p[seqca[i]],fd);
+
+ if (i%50 == 49) fputc('\n',fd);
+ }
+ if ((i-1)%50!=49 || seqc1[i-1]==' ') fputc('\n',fd);
+#endif
+ return;
+ }
+ else if (m_msp->markx & MX_RES_ALIGN_SCORE) {
+ have_res = 0;
+ tot_score = 0;
+ del0 = del1 = 0;
+ fprintf(fd,">%s\t%s\t%s0\t%s1\tscore\ttotal\n",
+ name0,name1,m_msp->sqnam,m_msp->sqnam);
+ for (ic=0; ic<nc; ic++, ioff0++, ioff1++) {
+ if (seqc0[ic] == ' ' || seqc0[ic] == '-' || seqc0[ic] == '/' || seqc0[ic] == '\\') {
+ del0++;
+ }
+ if (seqc1[ic] == ' ' || seqc1[ic] == '-' || seqc1[ic] == '/' || seqc1[ic] == '\\') {
+ del1++;
+ }
+
+ tot_score += cumm_seq_score[ic];
+ fprintf(fd,"%ld\t%ld\t%c\t%c\t%d\t%d\n",
+ aln->q_offset+qlsgn*(ioff0-del0)*aln->llmult+qfx0,
+ aln->l_offset+llsgn*(ioff1-del1)+lfx0,
+ seqc0[ic], seqc1[ic], cumm_seq_score[ic], tot_score);
+ }
+ return;
+ }
+
+ memset(line[0],' ',MAXOUT);
+ memset(line[1],' ',MAXOUT);
+ memset(line[2],' ',MAXOUT);
+
+ /* cl0 indicates whether a coordinate should be printed over the first
+ sequence; cl1 indicates a coordinate for the second;
+ */
+
+ ic = 0; del0=del1=0;
+
+ /* we set afmt/afmt0 here, rather than at the start of discons, so
+ we can have accurate values for the max and min query/subject
+ start/end using qlsgn and qlfact
+ */
+
+ if (!(m_msp->markx & MX_MBLAST)) {
+ if (nml > 6) {
+ blank[nml-6]='\0';
+ sprintf(afmt,"%%-%ds %%s\n",nml);
+ }
+ else {
+ blank[0]='\0';
+ SAFE_STRNCPY(afmt,"%-6s %s\n",sizeof(afmt));
+ }
+ }
+ else {
+ /* for MX_MBLAST format, the size of the numbers is a function of
+ the largest numbers in the first or last coordinate - which
+ could be either the query or the library sequence.
+ */
+ if (qlsgn > 0) {q_digit_max = aln->smin0 + (aln->amax0 - aln->amin0); }
+ else {q_digit_max = aln->smin0;}
+ q_digit_max = aln->q_offset + qlsgn*q_digit_max + 1l;
+
+ if (llsgn > 0) {s_digit_max = aln->smin1 + (aln->amax1 - aln->amin1); }
+ else {s_digit_max = aln->smin1;}
+ s_digit_max = aln->l_offset + aln->frame + llsgn*aln->llmult*s_digit_max + 1l;
+
+ sprintf(digit_tmp,"%ld",max(q_digit_max, s_digit_max));
+ digit_len = strlen(digit_tmp);
+ if (digit_len < 4) digit_len = 4;
+
+ sprintf(afmt,"%%-5s %%-%dld %%s %%-ld\n",digit_len);
+ blank[digit_len+6]='\0';
+ SAFE_STRNCPY(afmt0,"%-10s %s\n",sizeof(afmt0));
+ }
+
+ if (aln->d_start0 < aln->d_stop0) qf_flag = 1; else qf_flag = 0;
+ if (aln->d_start1 < aln->d_stop1) sf_flag = 1; else sf_flag = 0;
+
+ for (il=0; il<(nc+l_llen-1)/l_llen; il++) {
+ loff=il*l_llen;
+ lend=min(l_llen,nc-loff);
+
+ ll0 = NO; ll1 = NO;
+
+ memset(cline[0],' ',MAXOUT+1);
+ memset(cline[1],' ',MAXOUT+1);
+
+ ic_save = ic;
+
+ q_start = aln->q_offset + (long)qlsgn*ioff00 +
+ (long)qlsgn*qlfact*(ioff0-del0-ioff00) + qf_flag;
+ s_start = aln->l_offset + /* aln->frame + */
+ (long)llsgn*aln->llmult*ioff10 +
+ (long)llsgn*llfact*(ioff1-del1-ioff10) + sf_flag;
+
+ for (i=0; i<lend; i++, ic++,ioff0++,ioff1++) {
+ cl0 = cl1 = rl0 = rl1 = YES;
+ if ((line[0][i]=seqc0[ic])=='-' || seqc0[ic]=='\\') {
+ del0++; cl0=rl0=NO;
+ }
+ else if (seqc0[ic]=='/') {
+ del0++; cl0=rl0=NO;
+ }
+ if ((line[2][i]=seqc1[ic])=='-' || seqc1[ic]=='\\') {
+ del1++; cl1=rl1=NO;
+ }
+ else if (seqc1[ic]=='/') {
+ del1++; cl1=rl1=NO;
+ }
+
+ if (seqc0[ic]==' ') {del0++; cl0=rl0=NO;}
+ else ll0 = YES;
+ if (seqc1[ic]==' ') {del1++; cl1=rl1=NO;}
+ else ll1 = YES;
+
+ /* the old version used qoffset, this version uses q_offset+q_off */
+ qqoff = aln->q_offset + (long)qlsgn*ioff00 +
+ (long)qlsgn*qlfact*(ioff0-del0-ioff00);
+ if (cl0 && qqoff%10 == 9) {
+ sprintf(&clinep[0][i-qfxn],"%8ld",qqoff+1l);
+ clinep[0][i+8-qfxn]=' ';
+ rl0 = NO;
+ }
+ else if (cl0 && qqoff== -1) {
+ sprintf(&clinep[0][i-qfxn],"%8ld",0l);
+ clinep[0][i+8-qfxn]=' ';
+ rl0 = NO;
+ }
+ else if (rl0 && (qqoff+1)%10 == 0) {
+ sprintf(&clinep[0][i-qfxn],"%8ld",qqoff+1);
+ clinep[0][i+8-qfxn]=' ';
+ }
+
+ /* the lloff coordinate of a residue is the sum of:
+ m_msp->sq1off-1 - the user defined coordinate
+ aln->l_offset - the offset into the library sequence
+ llsgn*ioff10 - the offset into the beginning of the alignment
+ (given in the "natural" coordinate system,
+ except for tfasta3 which provides context)
+ llsgn*llfact*(ioff1-del1-ioff10)
+ - the position in the consensus aligment, -gaps
+ */
+
+ /* it seems like this should be done in calc_coord() */
+
+ lloff = aln->l_offset + /* aln->frame + */
+ (long)llsgn*aln->llmult*ioff10 +
+ (long)llsgn*llfact*(ioff1-del1-ioff10);
+
+ if (cl1 && lloff%10 == 9) {
+ sprintf(&clinep[1][i-lfxn],"%8ld",lloff+1l);
+ clinep[1][i+8-lfxn]=' ';
+ rl1 = NO;
+ }
+ else if (cl1 && lloff== -1) {
+ sprintf(&clinep[1][i],"%8ld",0l);
+ clinep[1][i+8-lfxn]=' ';
+ rl1 = NO;
+ }
+ else if (rl1 && (lloff+1)%10 == 0) {
+ sprintf(&clinep[1][i-lfxn],"%8ld",lloff+1);
+ clinep[1][i+8-lfxn]=' ';
+ }
+
+ line[1][i] = ' ';
+ if (ioff0-del0 >= aln->amin0 && ioff0-del0 <= aln->amax0) {
+ if (m_msp->markx & MX_MBLAST) {
+ if (disp_dna_align) {
+ if (seqca[ic]==4) {line[1][i] = '|';}
+ else {line[1][i] = ' ';}
+ }
+ else {
+ if (seqca[ic]==4) {line[1][i]=line[0][i];}
+ else {line[1][i] = map_sym_p[seqca[ic]];}
+ }
+ }
+ else {
+ if (seqca[ic]==4) {line[1][i]=map_sym_p[4];}
+ else if ((m_msp->markx&MX_ATYPE)==2) line[1][i]=line[2][i];
+ else line[1][i] = map_sym_p[seqca[ic]];
+ }
+ }
+ else if ((m_msp->markx&MX_ATYPE)==2) line[1][i]=line[2][i];
+ }
+
+ q_end = qqoff + qf_flag + (aln->qlfact-1)*(aln->qlrev > 0 ? -1 : 1);
+ s_end = lloff + sf_flag + (aln->llfact-1)*(aln->qlrev > 0 ? -1 : 1);
+
+ if (m_msp->ann_flg) {
+ for (ic=ic_save,i=0; i<lend; ic++,i++) {
+ if (m_msp->markx&MX_ANNOT_MID) {
+ if (seqc0a && seqc0a[ic]!= ' ') {line[1][i] = seqc0a[ic];}
+ if (seqc1a && seqc1a[ic]!= ' ') {line[1][i] = seqc1a[ic];}
+ }
+ if (m_msp->markx&MX_ANNOT_COORD) {
+ if (seqc0a && seqc0a[ic]!= ' ') clinep[0][i+7-qfxn] = seqc0a[ic];
+ if (seqc1a && seqc1a[ic]!= ' ') clinep[1][i+7-lfxn] = seqc1a[ic];
+ }
+ }
+ }
+
+ line[0][lend]=line[1][lend]=line[2][lend]=0;
+ clinep[0][lend+7]=clinep[1][lend+7]=0;
+
+ ll01 = ll0&&ll1;
+ if ((m_msp->markx&MX_ATYPE)==2 && (!aln->showall || ll0)) ll1=0;
+ fprintf(fd,"\n");
+ if (!(m_msp->markx & MX_MBLAST)) {
+ if (ll0) fprintf(fd,"%s%s\n",blank,clinep[0]);
+ if (ll0) fprintf(fd,afmt,name0,line[0]);
+ if (ll01) fprintf(fd,afmt,name01,line[1]);
+ if (ll1) fprintf(fd,afmt,name1,line[2]);
+ if (ll1) fprintf(fd,"%s%s\n",blank,clinep[1]);
+ }
+ else {
+ /* this code emulates BLAST output, but currently only for
+ coordinates < 10000) coordinates > 10000 will require that
+ the offset start and end coordinates across the entire
+ alignment are checked, and then the format is modified to
+ ensure that they will fit. A simple %-d does not work,
+ because the other sequence (query/library) may have the large
+ coordinate.
+
+ thus, afmt and afmt0 must be modified for each sequence,
+ based on the boundaries of the alignment
+ */
+
+ if (ll0) fprintf(fd,afmt,name0,q_start,line[0],q_end);
+ if (ll01) fprintf(fd,afmt0,blank,line[1]);
+ if (ll1) fprintf(fd,afmt,name1,s_start,line[2],s_end);
+ }
+ }
+}
+
+static float gscale= -1.0;
+
+void
+disgraph(FILE *fd, int n0,int n1, float percent, int score,
+ int min0, int min1, int max0, int max1, long sq0off,
+ char *name0, char *name1, int nml,
+ int mlen, int markx)
+{
+ int i, gstart, gstop, gend;
+ int llen;
+ char line[MAXOUT+1];
+ char afmt[16], afmtf[64];
+
+ if (nml > 6) {
+ sprintf(afmt,"%%-%ds",nml);
+ }
+ else {
+ SAFE_STRNCPY(afmt,"%-6s",sizeof(afmt));
+ }
+ SAFE_STRNCPY(afmtf,afmt,sizeof(afmtf));
+ SAFE_STRNCAT(afmtf," %4ld-%4ld: %5.1f%%:%s:\n",sizeof(afmtf));
+
+ llen = mlen - 10;
+ memset(line,' ',llen);
+
+ line[llen-1]='\0';
+ if (gscale < 0.0) {
+ gscale = (float)llen/(float)n0;
+ if ((markx&MX_ATYPE) == 7 )
+ fprintf(fd,afmtf,name0,sq0off,sq0off+n0-1,100.0,line);
+ }
+
+ gstart = (int)(gscale*(float)min0+0.5);
+ gstop = (int)(gscale*(float)max0+0.5);
+ gend = gstop+(int)(gscale*(float)(n1-max1));
+
+ if (gstop >= llen) gstop = llen-1;
+ if (gend >= llen) gend = llen-1;
+ for (i=0; i<gstart; i++) line[i]=' ';
+ for (; i<gstop; i++) line[i]='-';
+ for (; i<llen; i++) line[i]=' ';
+
+ line[gend]=':';
+ line[llen]='\0';
+
+ if (markx & MX_AMAP) {
+ if ((markx & MX_ATYPE)==7) { /* markx==4 - no alignment */
+ SAFE_STRNCPY(afmtf,afmt,sizeof(afmtf));
+ SAFE_STRNCAT(afmtf," %4ld-%4ld:%4d %5.1f%%:%s\n",sizeof(afmtf));
+ fprintf(fd,afmtf,name1,min0+sq0off,max0+sq0off-1,score,percent,line);
+ }
+ else {
+ SAFE_STRNCPY(afmtf,">",sizeof(afmtf));
+ SAFE_STRNCAT(afmtf,afmt,sizeof(afmtf));
+ SAFE_STRNCAT(afmtf," %4ld-%4ld:%s\n",sizeof(afmtf));
+ fprintf(fd,afmtf, name1,min0+sq0off,max0+sq0off-1,line);
+ }
+ }
+}
+
+void
+aancpy(char *to, char *from, int count, struct pstruct *ppst)
+{
+ char *tp;
+ unsigned char *sq;
+ int nsq;
+
+ nsq = ppst->nsqx;
+ if (ppst->ext_sq_set) {
+ sq = ppst->sqx;
+ }
+ else {
+ sq = ppst->sq;
+ }
+
+ tp=to;
+ while (count-- && *from) {
+ if (*from <= nsq) *tp++ = sq[*(from++)];
+ else *tp++ = *from++;
+ }
+ *tp='\0';
+}
+
+/* calc_coord transfers alignment boundary coordinates (aln->amin0,
+ amin1, amax0, amax1) into display coordinates (aln->d_start0,
+ d_start1, etc.)
+
+ this routine now indexes from 1 (rather than 0) because sq starts
+ with a 0
+*/
+
+void
+calc_coord(int n0, int n1,
+ long q_offset,
+ long l_offset,
+ struct a_struct *aln)
+{
+ int l_lsgn, q_lsgn, q_fx0, q_fxn, l_fx0, l_fxn;
+
+ if (aln->qlrev == 1) {
+ aln->q_start_off = q_offset+n0;
+ aln->q_end_off = q_offset+1;
+ q_offset += n0;
+ q_lsgn = -1;
+ q_fx0 = 0;
+ q_fxn = 1;
+ }
+ else {
+ aln->q_start_off = q_offset + 1;
+ aln->q_end_off = q_offset+n0;
+ q_lsgn = 1;
+ q_fx0 = 1;
+ q_fxn = 0;
+ }
+
+ if (aln->llrev == 1) {
+ aln->l_start_off = l_offset + n1;
+ aln->l_end_off = l_offset+1;
+ l_offset += n1;
+ l_lsgn = -1;
+ l_fx0 = 0;
+ l_fxn = 1;
+ }
+ else {
+ aln->l_start_off = l_offset + 1;
+ aln->l_end_off = l_offset+n1;
+ l_lsgn = 1;
+ l_fx0 = 1;
+ l_fxn = 0;
+ }
+ aln->q_offset = q_offset;
+ aln->l_offset = l_offset;
+ aln->d_start0 = q_offset+q_lsgn*aln->amin0+q_fx0;
+ aln->d_stop0 = q_offset+q_lsgn*aln->amax0+q_fxn;
+ aln->d_start1 = l_offset+l_lsgn*aln->amin1*aln->llmult+l_fx0;
+ aln->d_stop1 = l_offset+l_lsgn*aln->amax1*aln->llmult+l_fxn;
+}
diff --git a/src/cal_cons.c b/src/cal_cons.c
new file mode 100644
index 0000000..e61fd60
--- /dev/null
+++ b/src/cal_cons.c
@@ -0,0 +1,1226 @@
+/* cal_cons.c - routines for printing translated alignments for
+ fasta, ssearch, ggsearch, glsearch */
+
+/* $Id: cal_cons.c 1280 2014-08-21 00:47:55Z wrp $ */
+/* $Revision: 1280 $ */
+
+/* copyright (c) 1998, 1999, 2007, 2014 by William R. Pearson and The
+ Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+
+/* removed from dropgsw2.c, dropnfa.c April, 2007 */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "defs.h"
+#include "param.h"
+#include "dyn_string.h"
+
+#if defined(FASTA) || defined(TFASTA)
+#include "dropnfa.h"
+#endif
+
+#if defined(SSEARCH) || defined(OSEARCH)
+#include "dropgsw2.h"
+#endif
+
+#ifdef LALIGN
+#include "dropgsw2.h"
+#endif
+
+#include "a_mark.h"
+
+struct update_code_str {
+ int p_op_idx;
+ int p_op_cnt;
+ int show_code;
+ int cigar_order;
+ int show_ext;
+ char *op_map;
+};
+
+static char *ori_code = "=-+*x";
+static char *cigar_code = "MDIMX";
+
+static struct update_code_str *
+init_update_data(int show_code);
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *, int op_idx, int op_cnt);
+
+static void
+update_code(char *al_str, int al_str_max,
+ struct update_code_str *update_data, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1);
+
+static void
+close_update_data(char *al_str, int al_str_max,
+ struct update_code_str *update_data);
+
+extern void aancpy(char *to, char *from, int count, struct pstruct *ppst);
+extern void *init_stack(int, int);
+extern void push_stack(void *, void *);
+extern void *pop_stack(void *);
+extern void *free_stack(void *);
+
+/* returns M_NEG, M_ZERO, M_POS, M_IDENT, M_DEL (a_mark.h)
+ updates *aln->nsim, npos, nident, nmismatch */
+extern int
+align_type(int score, char sp0, char sp1, int nt_align, struct a_struct *aln, int pam_x_id_sim);
+
+extern void
+process_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ struct annot_entry *annot_arr_p, char **ann_comment,
+ void *annot_stack, int *have_push_features, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, struct domfeat_link **left_domain_p,
+ long *left_end_p, int init_score);
+
+extern int
+next_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ int i_annot, int n_annot, struct annot_entry **annot_arr, char **ann_comment,
+ void *annot_stack, int *have_push_features, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, struct domfeat_link **left_domain,
+ long *left_domain_end, int init_score);
+
+extern void
+close_annot_match (int ia, void *annot_stack, int *have_push_features,
+ int *d_score_p, int *d_ident_p, int *d_alen_p,
+ struct domfeat_link **left_domain_p,
+ long *left_end_p, int init_score);
+
+extern void
+comment_var(long i0, char sp0, long i1, char sp1, char o_sp1, char sim_char,
+ const char *ann_comment, struct dyn_string_str *annot_var_dyn,
+ int target, int d_type);
+
+void
+display_push_features(void *annot_stack, struct dyn_string_str *annot_var_dyn,
+ long i0_pos, char sp0, long i1_pos, char sp1, char sym,
+ int score, double comp, int n0, int n1,
+ void *pstat_void, int d_type);
+
+#define DP_FULL_FMT 1 /* Region: score: bits: id: ... */
+
+extern int seq_pos(int pos, int rev, int off);
+
+int calc_cons_a(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int *nc,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ char *seqc0, char *seqc1, char *seqca, int *cumm_seq_score,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a, const struct annot_str *annot0_p, char *seqc0a,
+ const unsigned char *aa1a, const struct annot_str *annot1_p, char *seqc1a,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str,
+ void *pstat_void)
+{
+ int i0, i1, nn1;
+ int op, lenc, nd, ns, itmp, p_match;
+ const unsigned char *aa1p;
+ char *sp0, *sp0a, *sp1, *sp1a, *spa, t_spa;
+ int *i_spa;
+ const unsigned char *sq;
+ int *rp;
+ int smins, mins, ntmp;
+ int have_ann = 0;
+ void *annot_stack;
+
+ /* variables for variant changes */
+ int *aa0_pam2_p;
+ char *sim_sym = aln_map_sym[5];
+ struct annot_entry **s_annot0_arr_p, **s_annot1_arr_p;
+
+ char *ann_comment;
+ int i0_annot, i1_annot; /* i0_annot, i1_annot, count through
+ the list of annotations */
+ long i0_left_end, i1_left_end; /* left-most coordinate of domain end */
+
+ int v_delta, v_tmp;
+ int d1_score, d1_ident, d1_alen;
+ int d0_score, d0_ident, d0_alen;
+ int have_push_features;
+ struct domfeat_link *left_domain_list1, *left_domain_list0;
+
+ /* variables for handling coordinate offsets */
+ long q_offset, l_offset;
+
+ *score_delta = 0;
+ i0_left_end = i1_left_end = -1;
+ left_domain_list0 = left_domain_list1 = NULL;
+ d1_score = d1_ident = d1_alen = 0;
+ d0_score = d0_ident = d0_alen = 0;
+
+ NULL_dyn_string(annot_var_dyn);
+ have_ann = (seqc0a != NULL);
+
+ if (ppst->ext_sq_set) {
+ sq = ppst->sqx;
+ }
+ else {
+ sq = ppst->sq;
+ }
+
+#ifndef TFASTA
+ aa1p = aa1;
+ nn1 = n1;
+#else
+ aa1p = f_str->aa1x;
+ nn1 = f_str->n10;
+#endif
+
+ aln->amin0 = a_res->min0;
+ aln->amax0 = a_res->max0;
+ aln->amin1 = a_res->min1;
+ aln->amax1 = a_res->max1;
+ aln->calc_last_set = 1;
+
+ q_offset = aln->q_offset;
+ l_offset = aln->l_offset;
+
+#ifndef LCAL_CONS
+ /* will we show all the start ?*/
+ if (min(a_res->min0,a_res->min1)<aln->llen || aln->showall==1)
+ if (a_res->min0 >= a_res->min1) { /* aa0 extends more to left */
+ smins=0;
+ if (aln->showall==1) mins = a_res->min0;
+ else mins = min(a_res->min0,aln->llcntx);
+ aancpy(seqc0,(char *)aa0+a_res->min0-mins,mins,ppst);
+ aln->smin0 = a_res->min0-mins;
+ if ((mins-a_res->min1)>0) {
+ memset(seqc1,' ',mins-a_res->min1);
+ aancpy(seqc1+mins-a_res->min1,(char *)aa1p,a_res->min1,ppst);
+ aln->smin1 = 0;
+ }
+ else {
+ aancpy(seqc1,(char *)aa1p+a_res->min1-mins,mins,ppst);
+ aln->smin1 = a_res->min1-mins;
+ }
+ }
+ else {
+ smins=0;
+ if (aln->showall == 1) mins=a_res->min1;
+ else mins = min(a_res->min1,aln->llcntx);
+ aancpy(seqc1,(char *)(aa1p+a_res->min1-mins),mins,ppst);
+ aln->smin1 = a_res->min1-mins;
+ if ((mins-a_res->min0)>0) {
+ memset(seqc0,' ',mins-a_res->min0);
+ aancpy(seqc0+mins-a_res->min0,(char *)aa0,a_res->min0,ppst);
+ aln->smin0 = 0;
+ }
+ else {
+ aancpy(seqc0,(char *)aa0+a_res->min0-mins,mins,ppst);
+ aln->smin0 = a_res->min0-mins;
+ }
+ }
+ else {
+ mins= min(aln->llcntx,min(a_res->min0,a_res->min1));
+ smins=mins;
+ aln->smin0=a_res->min0 - smins;
+ aln->smin1=a_res->min1 - smins;
+ aancpy(seqc0,(char *)aa0+a_res->min0-mins,mins,ppst);
+ aancpy(seqc1,(char *)aa1p+a_res->min1-mins,mins,ppst);
+ }
+ /* set the alignment code to zero for context */
+ memset(seqca,0,mins);
+ if (have_ann) {
+ memset(seqc0a,' ',mins);
+ memset(seqc1a,' ',mins);
+ }
+#else /* no flanking context */
+ smins = mins = 0;
+ aln->smin0=a_res->min0;
+ aln->smin1=a_res->min1;
+#endif
+
+ /* now get the middle */
+
+ spa = seqca+mins;
+ if (cumm_seq_score) i_spa = cumm_seq_score+mins;
+ sp0 = seqc0+mins;
+ sp1 = seqc1+mins;
+
+ if (have_ann) {
+ sp0a = seqc0a+mins;
+ sp1a = seqc1a+mins;
+ }
+
+ rp = a_res->res;
+ lenc = aln->nident = aln->nmismatch =
+ aln->npos = aln->nsim = aln->ngap_q = aln->ngap_l = aln->nfs = op = 0;
+ p_match = 1;
+ i0 = a_res->min0;
+ i1 = a_res->min1;
+
+ v_delta = 0;
+ i0_annot = i1_annot = 0;
+ annot_stack = NULL;
+ s_annot0_arr_p = s_annot1_arr_p = NULL;
+ have_push_features=0;
+ if (have_ann) {
+ if ((annot1_p && annot1_p->n_annot>0) || (annot0_p && annot0_p->n_annot > 0)) {annot_stack = init_stack(64,64);}
+ if (annot1_p && annot1_p->n_annot > 0) {
+ s_annot1_arr_p = annot1_p->s_annot_arr_p;
+
+ while (i1_annot < annot1_p->n_annot) {
+ if (s_annot1_arr_p[i1_annot]->pos >= i1 + l_offset) {break;}
+ if (s_annot1_arr_p[i1_annot]->end < i1 + l_offset) {i1_annot++; continue;}
+
+ if (s_annot1_arr_p[i1_annot]->label == '-') {
+ process_annot_match(&itmp, aa0_pam2_p, l_offset+seq_pos(i1,aln->llrev,0), q_offset + seq_pos(i0,aln->qlrev,0),
+ sp1, sp1a, sq, s_annot1_arr_p[i1_annot], &ann_comment,
+ annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end, 0);
+ }
+ i1_annot++;
+ }
+ }
+
+ if (annot0_p && annot0_p->n_annot>0) {
+ s_annot0_arr_p = annot0_p->s_annot_arr_p;
+
+ while (i0_annot < annot0_p->n_annot && s_annot0_arr_p[i0_annot]->pos < i0 + q_offset) {
+ if (s_annot0_arr_p[i0_annot]->pos >= i0 + q_offset) {break;}
+ if (s_annot0_arr_p[i0_annot]->end < i0 + q_offset) {i0_annot++; continue;}
+
+ if (s_annot0_arr_p[i0_annot]->label == '-') {
+ process_annot_match(&itmp, aa0_pam2_p, q_offset+seq_pos(i0,aln->qlrev,0), l_offset + seq_pos(i1,aln->llrev,0),
+ sp0, sp0a, sq, s_annot0_arr_p[i0_annot], &ann_comment,
+ annot_stack, &have_push_features, &v_delta,
+ &d0_score, &d0_ident, &d0_alen, &left_domain_list0, &i0_left_end, 0);
+ }
+ i0_annot++;
+ }
+ }
+ }
+
+ while (i0 < a_res->max0 || i1 < a_res->max1) {
+ /* match/mismatch (aligned residues */
+ /* here, op is the "current" encoding, and *rp is the next one */
+ if (op == 0 && *rp == 0) {
+ op = *rp++;
+ lenc++;
+
+ if (ppst->pam_pssm) {aa0_pam2_p = ppst->pam2p[0][i0];}
+ else {aa0_pam2_p = ppst->pam2[0][aa0[i0]];}
+
+ itmp=aa0_pam2_p[aa1p[i1]];
+
+ *sp0 = sq[aa0[i0]];
+ *sp1 = sq[aa1p[i1]];
+
+ if (have_ann) {
+ have_push_features = 0;
+ *sp0a = *sp1a = ' ';
+ if (aa0a) {*sp0a = ann_arr[aa0a[i0]];}
+ if (aa1a) {*sp1a = ann_arr[aa1a[i1]];}
+ if (s_annot1_arr_p) {
+ if (i1+l_offset == s_annot1_arr_p[i1_annot]->pos || i1+l_offset == i1_left_end) {
+
+ i1_annot = next_annot_match(&itmp, aa0_pam2_p, l_offset+seq_pos(i1,aln->llrev,0),
+ q_offset + seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end, 0);
+
+ /* must be out of the loop to capture the last value */
+ if (sq[aa1p[i1]] != *sp1) {
+ t_spa = align_type(itmp, *sp0, *sp1, ppst->nt_align, NULL, ppst->pam_x_id_sim);
+
+ comment_var(q_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ l_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sq[aa1p[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn, 1, 1);
+ }
+ }
+ d1_score += itmp;
+ }
+
+ if (s_annot0_arr_p) {
+ if (i0 + q_offset == s_annot0_arr_p[i0_annot]->pos || i0 + q_offset == i0_left_end) {
+
+ i0_annot = next_annot_match(&itmp, aa0_pam2_p, q_offset+seq_pos(i0,aln->qlrev,0),
+ l_offset + seq_pos(i1,aln->llrev,0), sp0, sp0a, sq,
+ i0_annot, annot0_p->n_annot, s_annot0_arr_p,
+ &ann_comment, annot_stack, &have_push_features, &v_delta,
+ &d0_score, &d0_ident, &d0_alen, &left_domain_list0, &i0_left_end, 0);
+
+ /* must be out of the loop to capture the last value */
+ if (sq[aa0[i0]] != *sp0) {
+ t_spa = align_type(itmp, *sp0, *sp1, ppst->nt_align, NULL, ppst->pam_x_id_sim);
+
+ comment_var(q_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ l_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sq[aa0[i0]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn, 0, 1);
+ }
+ }
+ d0_score += itmp;
+ }
+ sp0a++; sp1a++;
+ }
+
+ *spa = align_type(itmp, *sp0, *sp1, ppst->nt_align, aln, ppst->pam_x_id_sim);
+
+ d1_alen++;
+ d0_alen++;
+ if (*spa == M_IDENT) {
+ d1_ident++;
+ d0_ident++;
+ }
+
+ /* now we have done all the ?modified identity checks, display
+ potential site annotations */
+ if (have_ann && have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ q_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ l_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1,
+ pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+ i0++; i1++;
+ sp0++; sp1++; spa++;
+ }
+ else { /* indel */
+ if (op==0) {
+ op = *rp++;
+ if (cumm_seq_score) *i_spa = ppst->gdelval;
+ d1_score += ppst->gdelval;
+ d0_score += ppst->gdelval;
+ }
+ if (cumm_seq_score) *i_spa++ += ppst->ggapval;
+ d1_score += ppst->ggapval; d1_alen++;
+ d0_score += ppst->ggapval; d0_alen++;
+
+ if (op>0) { /* insertion in aa0 */
+ *sp1 = sq[aa1p[i1]];
+ *sp0 = '-';
+ *spa++ = M_DEL;
+ if (have_ann) {
+ have_push_features = 0;
+ *sp0a++ = ' ';
+ if (aa1a) *sp1a = ann_arr[aa1a[i1]];
+ else *sp1a = ' ';
+ if (s_annot1_arr_p) {
+ if (i1+l_offset == s_annot1_arr_p[i1_annot]->pos || i1+l_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, aa0_pam2_p, l_offset+seq_pos(i1,aln->llrev,0),
+ q_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,
+ ppst->ggapval+ppst->gdelval);
+ }
+ }
+
+ if (have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ q_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ l_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1,
+ pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+ sp1a++;
+ }
+
+ sp0++;
+ sp1++;
+ i1++;
+ op--;
+ lenc++;
+ aln->ngap_q++;
+ }
+ else { /* insertion in aa1 */
+ *sp1 = '-';
+ *spa++ = M_DEL;
+ *sp0 = sq[aa0[i0]];
+ if (have_ann) {
+ have_push_features = 0;
+ *sp1a++ = ' ';
+ if (aa0a) *sp0a = ann_arr[aa0a[i0]];
+ else *sp0a = ' ';
+ if (s_annot0_arr_p) {
+ if (i0+q_offset == s_annot0_arr_p[i0_annot]->pos || i0+q_offset == i0_left_end) {
+ i0_annot = next_annot_match(&itmp, ppst->pam2[0][aa1[i1]], q_offset+seq_pos(i0,aln->qlrev,0),
+ l_offset+seq_pos(i1,aln->llrev,0), sp0, sp0a, sq,
+ i0_annot, annot0_p->n_annot, s_annot0_arr_p,
+ &ann_comment, annot_stack, &have_push_features, &v_delta,
+ &d0_score, &d0_ident, &d0_alen, &left_domain_list0, &i0_left_end,
+ ppst->ggapval+ppst->gdelval);
+
+ }
+ }
+
+ if (have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ q_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ l_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+ sp0a++;
+ }
+
+ i0++;
+ sp0++;
+ sp1++;
+ op++;
+ lenc++;
+ aln->ngap_l++;
+ }
+ }
+ }
+
+ *score_delta = v_delta;
+
+ *nc = lenc;
+ if (have_ann) {
+ *sp0a = *sp1a = '\0';
+ have_push_features = 0;
+ /* check for left ends after alignment */
+ if (annot1_p && i1_left_end > 0) {
+ close_annot_match(-1, annot_stack, &have_push_features,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,
+ 0);
+ }
+
+ if (annot0_p && i0_left_end > 0) {
+ close_annot_match(-1, annot_stack, &have_push_features,
+ &d0_score, &d0_ident, &d0_alen, &left_domain_list0, &i0_left_end,
+ 0);
+ }
+
+ if (have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ a_res->max0-1 + q_offset, *sp0,
+ a_res->max1-1 + l_offset, *sp1,
+ sim_sym[*spa],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ }
+ }
+
+ *spa = '\0';
+
+#ifndef LCAL_CONS /* have context around alignment */
+ /* now we have the middle, get the right end */
+ if (!aln->llcntx_set) {
+ ns = mins + lenc + aln->llen; /* show an extra line? */
+ ns -= (itmp = ns %aln->llen); /* itmp = left over on last line */
+ if (itmp>aln->llen/2) ns += aln->llen; /* more than 1/2 , use another*/
+ nd = ns - (mins+lenc); /* this much extra */
+ }
+ else nd = aln->llcntx;
+
+ if (nd > max(n0-a_res->max0,nn1-a_res->max1))
+ nd = max(n0-a_res->max0,nn1-a_res->max1);
+
+ if (aln->showall==1) {
+ nd = max(n0-a_res->max0,nn1-a_res->max1); /* reset for showall=1 */
+ /* get right end */
+ aancpy(seqc0+mins+lenc,(char *)aa0+a_res->max0,n0-a_res->max0,ppst);
+ aancpy(seqc1+mins+lenc,(char *)aa1p+a_res->max1,nn1-a_res->max1,ppst);
+ /* fill with blanks - this is required to use one 'nc' */
+ memset(seqc0+mins+lenc+n0-a_res->max0,' ',nd-(n0-a_res->max0));
+ memset(seqc1+mins+lenc+nn1-a_res->max1,' ',nd-(nn1-a_res->max1));
+ }
+ else {
+ if ((nd-(n0-a_res->max0))>0) {
+ aancpy(seqc0+mins+lenc,(char *)aa0+a_res->max0,(n0-a_res->max0),ppst);
+ memset(seqc0+mins+lenc+n0-a_res->max0,' ',nd-(n0-a_res->max0));
+ }
+ else {
+ aancpy(seqc0+mins+lenc,(char *)aa0+a_res->max0,nd,ppst);
+ }
+
+ if ((nd-(nn1-a_res->max1))>0) {
+ aancpy(seqc1+mins+lenc,(char *)aa1p+a_res->max1,nn1-a_res->max1,ppst);
+ memset(seqc1+mins+lenc+nn1-a_res->max1,' ',nd-(nn1-a_res->max1));
+ }
+ else {
+ aancpy(seqc1+mins+lenc,(char *)aa1p+a_res->max1,nd,ppst);
+ }
+ }
+ if (have_ann) {
+ memset(seqc0a+mins+lenc,' ',nd);
+ memset(seqc1a+mins+lenc,' ',nd);
+ /*
+ ntmp = nd-(n0-a_res->max0);
+ if (ntmp > 0) memset(seqc0a+mins+lenc+n0-a_res->max0,' ',ntmp);
+ ntmp = nd-(nn1-a_res->max1);
+ if (ntmp > 0) memset(seqc1a+mins+lenc+nn1-a_res->max1,' ',ntmp);
+ */
+ }
+#else
+ nd = 0;
+#endif
+
+ /* fprintf(stderr,"%d\n",mins+lenc+nd); */
+
+ lenc = mins + lenc + nd;
+
+ if (annot0_p || annot1_p) free_stack(annot_stack);
+
+ return lenc;
+}
+
+void
+calc_astruct(struct a_struct *aln_p, struct a_res_str *a_res_p, struct f_struct *f_str) {
+
+ /* we do not pay attention to aln_p->calc_last_set, because all the
+ functions (calc_astruct, calc_cons_a, calc_code) use exactly the same
+ assignment */
+
+ aln_p->amin0 = a_res_p->min0;
+ aln_p->amax0 = a_res_p->max0;
+ aln_p->amin1 = a_res_p->min1;
+ aln_p->amax1 = a_res_p->max1;
+}
+
+
+
+static struct update_code_str *
+init_update_data(show_code) {
+
+ struct update_code_str *update_data_p;
+
+ if ((update_data_p = (struct update_code_str *)calloc(1,sizeof(struct update_code_str)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - init_update_data(): cannot allocate update_code_str\n",
+ __FILE__, __LINE__);
+ return NULL;
+ }
+
+ update_data_p->p_op_idx = -1;
+ update_data_p->p_op_cnt = 0;
+ update_data_p->show_code = show_code;
+
+ if ((show_code & SHOW_CODE_MASK) == SHOW_CODE_CIGAR) {
+ update_data_p->op_map = cigar_code;
+ update_data_p->cigar_order = 1;
+ }
+ else {
+ update_data_p->op_map = ori_code;
+ update_data_p->cigar_order = 0;
+ }
+
+ if ((show_code & SHOW_CODE_EXT) == SHOW_CODE_EXT) {
+ update_data_p->show_ext = 1;
+ }
+ else {
+ update_data_p->show_ext = 0;
+ }
+
+ return update_data_p;
+}
+
+static void
+close_update_data(char *al_str, int al_str_max,
+ struct update_code_str *up_dp) {
+ char tmp_cnt[MAX_SSTR];
+
+ if (!up_dp) return;
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx, up_dp->p_op_cnt);
+ strncat(al_str,tmp_cnt,al_str_max);
+
+ free(up_dp);
+}
+
+/* update_code() has been modified to work more correctly with
+ ggsearch/glsearch, which, because alignments can start with either
+ insertions or deletions, can produce an initial code of "0=". When
+ that happens, it is ignored and no code is added.
+
+ *al_str - alignment string [al_str_max] - not dynamic
+ op -- encoded operation, currently 0=match, 1-delete, 2-insert, 3-term-match, 4-mismatch
+ op_cnt -- length of run
+ show_code -- SHOW_CODE_CIGAR uses cigar_code, otherwise legacy
+*/
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *up_dp, int op_idx, int op_cnt) {
+ if (up_dp->cigar_order) {
+ sprintf(tmp_str,"%d%c",op_cnt,up_dp->op_map[op_idx]);
+ }
+ else {
+ sprintf(tmp_str,"%c%d",up_dp->op_map[op_idx],op_cnt);
+ }
+}
+
+static void
+update_code(char *al_str, int al_str_max,
+ struct update_code_str *up_dp, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1)
+{
+ char tmp_cnt[MAX_SSTR];
+
+ /* op == 0 : match state (could involve termination codons);
+ op == 1 : deletion
+ op == 2 : insertion
+ op == 3 : *:*
+ p_op == 5 : mismatch state
+ */
+
+ if (up_dp->p_op_cnt == 0) {
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ return;
+ }
+
+ if (op == 1 || op == 2) {
+ if (up_dp->p_op_idx == op) { up_dp->p_op_cnt++;}
+ else {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ strncat(al_str,tmp_cnt,al_str_max);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ }
+ else if (op==0 || op == 3) {
+ if (sp0 != '*' && sp1 != '*') { /* default case, not termination */
+ if (up_dp->show_ext) {
+ if (sim_code != M_IDENT) { op = 4;}
+ }
+ }
+ else { /* have a termination codon, output for !SHOW_CODE_CIGAR */
+ if (!up_dp->cigar_order) {
+ if (sp0 == '*' || sp1 == '*') { op = 3;}
+ }
+ else if (up_dp->show_ext && (sp0 != sp1)) { op = 4;}
+ }
+
+ if (op != up_dp->p_op_idx) {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ strncat(al_str,tmp_cnt,al_str_max);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ else {
+ up_dp->p_op_cnt++;
+ }
+ }
+}
+
+/* build an array of match/ins/del - length strings */
+/* 5-June-2014 - modified to split "match" encoding into identical (=)
+ and mismatch (X)
+
+ To support domain-based scoring, this function iterates through
+ every aligned position, including insertions and deletions (which
+ are encoded as runs).
+ */
+
+int calc_code(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ char *al_str, int al_str_n,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a,
+ const struct annot_str *annot0_p,
+ const unsigned char *aa1a,
+ const struct annot_str *annot1_p,
+ struct dyn_string_str *ann_code_dyn,
+ int *score_delta,
+ struct f_struct *f_str,
+ void *pstat_void,
+ int display_code)
+{
+ int i0, i1;
+ int op, lenc;
+ int p_op, op_cnt;
+ int match, p_match, match_cnt;
+ int mis, p_mis, mis_cnt;
+ const unsigned char *aa1p;
+ char sp0, sp1;
+ struct update_code_str *update_data_p;
+ unsigned char *sq;
+ int *rp;
+ int have_ann=0;
+ char ann_ch0, ann_ch1;
+ char tmp_astr[MAX_STR];
+ int sim_code, t_spa;
+ char *sim_sym = aln_map_sym[MX_ACC];
+ int aa0c, aa1c, itmp;
+ int show_code, annot_fmt, start_flag;
+
+ /* variables for variant changes, regions */
+ int *aa0_pam2_p;
+ void *annot_stack;
+ struct annot_entry **s_annot0_arr_p;
+ struct annot_entry **s_annot1_arr_p;
+ int i0_annot, i1_annot, v_delta, v_tmp;
+ long i0_left_end, i1_left_end;
+ int d1_score, d1_ident, d1_alen;
+ int d0_score, d0_ident, d0_alen;
+ struct domfeat_link *left_domain_list1, *left_domain_list0;
+ int have_push_features;
+ long q_offset, l_offset;
+
+ *score_delta = 0;
+ i0_left_end = i1_left_end = -1;
+ left_domain_list0 = left_domain_list1 = NULL;
+ d1_score = d1_ident = d1_alen = 0;
+ d0_score = d0_ident = d0_alen = 0;
+
+ show_code = (display_code & (SHOW_CODE_MASK+SHOW_CODE_EXT)); /* see defs.h; SHOW_CODE_ALIGN=2,_CIGAR=3,_CIGAR_EXT=4 */
+ annot_fmt = 2;
+ if (display_code & SHOW_ANNOT_FULL) {
+ annot_fmt = 1;
+ }
+
+ if (aa0a != NULL && aa1a != NULL) { have_ann = 2;}
+ else if (aa0a != NULL || aa1a != NULL) { have_ann = 1;}
+ else {have_ann = 0;}
+
+ if (ppst->ext_sq_set) { sq = ppst->sqx;}
+ else {sq = ppst->sq;}
+
+#ifndef TFASTA
+ aa1p = aa1;
+#else
+ aa1p = f_str->aa1x;
+#endif
+
+ rp = a_res->res;
+ lenc = aln->nident = aln->nmismatch = aln->nsim = aln->npos = aln->ngap_q = aln->ngap_l = aln->nfs = 0;
+
+ update_data_p = init_update_data(show_code);
+
+ i0 = a_res->min0;
+ i1 = a_res->min1;
+
+ q_offset = aln->q_offset;
+ l_offset = aln->l_offset;
+
+ op = p_op = 0;
+ op_cnt = match_cnt = mis_cnt = 0;
+ start_flag = 1;
+
+ v_delta = 0;
+ i0_annot = i1_annot = 0;
+ annot_stack = NULL;
+ s_annot0_arr_p = s_annot1_arr_p = NULL;
+ if (have_ann) {
+ have_push_features = 0;
+ if (annot0_p || annot1_p) annot_stack = init_stack(64,64);
+
+ if (annot1_p && annot1_p->n_annot > 0) {
+ s_annot1_arr_p = annot1_p->s_annot_arr_p;
+ while (i1_annot < annot1_p->n_annot) {
+ if (s_annot1_arr_p[i1_annot]->pos >= i1 + l_offset) {break;}
+ if (s_annot1_arr_p[i1_annot]->end < i1 + l_offset) {i1_annot++; continue;}
+
+ if (s_annot1_arr_p[i1_annot]->label == '-') {
+ process_annot_match(&itmp, aa0_pam2_p, l_offset+seq_pos(i1,aln->llrev,0), q_offset + seq_pos(i0,aln->qlrev,0),
+ &sp1, NULL, sq, s_annot1_arr_p[i1_annot], NULL,
+ annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end, 0);
+ }
+ i1_annot++;
+ }
+ }
+
+ if (annot0_p && annot0_p->n_annot > 0) {
+ s_annot0_arr_p = annot0_p->s_annot_arr_p;
+ while (i0_annot < annot0_p->n_annot && s_annot0_arr_p[i0_annot]->pos < i0+q_offset) {
+ if (s_annot0_arr_p[i0_annot]->pos >= i0 + q_offset) {break;}
+ if (s_annot0_arr_p[i0_annot]->end < i0 + q_offset) {i0_annot++; continue;}
+
+ if (s_annot0_arr_p[i0_annot]->label == '-') {
+ process_annot_match(&itmp, aa0_pam2_p, q_offset+seq_pos(i0,aln->qlrev,0), l_offset + seq_pos(i1,aln->llrev,0),
+ &sp0, NULL, sq, s_annot0_arr_p[i0_annot], NULL,
+ annot_stack, &have_push_features, &v_delta,
+ &d0_score, &d0_ident, &d0_alen, &left_domain_list0, &i0_left_end, 0);
+ }
+ i0_annot++;
+ }
+ }
+ }
+
+ while (i0 < a_res->max0 || i1 < a_res->max1) {
+ /* match/mismatch (aligned residues */
+ /* here, op is the "current" encoding, and *rp is the next one */
+
+ /* there is an op==0 for every aligned/matched residue.
+ insertions in aa0 > 0,
+ deletions < 0
+
+ for enhanced CIGAR, need run lengths for identities, mismatches
+ */
+
+ if (ppst->pam_pssm) {aa0_pam2_p = ppst->pam2p[0][i0];}
+ else {aa0_pam2_p = ppst->pam2[0][aa0[i0]];}
+
+ if (op == 0 && *rp == 0) {
+ aa0c = aa0[i0];
+ aa1c = aa1p[i1];
+ itmp = aa0_pam2_p[aa1c];
+ sp0 = sq[aa0c];
+ sp1 = sq[aa1c];
+
+ /* variant annot1_p annotations can cause substitution */
+ if (s_annot1_arr_p) {
+ if (i1+l_offset == s_annot1_arr_p[i1_annot]->pos || i1+l_offset == i1_left_end) {
+
+ i1_annot = next_annot_match(&itmp, aa0_pam2_p, l_offset+seq_pos(i1,aln->llrev,0),
+ q_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ NULL, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end, 0);
+
+ if (sq[aa1c] != sp1) {
+ t_spa = align_type(itmp, sp0, sp1, ppst->nt_align, NULL, ppst->pam_x_id_sim);
+ comment_var(q_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ l_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sq[aa1c], sim_sym[t_spa], NULL,
+ ann_code_dyn, 1, annot_fmt);
+ }
+ }
+ d1_score += itmp;
+ }
+
+ if (s_annot0_arr_p) {
+ if (i0+q_offset == s_annot0_arr_p[i0_annot]->pos || i0+q_offset == i0_left_end) {
+
+ i0_annot = next_annot_match(&itmp, aa0_pam2_p, q_offset+seq_pos(i0,aln->qlrev,0),
+ l_offset+seq_pos(i1,aln->llrev,0), &sp0, NULL, sq,
+ i0_annot, annot0_p->n_annot, s_annot0_arr_p,
+ NULL, annot_stack, &have_push_features, &v_delta,
+ &d0_score, &d0_ident, &d0_alen, &left_domain_list0, &i0_left_end, 0);
+
+ /* check for sequence change from variant */
+ if (sq[aa0c] != sp0) {
+ t_spa = align_type(itmp, sp0, sp1, ppst->nt_align, NULL, ppst->pam_x_id_sim);
+ comment_var(q_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ l_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sq[aa0c], sim_sym[t_spa], NULL,
+ ann_code_dyn, 0, annot_fmt);
+ }
+ }
+ d0_score += itmp;
+ }
+
+ d0_alen++;
+ d1_alen++;
+ if ((sim_code == align_type(itmp, sp0, sp1, ppst->nt_align, aln, ppst->pam_x_id_sim)) == M_IDENT) {
+ d0_ident++;
+ d1_ident++;
+ }
+
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, op, sim_code, sp0, sp1);
+
+ /* update op to the next encoded position */
+ op = *rp++;
+ lenc++;
+
+ /* check for an annotation */
+ if (have_ann) {
+ ann_ch0 = ann_ch1 = '\0';
+ /* conventional annotations */
+ if (have_ann == 2 && (ann_arr[aa0a[i0]] != ' ' || ann_arr[aa1a[i1]] != ' ')) {
+ ann_ch0 = ann_arr[aa0a[i0]];
+ if (ann_ch0 == ' ') ann_ch0 = 'X';
+ ann_ch1 = ann_arr[aa1a[i1]];
+ if (ann_ch1 == ' ') ann_ch1 = 'X';
+ }
+ else if (aa0a != NULL && ann_arr[aa0a[i0]]!=' ') {
+ ann_ch0 = ann_arr[aa0a[i0]];
+ ann_ch1 = 'X';
+ }
+ else if (aa1a != NULL && ann_arr[aa1a[i1]]!=' ') {
+ ann_ch0 = 'X';
+ ann_ch1 = ann_arr[aa1a[i1]];
+ }
+
+ /* ann_ch0 only works below because ann_ch0=='X' if ann_ch1 */
+ if ( ann_ch0 && !(ann_ch1 == '[' || ann_ch1 == ']' || ann_ch0 == '[' || ann_ch0 == ']')) {
+ sprintf(tmp_astr, "|%c%c:%ld%c%c%ld%c",
+ ann_ch0,ann_ch1, q_offset+seq_pos(i0,aln->qlrev,0)+1,sp0,
+ sim_sym[sim_code], l_offset+seq_pos(i1,aln->llrev,0)+1,sp1);
+ /* SAFE_STRNCAT(ann_code_s, tmp_astr, n_ann_code_s); */
+ dyn_strcat(ann_code_dyn, tmp_astr);
+ }
+
+ if ((s_annot1_arr_p || s_annot0_arr_p) && have_push_features) {
+ display_push_features(annot_stack, ann_code_dyn,
+ q_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ l_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sim_sym[sim_code],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+ }
+ i0++; i1++;
+ }
+ else { /* not in match run, in a gap */
+ if (op == 0) {
+ /* at a transition from match (previous) to indel (current) */
+ d1_score += ppst->gdelval;
+ d0_score += ppst->gdelval;
+ op = *rp++;
+ }
+ d1_score += ppst->ggapval; d1_alen++;
+ d0_score += ppst->ggapval; d0_alen++;
+
+ if (op > 0) {
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 2, sim_code,'-','-');
+
+ if (s_annot1_arr_p) {
+ if (i1+l_offset == s_annot1_arr_p[i1_annot]->pos || i1+l_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, aa0_pam2_p, l_offset+seq_pos(i1,aln->llrev,0),
+ q_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ NULL, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end, 0);
+
+ if (have_push_features) {
+ display_push_features(annot_stack, ann_code_dyn,
+ q_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ l_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sim_sym[sim_code],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+ }
+ }
+ op--; lenc++; i1++; aln->ngap_q++;
+ }
+ else { /* (op < 0) */
+
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 1, sim_code,'-','-');
+
+ if (s_annot0_arr_p) {
+ if (i0+q_offset == s_annot0_arr_p[i0_annot]->pos || i0+q_offset == i0_left_end) {
+
+ i0_annot = next_annot_match(&itmp, aa0_pam2_p, q_offset+seq_pos(i0,aln->qlrev,0),
+ l_offset+seq_pos(i1,aln->llrev,0), &sp0, NULL, sq,
+ i0_annot, annot0_p->n_annot, s_annot0_arr_p,
+ NULL, annot_stack, &have_push_features, &v_delta,
+ &d0_score, &d0_ident, &d0_alen, &left_domain_list0, &i0_left_end, 0);
+
+ if (have_push_features) {
+ display_push_features(annot_stack, ann_code_dyn,
+ q_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ l_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sim_sym[sim_code],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+ }
+ }
+ op++; lenc++; i0++; aln->ngap_l++;
+ }
+ }
+ }
+
+ /* all done, clean things up */
+
+ close_update_data(al_str, al_str_n-strlen(al_str), update_data_p);
+
+ if (have_ann) {
+ have_push_features = 0;
+ /* also check for regions after alignment */
+
+ if (s_annot1_arr_p && i1_left_end > 0) {
+ close_annot_match(-1, annot_stack, &have_push_features,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,
+ 0);
+ }
+ if (s_annot0_arr_p && i0_left_end > 0) {
+ close_annot_match(-1, annot_stack, &have_push_features,
+ &d0_score, &d0_ident, &d0_alen, &left_domain_list0, &i0_left_end,
+ 0);
+ }
+
+ if (have_push_features) {
+ display_push_features(annot_stack, ann_code_dyn,
+ q_offset+a_res->max0-1, sp0,
+ l_offset+a_res->max1-1, sp1,
+ sim_sym[sim_code],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ }
+ }
+
+ if (annot0_p || annot1_p) free_stack(annot_stack);
+
+ *score_delta = v_delta;
+ return lenc;
+}
+
+int calc_id(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str)
+{
+ int i0, i1, nn1;
+ int op, lenc;
+ char sp0, sp1;
+ char tmp_str[MAX_SSTR];
+ const unsigned char *aa1p;
+ int *rp;
+ unsigned char *sq;
+
+ /* variables for variant changes */
+ int *aa0_pam2_p;
+ struct annot_entry **s_annot0_arr_p;
+ struct annot_entry **s_annot1_arr_p;
+ int itmp, i0_annot, i1_annot, v_delta, v_tmp;
+ long q_offset, l_offset;
+ long i0_left_end, i1_left_end;
+ int d1_score, d1_ident, d1_alen;
+ int d0_score, d0_ident, d0_alen;
+ struct domfeat_link *left_domain_list1, *left_domain_list0;
+ struct domfeat_link *this_dom, *next_dom;
+
+ left_domain_list1 = left_domain_list0 = NULL;
+
+ *score_delta = 0;
+ i0_left_end = i1_left_end = -1;
+ left_domain_list0 = left_domain_list1 = NULL;
+
+ NULL_dyn_string(annot_var_dyn);
+
+ if (ppst->ext_sq_set) { sq = ppst->sqx; }
+ else { sq = ppst->sq; }
+
+#ifndef TFASTA
+ aa1p = aa1;
+ nn1 = n1;
+#else
+ aa1p = f_str->aa1x;
+ nn1 = f_str->n10;
+#endif
+
+ aln->amin0 = a_res->min0;
+ aln->amax0 = a_res->max0;
+ aln->amin1 = a_res->min1;
+ aln->amax1 = a_res->max1;
+ aln->calc_last_set = 1;
+
+ q_offset = aln->q_offset;
+ l_offset = aln->l_offset;
+
+ rp = a_res->res;
+ lenc = aln->nident = aln->nmismatch = aln->nsim = aln->ngap_q = aln->ngap_l = aln->nfs = op = 0;
+ i0 = a_res->min0;
+ i1 = a_res->min1;
+
+ v_delta = 0;
+ i0_annot = i1_annot = 0;
+
+ d1_score = d1_ident = d1_alen = 0;
+ d0_score = d0_ident = d0_alen = 0;
+
+ if (annot1_p && annot1_p->n_annot > 0) s_annot1_arr_p = annot1_p->s_annot_arr_p;
+ else s_annot1_arr_p = NULL;
+ if (annot0_p && annot0_p->n_annot > 0) s_annot0_arr_p = annot0_p->s_annot_arr_p;
+ else s_annot0_arr_p = NULL;
+
+ while (i0 < a_res->max0 || i1 < a_res->max1) {
+
+ if (ppst->pam_pssm) {
+ aa0_pam2_p = ppst->pam2p[0][i0];
+ }
+ else {
+ aa0_pam2_p = ppst->pam2[0][aa0[i0]];
+ }
+
+ if (op == 0 && *rp == 0) {
+ /* op==0 -> we are in a match run, and current code is a match */
+ op = *rp++;
+ lenc++;
+
+ itmp = ppst->pam2[0][aa0[i0]][aa1p[i1]];
+ sp0 = sq[aa0[i0]];
+ sp1 = sq[aa1p[i1]];
+
+ if (s_annot1_arr_p && (i1 + l_offset == s_annot1_arr_p[i1_annot]->pos || i1+l_offset == i1_left_end)) {
+ i1_annot = next_annot_match(&itmp, aa0_pam2_p, l_offset+seq_pos(i1,aln->llrev,0),
+ q_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ NULL, NULL, NULL, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end, itmp);
+
+ /* must be out of the loop to capture the last value */
+ if (sq[aa1p[i1]] != sp1) {
+ sprintf(tmp_str,"%c%d%c;",sq[aa1p[i1]],i1+1,sp1);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ d1_score += itmp;
+ }
+
+ if (s_annot0_arr_p && (i0 + q_offset == s_annot0_arr_p[i0_annot]->pos || i0+q_offset == i0_left_end)) {
+ i0_annot = next_annot_match(&itmp, aa0_pam2_p, q_offset+seq_pos(i0,aln->qlrev,0),
+ l_offset+seq_pos(i1,aln->llrev,0), &sp0, NULL, sq,
+ i0_annot, annot0_p->n_annot, s_annot0_arr_p,
+ NULL, NULL, NULL, &v_delta,
+ &d0_score, &d0_ident, &d0_alen, &left_domain_list0, &i0_left_end, itmp);
+
+
+
+ if (sq[aa0[i0]] != sp0) {
+ sprintf(tmp_str,"q%c%d%c;",sq[aa0[i0]],i0+1,sp0);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ d0_score += itmp;
+ }
+
+ /* updates nident, nsim, npos */
+ d0_alen++;
+ d1_alen++;
+ if (align_type(itmp, sp0, sp1, ppst->nt_align, aln, ppst->pam_x_id_sim) == M_IDENT) {
+ d0_ident++;
+ d1_ident++;
+ }
+
+ i0++; i1++;
+ }
+ else {
+ if (op==0) op = *rp++;
+ if (op>0) { /* inserts in seq0 */
+ op--; lenc++; i1++; aln->ngap_q++;
+ }
+ else { /* inserts in seq 1 */
+ op++; lenc++; i0++; aln->ngap_l++;
+ }
+ }
+ }
+ *score_delta = v_delta;
+
+ return lenc;
+}
diff --git a/src/cal_cons2.c b/src/cal_cons2.c
new file mode 100644
index 0000000..ba86330
--- /dev/null
+++ b/src/cal_cons2.c
@@ -0,0 +1,1164 @@
+/* cal_cons.c - routines for printing translated alignments for
+ fasta, ssearch, ggsearch, glsearch */
+
+/* $Id: cal_cons.c 1280 2014-08-21 00:47:55Z wrp $ */
+
+/* copyright (c) 1998, 1999, 2007, 2014 by William R. Pearson and The
+ Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* removed from dropgsw2.c, dropnfa.c April, 2007 */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "defs.h"
+#include "param.h"
+#include "dyn_string.h"
+
+#if defined(FASTA) || defined(TFASTA)
+#include "dropnfa.h"
+#endif
+
+#if defined(SSEARCH) || defined(OSEARCH)
+#include "dropgsw2.h"
+#endif
+
+#ifdef LALIGN
+#include "dropgsw2.h"
+#endif
+
+#include "a_mark.h"
+
+struct update_code_str {
+ int p_op_idx;
+ int p_op_cnt;
+ int btop_enc;
+ int show_code;
+ int cigar_order;
+ int show_ext;
+ char *op_map;
+};
+
+static char *ori_code = "=-+*x";
+static char *cigar_code = "MDIMX";
+
+static struct update_code_str *
+init_update_data(int show_code);
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *, int op_idx, int op_cnt);
+
+static void
+update_code(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *update_data, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1);
+
+static void
+close_update_data(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *update_data);
+
+extern void aancpy(char *to, char *from, int count, struct pstruct *ppst);
+extern void *init_stack(int, int);
+extern void push_stack(void *, void *);
+extern void *pop_stack(void *);
+extern void *free_stack(void *);
+extern struct domfeat_data * init_domfeat_data(const struct annot_str *annot_p);
+
+/* returns M_NEG, M_ZERO, M_POS, M_IDENT, M_DEL (a_mark.h)
+ updates *aln->nsim, npos, nident, nmismatch */
+extern int
+align_type(int score, char sp0, char sp1, int nt_align, struct a_struct *aln, int pam_x_id_sim);
+
+extern void /* in compacc2e.c */
+process_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ struct annot_entry *annot_arr_p, int n_domains, char **ann_comment,
+ void *annot_stack, int *have_push_features_p, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_head_p,
+ struct domfeat_data *left_domain_p,
+ long *left_end_p, int init_score);
+
+extern int /* in compacc2e.c */
+next_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ int i_annot, int n_annot, struct annot_entry **annot_arr, char **ann_comment,
+ void *annot_stack, int *have_push_features_p, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_head_p, struct domfeat_data *left_domain_p,
+ long *left_domain_end, int init_score);
+
+extern void /* in compacc2e.c */
+close_annot_match (int ia, void *annot_stack, int *have_push_features_p,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_p,
+ long *left_end_p, int init_score);
+
+extern void /* in compacc2e.c */
+comment_var(long i0, char sp0, long i1, char sp1, char o_sp1, char sim_char,
+ const char *ann_comment, struct dyn_string_str *annot_var_dyn,
+ int target, int d_type);
+
+extern void /* in compacc2e.c */
+display_push_features(void *annot_stack, struct dyn_string_str *annot_var_dyn,
+ long i0_pos, char sp0, long i1_pos, char sp1, char sym,
+ int score, double comp, int sw_score, int n0, int n1,
+ void *pstat_void, int d_type);
+
+#define DP_FULL_FMT 1 /* Region: score: bits: id: ... */
+
+extern int seq_pos(int pos, int rev, int off);
+
+/* values of calc_func_mode */
+#define CALC_CONS 1
+#define CALC_CODE 2
+#define CALC_ID 3
+#define CALC_ID_DOM 4
+
+int
+pre_fill_cons(const unsigned char *aa0, const unsigned char *aa1p,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ struct a_struct *aln,
+ char *seqc0, char *seqc1, char *seqca, int *smins,
+ char *seqc0a, char *seqc1a) {
+ int mins;
+
+ /* will we show all the start ?*/
+ if (min(a_res->min0,a_res->min1)<aln->llen || aln->showall==1)
+ if (a_res->min0 >= a_res->min1) { /* aa0 extends more to left */
+ *smins=0;
+ if (aln->showall==1) mins = a_res->min0;
+ else mins = min(a_res->min0,aln->llcntx);
+ aancpy(seqc0,(char *)aa0+a_res->min0-mins,mins,ppst);
+ aln->smin0 = a_res->min0-mins;
+ if ((mins-a_res->min1)>0) {
+ memset(seqc1,' ',mins-a_res->min1);
+ aancpy(seqc1+mins-a_res->min1,(char *)aa1p,a_res->min1,ppst);
+ aln->smin1 = 0;
+ }
+ else {
+ aancpy(seqc1,(char *)aa1p+a_res->min1-mins,mins,ppst);
+ aln->smin1 = a_res->min1-mins;
+ }
+ }
+ else {
+ *smins=0;
+ if (aln->showall == 1) mins=a_res->min1;
+ else mins = min(a_res->min1,aln->llcntx);
+ aancpy(seqc1,(char *)(aa1p+a_res->min1-mins),mins,ppst);
+ aln->smin1 = a_res->min1-mins;
+ if ((mins-a_res->min0)>0) {
+ memset(seqc0,' ',mins-a_res->min0);
+ aancpy(seqc0+mins-a_res->min0,(char *)aa0,a_res->min0,ppst);
+ aln->smin0 = 0;
+ }
+ else {
+ aancpy(seqc0,(char *)aa0+a_res->min0-mins,mins,ppst);
+ aln->smin0 = a_res->min0-mins;
+ }
+ }
+ else {
+ mins= min(aln->llcntx,min(a_res->min0,a_res->min1));
+ *smins=mins;
+ aln->smin0=a_res->min0 - *smins;
+ aln->smin1=a_res->min1 - *smins;
+ aancpy(seqc0,(char *)aa0+a_res->min0-mins,mins,ppst);
+ aancpy(seqc1,(char *)aa1p+a_res->min1-mins,mins,ppst);
+ }
+ /* set the alignment code to zero for context */
+ memset(seqca,0,mins);
+ if (seqc0a) {
+ memset(seqc0a,' ',mins);
+ memset(seqc1a,' ',mins);
+ }
+ return mins;
+}
+
+int
+post_fill_cons(const unsigned char *aa0, int n0,
+ const unsigned char *aa1p, int nn1,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ int mins, int lenc,
+ struct a_struct *aln,
+ char *seqc0, char *seqc1,
+ char *seqc0a, char *seqc1a) {
+
+ int ns, nd, itmp;
+
+ /* now we have the middle, get the right end */
+ if (!aln->llcntx_set) {
+ ns = mins + lenc + aln->llen; /* show an extra line? */
+ ns -= (itmp = ns %aln->llen); /* itmp = left over on last line */
+ if (itmp>aln->llen/2) ns += aln->llen; /* more than 1/2 , use another*/
+ nd = ns - (mins+lenc); /* this much extra */
+ }
+ else nd = aln->llcntx;
+
+ if (nd > max(n0-a_res->max0,nn1-a_res->max1))
+ nd = max(n0-a_res->max0,nn1-a_res->max1);
+
+ if (aln->showall==1) {
+ nd = max(n0-a_res->max0,nn1-a_res->max1); /* reset for showall=1 */
+ /* get right end */
+ aancpy(seqc0+mins+lenc,(char *)aa0+a_res->max0,n0-a_res->max0,ppst);
+ aancpy(seqc1+mins+lenc,(char *)aa1p+a_res->max1,nn1-a_res->max1,ppst);
+ /* fill with blanks - this is required to use one 'nc' */
+ memset(seqc0+mins+lenc+n0-a_res->max0,' ',nd-(n0-a_res->max0));
+ memset(seqc1+mins+lenc+nn1-a_res->max1,' ',nd-(nn1-a_res->max1));
+ }
+ else {
+ if ((nd-(n0-a_res->max0))>0) {
+ aancpy(seqc0+mins+lenc,(char *)aa0+a_res->max0,(n0-a_res->max0),ppst);
+ memset(seqc0+mins+lenc+n0-a_res->max0,' ',nd-(n0-a_res->max0));
+ }
+ else {
+ aancpy(seqc0+mins+lenc,(char *)aa0+a_res->max0,nd,ppst);
+ }
+
+ if ((nd-(nn1-a_res->max1))>0) {
+ aancpy(seqc1+mins+lenc,(char *)aa1p+a_res->max1,nn1-a_res->max1,ppst);
+ memset(seqc1+mins+lenc+nn1-a_res->max1,' ',nd-(nn1-a_res->max1));
+ }
+ else {
+ aancpy(seqc1+mins+lenc,(char *)aa1p+a_res->max1,nd,ppst);
+ }
+ }
+ if (seqc0a) {
+ memset(seqc0a+mins+lenc,' ',nd);
+ memset(seqc1a+mins+lenc,' ',nd);
+ /*
+ ntmp = nd-(n0-a_res->max0);
+ if (ntmp > 0) memset(seqc0a+mins+lenc+n0-a_res->max0,' ',ntmp);
+ ntmp = nd-(nn1-a_res->max1);
+ if (ntmp > 0) memset(seqc1a+mins+lenc+nn1-a_res->max1,' ',ntmp);
+ */
+ }
+ return nd;
+}
+
+/* add_annot_code: adds annotation codes to struct dyn_string_str ann_code_dyn */
+void
+add_annot_code(int have_ann, char sp0, char sp1,
+ char ann_aa0_i0, char ann_aa1_i1,
+ long q_off_pos, long l_off_pos, char sim_sym_code,
+ struct dyn_string_str *ann_code_dyn)
+{
+ char ann_ch0, ann_ch1;
+ char tmp_astr[MAX_STR];
+
+ ann_ch0 = ann_ch1 = '\0';
+
+ /* conventional annotations */
+ if (have_ann == 3 && (ann_aa0_i0 != ' ' || ann_aa1_i1 != ' ')) {
+ ann_ch0 = ann_aa0_i0;
+ if (ann_ch0 == ' ') ann_ch0 = 'X';
+ ann_ch1 = ann_aa1_i1;
+ if (ann_ch1 == ' ') ann_ch1 = 'X';
+ }
+ else if ((have_ann&2)==0 && ann_aa0_i0 != ' ') {
+ ann_ch0 = ann_aa0_i0;
+ ann_ch1 = 'X';
+ }
+ else if ((have_ann&1)==0 && ann_aa1_i1 != ' ') {
+ ann_ch0 = 'X';
+ ann_ch1 = ann_aa1_i1;
+ }
+
+ /* ann_ch0 only works below because ann_ch0=='X' if ann_ch1 */
+ /*
+ if ( ann_ch0 && !(ann_ch1 == '[' || ann_ch1 == ']' || ann_ch0 == '[' || ann_ch0 == ']')) {
+ sprintf(tmp_astr, "|%c%c:%ld%c%c%ld%c",
+ ann_ch0,ann_ch1, q_offset+seq_pos(i0,aln->qlrev,0)+1,sp0,
+ sim_sym[sim_code], l_offset+seq_pos(i1,aln->llrev,0)+1,sp1);
+ */
+ if ( ann_ch0 && !(ann_ch1 == '[' || ann_ch1 == ']' || ann_ch0 == '[' || ann_ch0 == ']')) {
+ sprintf(tmp_astr, "|%c%c:%ld%c%c%ld%c",
+ ann_ch0,ann_ch1, q_off_pos+1,sp0,
+ sim_sym_code, l_off_pos+1,sp1);
+ dyn_strcat(ann_code_dyn, tmp_astr);
+ }
+}
+
+/* calc_cons_u - combines calc_cons_a/calc_code/ calc_id */
+int
+calc_cons_u( /* inputs */
+ const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_res_str *a_res, /* alignment encoding */
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ void *pstat_void,
+ /* annotation stuff */
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a, const struct annot_str *annot0_p,
+ const unsigned char *aa1a, const struct annot_str *annot1_p,
+ int calc_func_mode, /* CALC_CONS, CALC_CODE, CALC_ID */
+ int display_code, /* used only by CALC_CODE */
+ /* outputs */
+ int *nc,
+ char *seqc0, char *seqc1, char *seqca, int *cumm_seq_score,
+ char *seqc0a, char *seqc1a,
+ struct a_struct *aln,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct dyn_string_str *align_code_dyn)
+{
+ int i0, i1, nn1;
+ int op, lenc, nd, ns, itmp;
+ const unsigned char *aa1p;
+ char *sp0_p, *sp0a_p, *sp1_p, *sp1a_p, *spa_p, t_spa;
+ char sp0_c, sp1_c, spa_c; /* used for CALC_ID, CALC_CODE */
+ char sp0a_c, sp1a_c; /* used for CALC_CODE */
+ char tmp_str[MAX_SSTR];
+ int *i_spa;
+ const unsigned char *sq;
+ int *rp;
+ int smins, mins, ntmp;
+ int have_ann;
+ void *annot_stack = NULL;
+ struct update_code_str *update_data_p;
+
+ /* variables for variant changes */
+ int *aa0_pam2_p;
+ char *sim_sym = aln_map_sym[5];
+ struct annot_entry **s_annot0_arr_p, **s_annot1_arr_p;
+
+ char *ann_comment;
+ int i0_annot, i1_annot; /* i0_annot, i1_annot, count through
+ the list of annotations */
+ long i0_left_end, i1_left_end; /* left-most coordinate of domain end */
+
+ int show_code, annot_fmt, start_flag;
+
+ int v_delta, v_tmp;
+ int d1_score, d1_ident, d1_alen, d1_gaplen;
+ int d0_score, d0_ident, d0_alen, d0_gaplen;
+ int have_push_features;
+ int *have_push_features_p;
+
+ /* struct domfeat_data is used to capture score, coordinate, and
+ identity information for possibly overlapping sub-alignment
+ scores -- each domfeat_data entry is associated with an
+ annot_p->annot_arr_p entry */
+ struct domfeat_data *left_domain_head1, *left_domain_head0;
+ struct domfeat_data *left_domain_list1, *left_domain_list0;
+
+ /* variables for handling coordinate offsets */
+ long q_offset, l_offset;
+ long i0_off, i1_off;
+
+ if (ppst->ext_sq_set) { sq = ppst->sqx;}
+ else {sq = ppst->sq;}
+
+#ifndef TFASTA
+ aa1p = aa1;
+ nn1 = n1;
+#else
+ aa1p = f_str->aa1x;
+ nn1 = f_str->n10;
+#endif
+
+ aln->amin0 = a_res->min0;
+ aln->amax0 = a_res->max0;
+ aln->amin1 = a_res->min1;
+ aln->amax1 = a_res->max1;
+ aln->calc_last_set = 1;
+
+ q_offset = aln->q_offset;
+ l_offset = aln->l_offset;
+
+#ifndef LCAL_CONS /* use for local context */
+ if (calc_func_mode == CALC_CONS) {
+ mins = pre_fill_cons(aa0, aa1p, a_res,ppst, aln, seqc0, seqc1, seqca, &smins,
+ seqc0a, seqc1a);
+ }
+#else /* no flanking context */
+ smins = mins = 0;
+ aln->smin0=a_res->min0;
+ aln->smin1=a_res->min1;
+#endif
+
+ /* now get the middle */
+ have_ann = 0; /* default no annotation */
+ left_domain_head0 = left_domain_head1 = NULL;
+ left_domain_list0 = left_domain_list1 = NULL;
+
+ /* have_ann encodes which sequences are annotated */
+ if ((annot0_p && annot0_p->n_annot > 0) || (aa0a != NULL)) { have_ann |= 1;}
+ if ((annot1_p && annot1_p->n_annot > 0) || (aa1a != NULL)) { have_ann |= 2;}
+
+ if (calc_func_mode == CALC_CONS) {
+ spa_p = seqca+mins; /* pointer to alignment symbol */
+ if (cumm_seq_score) i_spa = cumm_seq_score+mins; /* set index for cumm_seq_score */
+ sp0_p = seqc0+mins;
+ sp1_p = seqc1+mins;
+ /* have_ann = (seqc0a != NULL); */
+ annot_fmt = DP_FULL_FMT;
+ }
+ else if (calc_func_mode == CALC_ID || calc_func_mode == CALC_ID_DOM) {
+ spa_p = &spa_c;
+ sp0_p = &sp0_c;
+ sp1_p = &sp1_c;
+ /* does not require aa0a/aa1a, only for variants */
+ /* have_ann = ((annot1_p && annot1_p->n_annot > 0) || (annot0_p && annot0_p->n_annot > 0)); */
+ annot_fmt = 3;
+ }
+ else if (calc_func_mode == CALC_CODE) {
+ spa_p = &spa_c;
+ sp0_p = &sp0_c;
+ sp1_p = &sp1_c;
+
+ show_code = (display_code & (SHOW_CODE_MASK+SHOW_CODE_EXT)); /* see defs.h; SHOW_CODE_ALIGN=4,_CIGAR=8,_CIGAR_EXT=24, _BTOP_EXT=16 */
+ annot_fmt = 2;
+ if (display_code & SHOW_ANNOT_FULL) {
+ annot_fmt = 1;
+ }
+
+ update_data_p = init_update_data(show_code);
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] --- cal_cons_u() invalid calc_func_mode: %d\n",
+ __FILE__, __LINE__, calc_func_mode);
+ exit(1);
+ }
+
+ have_push_features=0;
+
+ if (have_ann) { /* initialize annotation variables */
+ if (calc_func_mode == CALC_CONS) {
+ sp0a_p = seqc0a+mins;
+ sp1a_p = seqc1a+mins;
+ annot_stack = init_stack(64,64);
+ have_push_features_p = &have_push_features;
+ }
+ else if (calc_func_mode == CALC_ID || calc_func_mode == CALC_ID_DOM) {
+ sp0a_p = &sp0a_c;
+ sp1a_p = &sp1a_c;
+ have_push_features_p = &have_push_features;
+ /* ann_comment = NULL; */
+ annot_stack = init_stack(64,64);
+ }
+ else if (calc_func_mode == CALC_CODE) {
+ annot_stack = init_stack(64,64);
+ sp0a_p = &sp0a_c;
+ sp1a_p = &sp1a_c;
+ have_push_features_p = &have_push_features;
+ }
+
+ *score_delta = 0;
+ i0_left_end = i1_left_end = -1;
+ NULL_dyn_string(annot_var_dyn);
+ }
+ /* always initialize, updated with no annotations */
+ d1_score = d1_ident = d1_alen = d1_gaplen = 0;
+ d0_score = d0_ident = d0_alen = d0_gaplen = 0;
+
+ lenc = aln->nident = aln->nmismatch =
+ aln->npos = aln->nsim = aln->ngap_q = aln->ngap_l = aln->nfs = op = 0;
+
+ i0 = a_res->min0; /* start in aa0[] */
+ i1 = a_res->min1; /* start in aa1[] */
+
+ /* handle region annotations outside alignment */
+ v_delta = 0;
+ i0_annot = i1_annot = 0;
+ s_annot0_arr_p = s_annot1_arr_p = NULL;
+ if (have_ann) {
+ i1_off = seq_pos(i1, aln->llrev,0) + l_offset;
+ i0_off = seq_pos(i0, aln->qlrev,0) + q_offset;
+
+ if (annot1_p && annot1_p->n_annot > 0) {
+
+ left_domain_list1 = init_domfeat_data(annot1_p);
+ s_annot1_arr_p = annot1_p->s_annot_arr_p;
+
+ while (i1_annot < annot1_p->n_annot) {
+ if (s_annot1_arr_p[i1_annot]->pos >= i1_off) {break;}
+ if (s_annot1_arr_p[i1_annot]->end <= i1_off) {i1_annot++; continue;}
+
+ if (s_annot1_arr_p[i1_annot]->label == '-') {
+ process_annot_match(&itmp, aa0_pam2_p, i1_off, i0_off,
+ sp1_p, sp1a_p, sq, s_annot1_arr_p[i1_annot], annot1_p->n_domains, &ann_comment,
+ annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1,
+ &left_domain_list1[i1_annot], &i1_left_end, 0);
+ }
+ i1_annot++;
+ }
+ }
+
+ /* do not need have_ann here, because domain only */
+ if (annot0_p && annot0_p->n_annot>0) {
+
+ if (calc_func_mode == CALC_CONS || calc_func_mode == CALC_CODE) {
+
+ /* inefficient -- the same initiation is done for every
+ query/subj alignment, even though it is always the same --
+ should be done in the build_ares() loop */
+ left_domain_list0 = init_domfeat_data(annot0_p);
+ s_annot0_arr_p = annot0_p->s_annot_arr_p;
+
+ while (i0_annot < annot0_p->n_annot) {
+ if (s_annot0_arr_p[i0_annot]->pos >= i0_off) {break;}
+ if (s_annot0_arr_p[i0_annot]->end <= i0_off) {i0_annot++; continue;}
+
+ if (s_annot0_arr_p[i0_annot]->label == '-') {
+ process_annot_match(&itmp, NULL, i0_off, i1_off,
+ sp0_p, sp0a_p, sq, s_annot0_arr_p[i0_annot], annot0_p->n_domains, &ann_comment,
+ annot_stack, have_push_features_p, &v_delta,
+ &d0_score, &d0_ident, &d0_alen, &d0_gaplen,
+ &left_domain_head0,
+ &left_domain_list0[i0_annot], &i0_left_end, 0);
+ }
+ i0_annot++;
+ }
+ }
+ }
+ }
+ /* done with domains starting before alignment */
+
+ /* handle alignment encoding */
+ rp = a_res->res; /* alignment encoding array */
+ while (i0 < a_res->max0 || i1 < a_res->max1) {
+ /* match/mismatch (aligned residues */
+ /* here, op is the "current" encoding, and *rp is the next one */
+ if (op == 0 && *rp == 0) {
+ op = *rp++;
+ lenc++;
+
+ if (ppst->pam_pssm) {aa0_pam2_p = ppst->pam2p[0][i0];}
+ else {aa0_pam2_p = ppst->pam2[0][aa0[i0]];}
+
+ itmp=aa0_pam2_p[aa1p[i1]];
+
+ *sp0_p = sq[aa0[i0]];
+ *sp1_p = sq[aa1p[i1]];
+
+ if (have_ann) {
+ have_push_features = 0;
+ *sp0a_p = *sp1a_p = ' ';
+ if (aa0a) *sp0a_p = ann_arr[aa0a[i0]];
+ if (aa1a) *sp1a_p = ann_arr[aa1a[i1]];
+
+ if (s_annot1_arr_p) {
+ if (i1+l_offset == s_annot1_arr_p[i1_annot]->pos) {
+
+ i1_annot = next_annot_match(&itmp, aa0_pam2_p, l_offset+seq_pos(i1,aln->llrev,0),
+ q_offset + seq_pos(i0,aln->qlrev,0), sp1_p, sp1a_p, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, left_domain_list1, &i1_left_end, 0);
+
+ /* must be out of the loop to capture the last value */
+ if (sq[aa1p[i1]] != *sp1_p) {
+ t_spa = align_type(itmp, *sp0_p, *sp1_p, ppst->nt_align, NULL, ppst->pam_x_id_sim);
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ comment_var(q_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ l_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+ sq[aa1p[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn, 1, annot_fmt);
+ }
+ else {
+ sprintf(tmp_str,"%c%d%c;",sq[aa1p[i1]],i1+1,*sp1_p);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+ }
+ d1_score += itmp;
+ }
+
+ if (s_annot0_arr_p) {
+ if (i0 + q_offset == s_annot0_arr_p[i0_annot]->pos) {
+
+ i0_annot = next_annot_match(&itmp, aa0_pam2_p, q_offset+seq_pos(i0,aln->qlrev,0),
+ l_offset + seq_pos(i1,aln->llrev,0), sp0_p, sp0a_p, sq,
+ i0_annot, annot0_p->n_annot, s_annot0_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d0_score, &d0_ident, &d0_alen, &d0_gaplen,
+ &left_domain_head0, left_domain_list0, &i0_left_end, 0);
+
+ /* must be out of the loop to capture the last value */
+ if (sq[aa0[i0]] != *sp0_p) {
+ t_spa = align_type(itmp, *sp0_p, *sp1_p, ppst->nt_align, NULL, ppst->pam_x_id_sim);
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+
+ comment_var(q_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ l_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+ sq[aa0[i0]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn, 0, annot_fmt);
+ }
+ else {
+ sprintf(tmp_str,"q%c%d%c;",sq[aa0[i0]],i0+1,*sp0_p);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+ }
+ d0_score += itmp;
+ }
+ if (calc_func_mode == CALC_CONS) {sp0a_p++; sp1a_p++;}
+ }
+
+ *spa_p = align_type(itmp, *sp0_p, *sp1_p, ppst->nt_align, aln, ppst->pam_x_id_sim);
+
+ d1_alen++;
+ d0_alen++;
+ if (*spa_p == M_IDENT) {
+ d1_ident++;
+ d0_ident++;
+ }
+
+ if (s_annot1_arr_p && (i1 + l_offset == i1_left_end)) {
+ i1_annot = next_annot_match(&itmp, aa0_pam2_p, l_offset+seq_pos(i1,aln->llrev,0),
+ q_offset + seq_pos(i0,aln->qlrev,0), sp1_p, sp1a_p, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, left_domain_list1, &i1_left_end, 0);
+ }
+
+ if (s_annot0_arr_p && (i0 + q_offset == i0_left_end)) {
+ i0_annot = next_annot_match(&itmp, aa0_pam2_p, q_offset+seq_pos(i0,aln->qlrev,0),
+ l_offset + seq_pos(i1,aln->llrev,0), sp0_p, sp0a_p, sq,
+ i0_annot, annot0_p->n_annot, s_annot0_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d0_score, &d0_ident, &d0_alen, &d0_gaplen,
+ &left_domain_head0, left_domain_list0, &i0_left_end, 0);
+ }
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, op, *spa_p, *sp0_p, *sp1_p);
+ }
+
+ /* now we have done all the ?modified identity checks, display
+ potential site annotations */
+ if (have_ann && calc_func_mode == CALC_CODE) {
+ add_annot_code(have_ann, *sp0_p, *sp1_p, *sp0a_p, *sp1a_p,
+ q_offset + seq_pos(i0,aln->qlrev,0), l_offset+seq_pos(i1,aln->llrev,0),
+ sim_sym[*spa_p], annot_var_dyn);
+ }
+
+ if (have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+ q_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ l_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1,
+ pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ if (cumm_seq_score) *i_spa++ = itmp; /* update cummulative score */
+ i0++; i1++;
+ if (calc_func_mode == CALC_CONS) {
+ sp0_p++; sp1_p++; spa_p++;
+ }
+ }
+ else { /* indel */
+ /* include all calc_func_mode's, because i_annot must be incremented in indels */
+ if (op==0) {
+ op = *rp++;
+ if (cumm_seq_score) *i_spa = ppst->gdelval;
+ d1_score += ppst->gdelval;
+ d0_score += ppst->gdelval;
+ }
+ if (cumm_seq_score) *i_spa++ += ppst->ggapval;
+ d1_score += ppst->ggapval; d1_alen++; d1_gaplen++;
+ d0_score += ppst->ggapval; d0_alen++; d0_gaplen++;
+
+ if (op > 0) { /* insertion in aa0 */
+ *sp1_p = sq[aa1p[i1]];
+ *sp0_p = '-';
+ *spa_p = M_DEL;
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 2, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ if (have_ann) {
+ have_push_features = 0;
+ *sp0a_p = ' ';
+ if (aa1a) *sp1a_p = ann_arr[aa1a[i1]];
+ else *sp1a_p = ' ';
+ if (s_annot1_arr_p) {
+ if (i1+l_offset == s_annot1_arr_p[i1_annot]->pos || i1+l_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, aa0_pam2_p, l_offset+seq_pos(i1,aln->llrev,0),
+ q_offset+seq_pos(i0,aln->qlrev,0), sp1_p, sp1a_p, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, left_domain_list1, &i1_left_end,
+ ppst->ggapval+ppst->gdelval);
+ }
+ }
+
+ if (have_ann && calc_func_mode == CALC_CODE) {
+ add_annot_code(have_ann, *sp0_p, *sp1_p, *sp0a_p, *sp1a_p,
+ q_offset + seq_pos(i0,aln->qlrev,0), l_offset+seq_pos(i1,aln->llrev,0),
+ '-', annot_var_dyn);
+ }
+
+ if (have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+ q_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ l_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1,
+ pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+ if (calc_func_mode == CALC_CONS) {
+ sp0a_p++;
+ sp1a_p++;
+ }
+ }
+ if (calc_func_mode == CALC_CONS) {
+ spa_p++;
+ sp0_p++;
+ sp1_p++;
+ }
+ i1++;
+ op--;
+ lenc++;
+ aln->ngap_q++;
+ }
+ else { /* (op < 0), insertion in aa1 */
+ *sp1_p = '-';
+ *spa_p = M_DEL;
+ *sp0_p = sq[aa0[i0]];
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 1, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ if (have_ann) {
+ have_push_features = 0;
+ *sp1a_p = ' ';
+ if (aa0a) *sp0a_p = ann_arr[aa0a[i0]];
+ else *sp0a_p = ' ';
+ if (s_annot0_arr_p) {
+ if (i0+q_offset == s_annot0_arr_p[i0_annot]->pos || i0+q_offset == i0_left_end) {
+ i0_annot = next_annot_match(&itmp, ppst->pam2[0][aa1[i1]], q_offset+seq_pos(i0,aln->qlrev,0),
+ l_offset+seq_pos(i1,aln->llrev,0), sp0_p, sp0a_p, sq,
+ i0_annot, annot0_p->n_annot, s_annot0_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d0_score, &d0_ident, &d0_alen, &d0_gaplen,
+ &left_domain_head0, left_domain_list0, &i0_left_end,
+ ppst->ggapval+ppst->gdelval);
+
+ }
+ }
+
+ if (calc_func_mode == CALC_CODE) {
+ add_annot_code(have_ann, *sp0_p, *sp1_p, *sp0a_p, *sp1a_p,
+ q_offset + seq_pos(i0,aln->qlrev,0), l_offset+seq_pos(i1,aln->llrev,0),
+ '-', annot_var_dyn);
+ }
+
+ if (have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+ q_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ l_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+ if (calc_func_mode == CALC_CONS) {
+ sp0a_p++;
+ sp1a_p++;
+ }
+ }
+
+ i0++;
+ if (calc_func_mode == CALC_CONS) {
+ spa_p++;
+ sp0_p++;
+ sp1_p++;
+ }
+ op++;
+ lenc++;
+ aln->ngap_l++;
+ }
+ }
+ }
+
+ if (calc_func_mode == CALC_CODE) {
+ close_update_data(align_code_dyn, update_data_p);
+ }
+
+ *score_delta = v_delta;
+
+ *nc = lenc;
+ if (have_ann) {
+ have_push_features = 0;
+ /* check for left ends after alignment */
+ if (annot1_p && i1_left_end > 0) {
+ close_annot_match(-1, annot_stack, have_push_features_p,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, &i1_left_end, 0);
+ }
+
+ if (annot0_p && i0_left_end > 0) {
+ close_annot_match(-1, annot_stack, have_push_features_p,
+ &d0_score, &d0_ident, &d0_alen, &d0_gaplen,
+ &left_domain_head0, &i0_left_end, 0);
+ }
+
+ if (have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+ a_res->max0-1 + q_offset, *sp0_p,
+ a_res->max1-1 + l_offset, *sp1_p,
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ }
+ }
+
+ *spa_p = '\0';
+ if (have_ann) {
+ *sp0a_p = *sp1a_p = '\0';
+ }
+ if (calc_func_mode == CALC_CONS) {
+#ifndef LCAL_CONS /* have context around alignment */
+ nd = post_fill_cons(aa0, n0, aa1p, nn1,
+ a_res, ppst, mins, lenc, aln,
+ seqc0, seqc1, seqc0a, seqc1a);
+#else
+ nd = 0;
+#endif
+ lenc = mins + lenc + nd;
+ }
+
+ if (have_ann) {
+ if (left_domain_list0) free(left_domain_list0);
+ if (left_domain_list1) free(left_domain_list1);
+ annot_stack = free_stack(annot_stack);
+ }
+
+ return lenc;
+}
+
+int calc_cons_a(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int *nc,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ char *seqc0, char *seqc1, char *seqca, int *cumm_seq_score,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a, const struct annot_str *annot0_p, char *seqc0a,
+ const unsigned char *aa1a, const struct annot_str *annot1_p, char *seqc1a,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str,
+ void *pstat_void)
+{
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, pstat_void,
+ ann_arr, aa0a, annot0_p, aa1a, annot1_p, CALC_CONS, 0,
+ nc, seqc0, seqc1, seqca, cumm_seq_score,
+ seqc0a, seqc1a, aln, score_delta, annot_var_dyn, NULL
+ );
+}
+
+void
+calc_astruct(struct a_struct *aln_p, struct a_res_str *a_res_p, struct f_struct *f_str) {
+
+ /* we do not pay attention to aln_p->calc_last_set, because all the
+ functions (calc_astruct, calc_cons_a, calc_code) use exactly the same
+ assignment */
+
+ aln_p->amin0 = a_res_p->min0;
+ aln_p->amax0 = a_res_p->max0;
+ aln_p->amin1 = a_res_p->min1;
+ aln_p->amax1 = a_res_p->max1;
+}
+
+static struct update_code_str *
+init_update_data(show_code) {
+
+ struct update_code_str *update_data_p;
+
+ if ((update_data_p = (struct update_code_str *)calloc(1,sizeof(struct update_code_str)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - init_update_data(): cannot allocate update_code_str\n",
+ __FILE__, __LINE__);
+ return NULL;
+ }
+
+ update_data_p->p_op_idx = -1;
+ update_data_p->p_op_cnt = 0;
+ update_data_p->show_code = show_code;
+ update_data_p->btop_enc = 0;
+
+ if ((show_code & SHOW_CODE_CIGAR) == SHOW_CODE_CIGAR) { /* CIGAR enc */
+ update_data_p->op_map = cigar_code;
+ update_data_p->cigar_order = 1;
+ }
+ else if ((show_code & SHOW_CODE_BTOP) == SHOW_CODE_BTOP) { /* btop_enc */
+ update_data_p->op_map = ori_code;
+ update_data_p->cigar_order = 0;
+ update_data_p->btop_enc = 1;
+ }
+ else { /* orig (ALIGN) enc */
+ update_data_p->op_map = ori_code;
+ update_data_p->cigar_order = 0;
+ }
+
+ if ((show_code & SHOW_CODE_EXT) == SHOW_CODE_EXT) { /* set for CIGAR/ALIGN, BTOP already set */
+ update_data_p->show_ext = 1;
+ }
+ else {
+ update_data_p->show_ext = 0;
+ }
+
+ return update_data_p;
+}
+
+
+static void
+close_update_data(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *up_dp) {
+ char tmp_cnt[MAX_SSTR];
+ tmp_cnt[0] = '\0';
+
+ if (!up_dp) return;
+
+ if (up_dp->p_op_cnt) {
+ if (up_dp->btop_enc) { /* btop_enc always has a p_opt_cnt == 0 unless in run of identical match */
+ sprintf(tmp_cnt,"%d",up_dp->p_op_cnt);
+ up_dp->p_op_cnt = 0;
+ }
+ else {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx, up_dp->p_op_cnt);
+ }
+ dyn_strcat(align_code_dyn,tmp_cnt);
+ }
+
+ free(up_dp);
+}
+
+/* update_code() has been modified to work more correctly with
+ ggsearch/glsearch, which, because alignments can start with either
+ insertions or deletions, can produce an initial code of "0=". When
+ that happens, it is ignored and no code is added.
+
+ *align_code_dyn - alignment string (dynamic)
+ op -- encoded operation, currently 0=match, 1-delete, 2-insert, 3-term-match, 4-mismatch
+ op_cnt -- length of run
+ show_code -- SHOW_CODE_CIGAR uses cigar_code, otherwise legacy
+*/
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *up_dp, int op_idx, int op_cnt) {
+ if (up_dp->cigar_order) {
+ sprintf(tmp_str,"%d%c",op_cnt,up_dp->op_map[op_idx]);
+ }
+ else {
+ sprintf(tmp_str,"%c%d",up_dp->op_map[op_idx],op_cnt);
+ }
+}
+
+/* only called for btop alignment encoding, for identity, update
+ count, otherwise, print previous count and current difference.
+ assumes that up_dp->p_op_cnt only tracks identity
+*/
+
+static void
+sprintf_btop(char *tmp_str,
+ struct update_code_str *up_dp,
+ int op, int sim_code,
+ unsigned char sp0, unsigned char sp1)
+{
+ char local_str[MAX_SSTR];
+ local_str[0]='\0';
+
+ tmp_str[0] = '\0';
+
+ if (op==0 && sim_code == M_IDENT) {
+ up_dp->p_op_cnt++;
+ return;
+ }
+ else {
+ if (up_dp->p_op_cnt > 0) {
+ sprintf(local_str,"%d",up_dp->p_op_cnt);
+ }
+ up_dp->p_op_cnt = 0;
+ sprintf(tmp_str,"%s%c%c",local_str,sp0,sp1);
+ }
+}
+
+static void
+update_code(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *up_dp, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1)
+{
+ char tmp_cnt[MAX_SSTR];
+ tmp_cnt[0]='\0';
+
+ /* op == 0 : match state (could involve termination codons);
+ op == 1 : deletion
+ op == 2 : insertion
+ op == 3 : *:*
+ p_op == 5 : mismatch state
+ */
+
+ if (up_dp->btop_enc) {
+ sprintf_btop(tmp_cnt, up_dp, op, sim_code, sp0, sp1);
+ dyn_strcat(align_code_dyn, tmp_cnt);
+ return;
+ }
+
+ /* not btop_enc */
+ if (up_dp->p_op_cnt == 0) {
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ return;
+ }
+
+ if (op == 1 || op == 2) {
+ if (up_dp->p_op_idx == op) { up_dp->p_op_cnt++;}
+ else {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ dyn_strcat(align_code_dyn, tmp_cnt);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ }
+ else if (op==0 || op == 3) {
+ if (sp0 != '*' && sp1 != '*') { /* default case, not termination */
+ if (up_dp->show_ext) {
+ if (sim_code != M_IDENT) { op = 4;}
+ }
+ }
+ else { /* have a termination codon, output for !SHOW_CODE_CIGAR */
+ if (!up_dp->cigar_order) {
+ if (sp0 == '*' || sp1 == '*') { op = 3;}
+ }
+ else if (up_dp->show_ext && (sp0 != sp1)) { op = 4;}
+ }
+
+ if (op != up_dp->p_op_idx) {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ dyn_strcat(align_code_dyn, tmp_cnt);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ else {
+ up_dp->p_op_cnt++;
+ }
+ }
+}
+
+/* build an array of match/ins/del - length strings */
+/* 5-June-2014 - modified to split "match" encoding into identical (=)
+ and mismatch (X)
+
+ To support domain-based scoring, this function iterates through
+ every aligned position, including insertions and deletions (which
+ are encoded as runs).
+ */
+
+int calc_code(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ struct dyn_string_str *align_code_dyn,
+ /* char *al_str, int al_str_n, */
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a,
+ const struct annot_str *annot0_p,
+ const unsigned char *aa1a,
+ const struct annot_str *annot1_p,
+ struct dyn_string_str *annot_code_dyn,
+ int *score_delta,
+ struct f_struct *f_str,
+ void *pstat_void,
+ int display_code)
+{
+ int nc;
+
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, pstat_void,
+ ann_arr, aa0a, annot0_p, aa1a, annot1_p, CALC_CODE,
+ display_code,
+ &nc, NULL, NULL, NULL, NULL,
+ NULL, NULL, aln, score_delta, annot_code_dyn,
+ align_code_dyn
+ );
+}
+
+/* calc_id never looks at domains or features, only variation */
+
+int calc_id(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str)
+{
+ int nc;
+
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, NULL,
+ NULL, NULL, annot0_p, NULL, annot1_p, CALC_ID, 0,
+ &nc, NULL, NULL, NULL, NULL,
+ NULL, NULL, aln, score_delta, annot_var_dyn,
+ NULL
+ );
+}
+
+/* calc_id never looks at domains or features, only variation */
+
+int calc_idd(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str)
+{
+ int nc;
+
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, NULL,
+ NULL, NULL, annot0_p, NULL, annot1_p, CALC_ID_DOM, 0,
+ &nc, NULL, NULL, NULL, NULL,
+ NULL, NULL, aln, score_delta, annot_var_dyn,
+ NULL
+ );
+}
diff --git a/src/cal_consf.c b/src/cal_consf.c
new file mode 100644
index 0000000..2adbea2
--- /dev/null
+++ b/src/cal_consf.c
@@ -0,0 +1,591 @@
+/* cal_consf.c - routines for printing translated alignments for [t]fast[sf] */
+
+/* copyright (c) 1998, 1999, 2007, 2014 by William R. Pearson and The
+ Rector & Visitors of the University of Virginia */
+
+/* $Id: cal_consf.c 1263 2014-06-25 10:40:39Z wrp $ */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* removed from dropfs2.c, dropff2.c April, 2007 */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "defs.h"
+#include "param.h"
+
+#include "tatstats.h"
+
+#include "a_mark.h"
+
+#define DROP_INTERN
+#include "drop_func.h"
+
+void update_code(struct dyn_string_str *align_code_dyn, int op, int op_cnt, int fnum, int show_code);
+extern void aancpy(char *to, char *from, int count, const struct pstruct *ppst);
+
+int
+calc_cons_a(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int *nc,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ char *seqc0, char *seqc1, char *seqca, int *cumm_seq_score,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a, const struct annot_str *annot0_p, char *seqc0a,
+ const unsigned char *aa1a, const struct annot_str *annot1_p, char *seqc1a,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str,
+ void *pstat_void)
+{
+ int i0, i1, nn1, n0t;
+ int op, lenc, len_gap, nd, ns, itmp, p_ac, fnum, o_fnum;
+ int *i_spa;
+ const unsigned char *aa1p;
+ const unsigned char *aa0ap;
+ char *sp0, *sp0a, *sp1, *sp1a, *spa;
+ int *rp;
+ int mins, smins, ntmp;
+ int have_ann = 0;
+
+ *score_delta = 0;
+ NULL_dyn_string(annot_var_dyn);
+ have_ann = (seqc0a != NULL && (aa0a != NULL || aa1a != NULL));
+
+#ifndef TFAST
+ aa1p = aa1;
+ nn1 = n1;
+#else
+ aa1p = f_str->aa1x;
+ nn1 = f_str->n10;
+#endif
+
+ aln->amin0 = a_res->min0 + f_str->aa0t_off;
+ aln->amax0 = a_res->max0 + f_str->aa0t_off;;
+ aln->amin1 = a_res->min1;
+ aln->amax1 = a_res->max1;
+ aln->calc_last_set = 1;
+
+ /* first fill in the ends */
+ n0 -= (f_str->nm0-1);
+
+ if (min(a_res->min0,a_res->min1)<aln->llen || aln->showall==1)
+ /* will we show all the start ?*/
+ if (a_res->min0>=a_res->min1) { /* aa0 extends more to left */
+ smins=0;
+ if (aln->showall==1) mins=a_res->min0;
+ else mins = min(a_res->min0,aln->llen/2);
+ aancpy(seqc0,(char *)f_str->aa0t+a_res->min0-mins,mins,ppst);
+ aln->smin0 = a_res->min0-mins;
+ if ((mins-a_res->min1)>0) {
+ memset(seqc1,' ',mins-a_res->min1);
+ aancpy(seqc1+mins-a_res->min1,(char *)aa1p,a_res->min1,ppst);
+ aln->smin1 = 0;
+ }
+ else {
+ aancpy(seqc1,(char *)aa1p+a_res->min1-mins,mins,ppst);
+ aln->smin1 = a_res->min1-mins;
+ }
+ }
+ else {
+ smins=0;
+ if (aln->showall == 1) mins=a_res->min1;
+ else mins = min(a_res->min1,aln->llen/2);
+ aancpy(seqc1,(char *)(aa1p+a_res->min1-mins),mins,ppst);
+ aln->smin1 = a_res->min1-mins;
+ if ((mins-a_res->min0)>0) {
+ memset(seqc0,' ',mins-a_res->min0);
+ aancpy(seqc0+mins-a_res->min0,(char *)f_str->aa0t,a_res->min0,ppst);
+ aln->smin0 = 0;
+ }
+ else {
+ aancpy(seqc0,(char *)f_str->aa0t+a_res->min0-mins,mins,ppst);
+ aln->smin0 = a_res->min0-mins;
+ }
+ }
+ else {
+ mins= min(aln->llen/2,min(a_res->min0,a_res->min1));
+ smins=mins;
+ aln->smin0=a_res->min0;
+ aln->smin1=a_res->min1;
+ aancpy(seqc0,(char *)f_str->aa0t+a_res->min0-mins,mins,ppst);
+ aancpy(seqc1,(char *)aa1p+a_res->min1-mins,mins,ppst);
+ }
+
+ memset(seqca,M_BLANK,mins);
+ if (have_ann) {
+ /* pad annotation before alignment - this strategy means no
+ annotation before alignment */
+ memset(seqc0a,' ', mins);
+ memset(seqc1a,' ', mins);
+ }
+
+/* now get the middle */
+
+ spa = seqca+mins;
+ if (cumm_seq_score) i_spa = cumm_seq_score+mins;
+ sp0 = seqc0+mins;
+ sp0a = seqc0a+mins;
+ sp1 = seqc1+mins;
+ sp1a = seqc1a+mins;
+ rp = a_res->res;
+ n0t=lenc=len_gap=aln->nident=aln->nmismatch=aln->nsim=aln->npos=aln->ngap_q=aln->ngap_l=op=p_ac= 0;
+ i0 = a_res->min0;
+ i1 = a_res->min1;
+
+ /* op is the previous "match/insert" operator; *rp is the current
+ operator or repeat count */
+
+#if defined(FASTS) || defined(FASTM)
+ o_fnum = f_str->aa0ti[i0];
+ if (aa0a) aa0ap = &aa0a[f_str->nmoff[o_fnum]+i0];
+#endif
+
+ while (i0 < a_res->max0 || i1 < a_res->max1) {
+#if defined(FASTS) || defined(FASTM)
+ fnum = f_str->aa0ti[i0];
+#endif
+
+ if (op == 0 && *rp == 0) { /* previous was match (or start), current is match */
+
+#if defined(FASTS) || defined(FASTM)
+ if (p_ac == 0) { /* previous code was a match */
+ if (fnum != o_fnum) { /* continuing a match, but with a different fragment */
+ if (have_ann) { aa0ap = &aa0a[f_str->nmoff[fnum]];}
+ o_fnum = fnum;
+ }
+ }
+ else {
+ p_ac = 0; o_fnum = fnum = f_str->aa0ti[i0];
+ if (have_ann) {aa0ap = &aa0a[f_str->nmoff[fnum]];}
+ }
+#endif
+ op = *rp++; /* get the next match/insert operator */
+
+ /* get the alignment symbol */
+ if ((itmp=ppst->pam2[0][f_str->aa0t[i0]][aa1p[i1]])<0) { *spa = M_NEG; }
+ else if (itmp == 0) { *spa = M_ZERO;}
+ else {*spa = M_POS;}
+ if (*spa == M_POS) { aln->npos++;}
+ if (*spa == M_ZERO || *spa == M_POS) { aln->nsim++;}
+
+ if (cumm_seq_score) *i_spa++ += itmp;
+
+ *sp0 = ppst->sq[f_str->aa0t[i0++]]; /* get the residues for the consensus */
+
+ if (have_ann) {
+ if (aa0a) {*sp0a++ = ann_arr[*aa0ap++];}
+ else {*sp0a++ = ' ';}
+ if (aa1a) {*sp1a++ = ann_arr[aa1a[i1]];}
+ else {*sp1a++ = ' ';}
+ }
+
+ *sp1 = ppst->sq[aa1p[i1++]];
+ n0t++;
+ lenc++;
+ if (toupper(*sp0) == toupper(*sp1)) {aln->nident++; *spa = M_IDENT;}
+ else {aln->nmismatch++;}
+ sp0++; sp1++; spa++;
+ }
+ else { /* either op != 0 (previous was insert) or *rp != 0
+ (current is insert) */
+ if (op==0) { op = *rp++;} /* previous was match, start insert */
+ /* previous was insert - count through gap */
+#if defined(FASTS) || defined(FASTM)
+ if (p_ac != 1) {
+ p_ac = 1;
+ fnum = f_str->aa0ti[i0];
+ }
+#endif
+ if (have_ann) {
+ *sp0a++ = ' ';
+ if (aa1a) {*sp1a++ = ann_arr[aa1a[i1]];}
+ else {*sp1a++ = ' ';}
+ }
+ *sp0++ = '-';
+ *sp1++ = ppst->sq[aa1p[i1++]];
+ *spa++ = M_DEL;
+ op--;
+ len_gap++;
+ lenc++;
+ }
+ } /* end alignment while() */
+
+ if (have_ann) {*sp0a = *sp1a = '\0';}
+ *spa = '\0';
+ *nc = lenc-len_gap;
+
+ /* now we have the middle, get the right end */
+
+ /* ns should be the length of alignmnet display in seqc0/0a/1/1a */
+ ns = mins + lenc + aln->llen;
+ /* adjust for the last line */
+ ns -= (itmp = ns %aln->llen);
+ /* add another full line */
+ if (itmp>aln->llen/2) ns += aln->llen;
+ /* the amount to display at the end, now (after lenc) */
+ nd = ns - (mins+lenc);
+ if (nd > max(n0t-a_res->max0,nn1-a_res->max1)) nd = max(n0t-a_res->max0,nn1-a_res->max1);
+
+ if (aln->showall==1) {
+ nd = max(n0t-a_res->max0,nn1-a_res->max1); /* reset for showall=1 */
+ /* get right end */
+ aancpy(seqc0+mins+lenc,(char *)f_str->aa0t+a_res->max0,n0t-a_res->max0,ppst);
+ aancpy(seqc1+mins+lenc,(char *)aa1p+a_res->max1,nn1-a_res->max1,ppst);
+ /* fill with blanks - this is required to use one 'nc' */
+ memset(seqc0+mins+lenc+n0t-a_res->max0,' ',nd-(n0t-a_res->max0));
+ memset(seqc1+mins+lenc+nn1-a_res->max1,' ',nd-(nn1-a_res->max1));
+ }
+ else {
+ if ((nd-(n0t-a_res->max0))>0) {
+ /* finish copying out the sequence */
+ aancpy(seqc0+mins+lenc,(char *)f_str->aa0t+a_res->max0,
+ n0t-a_res->max0,ppst);
+ /* add blanks to pad */
+ memset(seqc0+mins+lenc+n0t-a_res->max0,' ',nd-(n0t-a_res->max0));
+ }
+ else {
+ /* just use up some sequence */
+ aancpy(seqc0+mins+lenc,(char *)f_str->aa0t+a_res->max0,nd,ppst);
+ }
+ if ((nd-(nn1-a_res->max1))>0) {
+ /* finish copying out the sequence */
+ aancpy(seqc1+mins+lenc,(char *)aa1p+a_res->max1,nn1-a_res->max1,ppst);
+ /* add blanks to pad */
+ memset(seqc1+mins+lenc+nn1-a_res->max1,' ',nd-(nn1-a_res->max1));
+ }
+ else {
+ /* just use up some sequence */
+ aancpy(seqc1+mins+lenc,(char *)aa1p+a_res->max1,nd,ppst);
+ }
+ }
+ if (have_ann) {
+ /* also pad the annotation -- this strategy means no annotations
+ in the unaligned region*/
+ memset(seqc0a+mins+lenc,' ',nd);
+ memset(seqc1a+mins+lenc,' ',nd);
+ /*
+ ntmp = nd-(n0t-a_res->max0);
+ if (ntmp > 0) memset(seqc0a+mins+lenc+n0-a_res->max0,' ',ntmp);
+ ntmp = nd-(nn1-a_res->max1);
+ if (ntmp > 0) memset(seqc1a+mins+lenc+nn1-a_res->max1,' ',ntmp);
+ */
+ }
+
+ aln->smin0 = f_str->aa0t_off;
+
+ return mins+lenc+nd;
+}
+
+void
+calc_astruct(struct a_struct *aln_p, struct a_res_str *a_res_p, struct f_struct *f_str) {
+
+ /* we do not pay attention to aln_p->calc_last_set, because all the
+ functions (calc_astruct, calc_cons_a, calc_code) use exactly the same
+ assignment */
+
+ aln_p->amin0 = a_res_p->min0 + f_str->aa0t_off;
+ aln_p->amax0 = a_res_p->max0 + f_str->aa0t_off;
+ aln_p->amin1 = a_res_p->min1;
+ aln_p->amax1 = a_res_p->max1;
+}
+
+/* build an array of match/ins/del - length strings */
+int
+calc_code(const unsigned char *aa0, const int n0,
+ const unsigned char *aa1, const int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ struct dyn_string_str *align_code_dyn,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a,
+ const struct annot_str *annot0_p,
+ const unsigned char *aa1a,
+ const struct annot_str *annot1_p,
+ struct dyn_string_str *annot_code_dyn,
+ int *score_delta,
+ struct f_struct *f_str,
+ void *pstat_void,
+ int display_code)
+{
+ int i0, i1, nn1;
+ int op, lenc, len_gap;
+ int p_ac, op_cnt;
+ const unsigned char *aa1p;
+ const unsigned char *aa0ap;
+ char tmp_cnt[20];
+ char sp0, sp1, spa;
+ unsigned char *sq;
+ int *rp;
+ int mins, smins;
+ int o_fnum,fnum = 0;
+
+ int have_ann = 0;
+ char ann_ch0, ann_ch1;
+ char tmp_astr[MAX_STR];
+ int sim_code;
+ int show_code, annot_fmt;
+ char *sim_sym= aln_map_sym[MX_ACC];
+
+ *score_delta = 0;
+
+ show_code = (display_code & SHOW_CODE_MASK);
+ annot_fmt = 2;
+ if (display_code & SHOW_ANNOT_FULL) {
+ annot_fmt = 1;
+ }
+
+ if (aa0a != NULL && aa1a != NULL) { have_ann = 2;}
+ else if (aa0a != NULL || aa1a != NULL) { have_ann = 1;}
+ else {have_ann = 0;}
+
+ if (ppst->ext_sq_set) {sq = ppst->sqx;}
+ else {sq = ppst->sq;}
+
+#ifndef TFAST
+ aa1p = aa1;
+ nn1 = n1;
+#else
+ aa1p = f_str->aa1x;
+ nn1 = f_str->n10;
+#endif
+
+ /*
+ aln->amin0 = a_res->min0;
+ aln->amin1 = a_res->min1;
+ aln->amax0 = a_res->max0;
+ aln->amax1 = a_res->max1;
+ */
+
+ rp = a_res->res;
+ lenc = len_gap =aln->nident=aln->nmismatch=aln->nsim=aln->npos=aln->ngap_q=aln->ngap_l=aln->nfs=op=p_ac = 0;
+ op_cnt = 0;
+
+ i0 = a_res->min0; /* start in aa0 (f_str->aa0t) */
+ i1 = a_res->min1; /* start in aa1 */
+ tmp_cnt[0]='\0';
+
+#if defined(FASTS) || defined(FASTM)
+ o_fnum = f_str->aa0ti[i0]+1;
+ if (aa0a) { aa0ap = &aa0a[f_str->nmoff[o_fnum]+i0]; }
+#endif
+
+ while (i0 < a_res->max0 || i1 < a_res->max1) {
+ fnum = f_str->aa0ti[i0]+1;
+ if (op == 0 && *rp == 0) { /* previous was match, this is match */
+#if defined(FASTS) || defined(FASTM)
+ if (p_ac == 0) { /* previous code was a match */
+ if (fnum == o_fnum) { op_cnt++; }
+ else { /* continuing a match, but with a different fragment */
+ update_code(align_code_dyn, p_ac, op_cnt, o_fnum, show_code);
+ if (have_ann) aa0ap = &aa0a[f_str->nmoff[fnum]];
+ o_fnum = fnum;
+ op_cnt=1;
+ }
+ }
+ else {
+ update_code(align_code_dyn,p_ac,op_cnt,o_fnum, show_code);
+ op_cnt = 1; p_ac = 0; o_fnum = fnum = f_str->aa0ti[i0] + 1;
+ if (have_ann) {aa0ap = &aa0a[f_str->nmoff[fnum]];}
+ }
+#endif
+ op = *rp++;
+ lenc++;
+ sim_code = M_NEG;
+ if (ppst->pam2[0][f_str->aa0t[i0]][aa1p[i1]]>0) {
+ sim_code = M_POS;
+ aln->npos++;
+ aln->nsim++;
+ }
+ else if (ppst->pam2[0][f_str->aa0t[i0]][aa1p[i1]]==0) {
+ sim_code = M_ZERO;
+ aln->nsim++;
+ }
+
+ sp0 = ppst->sq[f_str->aa0t[i0]];
+ sp1 = ppst->sq[aa1p[i1]];
+ if (toupper(sp0) == toupper(sp1)) {
+ sim_code = M_IDENT;
+ aln->nident++;
+ }
+ else {aln->nmismatch++;}
+
+ /* check for an annotation */
+ if (have_ann) {
+ ann_ch0 = ann_ch1 = '\0';
+ if (have_ann == 2 && (ann_arr[*aa0ap] != ' ' || ann_arr[aa1a[i1]] != ' ')) {
+ ann_ch0 = ann_arr[*aa0ap];
+ if (ann_ch0 == ' ') ann_ch0 = 'X';
+ ann_ch1 = ann_arr[aa1a[i1]];
+ if (ann_ch1 == ' ') ann_ch1 = 'X';
+ }
+ else if (aa0a != NULL && ann_arr[*aa0ap]!=' ') {
+ ann_ch0 = ann_arr[*aa0ap];
+ ann_ch1 = 'X';
+ }
+ else if (aa1a != NULL && ann_arr[aa1a[i1]]!=' ') {
+ ann_ch0 = 'X';
+ ann_ch1 = ann_arr[aa1a[i1]];
+ }
+ aa0ap++;
+ if (ann_ch0) {
+ sprintf(tmp_astr, "|%ld:%ld:%c%c:%c%c%c",
+ aln->q_offset+i0+1,aln->l_offset+i1+1,
+ ann_ch0,ann_ch1,sim_sym[sim_code],sp0,sp1);
+ /* strncat(ann_str, tmp_astr, ann_str_n - strlen(ann_str) - 1); */
+ dyn_strcat(annot_code_dyn, tmp_astr);
+ }
+ }
+ i0++;
+ i1++;
+ }
+ else {
+ if (op==0) op = *rp++;
+ if (p_ac == 1) { op_cnt++;}
+ else {
+ update_code(align_code_dyn,p_ac,op_cnt,o_fnum, show_code);
+#if defined(FASTS) || defined(FASTM)
+ p_ac = 1;
+ fnum = f_str->aa0ti[i0];
+#endif
+ op_cnt = 1; fnum = f_str->aa0ti[i0] + 1;
+ }
+ op--; lenc++; i1++; len_gap++;
+ }
+ }
+ update_code(align_code_dyn,p_ac,op_cnt,o_fnum, show_code);
+
+ return lenc - len_gap;
+}
+
+/* update_code(): if "op" == 0, this is the end of a match of length
+ "op_cnt" involving fragment "fnum"
+ otherwise, this is an insertion (op==1) or deletion (op==2)
+*/
+
+void
+update_code(struct dyn_string_str *align_code_dyn, int op, int op_cnt, int fnum, int show_code) {
+
+ char align_char[4]={"=-+"};
+ char cigar_char[4]={"MDI"};
+ char tmp_cnt[20];
+
+ if (op_cnt == 0) return;
+
+ if (show_code == SHOW_CODE_CIGAR) {
+ sprintf(tmp_cnt,"%d%c",op_cnt,cigar_char[op]);
+ }
+ else {
+ if (op == 0)
+ sprintf(tmp_cnt,"%c%d[%d]",align_char[op],op_cnt,fnum);
+ else
+ sprintf(tmp_cnt,"%c%d",align_char[op],op_cnt);
+ }
+ dyn_strcat(align_code_dyn, tmp_cnt);
+}
+
+int
+calc_id(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str)
+{
+ int i0, i1, nn1;
+ int op, lenc, len_gap;
+ const unsigned char *aa1p;
+ int sp0, sp1;
+ int *rp;
+ int mins, smins;
+
+ *score_delta = 0;
+ NULL_dyn_string(annot_var_dyn);
+
+#ifndef TFAST
+ aa1p = aa1;
+ nn1 = n1;
+#else
+ aa1p = f_str->aa1x;
+ nn1 = f_str->n10;
+#endif
+
+ aln->amin0 = a_res->min0 + f_str->aa0t_off;
+ aln->amax0 = a_res->max0 + f_str->aa0t_off;
+ aln->amin1 = a_res->min1;
+ aln->amax1 = a_res->max1;
+ aln->calc_last_set = 1;
+
+ /* first fill in the ends */
+ n0 -= (f_str->nm0-1);
+
+ /* now get the middle */
+ rp = a_res->res;
+ lenc=len_gap=aln->nident=aln->nmismatch=aln->nsim=aln->npos=aln->ngap_q = aln->ngap_l = aln->nfs = op = 0;
+ i0 = a_res->min0;
+ i1 = a_res->min1;
+
+ while (i0 < a_res->max0 || i1 < a_res->max1) {
+ if (op == 0 && *rp == 0) {
+ op = *rp++;
+
+ if (ppst->pam2[0][f_str->aa0t[i0]][aa1p[i1]]>0) {
+ aln->nsim++;
+ aln->npos++;
+ }
+ else if (ppst->pam2[0][f_str->aa0t[i0]][aa1p[i1]]==0) {
+ aln->nsim++;
+ }
+
+ sp0 = ppst->sq[f_str->aa0t[i0++]];
+ sp1 = ppst->sq[aa1p[i1++]];
+ lenc++;
+ if (toupper(sp0) == toupper(sp1)) aln->nident++;
+ else {aln->nmismatch++;}
+ }
+ else {
+ if (op==0) { op = *rp++;}
+ i1++;
+ op--;
+ len_gap++;
+ lenc++;
+ }
+ }
+ return lenc-len_gap;
+}
+
+int
+calc_idd(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str)
+{
+ return calc_id(aa0,n0,aa1,n1,aln, a_res, ppst, annot0_p, annot1_p, score_delta, annot_var_dyn, f_str);
+}
diff --git a/src/comp_lib9.c b/src/comp_lib9.c
new file mode 100644
index 0000000..1f894b5
--- /dev/null
+++ b/src/comp_lib9.c
@@ -0,0 +1,3052 @@
+/* $Id: comp_lib9.c 1291 2014-08-28 18:32:58Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2002, 2014 by William R. Pearson
+ and The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/*
+ * Jan 17, 2007 - remove #ifdef PRSS - begin better statistics in place
+ * for small libraries, related libraries
+ *
+ * Concurrent read version
+ *
+ * Feb 20, 1998 modifications for prss3
+ *
+ * December, 1998 - DNA searches are now down with forward and reverse
+ * strands
+ */
+
+/* (22-Jan-2011) comp_lib7.c is an extension of comp_lib6.c. Both
+ programs read the library in its entirety before doing the search.
+ comp_lib7.c provides full seqr_chain buffers of results, while
+ comp_lib6.c works with individual seq_records
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <time.h>
+
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+
+#ifdef UNIX
+#include <unistd.h>
+#include <sys/types.h>
+#include <signal.h>
+#endif
+
+#ifdef MPI_SRC
+#include "mpi.h"
+#endif
+
+#include "defs.h"
+
+#include "structs.h" /* mngmsg, libstruct */
+#include "mm_file.h"
+#include "best_stats.h" /* defines beststr */
+
+#include "thr_buf_structs.h"
+#include "drop_func.h"
+
+#define XTERNAL
+#include "uascii.h"
+
+char *mp_verstr="";
+
+/********************************/
+/* extern variable declarations */
+/********************************/
+extern int fa_max_workers;
+extern char *prog_func; /* function label */
+extern char *verstr, *iprompt0, *iprompt1, *iprompt2, *refstr;
+
+/********************************/
+/*extern function declarations */
+/********************************/
+struct lmf_str *open_lib(struct lib_struct *lib_p, int dnaseq, int *sascii, int quiet);
+
+void
+close_lib_list(struct lib_struct *lib_list_p, int free_flag, int mm_force);
+
+int closelib(struct lmf_str *m_fptr, int force);
+
+void *my_srand();
+unsigned int my_nrand(int, void *);
+
+struct seqr_chain *
+new_seqr_chain(int max_chain_seqs, int aa1b_size, struct seqr_chain *old_chain, int maxn, long *lost_memK, int alloc_buf_flg);
+void end_seqr_chain(struct seqr_chain *last_seqr);
+void free_seqr_chain(struct seqr_chain *this_seqr);
+
+struct getlib_str *
+init_getlib_info(struct lib_struct *lib_list_p, int maxn, long max_memK);
+/* void free_getlib_info(struct getlib_str *); */
+
+struct seqr_chain *
+next_seqr_chain(const struct mng_thr *m_bufi, struct getlib_str *getlib_info,
+ struct buf_head *lib_bhead_p,
+ struct mngmsg *m_msp, const struct pstruct *ppst);
+
+struct seq_record *
+next_sequence_p(struct mseq_record **cur_mseq_p, struct seq_record *prev_seq_p,
+ struct seqr_chain *cur_seqr_chain, int maxn);
+
+void reset_seqr_chain(struct seqr_chain *seqr_base);
+
+void
+seqr_chain_work(unsigned char **aa0, unsigned char *aa0s,
+ struct buf_head *lib_bhead_p, struct getlib_str *getlib_info,
+ const struct mng_thr *m_bufi_p, struct mngmsg *m_msp, struct pstruct *ppst,
+ void *pstat_void, struct db_str *ldb, struct hist_str *histp, struct score_count_s *s_info,
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ unsigned char *aa1shuff, void *f_str, void *qf_str,
+#endif
+ struct seq_record *best_seqs, struct mseq_record *best_mseqs, struct beststr *best,
+ FILE *fdata);
+
+void init_aa0(unsigned char **aa0, int n0, int nm0,
+ unsigned char **aa0s, unsigned char **aa1s,
+ int qframe, int qshuffle_flg, int max_tot,
+ struct pstruct *ppst, void **f_str, void **qf_str,
+ void *my_rand_state);
+
+extern int ann_scan(unsigned char *, int, unsigned char **, int);
+extern int get_annot(char *sname, struct mngmsg *m_msp, char *bline, long q_offset, int n1,
+ struct annot_str **annot_p, int target, int debug);
+extern int scanseq(unsigned char *seq, int n, char *str);
+extern void re_ascii(int *qascii, int *sascii, int max_ann_arr);
+extern int recode(unsigned char *seq, int n, int *qascii, int nsq);
+extern void revcomp(unsigned char *seq, int n, int *c_nt);
+
+extern void init_ascii(int is_ext, int *sascii, int nsq, int is_dna);
+extern void validate_novel_aa(int *sascii, int nsq, int is_dna);
+extern void qshuffle(unsigned char *aa0, int n0, int nm0, void *);
+extern void free_pam2p(int **);
+
+#ifdef DEBUG
+void check_rbuf(struct buf_head *cur_buf);
+int check_seq_range(unsigned char *aa1b, int n1, int nsq, char *);
+unsigned long adler32(unsigned long, const unsigned char *, unsigned int);
+#endif
+
+/* initialize environment (doinit.c) */
+extern void initenv (int argc, char **argv, struct mngmsg *m_msg,
+ struct pstruct *ppst, unsigned char **aa0);
+
+void fset_vars(struct mngmsg *m_msp, struct pstruct *ppst);
+
+/* print timing information */
+extern void ptime (FILE *, long);
+
+#ifdef COMP_MLIB
+#define QGETLIB (q_file_p->getlib)
+#endif
+
+#define GETLIB (m_file_p->getlib)
+
+int samp_stats_idx (int *pre_nstats, int nstats, void *rand_state);
+
+void
+save_best(struct buf_head *lib_buf, const struct mngmsg *, struct pstruct *ppst,
+ struct db_str *, FILE *fdata, struct hist_str *, void **,
+ struct score_count_s *);
+void
+save_best2(struct buf_head *lib_buf, const struct mngmsg *, struct pstruct *ppst,
+ struct db_str *, FILE *fdata, struct hist_str *, void **,
+ struct score_count_s *);
+
+void
+save_shuf(struct buf_head *lib_buf, int nitt, int shuff_max, int score_ix,
+ struct score_count_s *);
+
+int
+save_align(struct buf_head *lib_bhead_p, struct beststr **bestp_arr);
+
+void
+init_beststats(struct beststr **best, struct beststr ***bestp_arr,
+ struct seq_record **best_seqs, struct mseq_record **best_mseqs,
+ struct stat_str **stats, struct stat_str **rstats,
+ int shuff_max, int link_flag);
+void
+preserve_seq(struct buf2_data_s *, struct seq_record *, struct mseq_record *, struct beststr *);
+
+void
+preserve_seq2(struct beststr *, struct seq_record *, struct mseq_record *, struct beststr *);
+
+void
+buf_do_work(unsigned char **aa0, int n0, struct buf_head *lib_bhead_p,
+ int max_frame, struct pstruct *ppst, void **f_str);
+void
+buf_qshuf_work(unsigned char *aa0s, int n0, struct buf_head *lib_bhead_p,
+ int max_frame, struct pstruct *ppst, void *qf_str, int score_ix);
+void
+buf_shuf_work(unsigned char **aa0, int n0, unsigned char *aa1s, struct buf_head *lib_bhead_p,
+ int max_frame, struct pstruct *ppst, void **f_str,
+ int score_ix, void *rand_state);
+void
+buf_shuf_seq(unsigned char **aa0, int n0,
+ unsigned char **aa1_shuff_b, unsigned char *aa1save, int maxn,
+ struct beststr **bestp_arr, int nbest,
+ struct pstruct *pst, struct mngmsg *m_msp,
+ struct mng_thr *m_thr_p
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ , void **f_str
+#endif
+ , struct score_count_s *s_info);
+
+void
+buf_align_seq(unsigned char **aa0, int n0,
+ struct beststr **bestp_arr, int nbest,
+ struct pstruct *ppst, struct mngmsg *m_msp,
+ struct mng_thr *m_bufi_p
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ , void **f_str
+#endif
+ );
+
+void
+buf_do_align(unsigned char **aa0, int n0,
+ struct buf_head *lib_bhead_p,
+ struct pstruct *ppst, const struct mngmsg *m_msp,
+ void **f_str);
+
+struct buf_head *
+alloc_comp_bufs (struct mng_thr *m_bufi_p, struct mngmsg *m_msp,
+ int ave_seq_len);
+
+/* statistics functions */
+extern int
+process_hist(struct stat_str *sptr, int nstats,
+ const struct mngmsg *m_msg,
+ struct pstruct *ppst,
+ struct hist_str *hist, void **pstat_void, struct score_count_s *s_info, int do_hist);
+
+extern double find_z(int score, double escore, int length, double comp,void *);
+extern double zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db);
+
+void last_stats(const unsigned char *, int,
+ struct stat_str *sptr, int nstats,
+ struct beststr **bestp_arr, int nbest,
+ const struct mngmsg *m_msg, struct pstruct *ppst,
+ struct hist_str *histp, void *);
+
+int last_calc( unsigned char **aa0, unsigned char *aa1, int maxn,
+ struct beststr **bestp_arr, int nbest,
+ const struct mngmsg *m_msg, struct pstruct *ppst,
+ void **f_str, void *rs_str);
+
+void scale_scores(struct beststr **bestp_arr, int nbest,
+ struct db_str,struct pstruct *ppst, void *);
+
+int E1_to_s(double e_val, int n0, int n1, int db_size, void *pu);
+
+extern void pstat_info(char *, int, char *, void *);
+
+extern int shuffle(unsigned char *, unsigned char *, int, void *);
+extern int shuffle3(unsigned char *, unsigned char *, int, void *);
+extern int rshuffle(unsigned char *, unsigned char *, int);
+extern int wshuffle(unsigned char *, unsigned char *, int, int, void *);
+
+extern void set_db_size(int, struct db_str *, struct hist_str *);
+
+extern void /* pre-alignment */
+pre_load_best(unsigned char *aa1, int maxn,struct beststr **bbp_arr, int nbest,
+ struct mngmsg *m_msp, int debug);
+
+extern char * /* run a command to produce a fasta file */
+build_lib_db(char *script_file);
+
+extern char * /* run link file */
+build_link_data(char **, struct mngmsg *, struct beststr **, int);
+
+/* display functions */
+extern void
+showbest (FILE *fp, unsigned char **aa0, unsigned char *aa1, int maxn,
+ struct beststr **bestp_arr, int nbest,
+ int qlib, struct mngmsg *m_msg,struct pstruct *ppst,
+ struct db_str db, char **gstring2p, void **f_str);
+
+extern void
+showalign (FILE *fp, unsigned char **aa0, unsigned char *aa1, int maxn,
+ struct beststr **bestp_arr, int nbest, int qlib,
+ const struct mngmsg *m_msg, const struct pstruct *ppst,
+ char **gstring2p, void **f_str, struct mng_thr *m_bufi_p);
+
+/* misc functions */
+void h_init(struct pstruct *, struct mngmsg *, char *); /* doinit.c */
+void last_init(struct mngmsg *, struct pstruct *); /* initfa/sw.c */
+void last_params(unsigned char *, int, struct mngmsg *, struct pstruct *);
+int validate_params(const unsigned char *, int, const struct mngmsg *,
+ const struct pstruct *,
+ const int *lascii, const int *pascii);
+
+void s_abort(char *, char *); /* compacc.c */
+
+/* initfa/sw.c */
+void resetp(struct mngmsg *, struct pstruct *);
+
+void gettitle(char *, char *, int); /* nxgetaa.c */
+void lib_choice(char *lname, int nln, char *flstr, int ldnaseq); /* lib_sel.c */
+struct lib_struct *
+lib_select(char *lname, char *ltitle, const char *flstr, int ldnaseq); /* lib_sel.c */
+
+void query_parm(struct mngmsg *, struct pstruct *); /* initfa/sw.c */
+
+/* doinit.c */
+void markx_to_m_msp(struct mngmsg *m_msp, struct markx_str *markx);
+void m_msp_to_markx(struct markx_str *markx, struct mngmsg *m_msp);
+
+/* compacc.c */
+void print_header1(FILE *fd, const char *argv_line,
+ const struct mngmsg *m_msp, const struct pstruct *ppst);
+void print_header2(FILE *fd, int qlib, char *info_qlabel, unsigned char **aa0,
+ const struct mngmsg *m_msp, const struct pstruct *ppst, char *info_lib_range_p);
+void print_header3(FILE *fd, int qlib, struct mngmsg *m_msp, struct pstruct *ppst);
+
+void print_header4(FILE *fd, char *info_qlabel, char *argv_line, char *info_gstring3,
+ char *info_hstring_p[2], struct mngmsg *m_msp, struct pstruct *ppst);
+void print_header4a(FILE *fd, struct mngmsg *m_msp);
+
+void print_header5(FILE *fd, int qlib, struct db_str *qtt,
+ struct mngmsg *m_msp, struct pstruct *ppst, int in_mem, long tot_memK);
+
+void print_annot_header(FILE *fd, struct mngmsg *m_msp);
+
+void prhist(FILE *, const struct mngmsg *, struct pstruct *, struct hist_str hist,
+ int nstats, int sstats, struct db_str, char *, char *, char **, char **, long);
+
+void print_sum(FILE *, struct db_str *qtt, struct db_str *ntt, int in_mem, long tot_memK);
+int reset_maxn(struct mngmsg *, int, int); /* set m_msg.maxt, maxn from maxl */
+
+FILE *outfd; /* Output file */
+
+/* this information is global for fsigint() */
+extern long s_time(); /* fetches time */
+long tstart, tscan, tprev, tdone; /* Timing */
+#ifdef COMP_MLIB
+long ttscan, ttdisp;
+#endif
+time_t tdstart, tddone;
+
+static struct db_str qtt = {0l, 0l, 0};
+#ifdef DEBUG
+char ext_qtitle[MAX_STR];
+#endif
+
+#if defined(COMP_THR) || defined(PCOMPLIB)
+/***************************************/
+/* thread global variable declarations */
+/***************************************/
+
+/* functions for getting/sending buffers to threads (thr_sub.c) */
+#ifndef PCOMPLIB
+extern void init_thr(int , struct thr_str *, const struct mngmsg *, struct pstruct *,
+ unsigned char *, struct mng_thr *m_bufi_p);
+extern void start_thr(void);
+#define RESULTS_BUF reader_buf
+#else
+extern void init_thr(int , char *, const struct mngmsg *, struct pstruct *, unsigned char *, struct mng_thr *m_bufi_p);
+extern void work_comp(int);
+#define RESULTS_BUF worker_buf
+#endif
+extern void get_rbuf(struct buf_head **lib_buf, int max_work_buf);
+extern void put_rbuf(struct buf_head *lib_buf, int max_work_buf);
+extern void wait_rbuf(int max_work_buf);
+extern void rbuf_done(int nthreads);
+extern void put_rbuf_done(int nthreads, struct buf_head *lib_buf,
+ int max_work_buf);
+#ifndef PCOMPLIB
+#undef XTERNAL
+#include "thr_bufs2.h"
+#else
+#include "pcomp_bufs.h"
+#endif
+#endif
+
+struct buf_head *lib_buf2_list;
+
+/* these variables must be global for comp_thr.c so that save_best()
+ can use them */
+struct beststr **bestp_arr; /* array of pointers */
+int nbest; /* number of best scores */
+
+struct stat_str *stats; /* array of scores for statistics from real
+ (or shuffled) sequences*/
+struct stat_str *qstats; /* array of scores for shuffled query stats */
+struct stat_str *rstats; /* array of scores from shuffled library */
+
+ /* these variables are global so they can be set both by the main()
+ program and save_best() in threaded mode.
+ */
+
+int nstats, nqstats, nrstats, pre_nstats, kstats, shuff_tot, sstats;
+double zbestcut; /* cut off for best z-score */
+int bestfull; /* index for selectbest() */
+int stats_done=0; /* flag for z-value processing */
+void *rand_state;
+
+static int seq_index=0;
+
+void fsigint();
+
+/* **************************************************************** */
+/* start of main() program */
+/* **************************************************************** */
+int
+main (int argc, char *argv[])
+{
+ unsigned char *aa0[6], *aa0s;
+ unsigned char *aa1save; /* aa1shuff and aa1save must be distinct */
+ unsigned char *aa1shuff, *aa1shuff_b=NULL; /* for new unthreaded version */
+ char *lib_db_file;
+
+ char tmp_str[MAX_SSTR];
+ char link_title[MAX_LSTR];
+ char *link_lib_str;
+ char *link_lib_file;
+
+ double zs_off_save;
+ int n_sig;
+
+ struct a_res_str *next_ares_p, *cur_ares_p; /* used to free-up old a_res */
+
+ /* status/parameter information */
+ char info_lib_range[MAX_FN];
+ char *info_lib_range_p;
+ char info_pgm_abbr[MAX_SSTR];
+ char info_qlabel[MAX_STR];
+ char *info_gstring2p[2];
+ char info_gstring3[MAX_STR];
+ char *info_hstring_p[2];
+ char fdata_pstat_info[MAX_STR];
+
+#ifdef COMP_MLIB
+ fseek_t qseek;
+ int qlib;
+ struct lib_struct *q_lib_p;
+ struct lmf_str *q_file_p;
+ int sstart, sstop, is;
+#endif
+
+ long next_q_offset;
+ int id;
+ struct lib_struct *lib_list_p, *link_lib_list_p;
+ struct getlib_str *getlib_info, *link_getlib_info;
+
+ struct db_str link_ldb={0,0,0};
+ struct score_count_s link_s_info={0,0,0,0};
+ int utmp; /* user input tmp */
+
+ struct pstruct pst;
+ void *f_str[6], *qf_str; /* different f_str[]'s for forward,reverse */
+ int have_f_str=0;
+
+ /* these variables track buffers of library sequences */
+ struct buf_head *lib_bhead_p;
+
+ struct mng_thr m_bufi; /* has max_work_buf, max_buf2_res,
+ max_chain_seqs, nframe */
+ int ave_seq_len;
+ /* int empty_reader_bufs; */
+#ifdef COMP_THR
+ /* int t_reader_buf_readp; */
+ struct thr_str *work_info;
+#endif
+#ifdef MPI_SRC
+ int mpi_tid;
+#endif
+ /* end of library sequence buffers */
+
+ struct mngmsg m_msg; /* saves most non-param information
+ about library alignment */
+ struct markx_str markx_save; /* saves m_msg values for markx */
+ struct markx_str *cur_markx; /* follow m_msg.markx_list */
+
+ struct hist_str hist2; /* hist str for zsflag > 2 */
+ int zsflag_save; /* save zsflag > 20 */
+ char rline[MAX_FN];
+ char argv_line[MAX_LSTR];
+ int t_quiet;
+
+ int i;
+ FILE *fdata=NULL; /* file for full results */
+ struct beststr *best, *bbp; /* array of best scores */
+
+ /* save sequence meta info for sequences that are not currently available */
+ struct seq_record *best_seqs;
+ struct mseq_record *best_mseqs;
+
+ int leng; /* leng is length of the descriptive line */
+ int maxn; /* size of the library sequence examined */
+ int qlcont; /* continued query sequence */
+ char *bp; /* general purpose string ptr */
+
+ /* this is necessary because of an SGI Irix 64 issue */
+ info_gstring2p[0] = calloc(MAX_STR,sizeof(char));
+ info_gstring2p[1] = calloc(MAX_STR,sizeof(char));
+ info_hstring_p[0] = calloc(MAX_STR,sizeof(char));
+ info_hstring_p[1] = calloc(MAX_STR,sizeof(char));
+
+ if ((bp = strrchr(argv[0],'/'))!=NULL) {
+ strncpy(m_msg.pgm_name,bp+1,sizeof(m_msg.pgm_name));
+ }
+ else {
+ strncpy(m_msg.pgm_name,argv[0],sizeof(m_msg.pgm_name));
+ }
+
+ /* Initialization */
+
+ m_msg.s_info.s_cnt[0] = m_msg.s_info.s_cnt[1] =
+ m_msg.s_info.s_cnt[2] = m_msg.s_info.tot_scores = 0;
+ m_msg.ss_info.s_cnt[0] = m_msg.ss_info.s_cnt[1] =
+ m_msg.ss_info.s_cnt[2] = m_msg.ss_info.tot_scores = 0;
+
+#ifndef SHOW_HELP
+#if defined(UNIX)
+ m_msg.quiet= !isatty(1);
+#else
+ m_msg.quiet = 0;
+#endif
+#else
+ m_msg.quiet = 1;
+#endif
+
+#ifdef MPI_SRC
+ MPI_Init(&argc, &argv);
+ MPI_Comm_rank(MPI_COMM_WORLD,&mpi_tid);
+ if (mpi_tid > 0) {
+ work_comp(mpi_tid);
+ MPI_Finalize();
+ exit(0);
+ }
+#endif
+
+#ifdef PGM_DOC
+ /* document command line */
+ argv_line[0]='\0';
+ for (i=0; i<argc; i++) {
+ SAFE_STRNCAT(argv_line," ",sizeof(argv_line));
+ if (strchr(argv[i],' ')) {
+ SAFE_STRNCAT(argv_line,"\"",sizeof(argv_line));
+ SAFE_STRNCAT(argv_line,argv[i],sizeof(argv_line));
+ SAFE_STRNCAT(argv_line,"\"",sizeof(argv_line));
+ }
+ else {
+ SAFE_STRNCAT(argv_line,argv[i],sizeof(argv_line));
+ }
+ }
+ argv_line[sizeof(argv_line)-1]='\0';
+#endif
+
+ /* first initialization routine - nothing is known */
+ h_init(&pst, &m_msg, info_pgm_abbr);
+
+ m_msg.db.length = m_msg.ldb.length = qtt.length = 0l;
+ m_msg.db.entries = m_msg.db.carry =
+ m_msg.ldb.entries = m_msg.ldb.carry = qtt.entries = qtt.carry = 0;
+ m_msg.pstat_void = m_msg.pstat_void2 = NULL;
+ m_msg.hist.entries = 0;
+
+ f_str[5] = f_str[4] = f_str[3] = f_str[2] = f_str[1] = f_str[0] = NULL;
+ aa0[0] = NULL;
+ rand_state = my_srand();
+
+ /* initialize values in comp_lib9 that are thread/serial specific */
+ fset_vars(&m_msg, &pst);
+
+ /* second initialization - get commmand line arguments */
+ initenv (argc, argv, &m_msg, &pst, &aa0[0]);
+
+#ifndef PCOMPLIB
+#ifdef COMP_THR
+ if ((work_info=
+ (struct thr_str *)calloc(fa_max_workers,sizeof(struct thr_str)))==NULL) {
+ fprintf(stderr, " cannot allocate work_info[%d]\n",fa_max_workers);
+ exit(1);
+ }
+#else
+ fa_max_workers = 1;
+#endif
+#endif
+
+ ttscan = ttdisp = 0;
+ tstart = tscan = s_time();
+ tdstart = time(NULL);
+
+ /* Allocate space for the query and library sequences */
+ /* pad aa0[] with an extra SEQ_PAD chars for ALTIVEC padding */
+ if (aa0[0]==NULL) {
+ if ((aa0[0] = (unsigned char *)malloc((m_msg.max_tot+1+SEQ_PAD)*sizeof(unsigned char)))
+ == NULL)
+ s_abort ("Unable to allocate query sequence", "");
+ *aa0[0]=0;
+ aa0[0]++;
+ }
+ aa0[5]=aa0[4]=aa0[3]=aa0[2]=aa0[1]=aa0[0];
+
+ if ((aa1save = (unsigned char *)malloc((m_msg.max_tot+1)*sizeof (char))) == NULL) {
+ s_abort ("Unable to allocate library overlap", "");
+ }
+ *aa1save=0;
+ aa1save++;
+
+ /* print argv_line, program, version */
+ print_header1(stdout, argv_line, &m_msg, &pst);
+
+ /* get query information */
+ if (m_msg.tname[0] == '\0') {
+ if (m_msg.quiet == 1)
+ s_abort("Query sequence undefined","");
+ l1: fputs (iprompt1, stdout);
+ fflush (stdout);
+ if (fgets (m_msg.tname, MAX_FN, stdin) == NULL)
+ s_abort ("Unable to read query library name","");
+ m_msg.tname[MAX_FN-1]='\0';
+ if ((bp=strchr(m_msg.tname,'\n'))!=NULL) *bp='\0';
+ if (m_msg.tname[0] == '\0') goto l1;
+ }
+
+ /* **************************************************************** */
+ /* (1) open the query library;
+ (2) get a sequence;
+ (3) check for annotations */
+
+ /* we need a q_lib_p before opening the library */
+ if ((q_lib_p = (struct lib_struct *)calloc(1,sizeof(struct lib_struct)))==NULL) {
+ s_abort(" cannot allocate q_lib_p","");
+ }
+ else {
+ q_lib_p->file_name = m_msg.tname;
+ }
+
+ /* Open query library */
+ if ((q_file_p= open_lib(q_lib_p, m_msg.qdnaseq,qascii,!m_msg.quiet))==NULL) {
+ s_abort(" cannot open library ",m_msg.tname);
+ }
+ /* Fetch first sequence */
+ qlib = 0;
+ m_msg.q_offset = next_q_offset = 0l;
+ qlcont = 0;
+ m_msg.n0 =
+ QGETLIB (aa0[0], MAXTST, m_msg.qtitle, sizeof(m_msg.qtitle),
+ &qseek, &qlcont,q_file_p,&m_msg.q_off);
+ if ((bp=strchr(m_msg.qtitle,' '))!=NULL) *bp='\0';
+ strncpy(info_qlabel,m_msg.qtitle,sizeof(info_qlabel));
+#ifdef DEBUG
+ SAFE_STRNCPY(ext_qtitle, m_msg.qtitle,sizeof(ext_qtitle));
+#endif
+ if (bp != NULL) *bp = ' ';
+ info_qlabel[sizeof(info_qlabel)-1]='\0';
+
+ /* if annotations are included in sequence, remove them */
+ if (m_msg.ann_flg) {
+ m_msg.n0 = ann_scan(aa0[0],m_msg.n0,&m_msg.aa0a,m_msg.qdnaseq);
+ /* cannot do get_annot() here because lascii not initialized */
+ }
+
+ /* if protein and ldb_info.term_code set, add '*' if not there */
+ if (m_msg.ldb_info.term_code && !(m_msg.qdnaseq==SEQT_DNA || m_msg.qdnaseq==SEQT_RNA) &&
+ aa0[0][m_msg.n0-1]!='*') {
+ aa0[0][m_msg.n0++]='*';
+ aa0[0][m_msg.n0]=0;
+ }
+
+ /* if ends with ESS, remove terminal ESS */
+ if (aa0[0][m_msg.n0-1] == ESS) { m_msg.n0--; aa0[0][m_msg.n0]= '\0';}
+
+ /* check for subset */
+ if (q_file_p->opt_text[0]!='\0') {
+ if (q_file_p->opt_text[0]=='-') {
+ sstart=0; sscanf(&q_file_p->opt_text[1],"%d",&sstop);
+ }
+ else {
+ sstart = 0; sstop = -1;
+ sscanf(&q_file_p->opt_text[0],"%d-%d",&sstart,&sstop);
+ sstart--;
+ if (sstop <= 0 ) sstop = BIGNUM;
+ }
+
+ for (id=0,is=sstart; is<min(m_msg.n0,sstop); ) {
+ aa0[0][id++]=aa0[0][is++];
+ }
+ aa0[0][id]=0;
+ m_msg.n0 = min(m_msg.n0,sstop)-sstart;
+ m_msg.q_off += sstart;
+ }
+
+ /* check to see if query has been segmented */
+ if (qlcont) {
+ next_q_offset = m_msg.q_offset + m_msg.n0 - m_msg.q_overlap;
+ }
+ else {
+ next_q_offset = 0l;
+ }
+
+ /* this probably cannot happen any more */
+ if (m_msg.n0 > MAXTST) {
+ fprintf(stderr," sequence truncated to %d\n %s\n",MAXTST,m_msg.sqnam);
+ aa0[0][MAXTST]='\0';
+ m_msg.n0=MAXTST;
+ }
+
+ /* check for protein/DNA alphabet type */
+ if (m_msg.qdnaseq == SEQT_UNK) {
+ /* cannot change the alphabet mapping if a matrix has been set */
+ /* do automatic sequence recognition,but only for sequences > 20 residues */
+ if ( !pst.pam_set && m_msg.n0 > 20 &&
+ (float)scanseq(aa0[0],m_msg.n0,"ACGTUNacgtun")/(float)m_msg.n0 >0.85) {
+ pascii = nascii;
+ m_msg.qdnaseq = SEQT_DNA;
+ }
+ else { /* its protein */
+ pascii = aascii;
+ m_msg.qdnaseq = SEQT_PROT;
+ }
+ /* modify qascii to use encoded version
+ cannot use memcpy() because it loses annotations
+ */
+ re_ascii(qascii,pascii,strlen((char *)&m_msg.ann_arr[1]));
+ init_ascii(pst.ext_sq_set,qascii,pst.nsq,m_msg.qdnaseq);
+ validate_novel_aa(qascii, pst.nsq, m_msg.qdnaseq);
+ m_msg.n0 = recode(aa0[0],m_msg.n0,qascii, pst.nsqx);
+ }
+
+ /* check sequence length -- cannot do before now because query
+ alphabet may change */
+ if (m_msg.n0 <= 0)
+ s_abort ("Query sequence length <= 0: ", m_msg.tname);
+
+ /* reset algorithm parameters for alphabet */
+ resetp (&m_msg, &pst);
+
+#ifndef COMP_MLIB
+ gettitle(m_msg.tname,m_msg.qtitle,sizeof(m_msg.qtitle));
+ if (m_msg.tname[0]=='-' || m_msg.tname[0]=='@') {
+ strncmp(m_msg.tname,m_msg.qtitle,sizeof(m_msg.tname));
+ if ((bp=strchr(m_msg.tname,' '))!=NULL) *bp='\0';
+ }
+#endif
+
+ /* get library file names from argv[2] or by prompting */
+ if (strlen (m_msg.lname) == 0) {
+ if (m_msg.quiet == 1) s_abort("Library name undefined","");
+ lib_choice(m_msg.lname,sizeof(m_msg.lname),m_msg.flstr, m_msg.ldb_info.ldnaseq);
+ }
+
+ if (m_msg.lname[0] == '!') {
+ if ((lib_db_file = build_lib_db(&m_msg.lname[1]))==NULL) {
+ fprintf(stderr,"***[comp_lib9.c] Cannot open/execute %s script\n",&m_msg.lname[1]);
+ exit(1);
+ }
+
+ /* get a list of files to search */
+ lib_list_p = lib_select(lib_db_file, m_msg.ltitle, m_msg.flstr,
+ m_msg.ldb_info.ldnaseq);
+ }
+ else {
+ /* get a list of files to search */
+ lib_list_p = lib_select(m_msg.lname, m_msg.ltitle, m_msg.flstr, m_msg.ldb_info.ldnaseq);
+ }
+
+ /* Get additional parameters here */
+ if (!m_msg.quiet) query_parm (&m_msg, &pst);
+
+ /* set up zsflag, labels for output, thr_fact, shuffle params, pam matrix */
+ last_init(&m_msg, &pst);
+
+ /* allocate beststr best, beststr **bestp_arr, best_seqs/mseqs
+ structures for preserving seq_records, stats structures, */
+ init_beststats(&best, &bestp_arr,
+ &best_seqs, &best_mseqs,
+ &stats, &rstats, m_msg.shuff_max,
+ m_msg.link_lname[0]);
+
+#ifdef UNIX
+ /* set up signals now that input is done */
+ signal(SIGHUP,SIG_IGN);
+#endif
+
+ /* **************************************************************** */
+ /* begin setting things up for threads */
+ /* **************************************************************** */
+ /*
+ This section defines m_bufi.max_chain_seqs, the average number of entries
+ per buffer, and m_bufi.max_work_buf, the total number of buffers
+
+ Use a 2 Mbyte (DEF_WORKER_BUF) buffer for each worker. For
+ proteins, that means 5,000 sequences of length 400 (average).
+ For DNA, that means 2,000 sequences of length 1000.
+
+ To accommodate larger libraries in memory, use more buffers, not
+ bigger buffers.
+
+ Once m_bufi.max_chain_seqs/max_work_buf and ave_seq_len are set,
+ allocate all the communication buffers with alloc_comp_bufs();
+ */
+
+ if (m_msg.ldb_info.ldnaseq== SEQT_DNA) {
+ ave_seq_len = AVE_NT_LEN;
+ m_bufi.max_chain_seqs = DEF_WORKER_BUF/AVE_NT_LEN;
+ }
+ else {
+ ave_seq_len = AVE_AA_LEN;
+ m_bufi.max_chain_seqs = DEF_WORKER_BUF/AVE_AA_LEN;
+ }
+
+ m_bufi.max_chain_seqs /= m_msg.thr_fact;
+ /* make certain all buffers have at least 600 sequences */
+ m_bufi.max_chain_seqs = max(m_bufi.max_chain_seqs,600);
+
+ /* max_work_buf is the number of buffers - if the worker buffers are
+ small, then make lots more buffers */
+
+#ifdef PCOMPLIB /* PCOMPLIB -- one buffer per worker */
+ m_bufi.max_work_buf = fa_max_workers;
+#else /* !PCOMPLIB */
+ m_bufi.max_work_buf = (DEF_WORKER_BUF * 2 * fa_max_workers)/(ave_seq_len * m_bufi.max_chain_seqs);
+ if (m_bufi.max_work_buf < 2*fa_max_workers) m_bufi.max_work_buf = 2*fa_max_workers;
+ if (m_bufi.max_work_buf > 4*fa_max_workers) m_bufi.max_work_buf = 4*fa_max_workers;
+ m_bufi.max_work_buf -= (m_bufi.max_work_buf%fa_max_workers);
+#ifndef COMP_THR
+ /* if not threaded, only one (larger) buffer */
+ m_bufi.max_chain_seqs *= m_bufi.max_work_buf;
+ m_bufi.max_work_buf = 1;
+#endif /* !COMP_THR */
+#endif
+ m_bufi.max_buf2_res = m_bufi.max_chain_seqs * (m_msg.nitt1+1 - m_msg.revcomp);
+
+ /* allocate lib_buf2_lib[] and the associated data and results buffers,
+ as well as reader_buf[] and worker_buf[] */
+
+ lib_buf2_list = alloc_comp_bufs(&m_bufi, &m_msg, ave_seq_len);
+
+ /* initialization of global variables for threads/buffers */
+
+#if defined(COMP_THR) || defined(PCOMPLIB)
+#ifdef DEBUG
+ /* fprintf(stderr," max_work_buf: %d\n", m_bufi.max_work_buf); */
+#endif
+ num_reader_bufs = m_bufi.max_work_buf;
+#endif
+
+#ifdef COMP_THR
+ num_worker_bufs = 0;
+ reader_done = 0;
+ worker_buf_workp = 0;
+ worker_buf_readp = 0;
+ reader_buf_workp = 0;
+ reader_buf_readp = 0;
+
+ start_thread = 1; /* keeps threads from starting */
+#endif
+
+ /* Label the output */
+ if ((bp = (char *) strchr (m_msg.lname, ' ')) != NULL) *bp = '\0';
+ if (m_msg.ltitle[0] == '\0') {
+ strncpy(m_msg.ltitle,m_msg.lname,sizeof(m_msg.ltitle));
+ m_msg.ltitle[sizeof(m_msg.ltitle)-1]='\0';
+ }
+
+ if (m_msg.dfile[0]) {
+ fdata=fopen(m_msg.dfile,"w");
+ fprintf(fdata, "#%s\n",argv_line);
+ }
+
+ /* pre-load the library into a [m]seq_record array */
+ /* initialize outside while(1) { query loop } */
+ m_msg.db.length = 0l;
+ m_msg.db.entries = m_msg.db.carry = 0;
+
+ /* also sets ldb_info.l_overlap, use a fixed 150 residue overlap */
+ m_msg.ldb_info.maxn = maxn = reset_maxn(&m_msg, 150, m_msg.max_tot);
+ pst.maxlen = maxn;
+
+ seq_index = 0;
+
+ outfd = stdout;
+
+ /* in comp_lib9.c, getlib_info saves all the state information
+ required to read the next sequence from the library. */
+ getlib_info = init_getlib_info(lib_list_p, m_msg.ldb_info.maxn,m_msg.max_memK);
+ m_msg.cur_seqr_cnt = 0;
+
+ /* main loop for doing a search, getting the next query */
+ while(1) {
+
+ /* Initialize bestp_arr */
+ for (nbest = 0; nbest < MAX_BEST; nbest++)
+ bestp_arr[nbest] = &best[nbest];
+ nbest = 0;
+
+ qlib++;
+ stats_done = 0;
+
+ zbestcut = -FLT_MAX;
+ nstats = nrstats = pre_nstats = shuff_tot = sstats = 0;
+
+ /* ensure that link_list is NULL for no result */
+ link_lib_list_p = NULL;
+
+ /* get the last parameters */
+ last_params(aa0[0],m_msg.n0, &m_msg, &pst);
+
+ if (!validate_params(aa0[0],m_msg.n0, &m_msg, &pst,
+ lascii, pascii)) {
+ fprintf(stderr," *** ERROR *** validate_params() failed:\n -- %s\n", argv_line);
+ exit(1);
+ }
+
+ /* When approx. E()-scores are calculated (FASTS,FASTM), we still
+ need statistics structures; get them immediately. In this case,
+ find_z() must produce a z_score (large positive is good) from an
+ e_score. */
+
+ if (m_msg.escore_flg) {
+ pst.zsflag_f = process_hist(stats,nstats,&m_msg,&pst,
+ &m_msg.hist,&m_msg.pstat_void,&m_msg.s_info,0);
+ stats_done=1;
+ }
+
+#ifdef COMP_THR
+ init_thr(fa_max_workers, work_info, &m_msg, &pst, aa0[0], &m_bufi);
+#endif
+#ifdef PCOMPLIB
+ info_lib_range_p = &info_lib_range[0];
+ init_thr(fa_max_workers, info_lib_range_p, &m_msg, &pst, aa0[0], &m_bufi);
+#endif
+
+ /* always have qstats available; allocate space for shuffled query scores (if needed) */
+ if (m_msg.qshuffle && qstats==NULL) {
+ if ((qstats =
+ (struct stat_str *)calloc(m_msg.shuff_max+1,sizeof(struct stat_str)))==NULL)
+ s_abort ("Cannot allocate qstats struct","");
+ }
+ nqstats = 0;
+
+ /* format query title */
+ leng = (int)strlen(m_msg.qtitle);
+ if (!(m_msg.markx & MX_M9SUMM) && leng > m_msg.aln.llen) leng -= 10;
+ if (leng > sizeof(m_msg.qtitle)-20) leng -= 20;
+
+ if (!(m_msg.markx & MX_MBLAST2)) {
+ if (m_msg.nm0 <= 1) {
+ sprintf(tmp_str," - %d %s", m_msg.n0, m_msg.sqnam);
+ }
+ else {
+ sprintf(tmp_str," - %d %s in %d fragments", m_msg.n0 - (m_msg.nm0-1), m_msg.sqnam, m_msg.nm0);
+ }
+
+ if (strlen(tmp_str) + leng + 1> sizeof(m_msg.qtitle)) {
+ leng = sizeof(m_msg.qtitle) - strlen(tmp_str) - 1;
+ SAFE_STRNCAT((m_msg.qtitle+leng), tmp_str, sizeof(m_msg.qtitle));
+ }
+ else {SAFE_STRNCAT(m_msg.qtitle, tmp_str, sizeof(m_msg.qtitle));}
+ }
+ if (fdata) {
+ fprintf(fdata,">>>%ld %3d\t%-50s\n",qtt.entries,m_msg.n0,m_msg.qtitle);
+ }
+
+ tprev = s_time();
+
+ qtt.length += m_msg.n0;
+ qtt.entries++;
+
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ have_f_str=1;
+
+ /* allocates aa0 arrays, calls init_work, only for non-threaded */
+ init_aa0(aa0, m_msg.n0, m_msg.nm0, &aa0s, &aa1shuff,
+ m_msg.qframe, m_msg.qshuffle, m_msg.max_tot,
+ &pst, &f_str[0], &qf_str, rand_state);
+ aa1shuff_b = aa1shuff-1;
+
+ /* label library size limits -- must be called after init_aa0(),
+ which calls init_work(), which can reset the n1_high/n1_low
+ limits */
+ if (pst.n1_low > 0 && pst.n1_high < BIGNUM)
+ sprintf(info_lib_range," (range: %d-%d)",pst.n1_low,pst.n1_high);
+ else if (pst.n1_low > 0)
+ sprintf(info_lib_range," (range: >%d)",pst.n1_low);
+ else if (pst.n1_high < BIGNUM)
+ sprintf(info_lib_range," (range: <%d)",pst.n1_high);
+ else
+ info_lib_range[0]='\0';
+ info_lib_range[sizeof(info_lib_range)-1]='\0';
+ info_lib_range_p = info_lib_range;
+#endif
+
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ lib_bhead_p = lib_buf2_list; /* equivalent to un-threaded get_rbuf() */
+#else /* COMP_THR/PCOMPLIB */
+#ifndef PCOMPLIB
+ start_thr();
+#endif
+ /* now open the library and start reading */
+ /* get a buffer and fill it up */
+ get_rbuf(&lib_bhead_p,m_bufi.max_work_buf);
+#endif
+
+ /* **************************************************************** */
+ /* do the sequence comparison library scan */
+ /* **************************************************************** */
+ seqr_chain_work(aa0, aa0s, lib_bhead_p, getlib_info, &m_bufi, &m_msg, &pst,
+ &m_msg.pstat_void, &m_msg.ldb, &m_msg.hist, &m_msg.s_info,
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ aa1shuff, f_str, qf_str,
+#endif
+ best_seqs, best_mseqs, best, fdata);
+
+ /* the initial search is done, do statistics, possible expansion,
+ and alignments */
+
+#if defined(COMP_THR) && !defined(PCOMPLIB)
+ info_lib_range_p = work_info[0].info_lib_range;
+#endif
+
+ m_msg.nbr_seq = m_msg.db.entries;
+ get_param(&pst, info_gstring2p,info_gstring3, &m_msg.s_info);
+
+ /* *************************** */
+ /* analyze the last results */
+ /* *************************** */
+
+#ifndef SAMP_STATS
+ if (!stats_done && nstats > 0) {
+#endif
+ /* we ALWAYS do this if SAMP_STATS, because the statistics may have changed */
+ /* the new incremental sampling produces an nstats that is much
+ too large */
+
+ zsflag_save = pst.zsflag;
+ if (pst.zsflag > 20) {
+ pst.zsflag -= 20;
+ }
+ pst.zsflag_f = process_hist(stats,nstats,&m_msg, &pst,&m_msg.hist,
+ &m_msg.pstat_void, &m_msg.s_info, stats_done);
+ pst.zsflag = zsflag_save;
+
+ if (m_msg.pstat_void != NULL) {
+ stats_done = 1;
+ for (i = 0; i < nbest; i++) {
+ bestp_arr[i]->zscore =
+ find_z(bestp_arr[i]->rst.score[pst.score_ix],
+ bestp_arr[i]->rst.escore, bestp_arr[i]->seq->n1,
+ bestp_arr[i]->rst.comp, m_msg.pstat_void);
+ }
+#ifndef SAMP_STATS
+ }
+ else pst.zsflag = -1;
+#endif
+ }
+
+ /* **************************************************************** */
+ /* do shuffles if too few library sequences or for second estimate */
+ /* **************************************************************** */
+
+ /* if there are not many scores, produce better statistics by shuffling */
+ /* but only if statistics are enabled (7-July-2008) */
+ if (pst.zsflag != 3 && pst.zsflag > -1 &&
+ nbest > 0 && nbest < m_msg.shuff_max) {
+
+ buf_shuf_seq(aa0, m_msg.n0, &aa1shuff_b, aa1save, maxn,
+ bestp_arr, nbest, &pst, &m_msg, &m_bufi
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ , f_str
+#endif
+ , &m_msg.ss_info);
+
+ /* (4) analyze rstats */
+ if (pst.zsflag < 10) pst.zsflag += 10;
+ if (pst.zsflag > 20) pst.zsflag -= 10;
+ pst.zsflag_f = process_hist(rstats,nrstats,&m_msg, &pst,&m_msg.hist,
+ &m_msg.pstat_void,&m_msg.ss_info,0);
+ }
+
+ /* **************************************************************** */
+ /* done with shuffling for small sample size */
+ /* **************************************************************** */
+
+ if (!pst.zdb_size_set) pst.zdb_size = m_msg.ldb.entries;
+
+#if defined(COMP_THR) || defined(PCOMPLIB)
+ /* before I call last_calc/showbest/showalign, I need init_work() to
+ get an f_str. This duplicates some code above, which is used in
+ the non-threaded version.
+
+ I have tried to get an f_str from one of the threads, but on
+ some architectures, that f_str is not available to the main thread.
+ */
+
+ if (!have_f_str) {
+ init_work(aa0[0],m_msg.n0,&pst,&f_str[0]);
+ /* f_str[0] = work_info[0].f_str_ap[0]; */
+ have_f_str = 1;
+ f_str[5] = f_str[4] = f_str[3] = f_str[2] = f_str[1] = f_str[0];
+
+ if (m_msg.qframe == 2) {
+ if ((aa0[1]=(unsigned char *)calloc((size_t)m_msg.n0+2+SEQ_PAD,
+ sizeof(unsigned char)))==NULL) {
+ fprintf(stderr," cannot allocate aa0[1][%d] for alignments\n",
+ m_msg.n0+2+SEQ_PAD);
+ }
+ *aa0[1]='\0';
+ aa0[1]++;
+ memcpy(aa0[1],aa0[0],m_msg.n0+1);
+ /* for ALTIVEC/SSE2, must pad with 16 NULL's, but not necessary after calloc() */
+ for (id=0; id<SEQ_PAD; id++) {aa0[1][m_msg.n0+id]=0;}
+
+ revcomp(aa0[1],m_msg.n0,&pst.c_nt[0]);
+ init_work(aa0[1],m_msg.n0,&pst,&f_str[1]);
+ /* f_str[1] = work_info[0].f_str_ap[1]; */
+ }
+ }
+#endif
+
+ /* now we have one set of scaled scores for in bestp_arr -
+ for FASTS/F, we need to do some additional processing */
+
+ if (!m_msg.qshuffle) {
+ last_stats(aa0[0], m_msg.n0, stats,nstats, bestp_arr,nbest,
+ &m_msg, &pst, &m_msg.hist, &m_msg.pstat_void);
+ }
+ else {
+ last_stats(aa0[0], m_msg.n0,
+ qstats,nqstats, bestp_arr,nbest, &m_msg, &pst,
+ &m_msg.hist, &m_msg.pstat_void);
+ }
+
+ /* here is a contradiction: if pst.zsflag < 0, then m_msg.pstat_void
+ should be NULL; if it is not, then process_hist() has been called */
+ if (pst.zsflag < 0 && m_msg.pstat_void != NULL) pst.zsflag = 1;
+
+ if (m_msg.last_calc_flg) {
+ /* last_calc may need coefficients from last_stats() */
+ nbest = last_calc(aa0, aa1save, maxn, bestp_arr, nbest, &m_msg, &pst,
+ f_str, m_msg.pstat_void);
+ }
+
+ /* in addition to scaling scores, this sorts bestp_arr[nbest] */
+ scale_scores(bestp_arr,nbest,m_msg.db, &pst,m_msg.pstat_void);
+
+#ifdef DEBUG
+ /* check for bestp_arr corruption */
+ for (i=0; i<nbest; i++) {
+ if (bestp_arr[i]->n1 != bestp_arr[i]->seq->n1) {
+ fprintf(stderr," *** [comp_lib9.c:1120] *** n1 conflict[%d]: n1: %d != seq->n1: %d\n",
+ i, bestp_arr[i]->n1, bestp_arr[i]->seq->n1);
+ }
+ }
+#endif
+
+ /* For large databases, we have good zscores for all the MAX_BEST
+ sequences. Thus, we can sort the scores, and get a list of all
+ sequences with scores better than the E() threshold. If zsflag
+ > 20, then we should shuffle those guys to get an alternative
+ estimate of lambda and K */
+
+ if (pst.zsflag > 20 && nbest >= m_msg.shuff_max) {
+ n_sig = nbest;
+ for (i=0; i<nbest; i++) {
+ if (bestp_arr[i]->rst.escore > pst.e_cut) {
+ n_sig = i;
+ break;
+ }
+ }
+
+ /* if there are no significant hits, shuffle the top 10 */
+ if (n_sig < 10) n_sig = 10;
+
+ /* check to see how many significant sequences there are, and
+ ensure that every sequence is shuffled at least 5 times */
+ if (n_sig * 5 > m_msg.shuff_max) {
+ m_msg.shuff_max = n_sig*5;
+ if (m_msg.shuff_max > MAX_BEST/2) m_msg.shuff_max = MAX_BEST/2;
+ if ((rstats = (struct stat_str *)realloc(rstats, m_msg.shuff_max * sizeof(struct stat_str)))==NULL) {
+ fprintf(stderr, " *** Cannot reallocate rstats[%d] ***\n",m_msg.shuff_max);
+ exit(1);
+ }
+ }
+
+ buf_shuf_seq(aa0, m_msg.n0, &aa1shuff_b, aa1save, maxn,
+ bestp_arr, n_sig, &pst, &m_msg, &m_bufi
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ , f_str
+#endif
+ , &m_msg.ss_info);
+
+ zs_off_save = pst.zs_off;
+ /* ensure that hist2.hist_a is initialized properly */
+ hist2.hist_a = NULL;
+ pst.zsflag_f = process_hist(rstats,nrstats,&m_msg, &pst,&hist2,
+ &m_msg.pstat_void2,&m_msg.ss_info, 0);
+ pst.zs_off = zs_off_save;
+
+ for (i=0; i<nbest; i++) {
+ bestp_arr[i]->zscore2 =
+ find_z(bestp_arr[i]->rst.score[pst.score_ix],
+ bestp_arr[i]->rst.escore, bestp_arr[i]->seq->n1,
+ bestp_arr[i]->rst.comp,m_msg.pstat_void2);
+ }
+ }
+
+ get_param(&pst, info_gstring2p,info_gstring3, &m_msg.s_info);
+
+ /* **************************************************************** */
+ /* label Library: output */
+ /* **************************************************************** */
+
+ tscan = s_time();
+
+ /* get query annotations here, before print_header2() */
+ if (m_msg.annot0_sname[0]) {
+ if (get_annot(m_msg.annot0_sname, &m_msg, m_msg.qtitle, m_msg.q_offset+m_msg.q_off-1,m_msg.n0, &m_msg.annot_p, 0, pst.debug_lib) < 0) {
+ fprintf(stderr,"*** error [%s:%d] - %s did not produce annotations\n",__FILE__, __LINE__, m_msg.annot0_sname);
+ m_msg.annot0_sname[0] = '\0';
+ }
+ if (m_msg.annot_p && m_msg.annot_p->n_annot > 0) {
+ m_msg.aa0a = m_msg.annot_p->aa1_ann;
+ }
+ if (!m_msg.ann_arr[0]) {m_msg.ann_arr[0] = ' '; m_msg.ann_arr[1] = '\0';}
+ }
+
+ /* header2 print Query:, Annotation:, Library: */
+ print_header2(stdout, qlib, info_qlabel, aa0, &m_msg, &pst, info_lib_range_p);
+
+ if (m_msg.std_output) {
+ prhist (stdout, &m_msg, &pst, m_msg.hist, nstats, sstats, m_msg.ldb,
+ (pst.zsflag > 20? hist2.stat_info:NULL),info_lib_range_p,
+ info_gstring2p, info_hstring_p, tscan-tprev) ;
+
+ /* print annot header if it cannot be changed by script */
+ if (!(m_msg.markx & (MX_MBLAST2+MX_M8OUT)) &&
+ !(m_msg.annot1_sname[0] || m_msg.annot0_sname[0]))
+ print_annot_header(stdout, &m_msg);
+ }
+#ifdef COMP_MLIB
+ ttscan += tscan-tprev;
+#endif
+
+ /* check to see if there are alternate output files */
+ l3:
+ if (!m_msg.quiet) {
+ printf("Enter filename for results [%s]: ", m_msg.outfile);
+ fflush(stdout);
+ rline[0]='\0';
+ if (fgets(rline,sizeof(rline),stdin)==NULL) goto end_l;
+ else { /* parse rline input */
+ if ((bp=strchr(rline,'\n'))!=NULL) *bp = '\0';
+ if (rline[0]!='\0') strncpy(m_msg.outfile,rline,sizeof(m_msg.outfile));
+ }
+ }
+
+ if (m_msg.outfile[0]!='\0') { /* have an output file name */
+ if ((outfd=fopen(m_msg.outfile,"w"))==NULL) {
+ fprintf(stderr," could not open %s\n",m_msg.outfile);
+ if (!m_msg.quiet) goto l3;
+ else goto l4; /* skip output file */
+ }
+
+ if (m_msg.markx_list==NULL) { /* no -m 9 options, need one */
+ if ((m_msg.markx_list = (struct markx_str *)calloc(1,sizeof(struct markx_str)))==NULL) {
+ fprintf(stderr,"cannot allocate m_msg.markx_list\n");
+ goto l4;
+ }
+ else {
+ m_msp_to_markx(m_msg.markx_list, &m_msg);
+ }
+ }
+ m_msg.markx_list->out_file = m_msg.outfile;
+ m_msg.markx_list->out_fd = m_msg.outfd = outfd;
+ /* save permanent m_msg marks info */
+ }
+
+ l4:
+ m_msp_to_markx(&markx_save, &m_msg);
+ /* for each entry in markx_list */
+ for (cur_markx = m_msg.markx_list; cur_markx;
+ cur_markx = cur_markx->next) {
+
+ if (cur_markx->out_file && cur_markx->out_file[0] && cur_markx->out_fd == NULL) {
+ if ((cur_markx->out_fd=fopen(cur_markx->out_file,"w"))==NULL) {
+ fprintf(stderr," could not open %s\n",cur_markx->out_file);
+ }
+ }
+ if (cur_markx->out_fd == NULL) continue;
+
+ markx_to_m_msp(&m_msg, cur_markx);
+
+ if (qlib==1) print_header1(cur_markx->out_fd, argv_line, &m_msg, &pst);
+ print_header2(cur_markx->out_fd, qlib, info_qlabel, aa0, &m_msg, &pst, info_lib_range_p);
+
+ if (m_msg.std_output) {
+ prhist(cur_markx->out_fd, &m_msg, &pst,m_msg.hist, nstats, sstats, m_msg.db,
+ (pst.zsflag > 20? hist2.stat_info:NULL), info_lib_range_p,
+ info_gstring2p, info_hstring_p, tscan-tprev);
+
+ /* print annot header if it cannot be changed by script */
+ if (!(m_msg.markx & (MX_MBLAST2+MX_M8OUT)) &&
+ !(m_msg.annot1_sname[0] || m_msg.annot0_sname[0]))
+ print_annot_header(cur_markx->out_fd, &m_msg);
+ }
+
+ /* done with this output, restore m_msg */
+ }
+ markx_to_m_msp(&m_msg,&markx_save);
+
+ /* find the lowest scoring alignment to be displayed */
+
+ /* m_msg.nshow always provides the number of alignments to be
+ displayed in quiet mode */
+
+ /* skip entries if -F e_low specified */
+ if (pst.zsflag >= 0) {
+ for (i=0; i<nbest && bestp_arr[i]->rst.escore < m_msg.e_low; i++) {};
+ m_msg.nskip = i;
+ }
+ else {
+ m_msg.nskip = 0;
+ }
+
+ if (m_msg.quiet || m_msg.tot_markx & MX_M9SUMM) {
+
+ /* to determine how many sequences to re-align (either for
+ do_opt() or calc_id() we need to set m_msg.nshow based on
+ e_cut, m_msg.mshow to display the correct number of
+ alignments */
+
+ /* the logic in this section is:
+ -b '$' - show all the results in the database after -F e_top
+ -b "123" - show min(123, e_cut_count)
+ -b "=123" - show min(123,nbest)
+ -b ">123" - show max(123,e_cut_count)
+ m_msg.mshow_set==1 -- has a value been entered "-b 123", "-b =123", "-b >123"
+ m_msg.mshow_min==1 -- the mshow value is the mininum number to display, set with "-b =,>123"
+ */
+
+ if (pst.zsflag >= 0) { /* do we have E()-values? */
+
+ for (i=m_msg.nskip; i<nbest && bestp_arr[i]->rst.escore < m_msg.e_cut; i++) {}
+
+ if (m_msg.mshow_set <= 0) { /* no -b 123 */
+ m_msg.nshow = min(i - m_msg.nskip, nbest-m_msg.nskip);
+ }
+ else { /* mshow_set>0 */
+ if (m_msg.mshow_min == 1) { /* -b '=123" must show exactly m_msg.mshow results */
+ m_msg.nshow = m_msg.mshow;
+ }
+ else if (m_msg.mshow_min == 2 ) { /* -b '>123' show 123 unless e_cut > 123 */
+ m_msg.nshow = max(m_msg.mshow, i-m_msg.nskip);
+ }
+ else {
+ /* -b '123' show no more than 123, limited by e_cut */
+ if (m_msg.mshow >= 0) m_msg.nshow = min(m_msg.mshow, i-m_msg.nskip);
+ else { m_msg.nshow = nbest-m_msg.nskip;} /* -b '$' sets m_msg.mshow == -1 */
+ }
+ }
+ }
+ else { /* we do not have E()-values */
+ if (m_msg.mshow >= 0) { m_msg.nshow = min(m_msg.mshow,nbest); }
+ else { m_msg.nshow = nbest;}
+ }
+
+ if (m_msg.nshow <= 0) { /* no results to display */
+ if (m_msg.std_output) fprintf(outfd,"!! No sequences with E() < %0.5g\n",m_msg.e_cut);
+ m_msg.nshow = 0;
+ goto end_l;
+ }
+ }
+
+ m_msg.pre_load_done = 0;
+
+ /* ******************************************************************/
+ /* check for additional expansion sequences via link script */
+ /* **************************************************************** */
+ if (m_msg.link_lname[0]) {
+ /* guarantee that we have the bline's for the best sequences */
+ pre_load_best(aa1save, maxn, &bestp_arr[m_msg.nskip], m_msg.nshow, &m_msg, pst.debug_lib);
+
+ if ((link_lib_str = build_link_data(&link_lib_file, &m_msg, bestp_arr,pst.debug_lib))==NULL) {
+ goto no_links;
+ };
+
+ /* get a list of files */
+ link_lib_list_p = lib_select(link_lib_str, link_title, m_msg.flstr, m_msg.ldb_info.ldnaseq);
+ if (link_lib_str != NULL) free(link_lib_str);
+
+ link_getlib_info = init_getlib_info(link_lib_list_p, m_msg.ldb_info.maxn,m_msg.max_memK/16);
+ m_msg.cur_seqr_cnt = 0;
+
+ /* before searching with another getlib_info, we need to be
+ certain that all the best_str entries that will be used after
+ the link search have been saved (only necessary if buffers
+ may be reused) */
+ if (getlib_info->use_memory <= 0 && getlib_info->lib_list_p->m_file_p->get_mmap_chain==NULL) {
+ for (i=0; i < m_msg.nshow; i++) {
+ if (bestp_arr[i]->seq->aa1b != NULL) {
+ preserve_seq2(bestp_arr[i],best_seqs, best_mseqs, best);
+ if (bestp_arr[i]->n1 != bestp_arr[i]->seq->n1) {
+ fprintf(stderr,"[*** comp_lib9.c:1334***] -n1:%d != seq->n1:%d\n",
+ bestp_arr[i]->n1, bestp_arr[i]->seq->n1);
+ }
+ }
+ }
+ }
+
+ /* calculate scores for the sequences */
+#if defined(COMP_THR) || defined(PCOMPLIB)
+ for (i=0; i<m_bufi.max_work_buf; i++) {
+ memset(lib_buf2_list[i].buf2_data,0,
+ (size_t)(m_bufi.max_buf2_res+1)*sizeof(struct buf2_data_s));
+ lib_buf2_list[i].hdr.buf2_cnt=
+ lib_buf2_list[i].hdr.have_results=
+ lib_buf2_list[i].hdr.have_best_save =
+ lib_buf2_list[i].hdr.aa1b_used = 0;
+ }
+ num_reader_bufs = m_bufi.max_work_buf;
+#endif
+
+#if defined(COMP_THR) || defined(PCOMPLIB)
+ get_rbuf(&lib_bhead_p,m_bufi.max_work_buf);
+#else
+ lib_bhead_p = lib_buf2_list;
+#endif
+
+ /* **************************************************************** */
+ /* do the sequence comparison scan on the expanded sequences */
+ /* **************************************************************** */
+ seqr_chain_work(aa0, aa0s, lib_bhead_p, link_getlib_info,
+ &m_bufi, &m_msg, &pst, &m_msg.pstat_void,
+ &link_ldb, NULL, &link_s_info,
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ aa1shuff, f_str, qf_str,
+#endif
+ best_seqs, best_mseqs, best, NULL);
+
+ m_msg.pre_load_done = 0; /* pre-load has not been done for link library sequences */
+
+#ifdef DEBUG
+ /* check for bestp_arr corruption */
+ for (i=0; i<m_msg.nshow; i++) {
+ if (bestp_arr[i]->n1 != bestp_arr[i]->seq->n1) {
+ fprintf(stderr," *** [comp_lib9.c:1377] *** n1 conflict[%d]: n1: %d != seq->n1: %d\n",
+ i, bestp_arr[i]->n1, bestp_arr[i]->seq->n1);
+ }
+ }
+#endif
+ /* need to resort results, and re-check how many should be displayed */
+ scale_scores(bestp_arr,nbest,m_msg.db, &pst,m_msg.pstat_void);
+
+#ifdef DEBUG
+ /* check for bestp_arr corruption */
+ for (i=0; i<m_msg.nshow; i++) {
+ if (bestp_arr[i]->n1 != bestp_arr[i]->seq->n1) {
+ fprintf(stderr," *** [comp_lib9.c:1389] *** n1 conflict[%d]: n1: %d != seq->n1: %d\n",
+ i, bestp_arr[i]->n1, bestp_arr[i]->seq->n1);
+ }
+ }
+#endif
+
+ if (pst.zsflag >= 0) {
+ /* skip entries if -F e_low specified */
+ for (i=0; i<nbest && bestp_arr[i]->rst.escore < m_msg.e_low; i++) {};
+ m_msg.nskip = i;
+ }
+ else {
+ /* no statistics, just use the same score */
+ m_msg.nskip = 0;
+ }
+
+ if (m_msg.quiet || m_msg.tot_markx & MX_M9SUMM) {
+
+ /* to determine how many sequences to re-align (either for
+ do_opt() or calc_id() we need to modify m_msg.mshow to get
+ the correct number of alignments */
+
+ if (pst.zsflag >= 0) { /* do we have e_values? */
+ for (i=m_msg.nskip; i<nbest && bestp_arr[i]->rst.escore < m_msg.e_cut; i++) {}
+
+ if (m_msg.mshow_set != 1) {
+ m_msg.nshow = min(i - m_msg.nskip, nbest-m_msg.nskip);
+ }
+ else {
+ if (m_msg.mshow_min) { /* must show at least m_msg.mshow results */
+ m_msg.nshow = max(m_msg.mshow, i-m_msg.nskip);
+ }
+ else { /* limit by e_cut */
+ m_msg.nshow = min(m_msg.mshow, i-m_msg.nskip);
+ }
+ }
+ }
+ }
+ }
+ /* done with -e link_lname */
+
+ no_links:
+ /* **************************************************************** */
+ /* if we need alignment info now, pre-load and pre-calculate it */
+ /* **************************************************************** */
+ /* the list of conditions is greatly expanded to allow for variant scores */
+ if (m_msg.quiet &&
+ ((m_msg.stages > 1) ||
+ m_msg.annot0_sname[0] || m_msg.annot1_sname[0] ||
+ (m_msg.tot_markx & (MX_M9SUMM + MX_MBLAST + MX_MBLAST2 + MX_M8OUT + MX_M8COMMENT)))) {
+
+ /* pre-load sequence data for alignments for showbest, showalign */
+ pre_load_best(aa1save, maxn, &bestp_arr[m_msg.nskip], m_msg.nshow, &m_msg, pst.debug_lib);
+
+ /* must calculate repeat_thresh before buf_align_seq */
+ if (pst.do_rep) {
+ if (pst.zsflag >= 0) {
+ for (i=m_msg.nskip; i < m_msg.nskip + m_msg.nshow; i++) {
+ bestp_arr[i]->repeat_thresh =
+ min(E1_to_s(pst.e_cut_r, m_msg.n0, bestp_arr[i]->seq->n1,
+ pst.zdb_size, m_msg.pstat_void),bestp_arr[i]->rst.score[pst.score_ix]);
+ }
+ }
+ else {
+ for (i=m_msg.nskip; i < nbest; i++) {
+ bestp_arr[i]->repeat_thresh = bestp_arr[i]->rst.score[pst.score_ix];
+ }
+ }
+ }
+
+ buf_align_seq(aa0, m_msg.n0, &bestp_arr[m_msg.nskip], m_msg.nshow,
+ &pst, &m_msg, &m_bufi
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ , f_str
+#endif
+ );
+
+ /* buf_align_seq can produce higher a_res->rst.score[]s, partly
+ because of variation but also because of a wider optimization
+ band. In earlier versions, the original rst.score was always
+ displayed and sorted on. With variant re-scoring, the
+ alignment score can improve the rst.score, so the rst.score
+ must be updated, and the scores re-sorted */
+
+ for (i=m_msg.nskip; i < m_msg.nskip + m_msg.nshow; i++) {
+ bbp = bestp_arr[i];
+ if (bbp->a_res == NULL) continue;
+ if (bbp->a_res->score_delta > 0) {
+ bbp->rst.score[0] += bbp->a_res->score_delta;
+ bbp->rst.score[1] += bbp->a_res->score_delta;
+ bbp->rst.score[2] += bbp->a_res->score_delta;
+ }
+ }
+ scale_scores(&bestp_arr[m_msg.nskip],m_msg.nshow,m_msg.db, &pst,m_msg.pstat_void);
+ }
+
+ if (!(m_msg.markx & (MX_MBLAST2+MX_M8OUT)) &&
+ (m_msg.annot1_sname[0] || m_msg.annot0_sname[0])) print_annot_header(stdout, &m_msg);
+
+
+ /* **************************************************************** */
+ /* BLAST header ouput: Database:\n %ld sequences; %ld total letters */
+ /* Query: %s length=%d */
+ /* **************************************************************** */
+ print_header3(stdout, qlib, &m_msg, &pst);
+
+ showbest(stdout, aa0, aa1save, maxn, &bestp_arr[m_msg.nskip], nbest-m_msg.nskip,
+ qtt.entries, &m_msg, &pst,m_msg.db, info_gstring2p, f_str);
+
+ m_msp_to_markx(&markx_save, &m_msg);
+ t_quiet = m_msg.quiet;
+ m_msg.quiet = -1; /* should guarantee 1..m_msg.nshow shown */
+
+ /* set copies of showbest to alternative files */
+ for (cur_markx = m_msg.markx_list; cur_markx; cur_markx = cur_markx->next) {
+ if (cur_markx->out_fd == NULL) continue;
+ markx_to_m_msp(&m_msg, cur_markx);
+ if (!(m_msg.markx & (MX_MBLAST2+MX_M8OUT)) &&
+ (m_msg.annot1_sname[0] || m_msg.annot0_sname[0]))
+ print_annot_header(cur_markx->out_fd, &m_msg);
+ print_header3(cur_markx->out_fd, qlib, &m_msg, &pst);
+ showbest(cur_markx->out_fd, aa0, aa1save, maxn, &bestp_arr[m_msg.nskip], nbest-m_msg.nskip,
+ qtt.entries, &m_msg, &pst, m_msg.db, info_gstring2p, f_str);
+ }
+ m_msg.quiet = t_quiet;
+ markx_to_m_msp(&m_msg, &markx_save);
+
+ /* m_msg.ashow can be -1 or > 0 to show results */
+ if (m_msg.nshow > 0 && m_msg.ashow != 0) {
+
+ rline[0]='N';
+ if (!m_msg.quiet){
+ printf(" Display alignments also? (y/n) [n] "); fflush(stdout);
+ if (fgets(rline,sizeof(rline),stdin)==NULL) goto end_l;
+ }
+ else rline[0]='Y';
+
+ if (toupper((int)rline[0])=='Y') {
+ if (!m_msg.quiet && m_msg.do_showbest) {
+ printf(" number of alignments [%d]? ",m_msg.nshow);
+ fflush(stdout);
+ if (fgets(rline,sizeof(rline),stdin)==NULL) goto end_l;
+ if (rline[0]!=0) sscanf(rline,"%d",&utmp);
+ if (utmp == 0) utmp = -1;
+ m_msg.ashow=min(utmp,m_msg.nshow);
+ }
+ }
+
+ /* **************************************************************** */
+ /* print_header4() : showalign alignment transition */
+ /* >>>query vs library for -m 9, -m 10 */
+ /* "\n" for MX_MBLAST */
+ /* ; pg_name, other info for -m 10 */
+ /* not sent to stdout if outfile specified */
+ /* **************************************************************** */
+ print_header4(outfd, info_qlabel, argv_line, info_gstring3, info_hstring_p, &m_msg, &pst);
+ showalign (outfd, aa0, aa1save, maxn,
+ &bestp_arr[m_msg.nskip], nbest-m_msg.nskip,
+ qtt.entries, &m_msg, &pst, info_gstring2p, f_str, &m_bufi);
+
+ fflush(outfd);
+ }
+
+ m_msp_to_markx(&markx_save, &m_msg);
+ for (cur_markx = m_msg.markx_list; cur_markx; cur_markx=cur_markx->next) {
+ if (cur_markx->out_fd == NULL) continue;
+ if (cur_markx->out_fd == outfd) continue;
+ markx_to_m_msp(&m_msg, cur_markx);
+ print_header4(cur_markx->out_fd, info_qlabel, argv_line, info_gstring3, info_hstring_p, &m_msg, &pst);
+ showalign (cur_markx->out_fd, aa0, aa1save, maxn,
+ &bestp_arr[m_msg.nskip], nbest-m_msg.nskip,
+ qtt.entries, &m_msg, &pst, info_gstring2p, f_str, &m_bufi);
+ fflush(cur_markx->out_fd);
+ }
+ markx_to_m_msp(&m_msg, &markx_save);
+
+ end_l:
+
+ if (m_msg.nshow==0 && m_msg.markx & MX_M8COMMENT) {
+ fprintf(outfd,"# %d hits found\n",m_msg.nshow);
+ }
+
+ /* print >>><<< for correct -m 9 */
+ print_header4a(outfd, &m_msg);
+ for (cur_markx = m_msg.markx_list; cur_markx; cur_markx=cur_markx->next) {
+ if (cur_markx->out_fd == NULL) continue;
+ if (cur_markx->out_fd == outfd) continue;
+ if (m_msg.nshow==0 && cur_markx->markx & MX_M8COMMENT) {
+ fprintf(cur_markx->out_fd,"# %d hits found\n",m_msg.nshow);
+ }
+ print_header4a(cur_markx->out_fd, &m_msg);
+ }
+
+ /* display info, statistics parameters for fdata res file */
+ if (fdata) {
+ fprintf(fdata,"#Algorithm : %s\n",info_gstring2p[0]);
+ fprintf(fdata,"#Parameters : %s\n",info_gstring2p[1]);
+ fprintf(fdata,"#Query: %3ld>>>%-50s\n",qtt.entries-1,m_msg.qtitle);
+ pstat_info(fdata_pstat_info, sizeof(fdata_pstat_info), "#Stat:",m_msg.pstat_void);
+ fputs(fdata_pstat_info,fdata);
+ fflush(fdata);
+ }
+
+#if defined(COMP_THR) || defined(PCOMPLIB)
+ rbuf_done(fa_max_workers);
+#endif
+
+ /* **************************************************************** */
+ /* completely finished with previous query */
+ /* **************************************************************** */
+
+ /* clean up/reinitialize the threads buffers */
+#if defined(COMP_THR) || defined(PCOMPLIB)
+ for (i=0; i<m_bufi.max_work_buf; i++) {
+ lib_buf2_list[i].hdr.buf2_cnt=
+ lib_buf2_list[i].hdr.have_results=
+ lib_buf2_list[i].hdr.have_best_save =
+ lib_buf2_list[i].hdr.aa1b_used = 0;
+ }
+
+ num_reader_bufs = m_bufi.max_work_buf;
+#endif
+
+#if defined(COMP_THR)
+ num_worker_bufs = 0;
+ reader_done = 0;
+ reader_wait = 1;
+ worker_buf_workp = 0;
+ worker_buf_readp = 0;
+ reader_buf_workp = 0;
+ reader_buf_readp = 0;
+
+ start_thread = 1; /* stop thread from starting again */
+#endif
+
+ /* clean up best_seqs */
+ memset(best_seqs,0,(MAX_BEST+1)*sizeof(struct seq_record));
+
+ /* re-initialize lib_buf2_list buffers */
+ for (lib_bhead_p = lib_buf2_list;
+ lib_bhead_p < lib_buf2_list+m_bufi.max_work_buf; lib_bhead_p++) {
+
+ /* this wipes out lib_bhead_p->hdr.buf2[0].seq, .mseq */
+ memset(lib_bhead_p->buf2_data,0,(size_t)(m_bufi.max_buf2_res+1)*sizeof(struct buf2_data_s));
+ /* replace it */
+ lib_bhead_p->hdr.have_results = 0;
+ }
+
+ /* re-initialize library counts */
+ m_msg.ldb.length = 0l;
+ m_msg.ldb.entries = m_msg.ldb.carry = 0;
+
+ /* **************************************************************** */
+ /* free allocated alignment encodings associated with bestp_arr[] */
+ /* needs to deallocate aln_code, ann_code */
+ /* bestp_arr[i]->a_res is a pointer, so it must be free()'d */
+ /* **************************************************************** */
+ for (i=m_msg.nskip; i < m_msg.nskip+m_msg.nshow; i++) {
+ if (bestp_arr[i]->have_ares & 0x2) {
+ cur_ares_p = bestp_arr[i]->a_res;
+ while (cur_ares_p) {
+ if (cur_ares_p->aln_code) free(cur_ares_p->aln_code);
+ if (cur_ares_p->annot_code) free(cur_ares_p->annot_code);
+ if (cur_ares_p->annot_var_s) free(cur_ares_p->annot_var_s);
+ if (cur_ares_p->annot_var_id) free(cur_ares_p->annot_var_id);
+ if (cur_ares_p->annot_var_idd) free(cur_ares_p->annot_var_idd);
+ if (bestp_arr[i]->have_ares & 0x1 && cur_ares_p->res) free(cur_ares_p->res);
+ next_ares_p = cur_ares_p->next;
+ free(cur_ares_p);
+ cur_ares_p = next_ares_p;
+ }
+ bestp_arr[i]->a_res = NULL;
+ }
+ bestp_arr[i]->have_ares = 0;
+ bestp_arr[i]->mseq->bline = NULL;
+ bestp_arr[i]->mseq->bline_max = 0;
+ }
+
+#ifdef DEBUG
+ /* check to see if there are ANY un-reset have_ares */
+ for (i=0; i< nbest; i++) {
+ if (bestp_arr[i]->have_ares) {
+ fprintf(stderr," Un-reset have_ares[%d]: %d\n",i,bestp_arr[i]->have_ares);
+ bestp_arr[i]->have_ares = 0;
+ }
+ }
+#endif
+
+ /* reset align_done flag for next search */
+ m_msg.align_done = 0;
+
+ if (m_msg.qframe == 2) free(aa0[1]-1);
+
+ if (have_f_str) {
+ have_f_str = 0;
+ if (f_str[1]!=f_str[0]) {
+ close_work (aa0[1], m_msg.n0, &pst, &f_str[1]);
+ }
+ close_work (aa0[0], m_msg.n0, &pst, &f_str[0]);
+ }
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ if (m_msg.qshuffle) close_work (aa0s, m_msg.n0, &pst, &qf_str);
+#endif
+ if (pst.pam_pssm) {
+ free_pam2p(pst.pam2p[0]);
+ free_pam2p(pst.pam2p[1]);
+ }
+
+ if (aa1shuff_b != NULL) {
+ free(aa1shuff_b);
+ aa1shuff_b = NULL;
+ }
+
+ if (m_msg.aa1save_buf_b != NULL) {
+ free(m_msg.aa1save_buf_b);
+ m_msg.aa1save_buf_b = NULL;
+ }
+
+ if (m_msg.bline_buf_b != NULL) {
+ free(m_msg.bline_buf_b);
+ m_msg.bline_buf_b = NULL;
+ }
+
+ if (link_lib_list_p) {
+ close_lib_list(link_lib_list_p,1,1);
+ free_seqr_chain(link_getlib_info->start_seqr_chain);
+ /* delete the library file */
+ if (!pst.debug_lib) {
+#ifdef UNIX
+ unlink(link_lib_file);
+#else
+ _unlink(link_lib_file);
+#endif
+ }
+ if (link_lib_file) free(link_lib_file);
+ }
+
+ tddone = time(NULL);
+ tdone = s_time();
+ fflush(outfd);
+
+ ttdisp += tdone-tscan;
+
+ /* reset pst parameters to original */
+ pst.zsflag = m_msg.zsflag;
+ pst.zsflag2 = m_msg.zsflag2;
+ pst.n1_low = m_msg.n1_low;
+ pst.n1_high = m_msg.n1_high;
+
+ /* **************************************************************** */
+ /* start the next query */
+ /* **************************************************************** */
+ next_query:
+ m_msg.q_offset = next_q_offset;
+
+ m_msg.n0 =
+ QGETLIB (aa0[0], MAXTST, m_msg.qtitle, sizeof(m_msg.qtitle),
+ &qseek, &qlcont,q_file_p,&m_msg.q_off);
+ if (m_msg.n0 < 0) break;
+ if (m_msg.n0 == 0) {
+ next_q_offset = 0;
+ goto next_query;
+ }
+
+ if ((bp=strchr(m_msg.qtitle,' '))!=NULL) *bp='\0';
+ strncpy(info_qlabel, m_msg.qtitle,sizeof(info_qlabel));
+#ifdef DEBUG
+ SAFE_STRNCPY(ext_qtitle, m_msg.qtitle,sizeof(ext_qtitle));
+#endif
+ if (bp != NULL) *bp=' ';
+ info_qlabel[sizeof(info_qlabel)-1]='\0';
+
+ if (m_msg.ann_flg) {
+ m_msg.n0 = ann_scan(aa0[0],m_msg.n0,&m_msg.aa0a,m_msg.qdnaseq);
+ }
+
+ if (m_msg.ldb_info.term_code && m_msg.qdnaseq==SEQT_PROT &&
+ aa0[0][m_msg.n0-1]!=m_msg.ldb_info.term_code) {
+ aa0[0][m_msg.n0++]=m_msg.ldb_info.term_code;
+ aa0[0][m_msg.n0]=0;
+ }
+
+ /* if ends with ESS, remove terminal ESS */
+ if (aa0[0][m_msg.n0-1] == ESS) { m_msg.n0--; aa0[0][m_msg.n0]= '\0';}
+
+ if (m_msg.outfd) {fputc('\n',stdout);}
+
+ if (qlcont) {
+ next_q_offset = m_msg.q_offset + m_msg.n0 - m_msg.q_overlap;
+ }
+ else {
+ next_q_offset = 0l;
+ }
+
+ /* **************************************************************** */
+ /* have the query, reset the seqr_chain for another scan */
+ /* **************************************************************** */
+ if (getlib_info->use_memory <= 0) {
+ /* re-use the buffer, but re-open the library */
+ getlib_info->eof = 0;
+ getlib_info->lib_list_p = lib_list_p;
+ close_lib_list(getlib_info->lib_list_p,0,0);
+ }
+ else {
+ reset_seqr_chain(getlib_info->start_seqr_chain);
+ }
+ } /* end of while(1) for multiple queries */
+
+ /* **************************************************************** */
+ /* ALL done -- all queries read, all results shown, clean up */
+ /* **************************************************************** */
+#ifdef PCOMPLIB
+ /* tell workers to quit */
+ init_thr(fa_max_workers, NULL, NULL, NULL, NULL, NULL);
+#endif
+
+ close_lib_list(lib_list_p,1,1);
+
+ if (m_msg.lname[0] == '!' && !pst.debug_lib) {
+#ifdef UNIX
+ unlink(lib_db_file);
+#else
+ _unlink(lib_db_file);
+#endif
+ }
+
+ tdone = s_time();
+
+#ifdef DEBUG
+ fprintf(stderr,"[*** comp_lib9.c***] seqr_chains: %d [%d]; aa1b_buffer: %ld [%ld] blocks (lost: %ld); using memory:%d\n",
+ m_msg.cur_seqr_cnt, m_bufi.max_work_buf,
+ getlib_info->tot_memK,getlib_info->max_memK, getlib_info->lost_memK,
+ getlib_info->use_memory);
+#endif
+
+ /* **************************************************************** */
+ /* final summary text */
+ /* 218 residues in 1 query sequences */
+ /* 5190103 residues in 13351 library sequences */
+ /* Tcomplib [36.3.5 Apr, 2011(preload7)] (2 proc) */
+ /* goes to stdout regardless of outfd */
+ /* **************************************************************** */
+ print_header5(stdout, qlib, &qtt, &m_msg, &pst,
+ getlib_info->use_memory, getlib_info->tot_memK);
+
+ m_msp_to_markx(&markx_save, &m_msg);
+ for (cur_markx = m_msg.markx_list; cur_markx; cur_markx=cur_markx->next) {
+ if (cur_markx->out_fd == NULL) continue;
+ markx_to_m_msp(&m_msg, cur_markx);
+ print_header5(cur_markx->out_fd, qlib, &qtt, &m_msg, &pst,
+ getlib_info->use_memory, getlib_info->tot_memK);
+ fflush(cur_markx->out_fd);
+ fclose(cur_markx->out_fd);
+ }
+ markx_to_m_msp(&m_msg, &markx_save);
+
+#ifdef PCOMPLIB
+#ifdef MPI_SRC
+ MPI_Finalize();
+#endif
+#endif
+ exit(0);
+}
+/* **************************************************************** */
+/* end of main() program */
+/* **************************************************************** */
+
+void fsigint()
+{
+ struct db_str db;
+
+ db.entries = db.length = db.carry = 0;
+ tdone = s_time();
+ tddone = time(NULL);
+
+ printf(" /*** interrupted ***/\n");
+ if (outfd!=stdout) fprintf(outfd,"/*** interrupted ***/\n");
+ fprintf(stderr,"/*** interrupted ***/\n");
+
+ print_sum(stdout,&qtt, &db,0,0);
+ if (outfd!=stdout) print_sum(outfd,&qtt, &db,0,0);
+
+ exit(1);
+}
+
+/* ******************************************************************* */
+/* alloc_comp_bufs() allocate thread buffers */
+/* a thread buffer can hold all the results from a seqr_chain buffer: */
+/* m_bufi_p->max_buf2_res = nframe * m_bufi_p->max_chain_seqs (preset) */
+/* ******************************************************************* */
+struct buf_head *
+alloc_comp_bufs (struct mng_thr *m_bufi_p, struct mngmsg *m_msp,
+ int ave_seq_len) {
+ struct buf_head *lib_buf2_list, *lib_buf2_ptr;
+ int i, buf_siz, buf2_rsize;
+
+ /* m_msp->cur_seqr_cnt tracks the number of seqr chains allocated.
+ There must be at least max_work_buf seqr_chains. */
+ m_msp->cur_seqr_cnt = 0;
+
+ if ((lib_buf2_list =
+ (struct buf_head *)calloc((size_t)(m_bufi_p->max_work_buf),
+ sizeof(struct buf_head))) == NULL) {
+ fprintf(stderr," cannot allocate lib_buf2_list[%d]\n", m_bufi_p->max_work_buf);
+ exit(1);
+ }
+
+#if defined(COMP_THR) || defined(PCOMPLIB)
+ if ((worker_buf =
+ (struct buf_head **)calloc((size_t)(m_bufi_p->max_work_buf),
+ sizeof(struct buf_head *))) == NULL) {
+ fprintf(stderr," cannot allocate **worker_buf[%d]\n", m_bufi_p->max_work_buf);
+ exit(1);
+ }
+#endif
+
+#ifdef COMP_THR
+ if ((reader_buf =
+ (struct buf_head **)calloc((size_t)(m_bufi_p->max_work_buf),
+ sizeof(struct buf_head *))) == NULL) {
+ fprintf(stderr," cannot allocate **reader_buf[%d]\n", m_bufi_p->max_work_buf);
+ exit(1);
+ }
+#endif
+
+ /* **************************************************************** */
+ /* allocate space for library buffers and results */
+ /* there are four structures/buffers used to keep track of */
+ /* sequences/results: */
+ /* (1) lib_buf2_list[] is a buf_head array, with: */
+ /* buf2_hdr_s hdr - */
+ /* that stores whether the results are ready, number of results */
+ /* available, and information about the seq_record buffer and */
+ /* aa1b buffer */
+ /* (2) buf2_data[] -> seq_record/mseq_record arrays */
+ /* (3) buf2_res[] -> score results */
+ /* (4) buf2_ares[] -> alignment results */
+ /* **************************************************************** */
+ buf_siz = max(m_bufi_p->max_chain_seqs*ave_seq_len, m_msp->max_tot * 2);
+ if (buf_siz < m_msp->max_tot) buf_siz = m_msp->max_tot;
+ m_bufi_p->seq_buf_size = buf_siz;
+
+ buf2_rsize = m_bufi_p->max_buf2_res+1;
+
+ /* allocate max_buf2_res buf2_str's into each buf2 */
+ for (i=0; i<m_bufi_p->max_work_buf; i++) {
+ lib_buf2_ptr = &lib_buf2_list[i];
+ if ((lib_buf2_ptr->buf2_ares = (struct buf2_ares_s *)
+ calloc(buf2_rsize,sizeof(struct buf2_ares_s)))==NULL) {
+ fprintf(stderr," cannot allocate ares buffer struct %d %d\n",
+ i,m_bufi_p->max_buf2_res+1);
+ exit(1);
+ }
+
+ if ((lib_buf2_ptr->buf2_res = (struct buf2_res_s *)
+ calloc(buf2_rsize,sizeof(struct buf2_res_s)))==NULL) {
+ fprintf(stderr," cannot allocate res buffer struct %d %d\n",
+ i,m_bufi_p->max_buf2_res+1);
+ exit(1);
+ }
+
+ if ((lib_buf2_ptr->buf2_data = (struct buf2_data_s *)
+ calloc(buf2_rsize,sizeof(struct buf2_data_s)))==NULL) {
+ fprintf(stderr," cannot allocate buffer struct %d %d\n",
+ i,m_bufi_p->max_buf2_res+1);
+ exit(1);
+ }
+
+ lib_buf2_ptr->hdr.seq_record_continuous = 0; /* should be 1 */
+ /* these values are not used with seqr_chains */
+ lib_buf2_ptr->hdr.aa1b_used = 0;
+ lib_buf2_ptr->hdr.aa1b_size = 0;
+
+ lib_buf2_ptr->hdr.have_results=0;
+ lib_buf2_ptr->hdr.my_id = i;
+
+#if defined(COMP_THR) || defined(PCOMPLIB)
+ RESULTS_BUF[i] = lib_buf2_ptr;
+#endif
+ }
+ return lib_buf2_list;
+}
+
+/* **************************************************************** */
+/* new_seqr_chain() allocates space for:
+ struct seq_record seqr_base[max_chain_seqs];
+ struct mseq_record mseqr_base[max_chain_seqs];
+ if buffer required, unsigned char aa1b_base[aa1b_size]
+
+ comp_lib9.c tracks how much space is being used, and, if the space
+ is too large, reuses existing buffers.
+*/
+/* **************************************************************** */
+struct seqr_chain *
+new_seqr_chain(int max_chain_seqs, int aa1b_size, struct seqr_chain *old_seqr_chain,
+ int maxn, long *lost_memK, int alloc_buf_flg) {
+ struct seqr_chain *my_seqr_chain;
+ int this_aa1b_size;
+
+ /* allocate the chain */
+ if ((my_seqr_chain = (struct seqr_chain *)calloc(1,sizeof(struct seqr_chain)))==NULL) {
+ fprintf(stderr," Cannot allocate library seqr_chain\n");
+ exit(1);
+ }
+
+ this_aa1b_size = aa1b_size;
+ if (old_seqr_chain) {
+ old_seqr_chain->next = my_seqr_chain;
+ if (old_seqr_chain->aa1b_size > 0) {
+ this_aa1b_size = old_seqr_chain->aa1b_size;
+ }
+ }
+ else {
+ *lost_memK = 0l;
+ }
+
+ my_seqr_chain->max_chain_seqs = max_chain_seqs;
+ my_seqr_chain->cur_seq_cnt = 0;
+
+ /* now allocate the seq_record, mseq_record buffers, and the space for the sequences */
+ if ((my_seqr_chain->seqr_base = (struct seq_record *)
+ calloc((size_t)max_chain_seqs,sizeof(struct seq_record))) == NULL) {
+ fprintf(stderr," cannot allocate seq_record buffer %d\n",max_chain_seqs);
+ exit(1);
+ }
+
+ if ((my_seqr_chain->mseqr_base = (struct mseq_record *)
+ calloc((size_t)max_chain_seqs,sizeof(struct mseq_record))) == NULL) {
+ fprintf(stderr," cannot allocate mseq_record buffer %d\n",max_chain_seqs);
+ exit(1);
+ }
+
+ if (alloc_buf_flg) {
+ /* try to adjust aa1b_size to a sensible value based on past history */
+ if (old_seqr_chain != NULL && (old_seqr_chain->aa1b_size > 0)) {
+ *lost_memK += ((old_seqr_chain->aa1b_size - old_seqr_chain->aa1b_next)>>10);
+ /* if sequences are shorter than expected */
+ /*
+ if ((old_seqr_chain->cur_seq_cnt >= (max_chain_seqs-10)) &&
+ max(old_seqr_chain->aa1b_next,maxn) < old_seqr_chain->aa1b_size/2) {
+ this_aa1b_size -= (this_aa1b_size/4);
+ this_aa1b_size = max(this_aa1b_size, maxn * 2);
+ }
+ */
+ /* if not enough sequence slots are being used (sequences longer than expected) */
+ if (old_seqr_chain->cur_seq_cnt < old_seqr_chain->max_chain_seqs/4) {
+ this_aa1b_size *= 2;
+ }
+ }
+
+ /* finally, we need a big sequence buffer */
+ if ((my_seqr_chain->aa1b_base =
+ (unsigned char *)calloc((size_t)(this_aa1b_size+1),sizeof(unsigned char)))
+ ==NULL) {
+ fprintf(stderr," cannot allocate buffer %d\n",this_aa1b_size+1);
+ exit(1);
+ }
+
+ my_seqr_chain->aa1b_base++;
+ my_seqr_chain->aa1b_size = this_aa1b_size;
+ }
+
+ return my_seqr_chain;
+}
+
+/* **************************************************************** */
+/* re-initialize seqr_chain so it can be filled with new sequences */
+/* **************************************************************** */
+void
+reinit_seqr_chain(struct seqr_chain *my_seqr_chain, int max_chain_seqs) {
+ memset(my_seqr_chain->seqr_base, 0, my_seqr_chain->max_chain_seqs*sizeof(struct seq_record));
+ memset(my_seqr_chain->mseqr_base, 0, my_seqr_chain->max_chain_seqs*sizeof(struct mseq_record));
+ memset(my_seqr_chain->aa1b_base-1, 0, my_seqr_chain->aa1b_size+1);
+ /* this is possible because every seqr_chain has max_chain_seqs seq_records;
+ some have a smaller my_seqr_chain->aa1b_size */
+ my_seqr_chain->max_chain_seqs = max_chain_seqs;
+ my_seqr_chain->cur_seq_cnt = 0;
+ my_seqr_chain->contiguous = 0;
+ my_seqr_chain->aa1b_next = 0;
+ /* cannot reset my_seqr_chain->next because it is needed for chain */
+}
+
+/* **************************************************************** */
+/* end_seqr_chain() -- cleans up cur_seq_cnt, max_chain_seqs when a
+ seqr_chain is full.
+
+ inputs: *last_seqr (the finished seqr_chain)
+
+ modifies: max_chain_seqs (sets max_chain_seqs to cur_chain_seqs so
+ that future will have the correct number of seq_records */
+/* **************************************************************** */
+void
+end_seqr_chain(struct seqr_chain *last_seqr) {
+ /* the last seqrecord_p is always one too far */
+ last_seqr->cur_seq_cnt--;
+ last_seqr->max_chain_seqs = last_seqr->cur_seq_cnt;
+ /* prevent other threads from using this link */
+}
+
+/* **************************************************************** */
+/* free_seqr_chain() -- deallocates space allocated in seqr_chain.
+ frees, seq_records, mseq_records, and aa1b buffer
+ inputs: seqr_chain
+*/
+/* **************************************************************** */
+void
+free_seqr_chain(struct seqr_chain *this_seqr_chain) {
+ struct seqr_chain *cur_seqr_chain_p, *next_seqr_chain_p;
+
+ for (cur_seqr_chain_p = this_seqr_chain; cur_seqr_chain_p != NULL;
+ cur_seqr_chain_p = next_seqr_chain_p) {
+ next_seqr_chain_p = cur_seqr_chain_p->next;
+
+ /* with get_mmap_chain, not all chains have aa1b_base allocation */
+ if (cur_seqr_chain_p->aa1b_base) {free(--cur_seqr_chain_p->aa1b_base);}
+ free(cur_seqr_chain_p->mseqr_base);
+ free(cur_seqr_chain_p->seqr_base);
+ free(cur_seqr_chain_p);
+ }
+}
+
+/* **************************************************************** */
+/* next_sequence_p() returns a new seq_p and mseq_p from a seqr_chain, or
+ returns NULL if none are available or no space is available in the
+ buffer
+
+ inputs: *cur_seqr_chain, **cur_mseq_p, *old_seq_p
+ (*cur_seqr_chain provides sequence buffer space)
+ modifies: **cur_mseq_p
+ returns: cur_seq_p
+*/
+/* **************************************************************** */
+
+struct seq_record *
+next_sequence_p(struct mseq_record **cur_mseq_p, struct seq_record *old_seq_p,
+ struct seqr_chain *cur_seqr_chain, int maxn) {
+ struct seq_record *new_seq_p;
+
+ /* if we have a previous sequence (which we do for all but the first),
+ use it to update aa1b_next */
+ if (old_seq_p) cur_seqr_chain->aa1b_next += old_seq_p->n1+1;
+
+ /* is there room in cur_seqr_chain (count<max_count, size +maxn < size */
+ if (cur_seqr_chain->cur_seq_cnt < cur_seqr_chain->max_chain_seqs
+ && cur_seqr_chain->aa1b_next + maxn < cur_seqr_chain->aa1b_size) {
+
+ /* update mseq_p */
+ *cur_mseq_p = cur_seqr_chain->mseqr_base + cur_seqr_chain->cur_seq_cnt;
+ /* update seq_p */
+ new_seq_p = cur_seqr_chain->seqr_base + cur_seqr_chain->cur_seq_cnt;
+ /* update aa1b for seq_p */
+ new_seq_p->aa1b = cur_seqr_chain->aa1b_base + cur_seqr_chain->aa1b_next;
+ /* update counter */
+ cur_seqr_chain->cur_seq_cnt++;
+ return new_seq_p;
+ }
+ else {
+ cur_seqr_chain->cur_seq_cnt++;
+ /* out of room in seqr_chain or aa1b_base */
+ return NULL;
+ }
+}
+
+/* **************************************************************** */
+/* getlib_info():
+ inputs: lib_list_p - the list of library files
+ maxn -- max size of sequence read
+ max_memK -- memory block counter max
+
+ output: struct getlib_str *getlib_info
+
+ maintain state information required for successive, asynchronous
+ getlib() calls */
+/* **************************************************************** */
+struct getlib_str *
+init_getlib_info(struct lib_struct *lib_list_p, int maxn,long max_memK) {
+ struct getlib_str *my_getlib_info;
+ unsigned char *aa1save;
+
+ if ((my_getlib_info=(struct getlib_str *)calloc(1,sizeof(struct getlib_str)))==NULL) {
+ s_abort ("Unable to allocate getlib_info", "");
+ }
+
+ if ((aa1save = (unsigned char *)calloc((maxn+1),sizeof (char))) == NULL) {
+ s_abort ("Unable to allocate library overlap", "");
+ }
+ *aa1save= '\0';
+ aa1save++;
+
+ my_getlib_info->aa1save = aa1save;
+ my_getlib_info->lib_list_p = lib_list_p;
+ my_getlib_info->use_memory = 0;
+ my_getlib_info->max_memK = max_memK;
+ my_getlib_info->n_libstr=sizeof(my_getlib_info->libstr);
+ my_getlib_info->loffset = 0l;
+ my_getlib_info->ocont = 0;
+ my_getlib_info->lcont = 0;
+
+ return my_getlib_info;
+}
+
+/* **************************************************************** */
+/* next_seqr_chain reads a library in seqr_cnt chunks, returning when
+ a chunk is full.
+
+ 8-July-2011 -- modified to directly return memory mapped encoded
+ sequences (BLASTDB)
+
+ next_seqr_chain saves enough of the getlib() state to be able to
+ continue reading in getlib_str *getlib_info, which also keeps the
+ information about the current seqr_chain record being used.
+
+ (1) get data structure to save one seq_chain buffer
+ new_seqr_chain(number of buffers= max_chain_seqs, seq_buf_size+1,...)
+ thus, a single seqr_chain() is guaranteed to be contiguous
+ (2) then get a current_seq_p, current_mseq_p
+ (3) getlib() to put the stuff in the current_seq_p (and current_mseq_p)
+ (4) keep doing this until the seq_chain buffer is filled
+*/
+/* **************************************************************** */
+struct seqr_chain *
+next_seqr_chain(const struct mng_thr *m_bufi_p, struct getlib_str *getlib_info,
+ struct buf_head *lib_bhead_p,
+ struct mngmsg *m_msp, const struct pstruct *ppst)
+{
+ struct seqr_chain *my_seqr_chain=NULL;
+ struct lib_struct *cur_lib_p;
+ struct seq_record *current_seq_p, *old_seq_p;
+ struct mseq_record *current_mseq_p;
+ struct lmf_str *m_fd;
+
+ int maxt; /* continued sequence */
+ int sstart, sstop, is, id, i;
+ int igncnt=0; /* count for ignoring sequences warning */
+ struct lmf_str *m_file_p;
+ unsigned char *aa1ptr, *aa1;
+ char *bp;
+ int n1;
+ /* int cont, ocont; */
+ /* long loffset; */
+
+ /* Are the sequences are in memory? if so, just return the next one. */
+ if (getlib_info->use_memory>0) {
+ my_seqr_chain = getlib_info->cur_seqr_chain;
+ if (my_seqr_chain == NULL) {
+ goto return_null;
+ }
+ else {
+ getlib_info->cur_seqr_chain = getlib_info->cur_seqr_chain->next;
+#ifdef DEBUG
+ lib_bhead_p->hdr.my_chain = my_seqr_chain;
+#endif
+ return my_seqr_chain;
+ }
+ }
+
+ /* we do not have complete seqr_chains, so we must do things one
+ record/sequence at a time */
+
+ current_seq_p = old_seq_p = NULL;
+ /* check for eof on current library; return NULL if EOF, or start
+ next library in chain */
+ if (getlib_info->eof) {
+ if (getlib_info->use_memory < 0) {
+ closelib(getlib_info->lib_list_p->m_file_p,1);
+ }
+ if (getlib_info->lib_list_p->next == NULL) {
+ /* finished with everything, use memory next time */
+ if (getlib_info->use_memory >= 0) getlib_info->use_memory = 1;
+ goto return_null; /* ensures that cur_seqr_chain is reset */
+ }
+ else {
+ getlib_info->eof = 0;
+ getlib_info->lib_list_p = getlib_info->lib_list_p->next;
+ }
+ }
+
+ if (getlib_info->lib_list_p == NULL) goto return_null;
+
+ /* check to see if a library is open; if not get one and open it */
+ if (getlib_info->lib_list_p->m_file_p == NULL) {
+ next_lib:
+ cur_lib_p = getlib_info->lib_list_p;
+ if ((cur_lib_p->m_file_p =
+ open_lib(cur_lib_p, m_msp->ldb_info.ldnaseq, lascii, !m_msp->quiet))
+ ==NULL) {
+ fprintf(stderr," cannot open library %s\n",cur_lib_p->file_name);
+ getlib_info->lib_list_p = getlib_info->lib_list_p->next;
+ if (getlib_info->lib_list_p == NULL) {
+ goto return_null;
+ }
+ else {
+ goto next_lib;
+ }
+ }
+ /* these values must be reset for new databases, but otherwise
+ should not be reset, because a sequence can span more than one
+ seqr_chain */
+ getlib_info->loffset = 0l;
+ getlib_info->ocont = 0;
+ }
+
+ /* here we have an open library */
+
+ /* if the library is NCBIBL20 and memory mapped, simply return
+ pointers to the memory map */
+ m_fd = getlib_info->lib_list_p->m_file_p;
+ if (m_fd->get_mmap_chain) {
+ /* get a new seqr_chain */
+ my_seqr_chain =
+ new_seqr_chain(m_bufi_p->max_chain_seqs,(m_bufi_p->seq_buf_size+1),
+ getlib_info->cur_seqr_chain, m_msp->ldb_info.maxn,
+ &getlib_info->lost_memK, 0);
+
+ /* keep track of number of seqr_chains */
+ m_msp->cur_seqr_cnt++;
+
+ /* fill it with memory mapped pointers */
+ if (m_fd->get_mmap_chain(my_seqr_chain, m_fd, &m_msp->db)==EOF) {
+ getlib_info->eof = 1;
+ }
+
+ lib_bhead_p->hdr.my_chain = my_seqr_chain;
+
+ getlib_info->cur_seqr_chain = my_seqr_chain;
+ if (getlib_info->start_seqr_chain == NULL) {
+ getlib_info->start_seqr_chain = my_seqr_chain;
+ }
+ my_seqr_chain->max_chain_seqs = my_seqr_chain->cur_seq_cnt;
+ return my_seqr_chain;
+ }
+
+ /* a seqr_chain contains an array for seq_records, mseq_records, and
+ the aa1b_base buffer for sequences */
+
+ /* this section transitions between the comp_lib6,7 strategy, which
+ kept all the sequence database in memory, simply allocating
+ additional seqr_chains's as required, and the comp_lib5 strategy,
+ which did not use seqr_chains, but linked the sequence buffers
+ with the thread buffers
+
+ seqr_chain buffers are allocated while memory is available and
+ there are fewer than max_work_buf (thread_buf) seqr chains. In
+ addition, each seqr_chain is associated with the thread buffer
+ that will use it.
+
+ If the memory limit is hit, then getlib_info->use_memory is set
+ negative, AND the seqr_chains linked to the thread buffers are
+ used -- each thread buffer is permanently associated with the
+ last seqr_chain that it used. Linking seqr_chains to thread
+ buffers (as is effectively done in comp_lib5) ensures that the
+ same seqr_chain is never used by more than one thread
+ simultaneously. */
+
+ /* if (a) we don't have m_bufi_p->max_work_buf chains or (b) memory
+ is still available, then get a new_seqr_chain */
+
+ if ((getlib_info->use_memory >= 0) &&
+ (getlib_info->start_seqr_chain == NULL || m_msp->cur_seqr_cnt < m_bufi_p->max_work_buf ||
+ getlib_info->tot_memK + ((m_bufi_p->seq_buf_size+1)>>10) < getlib_info->max_memK) /* memK is in 1024 bytes thus >>10 */
+ ) {
+ my_seqr_chain =
+ new_seqr_chain(m_bufi_p->max_chain_seqs,(m_bufi_p->seq_buf_size+1),
+ getlib_info->cur_seqr_chain, m_msp->ldb_info.maxn,
+ &getlib_info->lost_memK, 1);
+ getlib_info->tot_memK += (my_seqr_chain->aa1b_size >> 10);
+ m_msp->cur_seqr_cnt++;
+ lib_bhead_p->hdr.my_chain = my_seqr_chain;
+ }
+ /* if the memory limit has been reached, then keep using the first
+ one (after re-initializing) */
+ else {
+ my_seqr_chain = lib_bhead_p->hdr.my_chain;
+ if (!my_seqr_chain) {
+ fprintf(stderr,"[***%s/next_seqr_chain***] lib_bhead_p->hdr.my_chain==NULL\n",prog_func);
+ exit(1);
+ }
+ reinit_seqr_chain(my_seqr_chain,m_bufi_p->max_chain_seqs);
+ getlib_info->use_memory = -1;
+ }
+
+ getlib_info->cur_seqr_chain = my_seqr_chain;
+
+ if (getlib_info->start_seqr_chain == NULL) {
+ getlib_info->start_seqr_chain = my_seqr_chain;
+ }
+
+ /* (re-)initialize variables from getlib_info */
+ m_file_p = getlib_info->lib_list_p->m_file_p;
+
+ /* then we can count through the seq_records in the
+ seqr_chain and get a getlib() sequence for each one */
+
+ while ((current_seq_p =
+ next_sequence_p(¤t_mseq_p, old_seq_p, my_seqr_chain,
+ m_msp->ldb_info.maxn))) {
+
+ aa1 = current_seq_p->aa1b;
+
+ if (getlib_info->lcont) {
+ maxt = m_msp->ldb_info.maxt3;
+ memcpy(aa1,getlib_info->aa1save,m_msp->ldb_info.l_overlap);
+ aa1ptr= &aa1[m_msp->ldb_info.l_overlap]; /* aa1ptr is where the next GETLIB sequence goes */
+ current_mseq_p->lseek = getlib_info->lseek;
+ }
+ else {
+ maxt = m_msp->ldb_info.maxn;
+ aa1ptr = aa1;
+ }
+
+ n1=GETLIB(aa1ptr, maxt, getlib_info->libstr, getlib_info->n_libstr,
+ &(current_mseq_p->lseek), &getlib_info->lcont, m_file_p, &(current_seq_p->l_off));
+
+ /* if the library is empty, check for another library */
+ if (n1 < 0) {
+ /* set EOF for this file */
+ getlib_info->eof = 1;
+ /* reduce the seqr_chain count */
+ end_seqr_chain(my_seqr_chain);
+ return my_seqr_chain;
+ }
+
+ old_seq_p = current_seq_p;
+
+#ifdef DEBUG
+ /* check for out of range sequence */
+ /*
+ for (id=0; id<n1; id++) {
+ if (aa1[id] > ppst->nsq_e) {
+ fprintf(stderr," *** ERROR *** %s[%d] = %d > %d out of range\n",libstr, id, aa1[id], ppst->nsq_e);
+ aa1[id] = 1;
+ }
+ }
+ */
+#endif
+
+ /* which sequence is it in the entire library? used for debugging */
+ current_seq_p->index = seq_index;
+ current_mseq_p->index = seq_index++;
+ current_mseq_p->m_file_p = (void *)m_file_p;
+ current_mseq_p->cont = getlib_info->ocont+1;
+ current_seq_p->l_offset = getlib_info->loffset;
+
+ if ((bp=strchr(getlib_info->libstr,' '))!=NULL) *bp='\0';
+ strncpy(current_mseq_p->libstr,getlib_info->libstr,MAX_UID); /* get old libstr for lcont>0 */
+
+ /* add termination code for FASTX/FASTY if necessary */
+ if (m_msp->ldb_info.term_code && !getlib_info->lcont &&
+ m_msp->ldb_info.ldnaseq==SEQT_PROT &&
+ aa1ptr[n1-1]!=m_msp->ldb_info.term_code) {
+ aa1ptr[n1++]=m_msp->ldb_info.term_code;
+ aa1ptr[n1]=0;
+ }
+
+ /* check for subset */
+ if (m_file_p->opt_text[0]!='\0') {
+ if (m_file_p->opt_text[0]=='-') {
+ sstart=0; sscanf(&m_file_p->opt_text[1],"%d",&sstop);
+ }
+ else {
+ sstart = 0; sstop = -1;
+ sscanf(&m_file_p->opt_text[0],"%d-%d",&sstart,&sstop);
+ sstart--;
+ if (sstop <= 0 ) sstop = BIGNUM;
+ }
+
+ n1 = min(n1, sstop);
+ for (id=0,is=sstart; is<n1; ) {
+ aa1ptr[id++]=aa1ptr[is++];
+ }
+ aa1ptr[id]='\0';
+ n1 -= sstart;
+ current_seq_p->l_off += sstart;
+ }
+
+ /* update n1 after possible changes */
+ current_seq_p->n1 = n1;
+
+#ifdef DEBUG
+ if (getlib_info->n_libstr <= MAX_UID) {
+ if ((bp=strchr(current_mseq_p->libstr,' '))!=NULL) *bp='\0';
+ }
+ if (aa1[-1]!='\0' || aa1ptr[n1]!='\0') {
+ fprintf(stderr,"%s: aa1[%d] at %ld:%lld missing NULL boundaries: %d %d\n",
+ current_mseq_p->libstr,n1, m_msp->db.entries+1,current_mseq_p->lseek,
+ aa1[-1],aa1ptr[n1]);
+ }
+#endif
+ /* check for a continued sequence and provide a pointer to
+ the n1_tot array if lcont || ocont */
+ getlib_info->n1tot_v += n1;
+ if (getlib_info->lcont && !getlib_info->ocont) { /* get a new pointer */
+ if (getlib_info->n1tot_cnt <= 0) {
+ if ((getlib_info->n1tot_ptr=calloc(1000,sizeof(int)))==NULL) {
+ fprintf(stderr," cannot allocate n1tot_ptr\n");
+ exit(1);
+ }
+ else {getlib_info->n1tot_cnt=1000;}
+ }
+ getlib_info->n1tot_cnt--;
+ getlib_info->n1tot_cur = getlib_info->n1tot_ptr++;
+ }
+ current_mseq_p->n1tot_p = getlib_info->n1tot_cur;
+
+ m_msp->db.entries++;
+ m_msp->db.length += n1;
+ if (m_msp->db.length > LONG_MAX) {
+ m_msp->db.length -= LONG_MAX; m_msp->db.carry++;
+ }
+
+ /* don't count long sequences more than once */
+ if (aa1!=aa1ptr) { /* this is a continuation */
+ current_seq_p->n1 = n1 += m_msp->ldb_info.l_overlap; /* corrected 28-June-2008 */
+ m_msp->db.entries--;
+ }
+
+#ifdef DEBUG
+ current_seq_p->adler32_crc =
+ current_mseq_p->adler32_crc = adler32(1L,current_seq_p->aa1b,current_seq_p->n1);
+
+ /* This finds most reasons for core dumps */
+ if (ppst->debug_lib)
+ for (i=0; i<n1; i++) {
+ if (aa1[i]>ppst->nsqx || aa1[i] <= 0) {
+ fprintf(stderr,
+ "%s residue[%d/%d] %d range (%d) lcont/ocont: %d/%d\n%s\n",
+ current_mseq_p->libstr,i,current_seq_p->n1,aa1[i],ppst->nsq,
+ getlib_info->lcont,getlib_info->ocont,aa1ptr+i);
+ aa1[i]=0;
+ n1=i-1;
+ break;
+ }
+ }
+#endif
+
+ if ( n1 <= 1) {goto loop2;}
+
+ if (getlib_info->lcont) {
+ memcpy(getlib_info->aa1save,&aa1[n1-m_msp->ldb_info.l_overlap],m_msp->ldb_info.l_overlap);
+ }
+
+ /* all done with the previous getlib(), now get a new
+ current_seq_p (current_mseq_p) and reset aa1 */
+ /* next_sequence_p knows if there is space in the aa1b buffer */
+
+ loop2:
+ if (getlib_info->lcont) {
+ /* this must be n1, which is the old value, not current_seq_p->n1 */
+ getlib_info->loffset += n1 - m_msp->ldb_info.l_overlap;
+
+ /* ocont must be preserved across seqr_chains */
+ getlib_info->ocont = getlib_info->lcont;
+
+ /* if lcont>0, then lseek must be preserved; getlib() functions
+ do not set lseek (loffset) if lcont>0 */
+ getlib_info->lseek = current_mseq_p->lseek;
+ }
+ else {
+ if (getlib_info->ocont) {
+ *getlib_info->n1tot_cur = getlib_info->n1tot_v;
+ }
+ getlib_info->ocont = 0;
+ getlib_info->loffset = 0l;
+ getlib_info->n1tot_v = 0;
+ getlib_info->n1tot_cur = NULL;
+ }
+ } /* end cur_lib_p */
+
+ end_seqr_chain(my_seqr_chain);
+ return my_seqr_chain;
+
+ return_null:
+ getlib_info->cur_seqr_chain = getlib_info->start_seqr_chain;
+ return NULL;
+}
+
+/* reset_seqr_chain() prepares the seqr_chain (in memory) for another
+ search by setting cur_seq_cnt to max_chain_seqs */
+void
+reset_seqr_chain(struct seqr_chain *seqr_base) {
+ struct seqr_chain *cur_seqr;
+
+ for (cur_seqr = seqr_base; cur_seqr; cur_seqr = cur_seqr->next) {
+ cur_seqr->cur_seq_cnt = cur_seqr->max_chain_seqs;
+ }
+}
+
+void
+close_lib_list(struct lib_struct *lib_list_p, int free_flag, int mm_force) {
+ struct lib_struct *cur_lib_p, *next_lib_p;
+
+ for (cur_lib_p=lib_list_p; cur_lib_p != NULL; cur_lib_p = next_lib_p) {
+ next_lib_p = cur_lib_p->next;
+ if (cur_lib_p->m_file_p !=NULL) {
+ closelib(cur_lib_p->m_file_p,mm_force);
+ if (mm_force || cur_lib_p->m_file_p->libf == NULL) {
+ free(cur_lib_p->m_file_p->lline);
+ free(cur_lib_p->m_file_p);
+ cur_lib_p->m_file_p = NULL;
+ }
+ }
+ if (free_flag) {
+ free(cur_lib_p->file_name);
+ free(cur_lib_p);
+ }
+ }
+}
+
+/* save the seq/m_seq data from the thread buffer in
+ best_seq/best_mseq so that the thread buffer seq/m_seq can be
+ re-used.
+
+*/
+
+void preserve_seq(struct buf2_data_s *lib_buf2_dp,
+ struct seq_record *best_seqs,
+ struct mseq_record *best_mseqs,
+ struct beststr *best) {
+ struct seq_record *dest_seq_p, *saved_seq_p;
+ struct mseq_record *dest_mseq_p, *saved_mseq_p;
+ struct beststr *next_bbp;
+
+ saved_seq_p = lib_buf2_dp->best_save->seq;
+ saved_mseq_p = lib_buf2_dp->best_save->mseq;
+
+ /* the thread buffer ptr lib_buf2_dp knows where its results are
+ saved in beststr */
+
+ dest_seq_p = &best_seqs[lib_buf2_dp->best_save - best];
+ dest_mseq_p = &best_mseqs[lib_buf2_dp->best_save - best];
+
+ lib_buf2_dp->best_save->seq = dest_seq_p;
+ lib_buf2_dp->best_save->mseq = dest_mseq_p;
+
+ for (next_bbp = lib_buf2_dp->best_save->bbp_link;
+ (next_bbp != NULL) && (next_bbp->seq == saved_seq_p)
+ && (next_bbp->n1 == saved_seq_p->n1);
+ next_bbp = next_bbp->bbp_link) {
+ next_bbp->seq = dest_seq_p;
+ next_bbp->mseq = dest_mseq_p;
+ }
+
+ memcpy(dest_seq_p,lib_buf2_dp->seq,sizeof(struct seq_record));
+ memcpy(dest_mseq_p,lib_buf2_dp->mseq,sizeof(struct mseq_record));
+ dest_seq_p->aa1b = NULL;
+}
+
+/* save the seq/m_seq data from the best_str buffer in
+ best_seq/best_mseq so that the thread buffer seq/m_seq can be
+ re-used.
+
+*/
+
+void preserve_seq2(struct beststr *bbp,
+ struct seq_record *best_seqs,
+ struct mseq_record *best_mseqs,
+ struct beststr *best) {
+ struct seq_record *dest_seq_p, *saved_seq_p;
+ struct mseq_record *dest_mseq_p, *saved_mseq_p;
+ struct beststr *next_bbp;
+
+ saved_seq_p = bbp->seq;
+ saved_mseq_p = bbp->mseq;
+
+ /* the thread buffer ptr lib_buf2_dp knows where its results are
+ saved in beststr */
+
+ dest_seq_p = &best_seqs[bbp - best];
+ dest_mseq_p = &best_mseqs[bbp - best];
+
+ for (next_bbp = bbp->bbp_link;
+ (next_bbp != NULL) && (next_bbp->seq == saved_seq_p)
+ && (next_bbp->n1 == saved_seq_p->n1);
+ next_bbp = next_bbp->bbp_link) {
+ next_bbp->seq = dest_seq_p;
+ next_bbp->mseq = dest_mseq_p;
+ }
+
+ memcpy(dest_seq_p,bbp->seq,sizeof(struct seq_record));
+ memcpy(dest_mseq_p,bbp->mseq,sizeof(struct mseq_record));
+ dest_seq_p->aa1b = NULL;
+
+ bbp->seq = dest_seq_p;
+ bbp->mseq = dest_mseq_p;
+}
+
+/* **************************************************************** */
+/* seqr_chain_work() (comp_lib8/9.c, comp_lib7e.c) corresponds to */
+/* comp_lib5e.c/getlib_buf_work() */
+/* in current versions, these functions do all the initial library */
+/* sequence similarity scan -- they exist to be used with expansion */
+/* link libraries */
+/* **************************************************************** */
+void
+seqr_chain_work(unsigned char **aa0, unsigned char *aa0s, struct buf_head *lib_bhead_p,
+ struct getlib_str *getlib_info, const struct mng_thr *m_bufi_p,
+ struct mngmsg *m_msp, struct pstruct *ppst, void *pstat_void,
+ struct db_str *ldb, struct hist_str *histp, struct score_count_s *s_info,
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ unsigned char *aa1shuff, void *f_str, void *qf_str,
+#endif
+ struct seq_record *best_seqs, struct mseq_record *best_mseqs,
+ struct beststr *best, FILE *fdata) {
+
+ struct seqr_chain *current_seqr_chain_p;
+ struct buf2_data_s *lib_buf2_dp;
+ struct seq_record *current_seq_p;
+ struct mseq_record *current_mseq_p;
+ int i, itt, jstats;
+ int seqr_rec_cnt; /* count through sequences in seqr_chain */
+ int buf2_shuff_mask; /* pre-calculate when shuffles requested */
+
+ /* set shuffle mask for -z > 10 */
+ buf2_shuff_mask = 0;
+ if (ppst->zsflag >= 10 && ppst->zsflag < 20) {
+ buf2_shuff_mask = BUF2_DOSHUF;
+ }
+
+ /* must have a valid lib_bhead_p starting into this loop */
+ /* start the search */
+
+ while ((current_seqr_chain_p
+ = next_seqr_chain(m_bufi_p, getlib_info, lib_bhead_p,
+ m_msp, ppst))) {
+
+ /* to enable the transition from everything in memory (comp_lib7e)
+ to buffers read as neeed (comp_lib5e), comp_lib9.c requires that
+ lib_bhead_p accommodate a full seqr_chain */
+
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ lib_bhead_p->hdr.have_results = 0;
+ lib_bhead_p->hdr.stop_work = 0;
+ lib_bhead_p->hdr.buf2_type= (BUF2_DOWORK | buf2_shuff_mask);
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_bhead_p->hdr.aa1b_start = current_seqr_chain_p->aa1b_base;
+ lib_bhead_p->hdr.aa1b_used = 0;
+ lib_bhead_p->hdr.seqr_cnt = 0;
+ lib_bhead_p->hdr.seq_b = current_seqr_chain_p->seqr_base;
+
+ current_seq_p = current_seqr_chain_p->seqr_base;
+ current_mseq_p = current_seqr_chain_p->mseqr_base;
+ seqr_rec_cnt = current_seqr_chain_p->cur_seq_cnt;
+
+ while (seqr_rec_cnt-- > 0) {
+ lib_buf2_dp->seq = current_seq_p;
+ lib_buf2_dp->mseq = current_mseq_p;
+ lib_bhead_p->hdr.aa1b_used += current_seq_p->n1+1;
+
+#ifdef DEBUG
+ if (current_seq_p->aa1b[-1] != '\0') {
+ fprintf(stderr," invalid current_seq_p->aa1b[-1] = %d\n",current_seq_p->aa1b[-1]);
+ }
+#endif
+ /* check to see whether this score (or a shuff score) should
+ be included in statistics */
+
+ jstats = samp_stats_idx(&pre_nstats, nstats, rand_state);
+
+#ifdef PCOMPLIB
+ lib_buf2_dp->seq_dup = 0; /* mark first ->seq as original, not duplicate */
+#endif
+ for (itt=m_msp->revcomp; itt<=m_msp->nitt1; itt++) {
+ lib_buf2_dp->frame = itt;
+ lib_buf2_dp->stats_idx = jstats;
+ lib_buf2_dp++; /* point to next buf2 */
+ lib_bhead_p->hdr.buf2_cnt++;
+
+ /* point to the current sequence */
+#ifdef PCOMPLIB
+ lib_buf2_dp->seq_dup = 1; /* mark duplicates */
+#endif
+ lib_buf2_dp->seq = current_seq_p;
+ lib_buf2_dp->mseq = current_mseq_p;
+ } /* for (itt .. */
+ lib_bhead_p->hdr.seqr_cnt++;
+ current_seq_p++; /* ready for the next seq_p, necessary
+ to re-initialize lib_bhead_p->aa1b_base */
+ current_mseq_p++;
+ }
+
+ /* now I have a full lib_bhead_p buffer, which corresponds to a
+ seqr_chain; send it off and get a new one */
+
+#if defined(COMP_THR) || defined(PCOMPLIB) /* if COMP_THR/PCOMPLIB - fill and empty buffers */
+ /* provide filled buffer to workers */
+ lib_bhead_p->hdr.have_data = 1;
+ lib_bhead_p->hdr.seq_record_continuous = 1;
+ put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
+ get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf); /* get an empty buffer to fill */
+#else /* just do the searches */
+ if (lib_bhead_p->hdr.buf2_type & BUF2_DOWORK) {
+ buf_do_work(aa0, m_msp->n0, lib_bhead_p, m_msp->nitt1, ppst, f_str);
+ if (m_msp->qshuffle)
+ buf_qshuf_work(aa0s,m_msp->n0, lib_bhead_p, m_msp->nitt1, ppst, qf_str, ppst->score_ix);
+ }
+ if (lib_bhead_p->hdr.buf2_type & BUF2_DOSHUF) {
+ buf_shuf_work(aa0,m_msp->n0, aa1shuff, lib_bhead_p, m_msp->nitt1, ppst,
+ f_str, ppst->score_ix,rand_state);
+ }
+#endif
+
+ /* "empty" buffers have results that must be processed */
+ if (lib_bhead_p->hdr.buf2_cnt && lib_bhead_p->hdr.have_results) {
+ save_best2(lib_bhead_p,m_msp, ppst, ldb, fdata,
+ histp, pstat_void, s_info);
+
+ if (getlib_info->use_memory <= 0 && getlib_info->lib_list_p->m_file_p->get_mmap_chain==NULL) {
+ /* this section of code is only used for re-cycled buffers */
+ if (lib_bhead_p->hdr.have_best_save) {
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ while (lib_bhead_p->hdr.buf2_cnt--) {
+ if (lib_buf2_dp->best_save != NULL) {
+ preserve_seq(lib_buf2_dp, best_seqs, best_mseqs, best);
+ }
+ lib_buf2_dp->best_save = NULL;
+ lib_buf2_dp++;
+ }
+ lib_bhead_p->hdr.have_best_save = 0;
+ }
+ }
+ }
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ lib_bhead_p->hdr.have_data=0;
+ } /* end seqr_chain loop */
+
+#if defined(COMP_THR) || defined(PCOMPLIB)
+ /* send off the final data buffer */
+ lib_bhead_p->hdr.have_data = 1; /* ignored if buf2_cnt <= 0 */
+ put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
+
+ /* wait for the threads to finish */
+ wait_rbuf(m_bufi_p->max_work_buf);
+ /* wait_rbuf(m_bufi_p->max_work_buf - empty_reader_bufs); */
+
+ /* save the final results */
+ /* this loop assumes that the seq_record/mseq_record buffers pointed
+ to by RESULTS_BUF[i] are independent; thus there must be at least
+ num_reader_bufs/m_bufi.max_work_buf seqr chains.
+ */
+ for (i=0; i < num_reader_bufs; i++) {
+ save_best2(RESULTS_BUF[i],m_msp, ppst, ldb, fdata,
+ histp, pstat_void, s_info);
+
+ if (getlib_info->use_memory <= 0 && getlib_info->lib_list_p && getlib_info->lib_list_p->m_file_p->get_mmap_chain==NULL) {
+ /* this section of code is only used for re-cycled buffers */
+ if (lib_bhead_p->hdr.have_best_save) {
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ while (lib_bhead_p->hdr.buf2_cnt--) {
+ if (lib_buf2_dp->best_save != NULL) {
+ preserve_seq(lib_buf2_dp, best_seqs, best_mseqs, best);
+ }
+ lib_buf2_dp->best_save = NULL;
+ lib_buf2_dp++;
+ }
+ lib_bhead_p->hdr.have_best_save = 0;
+ }
+ }
+ RESULTS_BUF[i]->hdr.buf2_cnt = RESULTS_BUF[i]->hdr.have_results = 0;
+ }
+#else /* !defined(COMP_THR || PCOMPLIB) */
+ if (lib_bhead_p->hdr.buf2_type & BUF2_DOWORK) {
+ buf_do_work(aa0, m_msp->n0, lib_bhead_p, m_msp->nitt1, ppst, f_str);
+ if (m_msp->qshuffle)
+ buf_qshuf_work(aa0s,m_msp->n0, lib_bhead_p, m_msp->nitt1, ppst, qf_str, ppst->score_ix);
+ }
+
+ if (lib_bhead_p->hdr.buf2_type & BUF2_DOSHUF) {
+ buf_shuf_work(aa0, m_msp->n0, aa1shuff, lib_bhead_p, m_msp->nitt1,
+ ppst, f_str, ppst->score_ix, rand_state);
+ }
+
+ save_best2(lib_bhead_p,m_msp, ppst, ldb, fdata,
+ histp, pstat_void, s_info);
+
+ lib_bhead_p->hdr.buf2_cnt = lib_bhead_p->hdr.have_results = 0;
+#endif
+}
+
+/* **************************************************************** */
+/* display_result() allows more flexible definitions of whether a
+ result is displayed. Orginally, it was simply
+ bbp->rst.escore<m_msg.e_low, but we may want to require at least
+ one score display */
+/* **************************************************************** */
+int
+display_result(struct beststr *bbp, struct mngmsg *m_msp, int lt_flag) {
+
+ if (lt_flag) {
+ if (bbp->rst.escore <= m_msp->e_cut) return 1;
+ else return 0;
+ }
+ else {
+ if (bbp->rst.escore > m_msp->e_low) return 1;
+ else return 0;
+ }
+}
+
+/* **************************************************************** */
+/* fset_vars() is similar to initenv(), but sets some m_msp, ppst
+ values based on threaded/unthreaded variables
+ initfa.c and doinit.c do not know about COMP_THR/PCOMPLIB.
+ MAX_MEMK varies depending on COMP_THR in defs.h
+ */
+/* **************************************************************** */
+void fset_vars(struct mngmsg *m_msp, struct pstruct *ppst) {
+ long l_tmp;
+ char *cptr, ctmp;
+
+ m_msp->max_memK = MAX_MEMK;
+ if ((cptr=getenv("LIB_MEMK"))!=NULL) {
+ ctmp = '\0';
+ sscanf(cptr,"%ld%c",&l_tmp, &ctmp);
+ if (l_tmp <= 0) m_msp->max_memK = BIGNUM;
+ else {
+ l_tmp *= 1024;
+ if (ctmp == 'G') l_tmp *= 1024;
+ m_msp->max_memK = l_tmp;
+ }
+ }
+}
+
+/* **************************************************************** */
+/* one-time initialization of best, bestp_arr, best_seq, best_mseq,
+ stats, rstats */
+/* **************************************************************** */
+void
+init_beststats(struct beststr **best, struct beststr ***bestp_arr,
+ struct seq_record **best_seqs, struct mseq_record **best_mseqs,
+ struct stat_str **stats, struct stat_str **rstats,
+ int shuff_max, int link_flag) {
+
+ /* Allocate space for saved scores */
+ if ((*best =
+ (struct beststr *)calloc((MAX_BEST+1),sizeof(struct beststr)))==NULL) {
+ s_abort("Cannot allocate best struct","");
+ }
+ if ((*bestp_arr =
+ (struct beststr **)malloc((MAX_BEST+1)*sizeof(struct beststr *)))==NULL) {
+ s_abort("Cannot allocate bestp_arr","");
+ }
+
+ /* initialize high score boundary */
+ (*bestp_arr)[0] = *best;
+ (*best)[0].rst.score[0]=(*best)[0].rst.score[1]=(*best)[0].rst.score[2]= INT_MAX;
+ (*best)[0].rst.escore=FLT_MIN; /* for E()-values, lower is best */
+ (*best)[0].zscore=FLT_MAX; /* for Z-scores, bigger is best */
+
+ (*best)++; (*bestp_arr)++; /* ensures that the 0-th value is extreme */
+
+ /* save best score sequence info -- used if there is not enough memory for library */
+ if ((*best_seqs =
+ (struct seq_record *)calloc((MAX_BEST+1),sizeof(struct seq_record)))==NULL) {
+ s_abort("Cannot allocate best_seqs","");
+ }
+
+ if ((*best_mseqs =
+ (struct mseq_record *)calloc((MAX_BEST+1),sizeof(struct mseq_record)))==NULL) {
+ s_abort("Cannot allocate best_seqs","");
+ }
+
+ /* allocate space for sampled scores */
+ if ((*stats =
+ (struct stat_str *)calloc(MAX_STATS,sizeof(struct stat_str)))==NULL) {
+ s_abort ("Cannot allocate stats struct","");
+ }
+
+ /* allocate space for shuffled library scores */
+ if ((*rstats =
+ (struct stat_str *)calloc(shuff_max,sizeof(struct stat_str)))==NULL) {
+ s_abort ("Cannot allocate rstats struct","");
+ }
+}
+
+#ifdef DEBUG
+void
+check_rbuf(struct buf_head *cur_buf) {
+ int buf_cnt, index;
+ struct buf2_data_s *cur_buf2_dp;
+
+ index = 0;
+ buf_cnt = cur_buf->hdr.buf2_cnt;
+ cur_buf2_dp = cur_buf->buf2_data;
+
+ while (buf_cnt-- > 0) {
+ if (cur_buf2_dp->seq == NULL || cur_buf2_dp->seq->aa1b == NULL) {
+ fprintf(stderr, "*** [%s/check_rbuf] NULL buffer->seq entry: at %d\n",prog_func, index);
+ }
+ if (cur_buf2_dp->seq->adler32_crc != adler32(1L,cur_buf2_dp->seq->aa1b, cur_buf2_dp->seq->n1)) {
+ fprintf(stderr, "*** [%s/check_rbuf] CRC mismatch at %d (%d)\n",prog_func, index,cur_buf2_dp->seq->n1);
+ }
+ cur_buf2_dp++;
+ index++;
+ }
+}
+#endif
diff --git a/src/compacc2.c b/src/compacc2.c
new file mode 100644
index 0000000..2f06072
--- /dev/null
+++ b/src/compacc2.c
@@ -0,0 +1,4119 @@
+/* $Id: compacc2.c 1280 2014-08-21 00:47:55Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* Concurrent read version */
+
+#include <stdio.h>
+#include <stdlib.h>
+#if defined(UNIX)
+#include <unistd.h>
+#endif
+#if defined(UNIX) || defined(WIN32)
+#include <sys/types.h>
+#endif
+
+#include <limits.h>
+#include <ctype.h>
+#include <float.h>
+
+#include <string.h>
+#include <time.h>
+#include <math.h>
+
+#include "defs.h"
+#include "param.h"
+#include "structs.h"
+
+#include "mm_file.h"
+#include "best_stats.h"
+
+#define XTERNAL
+#include "uascii.h"
+#include "upam.h"
+#undef XTERNAL
+
+#ifdef DEBUG
+extern char ext_qtitle[];
+#endif
+
+extern void abort ();
+
+#include "drop_func.h" /* get init_work() */
+/* drop_func.h includes dyn_string.h */
+
+void revcomp(unsigned char *seq, int n, int *c_nt);
+extern void qshuffle(unsigned char *aa0, int n0, int nm0, void *);
+#ifdef DEBUG
+unsigned long adler32(unsigned long, const unsigned char *, unsigned int);
+#endif
+
+void
+s_annot_to_aa1a(long offset, int n1, struct annot_str *annot_p, unsigned char *ann_arr, char *tmp_line);
+
+extern void add_annot_def(struct mngmsg *m_msp, char *line, int qa_flag);
+int add_annot_char(unsigned char *ann_arr, char ctmp_label);
+
+int get_annot(char *sname, struct mngmsg *, char *bline, long offset, int n1,
+ struct annot_str **annot_p,int target, int debug);
+int
+get_annot_list(char *sname, struct mngmsg *m_msp, struct beststr **bestp_arr,
+ int nbest,int target, int debug);
+void
+print_sum(FILE *fd, struct db_str *qtt, struct db_str *ntt, int in_mem, long mem_use);
+int
+check_seq_range(unsigned char *aa1b, int n1, int nsq, char *str);
+/* print timing information */
+extern void ptime (FILE *, long);
+
+/* this function consolidates code in comp_lib4.c for non-threaded, and in
+ work_thr2.c (threads) and work_comp2.c (worker nodes)
+*/
+
+void
+init_aa0(unsigned char **aa0, int n0, int nm0,
+ unsigned char **aa0s, unsigned char **aa1s,
+ int qframe, int qshuffle_flg, int max_tot,
+ struct pstruct *ppst, void **f_str, void **qf_str,
+ void *my_rand_state) {
+ int id;
+
+ /* note that aa[5,4,3,2] are never used, but are provided so that frame
+ can range from 0 .. 5; likewise for f_str[5..2] */
+
+ aa0[5] = aa0[4] = aa0[3] = aa0[2] = aa0[1] = aa0[0];
+
+ /* zero out for SSE2/ALTIVEC -- make sure this is ALWAYS done */
+ for (id=0; id < SEQ_PAD; id++) aa0[0][n0+id] = '\0';
+
+ init_work (aa0[0], n0, ppst, &f_str[0]);
+ f_str[5] = f_str[4] = f_str[3] = f_str[2] = f_str[1] = f_str[0];
+
+ if (qframe == 2) {
+ if ((aa0[1]=(unsigned char *)calloc((size_t)n0+2+SEQ_PAD,sizeof(unsigned char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate aa01[%d]\n", __FILE__, __LINE__, n0);
+ }
+ *aa0[1]='\0';
+ aa0[1]++;
+ memcpy(aa0[1],aa0[0],n0+1);
+ /* for ALTIVEC/SSE2, must pad with 16 NULL's */
+ for (id=0; id<SEQ_PAD; id++) {aa0[1][n0+id]=0;}
+ revcomp(aa0[1],n0,ppst->c_nt);
+ init_work (aa0[1], n0, ppst, &f_str[1]);
+ }
+
+ if (qshuffle_flg) {
+ if ((*aa0s=(unsigned char *)calloc(n0+2+SEQ_PAD,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate aa0s[%d]\n",__FILE__, __LINE__, n0+2);
+ exit(1);
+ }
+ **aa0s='\0';
+ (*aa0s)++;
+ memcpy(*aa0s,aa0[0],n0);
+ qshuffle(*aa0s,n0,nm0, my_rand_state);
+ /* for SSE2/ALTIVEC, must pad with 16 NULL's */
+ for (id=0; id<SEQ_PAD; id++) {(*aa0s)[n0+id]=0;}
+ init_work (*aa0s, n0, ppst, qf_str);
+ }
+
+ /* always allocate shuffle space */
+ if((*aa1s=calloc(max_tot+1,sizeof(char))) == NULL) {
+ fprintf(stderr,"*** error [%s:%d] - unable to allocate shuffled library sequence [%d]\n", __FILE__, __LINE__, max_tot);
+ exit(1);
+ }
+ else {
+ **aa1s=0;
+ (*aa1s)++;
+ }
+}
+
+/* because it is used to pre-allocate space, maxn has various
+ constraints. For "simple" comparisons, it is simply the length of
+ the longest library sequence. But for translated comparisons, it
+ must be 3 or 6X the length of the query sequence.
+
+ In addition, however, it can be reduced to make certain that
+ sequences are read in smaller chunks. And, maxn affect how large
+ overlaps must be when sequences are read in chunks.
+*/
+
+int
+reset_maxn(struct mngmsg *m_msp, int over_len, int maxn) {
+
+ /* reduce maxn if requested */
+ if (m_msp->ldb_info.maxn > 0 && m_msp->ldb_info.maxn < maxn) maxn = m_msp->ldb_info.maxn;
+
+ if (m_msp->qdnaseq==m_msp->ldb_info.ldnaseq || m_msp->qdnaseq==SEQT_DNA ||
+ m_msp->qdnaseq == SEQT_RNA) {/* !TFAST - either FASTA or FASTX */
+
+ if (m_msp->n0 > m_msp->max_tot - m_msp->ldb_info.dupn) {
+ fprintf(stderr,"*** error [%s:%d] - query sequence is too long %d > %d - %d %s\n",
+ __FILE__, __LINE__,
+ m_msp->n0,
+ m_msp->max_tot, m_msp->ldb_info.dupn,
+ m_msp->sqnam);
+ exit(1);
+ }
+
+ m_msp->ldb_info.l_overlap = over_len;
+ m_msp->ldb_info.maxt3 = maxn-m_msp->ldb_info.l_overlap;
+ }
+ else { /* is TFAST */
+ if (m_msp->n0 > MAXTST) {
+ fprintf(stderr,"*** error [%s:%d] - query sequence is too long %d %s\n",
+ __FILE__, __LINE__, m_msp->n0,m_msp->sqnam);
+ exit(1);
+ }
+
+ if (m_msp->n0*3 > maxn ) { /* n0*3 for the three frames - this
+ will only happen if maxn has been
+ set low manually */
+
+ if (m_msp->n0*4+2 < m_msp->max_tot) { /* m_msg0*3 + m_msg0 */
+ fprintf(stderr,
+ "*** error [%s:%d] - query sequence too long for library segment: %d - resetting to %d\n",
+ __FILE__, __LINE__,
+ maxn,m_msp->n0*3);
+ maxn = m_msp->ldb_info.maxn = m_msp->n0*3;
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] - query sequence too long for translated search: %d * 4 > %d %s\n",
+ __FILE__, __LINE__, m_msp->n0,maxn, m_msp->sqnam);
+ exit(1);
+ }
+ }
+
+ /* set up some constants for overlaps */
+ m_msp->ldb_info.l_overlap = 3*over_len;
+ m_msp->ldb_info.maxt3 = maxn-m_msp->ldb_info.l_overlap-3;
+ m_msp->ldb_info.maxt3 -= m_msp->ldb_info.maxt3%3;
+ m_msp->ldb_info.maxt3++;
+
+ maxn = maxn - 3; maxn -= maxn%3; maxn++;
+ }
+ return maxn;
+}
+
+
+int
+scanseq(unsigned char *seq, int n, char *str) {
+ int tot,i;
+ char aaray[128]; /* this must be set > nsq */
+
+ for (i=0; i<128; i++) aaray[i]=0;
+ for (i=0; i < (int)strlen(str); i++) aaray[qascii[str[i]]]=1;
+ for (i=tot=0; i<n; i++) tot += aaray[seq[i]];
+ return tot;
+}
+
+/* subs_env takes a string, possibly with ${ENV}, and looks up all the
+ potential environment variables and substitutes them into the
+ string */
+
+void subs_env(char *dest, char *src, int dest_size) {
+ char *last_src, *bp, *bp1;
+
+ last_src = src;
+
+ if ((bp = strchr(src,'$'))==NULL) {
+ strncpy(dest, src, dest_size);
+ dest[dest_size-1] = '\0';
+ }
+ else {
+ *dest = '\0';
+ while (strlen(dest) < dest_size-1 && bp != NULL ) {
+ /* copy stuff before ${*/
+ *bp = '\0';
+ strncpy(dest, last_src, dest_size);
+ *bp = '$';
+
+ /* copy ENV */
+ if (*(bp+1) != '{') {
+ strncat(dest, "$", dest_size - strlen(dest) -1);
+ dest[dest_size-1] = '\0';
+ bp += 1;
+ }
+ else { /* have ${ENV} - put it in */
+ if ((bp1 = strchr(bp+2,'}'))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - Unterminated ENV: %s\n",
+ __FILE__, __LINE__, src);
+ break;
+ }
+ else {
+ *bp1 = '\0';
+ if (getenv(bp+2)!=NULL) {
+ strncat(dest, getenv(bp+2), dest_size - strlen(dest) - 1);
+ dest[dest_size-1] = '\0';
+ *bp1 = '}';
+ }
+ bp = bp1+1; /* bump bp even if getenv == NULL */
+ }
+ }
+ last_src = bp;
+
+ /* now get the next ${ENV} if present */
+ bp = strchr(last_src,'$');
+ }
+ /* now copy the last stuff */
+ strncat(dest, last_src, dest_size - strlen(dest) - 1);
+ dest[dest_size-1]='\0';
+ }
+}
+
+
+void
+selectbest(struct beststr **bptr, int k, int n) /* k is rank in array */
+{
+ int v, i, j, l, r;
+ struct beststr *tmptr;
+
+ l=0; r=n-1;
+
+ while ( r > l ) {
+ v = bptr[r]->rst.score[0];
+ i = l-1;
+ j = r;
+ do {
+ while (bptr[++i]->rst.score[0] > v) ;
+ while (bptr[--j]->rst.score[0] < v) ;
+ tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
+ } while (j > i);
+ bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
+ if (i>=k) r = i-1;
+ if (i<=k) l = i+1;
+ }
+}
+
+void
+selectbestz(struct beststr **bptr, int k, int n) /* k is rank in array */
+{
+ int i, j, l, r;
+ struct beststr *tmptr;
+ double v;
+
+ l=0; r=n-1;
+
+ while ( r > l ) {
+ v = bptr[r]->zscore;
+ i = l-1;
+ j = r;
+ do {
+ while (bptr[++i]->zscore > v) ;
+ while (bptr[--j]->zscore < v) ;
+ tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
+ } while (j > i);
+ bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
+ if (i>=k) r = i-1;
+ if (i<=k) l = i+1;
+ }
+}
+
+/* improved shellsort with high-performance increments */
+/*
+shellsort(itemType a[], int l, int r)
+{ int i, j, k, h; itemType v;
+ int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
+ 13776, 4592, 1968, 861, 336,
+ 112, 48, 21, 7, 3, 1 };
+ for ( k = 0; k < 16; k++)
+ for (h = incs[k], i = l+h; i <= r; i++) {
+ v = a[i]; j = i;
+ while (j > h && a[j-h] > v) {
+ a[j] = a[j-h]; j -= h;
+ }
+ a[j] = v;
+ }
+}
+*/
+
+/* ?improved? version of sortbestz using optimal increments and fewer
+ exchanges */
+void sortbestz(struct beststr **bptr, int nbest)
+{
+ int gap, i, j, k;
+ struct beststr *tmp;
+ double v;
+ int incs[14] = { 198768, 86961, 33936,
+ 13776, 4592, 1968, 861, 336,
+ 112, 48, 21, 7, 3, 1 };
+
+ for ( k = 0; k < 14; k++) {
+ gap = incs[k];
+ for (i=gap; i < nbest; i++) {
+ tmp = bptr[i];
+ j = i;
+ v = bptr[i]->zscore;
+ while ( j >= gap && bptr[j-gap]->zscore < v) {
+ bptr[j] = bptr[j - gap];
+ j -= gap;
+ }
+ bptr[j] = tmp;
+ }
+ }
+}
+
+
+/* sort based on sequence index */
+void sortbesti(struct beststr **bptr, int nbest)
+{
+ int gap, i, j, k;
+ struct beststr *tmp;
+ double v;
+ int incs[12] = { 33936, 13776, 4592, 1968, 861, 336,
+ 112, 48, 21, 7, 3, 1 };
+
+ for ( k = 0; k < 12; k++) {
+ gap = incs[k];
+ for (i=gap; i < nbest; i++) {
+ tmp = bptr[i];
+ j = i;
+ v = bptr[i]->seq->index;
+ while ( j >= gap && bptr[j-gap]->seq->index < v) {
+ bptr[j] = bptr[j - gap];
+ j -= gap;
+ }
+ bptr[j] = tmp;
+ }
+ }
+}
+
+void
+sortbeste(struct beststr **bptr, int nbest)
+{
+ int gap, i, j, k;
+ struct beststr *tmp;
+ double v;
+ int incs[14] = { 198768, 86961, 33936,
+ 13776, 4592, 1968, 861, 336,
+ 112, 48, 21, 7, 3, 1 };
+
+ for ( k = 0; k < 14; k++) {
+ gap = incs[k];
+ for (i=gap; i < nbest; i++) {
+ j = i;
+ tmp = bptr[i];
+ v = tmp->rst.escore;
+ while ( j >= gap && bptr[j-gap]->rst.escore > v) {
+ bptr[j] = bptr[j - gap];
+ j -= gap;
+ }
+ bptr[j] = tmp;
+ }
+ }
+
+ /* sometimes there are many high scores with E()==0.0, sort
+ those by z() score */
+
+ j = 0;
+ while (j < nbest && bptr[j]->rst.escore <= 2.0*DBL_MIN ) {j++;}
+ if (j > 1) sortbestz(bptr,j);
+}
+
+extern char *prog_func;
+extern char *verstr, *iprompt0, *refstr, *mp_verstr;
+extern long tstart, tscan, tprev, tdone; /* Timing */
+#ifdef COMP_MLIB
+extern long ttscan, ttdisp;
+#endif
+extern time_t tdstart, tddone;
+
+/* ****************************************************************
+ print command line arguments (argv_line)
+ possibly HTML header
+ !BLAST
+ please cite
+ version
+ BLAST
+ Reference version
+**************************************************************** */
+void
+print_header1(FILE *fd, const char *argv_line,
+ const struct mngmsg *m_msp, const struct pstruct *ppst) {
+ int i;
+
+#ifdef PGM_DOC
+ if (!(m_msp->markx & (MX_M8OUT+MX_MBLAST2))) fprintf(fd, "#%s\n",argv_line);
+#endif
+
+ if (m_msp->markx & MX_M11OUT) {
+ fprintf(fd, "#:lav\n\nd {\n \"%s\"\n}\n",argv_line+1);
+ }
+
+ if (m_msp->markx & MX_HTML) {
+#ifdef HTML_HEAD
+ fprintf(fd,"<html>\n<head>\n<title>%s Results</title>\n</head>\n<body>\n",prog_func);
+#endif
+ fprintf(fd,"<pre>\n");
+ }
+
+ if (m_msp->std_output) {
+ fprintf(fd,"%s\n",iprompt0);
+ if (refstr != NULL && refstr[0] != '\0') {
+ fprintf(fd," version %s%s\nPlease cite:\n %s\n",verstr,mp_verstr,refstr);
+ }
+ else {
+ fprintf(fd," version %s%s\n",verstr,mp_verstr);
+ }
+ }
+
+ if (m_msp->markx & MX_MBLAST2) {
+ if (refstr != NULL && refstr[0] != '\0') {
+ fprintf(fd,"%s %s%s\n\nReference: %s\n\n", prog_func, verstr, mp_verstr, refstr);
+ }
+ else {
+ fprintf(fd,"%s %s%s\n\n", prog_func, verstr, mp_verstr);
+ }
+ }
+
+ fflush(fd);
+}
+
+/* ****************************************************************
+ MX_HTML: <pre>
+ Query:
+ 1>>>accession description # aa
+ Annotation:
+ Library:
+**************************************************************** */
+void
+print_header2(FILE *fd, int qlib, char *info_qlabel, unsigned char **aa0,
+ const struct mngmsg *m_msp, const struct pstruct *ppst,
+ const char * info_lib_range_p) {
+ int j;
+ char tmp_str[MAX_STR];
+ double db_tt;
+
+ /* if (m_msp->markx & MX_HTML) fputs("<pre>\n",fd); */
+
+ if (m_msp->std_output) {
+ if (qlib==1) {
+ fprintf(fd,"Query: %s\n", m_msp->tname);
+ }
+
+ if (m_msp->qdnaseq == SEQT_DNA || m_msp->qdnaseq == SEQT_RNA) {
+ strncpy(tmp_str,(m_msp->qframe==1)? " (forward-only)" : "\0",sizeof(tmp_str));
+ tmp_str[sizeof(tmp_str)-1]='\0';
+ }
+ else tmp_str[0]='\0';
+
+ fprintf(fd,"%3d>>>%s%s\n", qlib,
+ m_msp->qtitle,
+ (m_msp->revcomp ? " (reverse complement)" : tmp_str));
+
+ /* check for annotation */
+ if (m_msp->ann_flg && m_msp->aa0a != NULL) {
+ fprintf(fd,"Annotation: ");
+ for (j=0; j<m_msp->n0; j++) {
+ if (m_msp->aa0a[j] && m_msp->ann_arr[m_msp->aa0a[j]] != ' ' ) {
+ fprintf(fd,"|%ld:%c%c",
+ j+m_msp->q_off,m_msp->ann_arr[m_msp->aa0a[j]],ppst->sq[aa0[0][j]]);
+ }
+ }
+ fprintf(fd,"\n");
+ }
+
+ fprintf(fd,"Library: %s%s\n", m_msp->ltitle,info_lib_range_p);
+
+ if (m_msp->db.carry==0) {
+ fprintf(fd, " %7ld residues in %5ld sequences\n", m_msp->db.length, m_msp->db.entries);
+ }
+ else {
+ db_tt = (double)m_msp->db.carry*(double)LONG_MAX + (double)m_msp->db.length;
+ fprintf(fd, " %.0f residues in %5ld library sequences\n", db_tt, m_msp->db.entries);
+ }
+
+ }
+ else {
+ if ((m_msp->markx & (MX_M8OUT + MX_M8COMMENT)) == (MX_M8OUT+MX_M8COMMENT)) {
+ fprintf(fd,"# %s %s%s\n",prog_func,verstr,mp_verstr);
+ fprintf(fd,"# Query: %s\n",m_msp->qtitle);
+ fprintf(fd,"# Database: %s\n",m_msp->ltitle);
+ }
+ }
+ if (m_msp->markx & MX_HTML) fputs("</pre>\n",fd);
+ fflush(fd);
+}
+
+/* **************************************************************** */
+/* before showbest */
+/* **************************************************************** */
+void print_header3(FILE *fd, int qlib, struct mngmsg *m_msp, struct pstruct *ppst) {
+
+ if (m_msp->markx & MX_MBLAST2) {
+ if (qlib == 1) {
+ fprintf(fd, "\nDatabase: %s\n %12ld sequences; %ld total letters\n\n\n",
+ m_msp->ltitle, m_msp->db.entries, m_msp->db.length);
+ }
+ fprintf(fd, "\nQuery= %s\nLength=%d\n", m_msp->qtitle, m_msp->n0);
+ }
+}
+
+
+/* **************************************************************** */
+/* alignment tranistion */
+/* **************************************************************** */
+void print_header4(FILE *fd, char *info_qlabel, char *argv_line, char *info_gstring3, char *info_hstring_p[2],
+ struct mngmsg *m_msp, struct pstruct *ppst) {
+
+ if (m_msp->std_output && (m_msp->markx & (MX_AMAP+ MX_HTML + MX_M9SUMM)) && !(m_msp->markx & MX_M10FORM)) {
+ fprintf(fd,"\n>>>%s%s, %d %s vs %s library\n",
+ info_qlabel,(m_msp->revcomp ? "_rev":"\0"), m_msp->n0,
+ m_msp->sqnam,m_msp->lname);
+ }
+
+ if (m_msp->markx & MX_M10FORM) {
+ fprintf(fd,"\n>>>%s%s, %d %s vs %s library\n",
+ info_qlabel,(m_msp->revcomp ? "-":"\0"), m_msp->n0, m_msp->sqnam,
+ m_msp->lname);
+ fprintf(fd,"; pg_name: %s\n",m_msp->pgm_name);
+ fprintf(fd,"; pg_ver: %s%s\n",verstr,mp_verstr);
+ fprintf(fd,"; pg_argv: %s",argv_line);
+ fputs(info_gstring3,fd);
+ fputs(info_hstring_p[0],fd);
+ fputs(info_hstring_p[1],fd);
+ }
+}
+
+void print_header4a(FILE *outfd, struct mngmsg *m_msp) {
+ if (!(m_msp->markx & MX_M8OUT) && (m_msp->markx & (MX_M10FORM+MX_M9SUMM)) && m_msp->show_code != SHOW_CODE_ID) {
+ fprintf(outfd,">>><<<\n");
+ }
+}
+
+void print_header5(FILE *fd, int qlib, struct db_str *qtt,
+ struct mngmsg *m_msp, struct pstruct *ppst,
+ int in_mem, long tot_memK) {
+
+ /* for MX_MBLAST2, show some statistics results */
+ if (m_msp->markx & MX_MBLAST2) {
+ fprintf(fd,"\n\nLambda K H\n");
+ fprintf(fd," %6.3f %6.3f %6.3f\n\n",ppst->pLambda,ppst->pK,ppst->pH);
+ fprintf(fd,"\nGapped\nLambda\n");
+ fprintf(fd," %6.3f %6.3f %6.3f\n",ppst->pLambda,ppst->pK,ppst->pH);
+ fprintf(fd,"\nEffective search space used: %ld\n\n",m_msp->db.entries);
+ }
+
+ if (m_msp->markx & MX_M8COMMENT) {
+ fprintf(fd, "# %s processed %d queries\n",prog_func,qlib);
+ }
+
+ if ( !((m_msp->markx & MX_M8OUT) || (m_msp->markx & MX_HTML))
+ && (m_msp->markx & (MX_M10FORM+MX_M9SUMM))) {
+ fprintf(fd,">>>///\n");
+ }
+
+ if ( m_msp->markx & MX_HTML) fputs("<pre>",fd);
+ if (m_msp->std_output) {
+ print_sum(fd, qtt, &m_msp->db, in_mem, tot_memK);}
+ if ( m_msp->markx & MX_HTML) fputs("</pre>\n",fd);
+#ifdef HTML_HEAD
+ if (m_msp->markx & MX_HTML) fprintf(fd,"</body>\n</html>\n");
+#endif
+
+ if (m_msp->markx & MX_MBLAST2) {
+ fprintf(fd,"\n Database: %s\n",m_msp->ltitle);
+ fprintf(fd," Number of letters in database: %ld\n",m_msp->db.length);
+ fprintf(fd," Number of sequences in database: %ld\n",m_msp->db.entries);
+ fprintf(fd,"\n\n\nMatrix: %s\n",ppst->pam_name);
+ fprintf(fd,"Gap Penalties: Existence: %d, Extension: %d\n",ppst->gdelval, ppst->ggapval);
+ }
+}
+
+void
+print_annot_header(FILE *fd, struct mngmsg *m_msp) {
+ int i;
+
+ if (m_msp->ann_arr_def[1]) {
+ if (m_msp->markx & MX_HTML) {fprintf(fd,"<pre>");}
+ fprintf(fd, "Annotation symbols:\n");
+ for (i=1; m_msp->ann_arr[i]; i++) {
+ if (m_msp->ann_arr_def[i]) {
+ fprintf(fd, " %c : %s\n",m_msp->ann_arr[i], m_msp->ann_arr_def[i]);
+ }
+ }
+ if (m_msp->markx & MX_HTML) {fputs("</pre><hr />\n",fd);}
+ }
+}
+
+extern int fa_max_workers;
+
+void
+print_sum(FILE *fd, struct db_str *qtt, struct db_str *ntt, int in_mem, long tot_memK)
+{
+ double db_tt;
+ char tstr1[26], tstr2[26];
+ char memstr[256];
+
+ strncpy(tstr1,ctime(&tdstart),sizeof(tstr1));
+ strncpy(tstr2,ctime(&tddone),sizeof(tstr1));
+ tstr1[24]=tstr2[24]='\0';
+
+ /* Print timing to output file as well */
+
+ fprintf(fd, "\n%ld residues in %ld query sequences\n", qtt->length, qtt->entries);
+ if (ntt->carry == 0)
+ fprintf(fd, "%ld residues in %ld library sequences\n", ntt->length, ntt->entries);
+ else {
+ db_tt = (double)ntt->carry*(double)LONG_MAX + (double)ntt->length;
+ fprintf(fd, "%.0f residues in %ld library sequences\n", db_tt, ntt->entries);
+ }
+
+ memstr[0]='\0';
+ if (tot_memK && in_mem != 0) {
+ sprintf(memstr," in memory [%ldG]",(tot_memK >> 20));
+ }
+
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ fprintf(fd," Scomplib [%s%s]\n start: %s done: %s\n",verstr,mp_verstr,tstr1,tstr2);
+#endif
+#if defined(COMP_THR)
+ fprintf(fd," Tcomplib [%s%s] (%d proc%s)\n start: %s done: %s\n", verstr, mp_verstr,
+ fa_max_workers, memstr, tstr1,tstr2);
+#endif
+#if defined(PCOMPLIB)
+ fprintf(fd," Pcomplib [%s%s] (%d proc%s)\n start: %s done: %s\n", verstr, mp_verstr,
+ fa_max_workers, memstr, tstr1,tstr2);
+#endif
+#ifndef COMP_MLIB
+ fprintf(fd," Scan time: ");
+ ptime(fd, tscan - tprev);
+ fprintf (fd," Display time: ");
+ ptime (fd, tdone - tscan);
+#else
+ fprintf(fd," Total Scan time: ");
+ ptime(fd, ttscan);
+ fprintf (fd," Total Display time: ");
+ ptime (fd, ttdisp);
+#endif
+ fprintf (fd,"\n");
+ fprintf (fd, "\nFunction used was %s [%s%s]\n", prog_func,verstr,mp_verstr);
+}
+
+extern double zs_to_Ec(double zs, long entries);
+extern double zs_to_bit(double zs, int n0, int n1);
+extern double zs_to_p(double zs);
+
+#include "aln_structs.h"
+
+void
+prhist(FILE *fd, const struct mngmsg *m_msp,
+ struct pstruct *ppst,
+ struct hist_str hist,
+ int nstats, int sstats,
+ struct db_str ntt,
+ char *stat_info2,
+ char *lib_range,
+ char **info_gstring2,
+ char **info_hstring,
+ long tscan)
+{
+ int i,j,hl,hll, el, ell, ev;
+ char hline[80], pch, *bp;
+ int mh1, mht;
+ int maxval, maxvalt, dotsiz, ddotsiz,doinset;
+ double cur_e, prev_e, f_int;
+ double max_dev, x_tmp;
+ double db_tt;
+ int n_chi_sq, cum_hl=0, max_i=0, max_dev_i;
+ double zs10_off;
+
+
+ if (m_msp->markx & MX_HTML) fputs("<pre>\n",fd);
+ else {fprintf(fd,"\n");}
+
+ if (ppst->zsflag_f < 0) {
+ if (!m_msp->nohist) {
+ fprintf(fd, " %7ld residues in %5ld sequences", ntt.length,ntt.entries);
+ fprintf(fd, "%s\n",lib_range);
+ }
+ fprintf(fd,"Algorithm: %s\nParameters: %s\n",info_gstring2[0],info_gstring2[1]);
+ return;
+ }
+
+ if (nstats > 20) {
+ zs10_off = ppst->zs_off * 10.0;
+
+ max_dev = 0.0;
+ mh1 = hist.maxh-1; /* max value for histogram */
+ mht = (3*hist.maxh-3)/4 - 1; /* x-coordinate for expansion */
+ n_chi_sq = 0;
+
+ if (!m_msp->nohist && mh1 > 0) {
+ for (i=0,maxval=0,maxvalt=0; i<hist.maxh; i++) {
+ if (hist.hist_a[i] > maxval) maxval = hist.hist_a[i];
+ if (i >= mht && hist.hist_a[i]>maxvalt) maxvalt = hist.hist_a[i];
+ }
+ cum_hl = -hist.hist_a[0];
+ dotsiz = (maxval-1)/60+1;
+ ddotsiz = (maxvalt-1)/50+1;
+ doinset = (ddotsiz < dotsiz && dotsiz > 2);
+
+ if (ppst->zsflag_f>=0)
+ fprintf(fd," opt E()\n");
+ else
+ fprintf(fd," opt\n");
+
+ prev_e = zs_to_Ec((double)(hist.min_hist-hist.histint/2)-zs10_off,hist.entries);
+ for (i=0; i<=mh1; i++) {
+ pch = (i==mh1) ? '>' : ' ';
+ pch = (i==0) ? '<' : pch;
+ hll = hl = hist.hist_a[i];
+ if (ppst->zsflag_f>=0) {
+ cum_hl += hl;
+ f_int = (double)(i*hist.histint+hist.min_hist)+(double)hist.histint/2.0;
+ cur_e = zs_to_Ec(f_int-zs10_off,hist.entries);
+ ev = el = ell = (int)(cur_e - prev_e + 0.5);
+ if (hl > 0 && i > 5 && i < (90-hist.min_hist)/hist.histint) {
+ x_tmp = fabs(cum_hl - cur_e);
+ if ( x_tmp > max_dev) {
+ max_dev = x_tmp;
+ max_i = i;
+ }
+ n_chi_sq++;
+ }
+ if ((el=(el+dotsiz-1)/dotsiz) > 60) el = 60;
+ if ((ell=(ell+ddotsiz-1)/ddotsiz) > 40) ell = 40;
+ fprintf(fd,"%c%3d %5d %5d:",
+ pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
+ mh1*hist.histint+hist.min_hist,hl,ev);
+ }
+ else fprintf(fd,"%c%3d %5d :",
+ pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
+ mh1*hist.histint+hist.min_hist,hl);
+
+ if ((hl=(hl+dotsiz-1)/dotsiz) > 60) hl = 60;
+ if ((hll=(hll+ddotsiz-1)/ddotsiz) > 40) hll = 40;
+ for (j=0; j<hl; j++) hline[j]='=';
+ if (ppst->zsflag_f>=0) {
+ if (el <= hl ) {
+ if (el > 0) hline[el-1]='*';
+ hline[hl]='\0';
+ }
+ else {
+ for (j = hl; j < el; j++) hline[j]=' ';
+ hline[el-1]='*';
+ hline[hl=el]='\0';
+ }
+ }
+ else hline[hl] = 0;
+ if (i==1) {
+ for (j=hl; j<10; j++) hline[j]=' ';
+ sprintf(&hline[10]," one = represents %d library sequences",dotsiz);
+ }
+ if (doinset && i == mht-2) {
+ for (j = hl; j < 10; j++) hline[j]=' ';
+ sprintf(&hline[10]," inset = represents %d library sequences",ddotsiz);
+ }
+ if (i >= mht&& doinset ) {
+ for (j = hl; j < 10; j++) hline[j]=' ';
+ hline[10]=':';
+ for (j = 11; j<11+hll; j++) hline[j]='=';
+ hline[11+hll]='\0';
+ if (ppst->zsflag_f>=0) {
+ if (ell <= hll) hline[10+ell]='*';
+ else {
+ for (j = 11+hll; j < 10+ell; j++) hline[j]=' ';
+ hline[10+ell] = '*';
+ hline[11+ell] = '\0';
+ }
+ }
+ }
+
+ fprintf(fd,"%s\n",hline);
+ prev_e = cur_e;
+ }
+ }
+ max_dev_i = max_i*hist.histint+hist.min_hist;
+ }
+ else {
+ max_dev = 0.0;
+ n_chi_sq = 0;
+ max_i = 0;
+ max_dev_i = 0;
+ }
+
+ if (ppst->zsflag_f >=0 ) {
+ if (!m_msp->nohist) {
+ if (ntt.carry==0) {
+ fprintf(fd, " %7ld residues in %5ld sequences", ntt.length, ntt.entries);
+ }
+ else {
+ db_tt = (double)ntt.carry*(double)LONG_MAX + (double)ntt.length;
+ fprintf(fd, " %.0f residues in %5ld library sequences", db_tt, ntt.entries);
+ }
+ fprintf(fd, "%s\n",lib_range);
+ }
+ fprintf(fd,"Statistics: %s\n",hist.stat_info);
+ if (stat_info2) {
+ fprintf(fd," Statistics E2: %s\n",stat_info2);
+ }
+
+#ifdef SAMP_STATS
+ fprintf(fd," statistics sampled from %ld (%d) to %ld sequences\n",
+ (hist.entries > nstats ? nstats : hist.entries),sstats, hist.entries);
+#else
+ fprintf(fd," statistics extrapolated from %ld to %ld sequences\n",
+ (hist.entries > nstats ? nstats : hist.entries),hist.entries);
+#endif
+
+ if (!m_msp->nohist && cum_hl > 0) {
+ fprintf(fd," Kolmogorov-Smirnov statistic: %6.4f (N=%d) at %3d\n",
+ max_dev/(float)cum_hl, n_chi_sq,max_dev_i);
+ }
+ if (m_msp->markx & MX_M10FORM) {
+ while ((bp=strchr(hist.stat_info,'\n'))!=NULL) *bp=' ';
+ if (cum_hl <= 0) cum_hl = -1;
+ sprintf(info_hstring[0],"; mp_extrap: %d %ld\n; mp_stats: %s\n; mp_KS: %6.4f (N=%d) at %3d\n",
+ MAX_STATS,hist.entries,hist.stat_info,max_dev/(float)cum_hl,
+ n_chi_sq,max_dev_i);
+ }
+ }
+
+ if (m_msp->markx & MX_M10FORM) {
+ if ((bp = strchr(info_gstring2[1],'\n'))!=NULL) *bp = ' ';
+ sprintf(info_hstring[1],"; mp_Algorithm: %s\n; mp_Parameters: %s\n",info_gstring2[0],info_gstring2[1]);
+ if (bp != NULL ) *bp = '\n';
+ }
+
+ if (ppst->other_info != NULL) {
+ fputs(ppst->other_info, fd);
+ }
+
+ fprintf(fd,"Algorithm: %s\nParameters: %s\n",info_gstring2[0],info_gstring2[1]);
+
+ fprintf (fd," Scan time: ");
+ ptime(fd,tscan);
+ fprintf(fd,"\n");
+ if (!m_msp->annot1_sname[0] && m_msp->markx & MX_HTML) {
+ fputs("</pre>\n<hr />\n",fd);
+ }
+
+ fflush(fd);
+}
+
+extern char prog_name[], *verstr;
+
+#ifdef PCOMPLIB
+#include "mpi.h"
+#endif
+
+void s_abort (char *p, char *p1)
+{
+ int i;
+
+ fprintf (stderr, "\n***[%s] %s%s***\n", prog_name, p, p1);
+#ifdef PCOMPLIB
+ MPI_Abort(MPI_COMM_WORLD,1);
+ MPI_Finalize();
+#endif
+ exit (1);
+}
+
+void w_abort (char *p, char *p1)
+{
+ fprintf (stderr, "\n***[%s] %s%s***\n\n", prog_name, p, p1);
+ exit (1);
+}
+
+extern struct a_res_str *
+build_ares_code(unsigned char *aa0, int n0,
+ unsigned char *aa1, struct seq_record *seq,
+ int frame, int *have_ares, int repeat_thresh,
+ const struct mngmsg *m_msp, struct pstruct *ppst,
+ void *f_str
+ );
+
+extern struct lmf_str *
+re_openlib(struct lmf_str *, int outtty);
+
+#define MAX_BLINE 2048
+#define RANLIB (m_fptr->ranlib)
+
+extern int
+re_getlib(unsigned char *, struct annot_str **,
+ int, int, int, int, int, long *, long *,
+ struct lmf_str *m_fptr);
+
+/*
+ pre_load_best loads a set of sequences using re_getlib
+
+ it should be used for getting sequences for shuffling, and for showbest() if m_msg->quiet
+
+ it both opens the m_file_p buffer, gets the bline[] descriptions,
+ and reads the actual sequences. In reading the sequences, it
+ should first allocate one large buffer so that individual buffers do not need to be freed.
+*/
+
+void
+pre_load_best(unsigned char *aa1save, int maxn,
+ struct beststr **bbp_arr, int nbest,
+ struct mngmsg *m_msp, int debug)
+{
+ int i, n1, bl_len, tmp_bline_len, l_llen;
+ int seq_buf_len;
+ char bline[MAX_BLINE];
+ unsigned char *seq_buf_p;
+ char *bline_buf_p;
+
+ struct beststr *bbp;
+ struct lmf_str *m_fptr;
+
+ /*
+ calculate how much room we need for sequences and blines
+ */
+
+ if (m_msp->pre_load_done) return;
+
+ seq_buf_len = 1;
+ for (i=0; i<nbest; i++) {
+ /* we are not (currently) allocating more than n1+1, because alignment is not vectorized,
+ if it were vectorized, we would need n+16
+ */
+#ifdef DEBUG
+ if (bbp_arr[i]->n1 != bbp_arr[i]->seq->n1) {
+ fprintf(stderr,"*** error [%s:%d] - n1 (%d) != seq->n1 (%d)\n",
+ __FILE__, __LINE__, bbp_arr[i]->n1, bbp_arr[i]->seq->n1);
+ }
+#endif
+
+ if (bbp_arr[i]->seq->aa1b == NULL) {
+ seq_buf_len += bbp_arr[i]->seq->n1 + 1;
+ }
+ }
+
+ /* have required sequence space (seq_buf_len), allocate it */
+
+ if ((m_msp->aa1save_buf_b=(unsigned char *)calloc(seq_buf_len, sizeof(char)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate space[%d] for sequence encoding\n",
+ __FILE__, __LINE__, seq_buf_len);
+ exit(1);
+ }
+ else {
+ seq_buf_p = m_msp->aa1save_buf_b+1; /* ensure there is an initial '\0' */
+ }
+
+ /* adjust description line length */
+ l_llen = m_msp->aln.llen;
+ if ((m_msp->markx & MX_M9SUMM) && m_msp->show_code != SHOW_CODE_ID) {
+ l_llen += 40;
+ if (l_llen > 200) l_llen=200;
+ }
+
+ tmp_bline_len = sizeof(bline)-1;
+ if (!(m_msp->markx & MX_M10FORM) && !m_msp->long_info) {tmp_bline_len = l_llen-5;}
+
+ /* allocate more bline than we need for simplicity */
+ if ((bline_buf_p=m_msp->bline_buf_b=(char *)calloc(nbest*tmp_bline_len, sizeof(char)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate space[%d] for bline descriptions\n",
+ __FILE__, __LINE__, nbest*tmp_bline_len);
+ exit(1);
+ }
+
+ for (i=0; i<nbest; i++) {
+ bbp = bbp_arr[i];
+
+ if ((m_fptr=re_openlib(bbp->mseq->m_file_p,!m_msp->quiet))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot re-open %s\n",
+ __FILE__, __LINE__, bbp->mseq->m_file_p->lb_name);
+ exit(1);
+ }
+ RANLIB(bline,tmp_bline_len,bbp->mseq->lseek,bbp->mseq->libstr,m_fptr);
+ bl_len = strlen(bline);
+ bbp->mseq->bline = bline_buf_p;
+ bbp->mseq->bline_max = m_msp->aln.llen;
+ strncpy(bbp->mseq->bline, bline, bl_len);
+ bline_buf_p += bl_len+1;
+
+ /* make sure we get annotation if present, and sequence if necessary */
+ if (bbp->seq->aa1b==NULL || (m_msp->ann_flg==1 && bbp->seq->annot_p==NULL)) {
+ n1 = re_getlib(aa1save, (m_msp->ann_flg==1) ? &(bbp->seq->annot_p) : NULL,
+ maxn,m_msp->ldb_info.maxt3, m_msp->ldb_info.l_overlap,
+ bbp->mseq->cont,m_msp->ldb_info.term_code,
+ &bbp->seq->l_offset,&bbp->seq->l_off,bbp->mseq->m_file_p);
+ if (n1 != bbp->seq->n1) {
+ fprintf(stderr,"*** error [%s:%d] - n1[%d/%d] != n1[%d] from re_getlib() at %s [maxn:%d/maxt3:%d]\n",
+ __FILE__, __LINE__,
+ bbp->n1, bbp->seq->n1, n1, bbp->mseq->libstr, maxn, m_msp->ldb_info.maxt3);
+ }
+
+#ifdef DEBUG
+ if (adler32(1L,aa1save,n1)!=bbp->adler32_crc) {
+ fprintf(stderr,"*** error [%s:%d] - adler32_crc from re_getlib() at %d(%d): %s\n",
+ __FILE__, __LINE__,
+ bbp->mseq->index,bbp->n1, bline);
+ }
+#endif
+
+ /* if we don't have the sequence in the aa1b buffer, copy it from re_getlib */
+ if (bbp->seq->aa1b == NULL) {
+ bbp->seq->aa1b = seq_buf_p;
+ memcpy(bbp->seq->aa1b, aa1save, bbp->seq->n1+1);
+ seq_buf_p += bbp->seq->n1+1;
+ }
+ }
+ }
+
+ /* here, we are getting query annots after all the bptr[]s have been processed */
+ /* moved to comp_lib9.c */
+ /*
+ if (m_msp->annot0_sname[0]) {
+ if (get_annot(m_msp->annot0_sname, m_msp, m_msp->qtitle, m_msp->q_offset+m_msp->q_off-1,m_msp->n0, &m_msp->annot_p, 0, debug) < 0) {
+ fprintf(stderr,"*** error [%s:%d] - %s did not produce annotations\n",__FILE__, __LINE__, m_msp->annot0_sname);
+ m_msp->annot0_sname[0] = '\0';
+ }
+ if (m_msp->annot_p && m_msp->annot_p->n_annot > 0) {
+ m_msp->aa0a = m_msp->annot_p->aa1_ann;
+ }
+ if (!m_msp->ann_arr[0]) {m_msp->ann_arr[0] = ' '; m_msp->ann_arr[1] = '\0';}
+ }
+ */
+
+ /* if we have an variant annotation script, execute it and capture the output */
+ /* must do after bline is set */
+ if (m_msp->annot1_sname[0]) {
+ if (get_annot_list(m_msp->annot1_sname, m_msp, bbp_arr, nbest, 1, debug)< 0) {
+ fprintf(stderr,"*** error [%s:%d] - %s did not produce annotations for %s\n",__FILE__, __LINE__, m_msp->annot1_sname,m_msp->qtitle);
+ m_msp->annot1_sname[0] = '\0';
+ };
+ if (!m_msp->ann_arr[0]) {m_msp->ann_arr[0] = ' '; m_msp->ann_arr[1] = '\0';}
+ }
+
+ m_msp->pre_load_done = 1;
+}
+
+/* merge_ares_chains()
+
+ seeks to merge two ares chains, producing a single chain that is
+ sorted by sw_score.
+
+ Strategy -- choose the chain with the highest score, and go down
+ it until the head of the other chain has higher score, then link
+ the other chain to the main chain, breaking the first, and
+ continue the process.
+
+ The two pointers, max_next and alt_next, keep track of the best
+ and the alternate chain
+ */
+
+
+#undef SHOW_MERGE_CHAIN
+
+struct a_res_str *
+merge_ares_chains(struct a_res_str *cur_ares,
+ struct a_res_str *tmp_ares,
+ int score_ix,
+ const char *msg)
+{
+ struct a_res_str *max_next, *max_ares, *alt_ares, *prev_next;
+
+ if (!tmp_ares) return cur_ares;
+
+#ifdef SHOW_MERGE_CHAIN
+ fprintf(stderr,"cur_ares->");
+ for (max_next = cur_ares; max_next; max_next = max_next->next) {
+ fprintf(stderr,"%d->",max_next->rst.score[score_ix]);
+ }
+
+ fprintf(stderr,"||\n");
+ fprintf(stderr,"tmp_ares->");
+ for (max_next = tmp_ares; max_next; max_next = max_next->next) {
+ fprintf(stderr,"%d->",max_next->rst.score[score_ix]);
+ }
+ fprintf(stderr,"||\n");
+#endif
+
+ /* start with the maximum score */
+
+ if (cur_ares->rst.score[score_ix] >= tmp_ares->rst.score[score_ix]) {
+ max_ares = max_next = prev_next = cur_ares;
+ alt_ares = tmp_ares;
+ }
+ else {
+ max_ares = max_next = prev_next = tmp_ares;
+ alt_ares = cur_ares;
+ }
+
+ while (max_next && alt_ares) {
+ /* this is guaranteed true for the first iteration */
+ while (max_next && max_next->rst.score[score_ix] >= alt_ares->rst.score[score_ix]) {
+ prev_next = max_next;
+ max_next = max_next->next;
+ }
+ if (max_next==NULL) break;
+ else { /* max_next->rst.score[score_ix] no longer greater, switch
+ pointers */
+ prev_next->next = alt_ares;
+ alt_ares = max_next;
+ max_next = prev_next->next;
+ }
+ }
+
+ /* we quit whenever max_next or alt_ares == NULL; if
+ (max_next==NULL), then continue adding the rest of alt_ares */
+
+ if (max_next==NULL) {
+ prev_next->next = alt_ares;
+ }
+
+
+#ifdef SHOW_MERGE_CHAIN
+ fprintf(stderr,"[%s] merge_ares->",msg);
+ for (max_next = max_ares; max_next; max_next = max_next->next) {
+ fprintf(stderr,"%d->",max_next->rst.score[score_ix]);
+ }
+ fprintf(stderr,"||\n\n");
+#endif
+
+ return max_ares;
+}
+
+/* copies from from to to shuffling */
+
+extern int my_nrand(int, void *);
+
+void
+shuffle(unsigned char *from, unsigned char *to, int n, void *rand_state)
+{
+ int i,j; unsigned char tmp;
+
+ if (from != to) memcpy((void *)to,(void *)from,n);
+
+ for (i=n; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ tmp = to[j];
+ to[j] = to[i-1];
+ to[i-1] = tmp;
+ }
+ to[n] = 0;
+}
+
+/* shuffles DNA sequences as codons */
+void
+shuffle3(unsigned char *from, unsigned char *to, int n, void *rand_state)
+{
+ int i, j, i3,j3; unsigned char tmp;
+ int n3;
+
+ if (from != to) memcpy((void *)to,(void *)from,n);
+
+ n3 = n/3;
+
+ for (i=n3; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ i3 = i*3;
+ j3 = j*3;
+ tmp = to[j3];
+ to[j3] = to[i3-1];
+ to[i3-1] = tmp;
+ tmp = to[j3+1];
+ to[j3+1] = to[i3];
+ to[i3] = tmp;
+ tmp = to[j3+2];
+ to[j3+2] = to[i3+1];
+ to[i3+1] = tmp;
+ }
+ to[n] = 0;
+}
+
+/* "shuffles" by reversing the sequence */
+void
+rshuffle(unsigned char *from, unsigned char *to, int n)
+{
+ unsigned char *ptr = from + n;
+
+ while (n-- > 0) {
+ *to++ = *ptr--;
+ }
+ *to = '\0';
+}
+
+static int ieven = 0;
+/* copies from from to from shuffling, ieven changed for threads */
+void
+wshuffle(unsigned char *from, unsigned char *to, int n, int wsiz, void *rand_state)
+{
+ int i,j, k, mm;
+ unsigned char tmp, *top;
+
+ memcpy((void *)to,(void *)from,n);
+
+ mm = n%wsiz;
+
+ if (ieven) {
+ for (k=0; k<(n-wsiz); k += wsiz) {
+ top = &to[k];
+ for (i=wsiz; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ tmp = top[j];
+ top[j] = top[i-1];
+ top[i-1] = tmp;
+ }
+ }
+ top = &to[n-mm];
+ for (i=mm; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ tmp = top[j];
+ top[j] = top[i-1];
+ top[i-1] = tmp;
+ }
+ ieven = 0;
+ }
+ else {
+ for (k=n; k>=wsiz; k -= wsiz) {
+ top = &to[k-wsiz];
+ for (i=wsiz; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ tmp = top[j];
+ top[j] = top[i-1];
+ top[i-1] = tmp;
+ }
+ }
+ top = &to[0];
+ for (i=mm; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ tmp = top[j];
+ top[j] = top[i-1];
+ top[i-1] = tmp;
+ }
+ ieven = 1;
+ }
+ to[n] = 0;
+}
+
+int
+sfn_cmp(int *q, int *s)
+{
+ if (*q == *s) return *q;
+ while (*q && *s) {
+ if (*q == *s) return *q;
+ else if (*q < *s) q++;
+ else if (*q > *s) s++;
+ }
+ return 0;
+}
+
+#ifndef MPI_SRC
+void
+revcomp(unsigned char *seq, int n, int *c_nt)
+{
+ unsigned char tmp;
+ int i, ni;
+
+ for (i=0, ni = n-1; i< n/2; i++,ni--) {
+ tmp = c_nt[seq[i]];
+ seq[i] = c_nt[seq[ni]];
+ seq[ni] = tmp;
+ }
+ if ((n%2)==1) {
+ i = n/2;
+ seq[i] = c_nt[seq[i]];
+ }
+ seq[n]=0;
+}
+#endif
+
+/* check to see whether this score (or a shuff score) should
+ be included in statistics */
+int samp_stats_idx (int *pre_nstats, int nstats, void *rand_state) {
+ int jstats = -1;
+
+ /* this code works when every score can be used for statistics
+ estimates, but fails for fasta/[t]fast[xy] where only a fraction
+ of scores are used */
+
+ if (*pre_nstats < MAX_STATS) {
+ jstats = (*pre_nstats)++;
+ }
+
+ /* here, the problem is that while we may have pre_nstats
+ possible samplings, in some cases (-M subsets, fasta,
+ [t]fast[xy] we don't have MAX_STATS samples yet. Until we
+ have MAX_STATS, we want more. But the stats_idx strategy
+ means that there may be additional samples in the buffers
+ that are not reflected in nstats.
+ */
+
+ else {
+#ifdef SAMP_STATS_LESS
+ /* now we have MAX_STATS samples
+ we want to sample 1/2 of 60K - 120K, 1/3 of 120K - 180K, etc */
+ /* check every 15K to see if we have gone past the next threshold */
+
+ /* pre_nstats cannot be incremented before the % to ensure
+ that stats_inc is incremented exactly at 60000, 120000, etc.
+ use ">=" in case increment comes later
+ tests suggest the first 60K are sampled about 25% more
+ than the rest
+ */
+ if (nstats < MAX_STATS) {
+ jstats = MAX_STATS - my_nrand(MAX_STATS - nstats, rand_state)-1;
+ }
+ else if (((*pre_nstats)++ % (MAX_STATS/4)) == 0 &&
+ *pre_nstats >= stats_inc * MAX_STATS) {
+ stats_inc = (*pre_nstats / MAX_STATS) + 1;
+ }
+ if ((*pre_nstats % stats_inc) == 0) {
+ jstats = my_nrand(MAX_STATS, rand_state);
+ }
+#else
+ /* this sampling strategy calls my_nrand() for every
+ sequence > 60K, but provides a very uniform sampling */
+ jstats = my_nrand(++(*pre_nstats), rand_state);
+ if (jstats >= MAX_STATS) { jstats = -1;}
+#endif
+ }
+ return jstats;
+}
+
+/* **************************************************************** */
+/* build_link_data -- produces fasta file from m_msp->
+ (1) generate a temporary file name
+ (2) write out accessions \t expects to the temporary file
+ (3) run script against temporary file, producing fasta_file_expansion_file
+ (4) return fasta expansion filename for standard fasta openlib().
+
+ returns: the expansion library file name
+ **link_link_file_p is the name of the file with the data
+ that will be removed.
+*/
+/* **************************************************************** */
+char *
+build_link_data(char **link_lib_file_p,
+ struct mngmsg *m_msp, struct beststr **bestp_arr,
+ int debug) {
+ int i, status;
+ char tmp_line[MAX_SSTR];
+ char link_acc_file[MAX_STR];
+ int link_acc_fd;
+ char *link_lib_file;
+ char *link_lib_str;
+ char link_script[MAX_LSTR];
+ int link_lib_type;
+ char *bp, *link_bp;
+ FILE *link_fd=NULL; /* file for link accessions */
+
+#ifndef UNIX
+ return NULL;
+#else
+ /* get two tmpfiles, one for accessions, one for library */
+ link_acc_file[0] = '\0';
+
+ if ((link_lib_file=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [build_link_data] Cannot allocate link_lib_file",
+ __FILE__, __LINE__);
+ }
+ link_lib_file[0] = '\0';
+
+ if ((bp=getenv("TMP_DIR"))!=NULL) {
+ strncpy(link_acc_file,bp,sizeof(link_acc_file));
+ link_acc_file[sizeof(link_acc_file)-1] = '\0';
+ SAFE_STRNCAT(link_acc_file,"/",sizeof(link_acc_file));
+ }
+
+ SAFE_STRNCAT(link_acc_file,"link_acc_XXXXXX",sizeof(link_acc_file));
+ link_acc_fd = mkstemp(link_acc_file);
+ strncpy(link_lib_file,link_acc_file,MAX_STR);
+ link_acc_file[sizeof(link_acc_file)-1] = '\0';
+ SAFE_STRNCAT(link_lib_file,".lib",MAX_STR);
+
+ /* write out accessions to link_acc_file */
+ if ((link_fd =fdopen(link_acc_fd,"w"))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - Cannot open link_acc_file: %s\n",
+ __FILE__, __LINE__, link_acc_file);
+ goto no_links;
+ }
+
+ for (i=0; i<m_msp->nskip + m_msp->nshow; i++) {
+ if ((bp=strchr(bestp_arr[i]->mseq->bline,' '))!=NULL) {
+ *bp = '\0';
+ }
+ fprintf(link_fd,"%s\t%.3g\n",bestp_arr[i]->mseq->bline,bestp_arr[i]->rst.escore);
+ if (bp != NULL) *bp=' ';
+ }
+ fclose(link_fd);
+
+ /* build link_script link_acc_file > link_lib_file */
+ /* check for indirect */
+ link_bp = &m_msp->link_lname[0];
+ if (*link_bp == '!') {
+ link_bp++;
+ }
+ if (*link_bp == '@') {
+ link_bp++;
+ }
+
+ /* remove library type */
+ if ((bp=strchr(link_bp,' '))!=NULL) {
+ *bp = '\0';
+ sscanf(bp+1,"%d",&link_lib_type);
+ }
+ else {
+ link_lib_type = 0;
+ }
+
+ strncpy(link_script,link_bp,sizeof(link_script));
+ link_script[sizeof(link_script)-1] = '\0';
+ SAFE_STRNCAT(link_script," ",sizeof(link_script));
+ SAFE_STRNCAT(link_script,link_acc_file,sizeof(link_script));
+ SAFE_STRNCAT(link_script," >",sizeof(link_script));
+ SAFE_STRNCAT(link_script,link_lib_file,sizeof(link_script));
+
+ /* un-edit m_msp->link_lname */
+ if (bp != NULL) *bp = ' ';
+
+ /* run link_script link_acc_file > link_lib_file */
+ status = system(link_script);
+ if (!debug) {
+#ifdef UNIX
+ unlink(link_acc_file);
+#else
+ _unlink(link_acc_file);
+#endif
+ }
+
+ if (status == NO_FILE_EXIT) { /* my specific return for no links */
+ goto no_links;
+ }
+
+ if (status < 0 || status == 127) {
+ fprintf(stderr,"*** error [%s:%d] - script: %s failed\n",
+ __FILE__, __LINE__,link_script);
+ goto no_links;
+ }
+
+ if ((link_fd=fopen(link_lib_file,"r"))==NULL) {
+ goto no_links;
+ }
+ else fclose(link_fd);
+
+ if ((link_lib_str=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [build_link_data] Cannot allocate link_lib_str",
+ __FILE__, __LINE__);
+ }
+
+ /* build the file string (possibly @link_lib_file libtype) */
+ link_lib_str[0]='\0';
+ if (m_msp->link_lname[0] == '@') {
+ SAFE_STRNCAT(link_lib_str,"@",MAX_STR);
+ }
+ SAFE_STRNCAT(link_lib_str,link_lib_file,MAX_STR);
+ if (link_lib_type > 0) {
+ sprintf(tmp_line," %d",link_lib_type);
+ SAFE_STRNCAT(link_lib_str,tmp_line,MAX_STR);
+ }
+
+ *link_lib_file_p = link_lib_file;
+ return link_lib_str;
+
+ no_links:
+ free(link_lib_file);
+ *link_lib_file_p = NULL;
+ return NULL;
+#endif
+}
+
+/* **************************************************************** */
+/* build_lib_db -- produces fasta file from script
+ (1) generate a temporary file name lib_db_file
+ (2) run script producing data in lib_db_file
+
+ returns: the expansion library file name
+ **db_str_file_p is the name of the file with the data
+ that will be removed.
+*/
+/* **************************************************************** */
+char *
+build_lib_db(char *script_file) {
+ int i, status;
+ char tmp_line[MAX_SSTR];
+ char *lib_db_file, *lib_db_str;
+ char lib_db_script[MAX_LSTR];
+ int lib_db_indirect;
+ int lib_db_type;
+ int lib_db_str_len;
+ char *bp, *lib_bp;
+
+ if ((lib_db_file=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [build_lib_db] Cannot allocate lib_db_file",
+ __FILE__, __LINE__);
+ goto no_lib;
+ }
+
+ if ((bp=getenv("TMP_DIR"))!=NULL) {
+ strncpy(lib_db_file,bp,MAX_STR);
+ lib_db_file[sizeof(lib_db_file)-1] = '\0';
+ SAFE_STRNCAT(lib_db_file,"/",sizeof(lib_db_file));
+ }
+
+ SAFE_STRNCAT(lib_db_file,"lib_db_XXXXXX",MAX_STR);
+ mktemp(lib_db_file);
+ lib_db_str_len = strlen(lib_db_file)+1;
+
+ /* check for indirect */
+ lib_bp = script_file;
+ if (*lib_bp == '@') {
+ lib_bp++;
+ lib_db_str_len++;
+ }
+ /* remove library type */
+ if ((bp=strchr(lib_bp,' '))!=NULL) {
+ *bp = '\0';
+ sscanf(bp+1,"%d",&lib_db_type);
+ lib_db_str_len += (strlen(bp+1)+1);
+ }
+ else {
+ lib_db_type = 0;
+ }
+
+ strncpy(lib_db_script,lib_bp,sizeof(lib_db_script));
+ lib_db_script[sizeof(lib_db_script)-1] = '\0';
+ SAFE_STRNCAT(lib_db_script," >",sizeof(lib_db_script));
+ SAFE_STRNCAT(lib_db_script,lib_db_file,sizeof(lib_db_script));
+
+ if (bp != NULL) *bp = ' ';
+
+ /* run lib_db_script link_acc_file > lib_db_file */
+ status = system(lib_db_script);
+
+ if (status == NO_FILE_EXIT) { /* my specific return for no links */
+ goto no_lib;
+ }
+
+ if (status < 0 || status == 127) {
+ fprintf(stderr,"*** error [%s:%d] - [build_lib_db] script: %s failed\n",
+ __FILE__, __LINE__, lib_db_script);
+ goto no_lib;
+ }
+
+ /* build the file string (possibly @lib_db_str libtype) */
+ if ((lib_db_str=calloc(lib_db_str_len+1,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [build_lib_db] cannot allocate lib_db_str[%d]\n",
+ __FILE__, __LINE__, lib_db_str_len+1);
+ goto no_lib;
+ }
+ lib_db_str[0]='\0';
+ if (*script_file == '@') {
+ SAFE_STRNCAT(lib_db_str,"@",MAX_STR);
+ }
+ SAFE_STRNCAT(lib_db_str,lib_db_file,MAX_STR);
+ if (lib_db_type > 0) {
+ sprintf(tmp_line," %d",lib_db_type);
+ SAFE_STRNCAT(lib_db_str,tmp_line,MAX_STR);
+ }
+
+ return lib_db_str;
+
+ no_lib:
+ return NULL;
+}
+
+/* used to temporarily allocate annotation array in next_annot_entry()*/
+struct annot_mstr {
+ int max_annot;
+ struct annot_entry *tmp_arr_p;
+};
+
+/* init_tmp_annot_mstr(size) intializes the structure used to track annots */
+int
+init_tmp_annot(struct annot_mstr *this, int size) {
+ struct annot_entry *tmp_ann_astr;
+
+ /* only reset if array is NULL */
+ if (this->tmp_arr_p == NULL || this->max_annot <= 0) {
+ this->max_annot = 32;
+ if ((this->tmp_arr_p=(struct annot_entry *)calloc(this->max_annot, sizeof(struct annot_entry)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate annot_entry[%d]\n",
+ __FILE__,__LINE__,this->max_annot);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+int
+update_tmp_annot(struct annot_mstr *this) {
+
+ this->max_annot += (this->max_annot/2);
+ if ((this->tmp_arr_p= (struct annot_entry *)realloc(this->tmp_arr_p, this->max_annot*sizeof(struct annot_entry)))==NULL) {
+ fprintf(stderr,"[*** error [%s:%d] - cannot reallocate tmp_ann_astr[%d]\n",
+ __FILE__, __LINE__, this->max_annot);
+ return 0;
+ }
+ return 1;
+}
+
+struct annot_str *
+next_annot_entry(FILE *annot_fd, char *tmp_line, int n_tmp_line,
+ struct annot_str *annot_p,
+ struct annot_mstr *mtmp_annot_p,
+ struct mngmsg *m_msp, int target);
+
+/* **************************************************************** */
+/* get_annot_list -- produces fasta file from sname
+ if sname[0]=='<', read the file directly, goto (4)
+ if sname[0]=='!', run a script
+ (1) generate a temporary file name
+ (2) write out list of blines
+ (3) run m_msp->annot1_sname[] script against temporary file, producing table of annotations
+
+ (4) read in the annotations and merge them into beststr
+ (5) return number of annotations
+*/
+/* **************************************************************** */
+
+int
+get_annot_list(char *sname, struct mngmsg *m_msp, struct beststr **bestp_arr, int nbest,
+ int target, int debug) {
+ int i, status;
+ long l_offset;
+ char tmp_line[MAX_STR];
+ char annot_bline_file[MAX_STR];
+ int annot_bline_fd;
+ char *annot_descr_file;
+ char annot_script[MAX_LSTR];
+ struct annot_str *annot_p;
+ char *bp;
+ int annot_seq_cnt;
+ FILE *annot_fd=NULL; /* file for annot accessions */
+ struct annot_mstr mtmp_annot;
+
+#ifndef UNIX
+ return 0;
+#else
+
+ if (sname[0] == '!') {
+
+ /* get two tmpfiles, one for bline, one for returned annotations
+ (but it would make more sense to use popen() to get the
+ annotations back
+ */
+
+ annot_bline_file[0] = '\0';
+
+ if ((annot_descr_file=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [get_annot_list] Cannot allocate annot_file",
+ __FILE__, __LINE__);
+ }
+ annot_descr_file[0] = '\0';
+
+ if ((bp=getenv("TMP_DIR"))!=NULL) {
+ strncpy(annot_bline_file,bp,sizeof(annot_bline_file));
+ annot_bline_file[sizeof(annot_bline_file)-1] = '\0';
+ SAFE_STRNCAT(annot_bline_file,"/",sizeof(annot_bline_file));
+ }
+
+ SAFE_STRNCAT(annot_bline_file,"annot_bline_XXXXXX",sizeof(annot_bline_file));
+ annot_bline_fd = mkstemp(annot_bline_file);
+ strncpy(annot_descr_file,annot_bline_file,MAX_STR);
+ annot_bline_file[sizeof(annot_bline_file)-1] = '\0';
+ SAFE_STRNCAT(annot_descr_file,".annot",MAX_STR);
+
+ /* write out accessions to annot_bline_file */
+ if ((annot_fd =fdopen(annot_bline_fd,"w"))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - Cannot open annot_bline_file: %s\n",__FILE__, __LINE__, annot_bline_file);
+ goto no_annots;
+ }
+
+ for (i=0; i<nbest; i++) {
+ if (bestp_arr[i]->mseq->annot_req_flag) { continue; }
+ if ((strlen(bestp_arr[i]->mseq->bline) > DESCR_OFFSET) &&
+ (bp=strchr(bestp_arr[i]->mseq->bline+DESCR_OFFSET,' '))!=NULL) {*bp = '\0';}
+ else {bp = NULL;}
+ /* provide sequence length with offset, but only if offset is positive */
+ l_offset = bestp_arr[i]->seq->l_offset+bestp_arr[i]->seq->l_off -1;
+ if (l_offset < 0) { l_offset = 0;}
+ fprintf(annot_fd,"%s\t%ld\n",bestp_arr[i]->mseq->bline,
+ l_offset + bestp_arr[i]->seq->n1);
+ if (bp != NULL) *bp=' ';
+ bestp_arr[i]->mseq->annot_req_flag = 1;
+ }
+ fclose(annot_fd);
+
+ subs_env(annot_script, sname+1, sizeof(annot_script));
+ annot_script[sizeof(annot_script)-1] = '\0';
+ SAFE_STRNCAT(annot_script," ",sizeof(annot_script));
+ SAFE_STRNCAT(annot_script,annot_bline_file,sizeof(annot_script));
+ SAFE_STRNCAT(annot_script," >",sizeof(annot_script));
+ SAFE_STRNCAT(annot_script,annot_descr_file,sizeof(annot_script));
+
+ /* run annot_script annot_bline_file > annot_descr_file */
+ status = system(annot_script);
+ if (!debug) {
+#ifdef UNIX
+ unlink(annot_bline_file);
+#else
+ _unlink(annot_bline_file);
+#endif
+ }
+
+ if (status == NO_FILE_EXIT) { /* my specific return for no annots */
+ goto no_annots;
+ }
+
+ if (status < 0 || status == 127) {
+ fprintf(stderr,"*** error [%s:%d] - script: %s failed\n",
+ __FILE__, __LINE__, annot_script);
+ goto no_annots;
+ }
+ }
+ else if (sname[0] == '<') {
+ annot_descr_file = sname+1;
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] - %s not script (!) or file (<)\n",__FILE__, __LINE__, sname);
+ goto no_annots;
+ }
+
+ /* read annotation file */
+
+ if ((annot_fd=fopen(annot_descr_file,"r"))==NULL) {
+ goto no_annots;
+ }
+
+ /* be sure to ask for annotation once */
+ for (i=0; i<nbest; i++) {
+ bestp_arr[i]->mseq->annot_req_flag = 1;
+ }
+ /* we have some annotations */
+ /* the annotation script MUST return the annotations ordered as in annot_descr_file,
+ in "fasta" form:
+
+ >bline_descr
+ pos<tab>label<tab>value?<tab>comment (which is not read in this version)
+ 1 *
+ 11 V N
+ */
+
+ /* now read the annotation/variant file */
+
+ /* read #comments, =annot_defs at beginning of file */
+ tmp_line[0] = '#';
+ while (tmp_line[0] == '#' || tmp_line[0] == '=') {
+ if (tmp_line[0] == '=') add_annot_def(m_msp, tmp_line+1,1);
+ if (fgets(tmp_line, sizeof(tmp_line), annot_fd)==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - premature annotation file end (%s)\n",
+ __FILE__,__LINE__, annot_descr_file);
+ goto no_annots;
+ }
+ }
+
+ /* set mtmp_annot to be initialized */
+ mtmp_annot.tmp_arr_p = NULL;
+ mtmp_annot.max_annot = 0;
+
+ annot_seq_cnt = 0;
+
+ /* now read in the annotations, but only the first time if asked multiple times */
+ for (i=0; i<nbest; i++) {
+ if (!bestp_arr[i]->mseq->annot_req_flag) {
+ continue;
+ }
+ bestp_arr[i]->mseq->annot_req_flag = 0;
+
+ if ((bp=strchr(tmp_line,'\n'))!=NULL) *bp = '\0';
+ if ((bp=strchr(tmp_line,'\t'))!=NULL) *bp = '\0';
+ if (tmp_line[0] != '>' || strncmp(&tmp_line[1], bestp_arr[i]->mseq->bline, strlen(&tmp_line[1])) != 0) {
+ fprintf(stderr,"*** error [%s:%d] - %s description mismatch (%s:%s)\n",
+ __FILE__,__LINE__,annot_descr_file, tmp_line, bestp_arr[i]->mseq->bline);
+ goto no_annots;
+ }
+
+ annot_p = next_annot_entry(annot_fd, tmp_line, sizeof(tmp_line), bestp_arr[i]->seq->annot_p, &mtmp_annot, m_msp, target);
+
+ if (annot_p) {
+ bestp_arr[i]->seq->annot_p = annot_p;
+ s_annot_to_aa1a(bestp_arr[i]->seq->l_offset + bestp_arr[i]->seq->l_off - 1,
+ bestp_arr[i]->seq->n1, annot_p,m_msp->ann_arr, bestp_arr[i]->mseq->libstr);
+ annot_seq_cnt++;
+ mtmp_annot.tmp_arr_p = NULL;
+ }
+ else {
+ if (bestp_arr[i]->seq->annot_p) {
+ bestp_arr[i]->seq->annot_p->n_annot = 0;
+ }
+ }
+ }
+
+ if (mtmp_annot.tmp_arr_p) free(mtmp_annot.tmp_arr_p);
+
+ fclose(annot_fd);
+ if (sname[0]=='!') {
+ if (!debug) {
+#ifdef UNIX
+ unlink(annot_descr_file);
+#else
+ _unlink(annot_descr_file);
+#endif
+ }
+ free(annot_descr_file);
+ }
+ return annot_seq_cnt;
+
+ no_annots:
+ for (i=0; i<nbest; i++) {
+ if (bestp_arr[i]->seq->annot_p) {
+ if (bestp_arr[i]->seq->annot_p->n_annot > 0) {
+ bestp_arr[i]->seq->annot_p->n_annot = 0;
+ }
+ }
+ }
+ if (sname[0] == '!') free(annot_descr_file);
+ return -1;
+#endif
+}
+
+void sort_annots(struct annot_entry **s_annot, int n_annot)
+{
+ int gap, i, j, k;
+ struct annot_entry *tmp;
+ int v;
+ int incs[6] = { 112, 48, 21, 7, 3, 1 };
+
+ for ( k = 0; k < 6; k++) {
+ gap = incs[k];
+ for (i=gap; i < n_annot; i++) {
+ tmp = s_annot[i];
+ j = i;
+ v = s_annot[i]->pos;
+ while ( j >= gap && s_annot[j-gap]->pos > v) {
+ s_annot[j] = s_annot[j - gap];
+ j -= gap;
+ }
+ s_annot[j] = tmp;
+ }
+ }
+}
+
+struct annot_str *
+next_annot_entry(FILE *annot_fd, char *tmp_line, int n_tmp_line, struct annot_str *annot_p,
+ struct annot_mstr *mtmp_annot_p, struct mngmsg *m_msp, int target) {
+
+ char ctmp_label, ctmp_value, tmp_comment[MAX_STR], annot_acc[MAX_STR];
+ char *bp;
+ int f_pos, f_end;
+ int i_ann, l_doms, r_doms;
+ int n_annot = 0;
+
+ struct annot_entry *tmp_ann_entry_arr, **s_tmp_ann_entry_arr;
+
+ SAFE_STRNCPY(annot_acc, tmp_line, sizeof(annot_acc));
+
+ if (init_tmp_annot(mtmp_annot_p, 32)==0) return NULL;
+
+ tmp_ann_entry_arr = mtmp_annot_p->tmp_arr_p;
+
+ /* read through each annotation in file */
+ while (fgets(tmp_line, n_tmp_line, annot_fd)!=NULL ) {
+ if (tmp_line[0] == '>') goto next_bline; /* start of new annotation */
+ if (tmp_line[0] == '#') continue; /* ignore comments */
+ if (tmp_line[0] == '=') { /* symbol definition */
+ add_annot_def(m_msp, tmp_line+1,1);
+ continue;
+ }
+
+ if (n_annot >= mtmp_annot_p->max_annot - 1) {
+ /* try to expand annotation array */
+ if (update_tmp_annot(mtmp_annot_p)==0) {
+ return NULL;
+ }
+ tmp_ann_entry_arr = mtmp_annot_p->tmp_arr_p;
+ }
+
+ /* sscanf cannot give strings with blanks */
+ /* sscanf(tmp_line,"%d %c %c %s", &f_pos, &ctmp_label, &ctmp_value, tmp_comment); */
+ tmp_comment[0] = '\0';
+ if ((bp=strchr(tmp_line,'\r')) || (bp=strchr(tmp_line,'\n'))) *bp='\0'; /* clean up EOL */
+ if ((bp=strchr(tmp_line,'\t'))!=NULL) { /* fields MUST be tab delimited */
+ f_pos=atoi(tmp_line) - 1; /* get first field -- f_pos, converted to 0-offset */
+ ctmp_label = bp[1]; /* get second field -- ctmp_label */
+ if ((bp=strchr(bp+1,'\t'))!=NULL) { /* next field could be f_end or ctmp_value */
+ if (ctmp_label == '-') { f_end = atoi(bp+1) -1; ctmp_value = '\0';}
+ else {ctmp_value = bp[1]; f_end = f_pos;} /* have variant, not coordinate */
+ if ((bp=strchr(bp+1,'\t'))!=NULL) { /* if last <tab>, get comment */
+ strncpy(tmp_comment,bp+1,sizeof(tmp_comment));
+ }
+ }
+ }
+ else { /* no tab */
+ continue;
+ }
+
+ tmp_ann_entry_arr[n_annot].pos = f_pos;
+ tmp_ann_entry_arr[n_annot].end = f_end;
+ tmp_ann_entry_arr[n_annot].label=ctmp_label;
+ tmp_ann_entry_arr[n_annot].value=ctmp_value;
+ tmp_ann_entry_arr[n_annot].comment = NULL;
+ tmp_ann_entry_arr[n_annot].target = target;
+ tmp_ann_entry_arr[n_annot].start = NULL;
+
+ if (tmp_comment[0]) {
+ if ((tmp_ann_entry_arr[n_annot].comment=(char *)calloc(strlen(tmp_comment)+1,sizeof(char)))!=NULL) {
+ strncpy(tmp_ann_entry_arr[n_annot].comment,tmp_comment,strlen(tmp_comment));
+ }
+ }
+ if (ctmp_label== 'V') {
+ /* map the .value from ascii to encoded residue */
+ /* this must be lascii, not qascii, because the script
+ describes the library, not the query, and for FASTX/TFASTX,
+ those are different */
+ tmp_ann_entry_arr[n_annot].value = lascii[tmp_ann_entry_arr[n_annot].value];
+ }
+ else if (ctmp_label == '[') {
+ i_ann = add_annot_char(m_msp->ann_arr, ctmp_label);
+ if (i_ann > 0) {
+ qascii[ctmp_label] = NANN + i_ann;
+ m_msp->ann_arr_def[i_ann] = NULL;
+ }
+ }
+ else if (ctmp_label == ']') {
+ i_ann = add_annot_char(m_msp->ann_arr, ctmp_label);
+ if (i_ann > 0) {
+ qascii[ctmp_label] = NANN + i_ann;
+ m_msp->ann_arr_def[i_ann] = NULL;
+ }
+ }
+ else if (ctmp_label == '-') { /* if ctmp_label == '-', have f_end, which must be added with ']' */
+ /* make sure start/stop characters are in annotation alphabet */
+ i_ann = add_annot_char(m_msp->ann_arr, '[');
+ if (i_ann > 0) {
+ qascii['['] = NANN + i_ann;
+ m_msp->ann_arr_def[i_ann] = NULL;
+ }
+
+ tmp_ann_entry_arr[n_annot].label='[';
+ n_annot++;
+
+ if (n_annot >= mtmp_annot_p->max_annot - 1) {
+ if (update_tmp_annot(mtmp_annot_p)==0) {
+ return NULL;
+ }
+ tmp_ann_entry_arr = mtmp_annot_p->tmp_arr_p;
+ }
+
+ /* only required for start - stop annotations; requires sort
+ later (which is not currently used */
+ tmp_ann_entry_arr[n_annot].pos = f_end;
+ tmp_ann_entry_arr[n_annot].end = f_end;
+ tmp_ann_entry_arr[n_annot].label=']';
+ tmp_ann_entry_arr[n_annot].value=ctmp_value;
+ tmp_ann_entry_arr[n_annot].comment = NULL;
+ tmp_ann_entry_arr[n_annot].target = target;
+ tmp_ann_entry_arr[n_annot].start = &tmp_ann_entry_arr[n_annot-1];
+
+ i_ann = add_annot_char(m_msp->ann_arr, ']');
+ if (i_ann > 0) {
+ qascii[']'] = NANN + i_ann;
+ m_msp->ann_arr_def[i_ann] = NULL;
+ }
+ }
+ else if ((i_ann = add_annot_char(m_msp->ann_arr, ctmp_label)) > 0) {
+ m_msp->ann_arr_def[i_ann] = NULL;
+ qascii[ctmp_label] = NANN + i_ann;
+ }
+ n_annot++;
+ }
+
+ next_bline:
+ if (n_annot) { /* if we have annotations, save them and set tmp_ann_entry_arr = NULL */
+ tmp_ann_entry_arr = (struct annot_entry *)realloc(tmp_ann_entry_arr, (n_annot+1)*sizeof(struct annot_entry));
+
+ if ((s_tmp_ann_entry_arr = (struct annot_entry **)calloc((n_annot+1),sizeof(struct annot_entry *)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot alloc s_tmp_ann_entry_arr[%d]",
+ __FILE__,__LINE__, n_annot+1);
+ exit(1);
+ }
+
+ /* pair every domain start/stop */
+ /* (1) count number of domains/check for consistency */
+ l_doms = r_doms = 0;
+ for (i_ann=0; i_ann < n_annot; i_ann++) {
+ if (tmp_ann_entry_arr[i_ann].label == '[') l_doms++;
+ if (tmp_ann_entry_arr[i_ann].label == ']') r_doms++;
+ }
+ if (l_doms != r_doms) {
+ fprintf(stderr,"*** error [%s:%d] - unpaired domains: %s %d != %d\n",
+ __FILE__,__LINE__, annot_acc, l_doms, r_doms);
+#ifdef DEBUG
+ for (i_ann=0; i_ann < n_annot; i_ann++) {
+ if (tmp_ann_entry_arr[i_ann].label == '[')
+ fprintf(stderr, "%ld %c %s\n",tmp_ann_entry_arr[i_ann].pos,tmp_ann_entry_arr[i_ann].label,tmp_ann_entry_arr[i_ann].comment);
+ if (tmp_ann_entry_arr[i_ann].label == ']')
+ fprintf(stderr, "%ld %c\n",tmp_ann_entry_arr[i_ann].pos,tmp_ann_entry_arr[i_ann].label);
+ }
+#endif
+ }
+ else if (l_doms > 0) {
+ if ((tmp_domain_entry_arr = (struct annot_entry *)calloc((l_doms+1),sizeof(struct annot_entry)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot alloc s_tmp_ann_entry_arr[%d]",
+ __FILE__,__LINE__, l_doms+1);
+ }
+ else {
+ l_doms = 0;
+ for (i_ann=0; i_ann < n_annot+1; i_ann++) {
+ if (tmp_ann_entry_arr[i_ann].label == '[') {
+ tmp_domain_entry_arr[l_doms].pos = tmp_ann_entry_arr[i_ann].pos;
+ tmp_domain_entry_arr[l_doms].label = '-';
+ tmp_domain_entry_arr[l_doms].comment = tmp_ann_entry_arr[i_ann].comment;
+ }
+ else if (tmp_ann_entry_arr[i_ann].label == ']') {
+ tmp_domain_entry_arr[l_doms].end = tmp_ann_entry_arr[i_ann].pos;
+ l_doms++;
+ }
+ }
+ }
+ }
+
+ for (i_ann=0; i_ann < n_annot+1; i_ann++) {
+ s_tmp_ann_entry_arr[i_ann] = &tmp_ann_entry_arr[i_ann];
+ }
+
+ sort_annots(s_tmp_ann_entry_arr,n_annot);
+
+ /* now allocate an annot_p if necessary, and link tmp_ann_entry_arr to it */
+ if (annot_p || (annot_p = calloc(1,sizeof(struct annot_str)))!=NULL) {
+ annot_p->annot_arr_p = tmp_ann_entry_arr;
+ annot_p->s_annot_arr_p = s_tmp_ann_entry_arr;
+ annot_p->n_annot = n_annot;
+ annot_p->n_domains = l_doms;
+ /* set to NULL to re-initialize */
+ }
+ }
+ else {
+ annot_p = NULL;
+ }
+ return annot_p;
+}
+
+
+/* **************************************************************** */
+/* add_annot_char(ann_arr, ctmp_label) --
+
+ (1) add annotation character to ann_arr if not present
+ (2) return i_ann if added
+*/
+/* **************************************************************** */
+
+int
+add_annot_char(unsigned char *ann_arr, char ctmp_label) {
+ int i_ann;
+
+ if (ann_arr[0] == '\0') {
+ ann_arr[0] = ' '; ann_arr[1] = '\0';
+ }
+
+ /* check to see if already there? */
+ if (strchr((char *)ann_arr,ctmp_label)==NULL) {
+ /* check for room for another character */
+ if (strlen((char *)ann_arr) >= MAX_FN) {
+ fprintf(stderr,"*** error [%s:%d] - too many annotation characters: len(%s) + %c > %d\n",
+ __FILE__, __LINE__, ann_arr, ctmp_label, MAX_FN-1);
+ return 0;
+ }
+ else {
+ ann_arr[i_ann=strlen((char *)ann_arr)] = ctmp_label; /* add the character */
+ ann_arr[i_ann+1] = '\0'; /* guarantee null termination */
+ return i_ann;
+ }
+ }
+ else {
+ return 0;
+ }
+}
+
+/* **************************************************************** */
+/* get_annot -- produces fasta file from m_msp->sname script
+ (modified 20-Sept-2012 to not use intermediate file)
+
+ # (1) generate a temporary file name
+ # (2) write out one bline (or portion that include accession)
+ # (3) run sname[] script against temporary file, producing table of annotations
+ (1) run script bline_id
+ (4) read in the annotations and put them in struct annot_entry;
+ (5) modify *annot_p to point to structure
+ (6) return number of annotations
+*/
+/* **************************************************************** */
+
+int
+get_annot(char *sname, struct mngmsg *m_msp, char *bline, long offset, int n1, struct annot_str **annot_p,
+ int target, int debug) {
+
+ char tmp_line[MAX_STR];
+ FILE *annot_data_fd;
+ char bline_descr[MAX_STR];
+ char annot_data_file[MAX_LSTR];
+ char annot_script[MAX_LSTR];
+ long q_offset;
+
+ char *bp;
+ FILE *annot_fd=NULL; /* file for annot accessions */
+ struct annot_mstr mtmp_annot;
+
+#ifndef UNIX
+ return 0;
+#else
+
+ if (sname[0] == '!') {
+ /* popen implementation */
+
+ annot_data_file[0] = '\0';
+
+ if (bline[0] == '>') {
+ SAFE_STRNCPY(bline_descr, bline+1,sizeof(bline_descr));
+ }
+ else {
+ SAFE_STRNCPY(bline_descr, bline,sizeof(bline_descr));
+ }
+ if ((strlen(bline_descr) > DESCR_OFFSET) &&
+ (bp=strchr(bline_descr+DESCR_OFFSET,' '))!=NULL) {*bp = '\0';}
+ else {bp = NULL;}
+
+ q_offset = m_msp->q_offset + m_msp->q_off - 1;
+ if (q_offset < 0) { q_offset = 0;}
+ sprintf(annot_script,"%s \"%s\" %ld",sname+1, bline_descr,q_offset+m_msp->n0);
+ annot_script[sizeof(annot_script)-1] = '\0';
+
+ annot_fd = popen(annot_script,"r");
+ }
+ else if (sname[0] == '<') {
+ SAFE_STRNCPY(annot_data_file,sname+1,sizeof(annot_data_file));
+ annot_fd=fopen(annot_data_file,"r");
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] - %s not script (!) or file (<)\n",__FILE__, __LINE__, sname);
+ goto no_annots;
+ }
+
+ if (!annot_fd) {
+ goto no_annots;
+ }
+ else { /* read the annotations into the array */
+
+ /* read #comments, =annot_defs at beginning of file */
+ tmp_line[0] = '#';
+ while (tmp_line[0] == '#' || tmp_line[0] == '=') {
+ if (tmp_line[0] == '=') add_annot_def(m_msp, tmp_line+1,1);
+ if (fgets(tmp_line, sizeof(tmp_line), annot_fd)==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - premature annotation file end (%s)\n",
+ __FILE__,__LINE__, annot_data_file);
+ goto no_annots;
+ }
+ }
+
+ /* set mtmp_annot to be initialized */
+ mtmp_annot.tmp_arr_p = NULL;
+ mtmp_annot.max_annot = 0;
+
+ /* strlen(&tmp_line[1])-1 to remove '>' and beginning and '\n' at end */
+ if (tmp_line[0] != '>') {
+ fprintf(stderr,"*** error [%s:%d] - no %s description: [%s]\n",
+ __FILE__,__LINE__,annot_data_file, tmp_line);
+ goto no_annots;
+ }
+
+ *annot_p = next_annot_entry(annot_fd, tmp_line, sizeof(tmp_line), *annot_p, &mtmp_annot, m_msp, target);
+
+ if (sname[0] == '!') {
+ pclose(annot_fd);
+ }
+ else {
+ fclose(annot_fd);
+ }
+
+ /* now allocate an annot_p if necessary, and link tmp_ann_entry_arr to it */
+ if (*annot_p) {
+ s_annot_to_aa1a(offset, n1, (*annot_p),m_msp->ann_arr,"get_annot");
+ return (*annot_p)->n_annot;
+ }
+ else {
+ if (mtmp_annot.tmp_arr_p) free(mtmp_annot.tmp_arr_p);
+ return 0;
+ }
+ }
+
+ no_annots:
+ return -1;
+#endif
+}
+
+/* s_annot_to_aa1a -- takes an annot_entry[] and converts it to an *aa1_ann
+ */
+void
+s_annot_to_aa1a(long offset, int n1, struct annot_str *annot_p, unsigned char *ann_arr, char *tmp_line) {
+ unsigned char *aa1a_tmp;
+ int i, ic, n_annot;
+ struct annot_entry *this_annot;
+ char *bp;
+
+ if ((aa1a_tmp = (unsigned char *)calloc(n1+2,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate aa1a_ann[%d] array\n",
+ __FILE__, __LINE__, n1);
+ return;
+ }
+
+ if (offset < 0) offset++;
+
+ for (i=0; i < annot_p->n_annot; i++) {
+ this_annot = &annot_p->annot_arr_p[i];
+ /* skip VAR labels */
+ if (this_annot->label == 'V') { continue; }
+ if (this_annot->label == '-') {
+ aa1a_tmp[this_annot->pos]=qascii['['] - NANN;
+ aa1a_tmp[this_annot->end]=qascii[']'] - NANN;
+ continue;
+ }
+ if (strchr((char *)ann_arr, this_annot->label)==NULL) {continue;}
+ if (this_annot->pos - offset < n1) {
+ if (this_annot->pos >= offset) { /* not an error, but annotation must be in range */
+ aa1a_tmp[this_annot->pos - offset]=qascii[this_annot->label] - NANN;
+ }
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] - this_annot->pos:[%ld - %ld] out of range: %d : %s\n",
+ __FILE__, __LINE__, this_annot->pos,offset, n1, tmp_line);
+ }
+ }
+ annot_p->aa1_ann = aa1a_tmp;
+}
+
+/* save_best captures much of the complexity of saving the best scores
+ and appropriately sampling the scores for statistical analysis. It
+ does the following:
+
+ (1) update s_info counts for functions like fasta/x/y that don't
+ optimize every score
+
+ (2) for every result in the buffer:
+ (a) decide if it should be used for statistical sampling
+ (b) if the number of samples > MAX_STATS, then run
+ process_hist() and update all the zscores
+ (c) reset everything for next sequence
+
+ (3) must ensure that -BIGNUM are never in best[]
+
+*/
+
+#include "thr_buf_structs.h"
+#ifndef PCOMPLIB
+#define RESULTS_BUF reader_buf
+#define XTERNAL
+#include "thr_bufs2.h"
+#else
+#define RESULTS_BUF worker_buf
+#include "pcomp_bufs.h"
+#endif
+
+extern char *prog_func; /* function label */
+extern int fa_max_workers;
+extern struct buf_head *lib_buf2_list;
+#ifdef DEBUG
+void check_rbuf(struct buf_head *cur_buf);
+#endif
+extern void get_rbuf(struct buf_head **lib_buf, int max_work_buf);
+extern void put_rbuf(struct buf_head *lib_buf, int max_work_buf);
+extern void wait_rbuf(int max_work_buf);
+extern void rbuf_done(int nthreads);
+extern void put_rbuf_done(int nthreads, struct buf_head *lib_buf,
+ int max_work_buf);
+extern int
+process_hist(struct stat_str *sptr, int nstats,
+ const struct mngmsg *m_msg,
+ struct pstruct *ppst,
+ struct hist_str *hist, void **pstat_void, struct score_count_s *s_info, int do_hist);
+
+extern void addhistz(double, struct hist_str *); /* scaleswn.c */
+void selectbestz(struct beststr **, int, int );
+extern double find_z(int score, double escore, int length, double comp, void *);
+extern double zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db);
+extern struct beststr **bestp_arr; /* array of pointers */
+extern int nbest;
+extern int nstats, nqstats, nrstats, pre_nstats, kstats, shuff_tot, sstats;
+extern double zbestcut; /* cut off for best z-score */
+extern int bestfull; /* index for selectbest() */
+extern int stats_done; /* flag for z-value processing */
+extern void *rand_state;
+extern struct stat_str *stats; /* array of scores for statistics from real
+ (or shuffled) sequences*/
+extern struct stat_str *qstats; /* array of scores for shuffled query stats */
+extern struct stat_str *rstats; /* array of scores from shuffled library */
+
+/* in the current version (fasta_35_01) save_best is used by both
+ threaded and unthreaded versions */
+
+#define COPY_RST_P(d,s) \
+{ d->rst.score[0] = s->rst.score[0]; \
+ d->rst.score[1] = s->rst.score[1]; \
+ d->rst.score[2] = s->rst.score[2]; \
+ d->rst.valid_stat = s->rst.valid_stat; \
+ d->rst.comp = s->rst.comp; \
+ d->rst.H = s->rst.H; \
+ d->rst.escore = s->rst.escore; \
+ d->rst.segnum = s->rst.segnum; \
+ d->rst.seglen = s->rst.seglen; \
+}
+
+void
+save_best(struct buf_head *lib_bhead_p,
+ const struct mngmsg *m_msp, struct pstruct *ppst,
+ struct db_str *ldb, FILE *fdata,
+ struct hist_str *histp, void **pstat_voidp,
+ struct score_count_s *s_info)
+{
+ double zscore;
+ int i_score;
+ struct beststr *bbp;
+ struct buf2_data_s *rbuf_dp, *lib_buf2_dp;
+ struct buf2_res_s *rbuf_rp, *lib_buf2_rp;
+ int i, t_best, t_rbest, t_qrbest, tm_best, t_n1, sc_ix;
+ int t_valid_stat, tr_valid_stat, use_shuff, zsflag_save;
+ double e_score, tm_escore, t_rescore, t_qrescore;
+ int buf2_cnt;
+
+ if (!lib_bhead_p->hdr.have_results) return;
+ if ((buf2_cnt=lib_bhead_p->hdr.buf2_cnt) <= 0) return;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+
+ shuff_tot += lib_bhead_p->hdr.shuff_cnt;
+ s_info->s_cnt[0] += lib_bhead_p->s_cnt_info.s_cnt[0];
+ s_info->s_cnt[1] += lib_bhead_p->s_cnt_info.s_cnt[1];
+ s_info->s_cnt[2] += lib_bhead_p->s_cnt_info.s_cnt[2];
+ s_info->tot_scores += lib_bhead_p->s_cnt_info.tot_scores;;
+
+ sc_ix = ppst->score_ix;
+
+ t_best = t_rbest = t_qrbest = -BIGNUM;
+ tm_escore = t_rescore = t_qrescore = FLT_MAX;
+ t_valid_stat = tr_valid_stat = 0;
+ if (ppst->zsflag >= 10 && ppst->zsflag < 20) { use_shuff = 1;}
+ else { use_shuff = 0;}
+
+#ifdef DEBUG
+ if (fdata) {
+ fprintf(fdata,">save_best: %d\n",buf2_cnt);
+ }
+#endif
+
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ while (buf2_cnt--) { /* count down the number of results */
+ rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
+ rbuf_dp = lib_buf2_dp++; /* step through the data buffer */
+
+ /* perhaps should use explicit flag to indicate no score */
+ if (rbuf_rp->rst.score[0] == -BIGNUM) continue;
+
+ /* i_score: current raw sorting score */
+ i_score = rbuf_rp->rst.score[sc_ix];
+ /* e_score, current escore */
+ e_score = rbuf_rp->rst.escore;
+
+ /* this should be done in the thread, and a sorted set of indexes
+ should be produced by the thread, so we just go down the list
+ to the zscore threshold */
+ zscore = (double)i_score;
+ if (stats_done) {
+ zscore=find_z(i_score, e_score, rbuf_dp->seq->n1,(double)rbuf_rp->rst.comp,
+ *pstat_voidp);
+ }
+
+ /* we have complex logic to decide:
+ (a) for multiframe results, which is the best
+ (b) information about valid stats
+ we should simply return a stats array where all this is figured
+ out in the thread.
+ */
+ t_n1 = rbuf_dp->seq->n1;
+ if (i_score > t_best) tm_best = t_best = i_score;
+ if (e_score < tm_escore) tm_escore = e_score;
+ if (rbuf_rp->rst.valid_stat > t_valid_stat) {
+ t_valid_stat = 1;
+ }
+
+ /* this stuff happens only for fasts/fastm/fastf
+ again, the t_qrbest stuff should be done in the thread
+ rather than check for every hit, run through the loop
+ only if necessary.
+ */
+ if (m_msp->qshuffle) {
+ if (rbuf_rp->qr_score > t_qrbest)
+ t_qrbest = rbuf_rp->qr_score;
+ if (rbuf_rp->qr_escore < t_qrescore)
+ t_qrescore = rbuf_rp->qr_escore;
+
+ if (rbuf_dp->frame == m_msp->nitt1 && t_qrbest > 0 && nqstats < m_msp->shuff_max) {
+ qstats[nqstats].n1 = rbuf_dp->seq->n1; /* save the best score */
+ qstats[nqstats].comp = rbuf_rp->rst.comp;
+ qstats[nqstats].H = rbuf_rp->rst.H;
+ qstats[nqstats].escore = t_qrescore;
+ qstats[nqstats++].score = t_qrbest;
+ t_qrbest = -BIGNUM; /* reset t_qrbest, t_qrescore */
+ t_qrescore = FLT_MAX;
+ }
+ } /* m_msp->qshuffle */
+
+ if (use_shuff) {
+ /* this check is required because some sequences scheduled to be
+ used for statistics may not in fact be returning a score (if
+ they are outside the -M range, for example.
+ */
+ if (rbuf_rp->r_rst.score[0] == -BIGNUM) { tr_valid_stat = 0; }
+ if (rbuf_rp->r_rst.valid_stat > tr_valid_stat) {
+ tr_valid_stat = 1;
+ }
+ if (rbuf_rp->r_rst.score[sc_ix] > t_rbest) {
+ t_rbest = rbuf_rp->r_rst.score[sc_ix];
+ t_rescore = rbuf_rp->r_rst.escore;
+ }
+ }
+
+ /* need to look for frame 0 if TFASTA, then save stats at frame 6 */
+ if (fdata) {
+ fprintf(fdata,
+ "%-12s %6d %d %.5f %.5f %4d %4d %4d %2d %2d %4d %4d %4d %2d %2d %5d %8lld\n",
+ rbuf_dp->mseq->libstr, rbuf_dp->seq->n1,rbuf_dp->frame,rbuf_rp->rst.comp,rbuf_rp->rst.H,
+ rbuf_rp->rst.score[0],rbuf_rp->rst.score[1],rbuf_rp->rst.score[2],
+ t_valid_stat, rbuf_rp->rst.alg_info,
+ (rbuf_rp->r_rst.score[0]<0 ? -1 : rbuf_rp->r_rst.score[0]),
+ (rbuf_rp->r_rst.score[1]<0 ? -1 : rbuf_rp->r_rst.score[1]),
+ (rbuf_rp->r_rst.score[2]<0 ? -1 : rbuf_rp->r_rst.score[2]),
+ tr_valid_stat, rbuf_rp->r_rst.alg_info,
+ rbuf_dp->stats_idx, rbuf_dp->mseq->lseek);
+ }
+
+ /* statistics done for best score of set */
+
+ if (rbuf_dp->frame == m_msp->nitt1) {
+ ldb->entries++;
+ ldb->length += t_n1;
+ if (ldb->length > LONG_MAX) {
+ ldb->length -= LONG_MAX; ldb->carry++;
+ }
+ }
+
+ if (rbuf_dp->frame == m_msp->nitt1 && ppst->zsflag >= 0) {
+ /* if this sample should be used for statistics */
+ if (use_shuff) t_valid_stat = tr_valid_stat;
+ if (t_valid_stat) {
+ /* we've got our initial MAX_STATS values */
+ if (nstats >= MAX_STATS) {
+ if (!stats_done) {
+ zsflag_save = ppst->zsflag;
+ if (ppst->zsflag > 20) {
+ ppst->zsflag -= 20;
+ }
+ ppst->zsflag_f = process_hist(stats,nstats,m_msp, ppst,
+ histp, pstat_voidp,s_info, 0);
+ ppst->zsflag = zsflag_save;
+ kstats = nstats;
+ if (ppst->zsflag >= 0) { /* this is redundant, but rare */
+ stats_done = 1;
+ for (i=0; i< nstats; i++) {
+ bestp_arr[i]->zscore =
+ find_z(bestp_arr[i]->rst.score[ppst->score_ix],
+ bestp_arr[i]->rst.escore, bestp_arr[i]->seq->n1,
+ bestp_arr[i]->rst.comp, *pstat_voidp);
+ }
+ }
+ }
+ }
+ else {
+ /* this logic allows stats_idx to be over-ruled for searches
+ where every query does not generate a score */
+ rbuf_dp->stats_idx = nstats;
+ nstats++;
+ }
+ }
+
+ if (rbuf_dp->stats_idx >= 0 && t_valid_stat) {
+ if (rbuf_dp->stats_idx >= MAX_STATS || nstats > MAX_STATS) {
+ fprintf(stderr, "*** error [%s:%d] - nstats index [%d] out of range [%d,%d]\n",
+ __FILE__, __LINE__,
+ rbuf_dp->stats_idx, nstats,MAX_STATS);
+ }
+ else { /* stats_idx is in range */
+ sstats++;
+ stats[rbuf_dp->stats_idx].n1 = t_n1;
+ stats[rbuf_dp->stats_idx].comp = rbuf_rp->rst.comp;
+ stats[rbuf_dp->stats_idx].H = rbuf_rp->rst.H;
+ if (use_shuff) { /* use shuffled score */
+ stats[rbuf_dp->stats_idx].escore = t_rescore;
+ stats[rbuf_dp->stats_idx].score = t_rbest;
+ }
+ else { /* real score, not shuffled */
+ stats[rbuf_dp->stats_idx].escore = tm_escore;
+ stats[rbuf_dp->stats_idx].score = tm_best;
+ }
+ } /* end stats_idx in range */
+ } /* end have valid stats_idx */
+
+ if (t_valid_stat && stats_done && histp) {
+ addhistz(find_z(t_best, tm_escore, rbuf_dp->seq->n1, (double) rbuf_rp->rst.comp,
+ *pstat_voidp), histp);
+ }
+ /* reset best scores */
+ t_best = t_rbest = -BIGNUM;
+ tm_escore = t_rescore = FLT_MAX;
+ t_valid_stat = tr_valid_stat = 0;
+ }
+
+ /*
+ if (rbuf_rp->rst.score[ppst->score_ix] > 200) {
+ fprintf(stderr, "high score[%d]: %s %d: %d\n", rbuf_dp->seq->index,
+ rbuf_dp->mseq->libstr, rbuf_dp->seq->n1, rbuf_rp->rst.score[ppst->score_ix]);
+ }
+ */
+
+ if (zscore > zbestcut) {
+ if (nbest >= MAX_BEST) {
+ bestfull = nbest-MAX_BEST/4;
+ selectbestz(bestp_arr,bestfull-1,nbest);
+ zbestcut = bestp_arr[bestfull-1]->zscore;
+ nbest = bestfull;
+ }
+ bbp = bestp_arr[nbest++];
+
+ COPY_RST_P(bbp, rbuf_rp);
+
+ bbp->seq = rbuf_dp->seq;
+ bbp->mseq = rbuf_dp->mseq;
+ bbp->n1 = rbuf_dp->seq->n1;
+#ifdef DEBUG
+ bbp->adler32_crc = rbuf_dp->seq->adler32_crc;
+#endif
+ /* rbuf_dp->best_save is set after a rbuf_dp is entered into best_str */
+ if (rbuf_dp->best_save) {
+ /* a previous rbuf_dp->seq is in best_str at best_save */
+ if (rbuf_dp->best_save->seq == rbuf_dp->seq) {
+ /* the best_save->seq matches the rbuf_dp->seq */
+ bbp->bbp_link = rbuf_dp->best_save;
+ /* bbp_link tells where this ->seq can be found */
+ }
+ else {
+ bbp->bbp_link = NULL;
+ }
+ }
+ rbuf_dp->best_save = bbp;
+ lib_bhead_p->hdr.have_best_save = 1;
+ bbp->zscore = zscore;
+ bbp->frame = rbuf_dp->frame;
+ }
+ }
+}
+
+void
+save_best2(struct buf_head *lib_bhead_p,
+ const struct mngmsg *m_msp, struct pstruct *ppst,
+ struct db_str *ldb, FILE *fdata,
+ struct hist_str *histp, void **pstat_voidp,
+ struct score_count_s *s_info)
+{
+ double zscore;
+ int i_score;
+ struct beststr *bbp;
+ struct buf2_data_s *rbuf_dp, *lib_buf2_dp;
+ struct buf2_res_s *rbuf_rp, *lib_buf2_rp;
+ int i, sc_ix;
+ int t_valid_stat, use_shuff, zsflag_save;
+ double e_score;
+ int buf2_cnt;
+
+ if (!lib_bhead_p->hdr.have_results) return;
+ if ((buf2_cnt = lib_bhead_p->hdr.buf2_cnt) <= 0) return;
+
+ /*
+#ifdef DEBUG
+ fprintf(stderr," save_best2: lib_bhead_p->buf2_data[0]->mseq->index/lseek: %d,%lld\n",
+ lib_bhead_p->buf2_data[0].mseq->index,lib_bhead_p->buf2_data[0].mseq->lseek);
+#endif
+ */
+ if (ppst->zsflag >= 10 && ppst->zsflag < 20) { use_shuff = 1;}
+ else {use_shuff = 0;}
+
+ shuff_tot += lib_bhead_p->hdr.shuff_cnt;
+ s_info->s_cnt[0] += lib_bhead_p->s_cnt_info.s_cnt[0];
+ s_info->s_cnt[1] += lib_bhead_p->s_cnt_info.s_cnt[1];
+ s_info->s_cnt[2] += lib_bhead_p->s_cnt_info.s_cnt[2];
+ s_info->tot_scores += lib_bhead_p->s_cnt_info.tot_scores;;
+ sc_ix = ppst->score_ix;
+
+ /* save the raw data if requested */
+ if (fdata) {
+#ifdef DEBUG
+ fprintf(fdata,">save_best: %d\n",buf2_cnt);
+#endif
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ while (buf2_cnt--) { /* count down the number of results */
+
+ rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
+ rbuf_dp = lib_buf2_dp++; /* step through the data buffer */
+
+ /* perhaps should use explicit flag to indicate no score */
+ if (rbuf_rp->rst.score[0] == -BIGNUM) continue;
+
+ fprintf(fdata,
+ "%-12s %6d %d %.5f %.5f %4d %4d %4d %2d %2d %4d %4d %4d %2d %2d %5d %8lld\n",
+ rbuf_dp->mseq->libstr, rbuf_dp->seq->n1,rbuf_dp->frame,rbuf_rp->rst.comp,rbuf_rp->rst.H,
+ rbuf_rp->rst.score[0],rbuf_rp->rst.score[1],rbuf_rp->rst.score[2],
+ rbuf_rp->is_valid_stat, rbuf_rp->rst.alg_info,
+ (rbuf_rp->r_rst.score[0]<0 ? -1 : rbuf_rp->r_rst.score[0]),
+ (rbuf_rp->r_rst.score[1]<0 ? -1 : rbuf_rp->r_rst.score[1]),
+ (rbuf_rp->r_rst.score[2]<0 ? -1 : rbuf_rp->r_rst.score[2]),
+ rbuf_rp->is_valid_stat, rbuf_rp->r_rst.alg_info,
+ rbuf_dp->stats_idx, rbuf_dp->mseq->lseek);
+ }
+ }
+
+ /* save the high-scoring data */
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ while (buf2_cnt--) {
+ rbuf_rp = lib_buf2_rp++;
+ rbuf_dp = lib_buf2_dp++;
+
+ /* perhaps should use explicit flag to indicate no score */
+ if (rbuf_rp->rst.score[0] == -BIGNUM) continue;
+
+ /* i_score: current raw sorting score */
+ i_score = rbuf_rp->rst.score[sc_ix];
+ /* e_score, current escore */
+ e_score = rbuf_rp->rst.escore;
+ /* this should be done in the thread, and a sorted set of indexes
+ should be produced by the thread, so we just go down the list
+ to the zscore threshold */
+ zscore = (double)i_score;
+ if (stats_done) {
+ zscore=find_z(i_score, e_score, rbuf_dp->seq->n1,(double)rbuf_rp->rst.comp,
+ *pstat_voidp);
+ }
+
+ if (rbuf_dp->frame == m_msp->nitt1) {
+ ldb->entries++;
+ ldb->length += rbuf_dp->seq->n1;
+ if (ldb->length > LONG_MAX) {
+ ldb->length -= LONG_MAX; ldb->carry++;
+ }
+ }
+
+ if (zscore > zbestcut) {
+ if (nbest >= MAX_BEST) {
+ bestfull = nbest-MAX_BEST/4;
+ selectbestz(bestp_arr,bestfull-1,nbest);
+ zbestcut = bestp_arr[bestfull-1]->zscore;
+ nbest = bestfull;
+ }
+ bbp = bestp_arr[nbest++];
+
+ COPY_RST_P(bbp, rbuf_rp);
+
+ bbp->seq = rbuf_dp->seq;
+ bbp->mseq = rbuf_dp->mseq;
+ bbp->n1 = rbuf_dp->seq->n1;
+#ifdef DEBUG
+ bbp->adler32_crc = rbuf_dp->seq->adler32_crc;
+#endif
+ /* rbuf_dp->best_save is set after a rbuf_dp is entered into best_str */
+ if (rbuf_dp->best_save) {
+ /* a previous rbuf_dp->seq is in best_str at best_save */
+ if (rbuf_dp->best_save->seq == rbuf_dp->seq) {
+ /* the best_save->seq matches the rbuf_dp->seq */
+ bbp->bbp_link = rbuf_dp->best_save;
+ /* bbp_link tells where this ->seq can be found */
+ }
+ else {
+ bbp->bbp_link = NULL;
+ }
+ }
+ rbuf_dp->best_save = bbp;
+ lib_bhead_p->hdr.have_best_save = 1;
+ bbp->zscore = zscore;
+ bbp->frame = rbuf_dp->frame;
+ }
+ }
+
+ /* process results for statistics */
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ while (buf2_cnt--) { /* count down the number of results */
+ rbuf_dp = lib_buf2_dp++; /* step through the results buffer */
+ rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
+
+ if (!rbuf_rp->is_valid_stat) { continue;}
+
+
+ if (use_shuff) {
+ i_score = rbuf_rp->r_rst.score[sc_ix];
+ e_score = rbuf_rp->r_rst.escore;
+ }
+ else {
+ i_score = rbuf_rp->rst.score[sc_ix];
+ e_score = rbuf_rp->rst.escore;
+ }
+
+ if (rbuf_dp->stats_idx >= MAX_STATS || nstats > MAX_STATS) {
+ fprintf(stderr, "*** error [%s:%d] - nstats index [%d] out of range [%d,%d]\n",
+ __FILE__, __LINE__,
+ rbuf_dp->stats_idx, nstats,MAX_STATS);
+ continue;
+ }
+
+ if (nstats < MAX_STATS) {
+ /* this logic allows stats_idx to be over-ruled for searches
+ where every query does not generate a score */
+ rbuf_dp->stats_idx = nstats;
+ nstats++;
+ }
+
+ if (stats_done && histp) {
+ addhistz(find_z(i_score, e_score, rbuf_dp->seq->n1, (double) rbuf_rp->rst.comp,
+ *pstat_voidp), histp);
+ }
+
+ if (rbuf_dp->stats_idx < 0) {
+ continue;
+ }
+
+ sstats++;
+ stats[rbuf_dp->stats_idx].n1 = rbuf_dp->seq->n1;
+ stats[rbuf_dp->stats_idx].comp = rbuf_rp->rst.comp;
+ stats[rbuf_dp->stats_idx].H = rbuf_rp->rst.H;
+ stats[rbuf_dp->stats_idx].escore = e_score;
+ stats[rbuf_dp->stats_idx].score = i_score;
+ }
+
+
+ /* fill the qstats[] array if m_msp->qshuffle */
+ if (m_msp->qshuffle && nqstats < m_msp->shuff_max) {
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ while (buf2_cnt--) {
+ rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
+ rbuf_dp = lib_buf2_dp++;
+
+ if (rbuf_rp->is_valid_stat && rbuf_rp->qr_score > 0
+ && nqstats < m_msp->shuff_max) {
+ qstats[nqstats].n1 = rbuf_dp->seq->n1; /* save the best score */
+ qstats[nqstats].comp = rbuf_rp->rst.comp;
+ qstats[nqstats].H = rbuf_rp->rst.H;
+ qstats[nqstats].escore = rbuf_rp->qr_escore;
+ qstats[nqstats++].score = rbuf_rp->qr_score;
+ }
+ } /* m_msp->qshuffle */
+ }
+
+ /* check if we have enough data to do stats */
+ if (!stats_done && nstats >= MAX_STATS) {
+ zsflag_save = ppst->zsflag;
+ if (ppst->zsflag > 20) {
+ ppst->zsflag -= 20;
+ }
+ ppst->zsflag_f = process_hist(stats,nstats,m_msp, ppst,
+ histp, pstat_voidp,s_info, 0);
+ ppst->zsflag = zsflag_save;
+ kstats = nstats;
+ stats_done = 1;
+ for (i=0; i< nstats; i++) {
+ bestp_arr[i]->zscore =
+ find_z(bestp_arr[i]->rst.score[ppst->score_ix],
+ bestp_arr[i]->rst.escore, bestp_arr[i]->seq->n1,
+ bestp_arr[i]->rst.comp, *pstat_voidp);
+ }
+ }
+
+}
+
+void
+save_shuf(struct buf_head *lib_bhead_p, int nitt1, int shuff_max, int sc_ix,
+ struct score_count_s *s_info)
+{
+ struct buf2_data_s *rbuf_dp, *lib_buf2_dp;
+ struct buf2_res_s *rbuf_rp, *lib_buf2_rp;
+ int t_valid_stat;
+ int t_rbest;
+ double t_rescore;
+ int buf2_cnt, jstats;
+ static int kstats=0;
+
+
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+
+ s_info->s_cnt[0] += lib_bhead_p->s_cnt_info.s_cnt[0];
+ s_info->s_cnt[1] += lib_bhead_p->s_cnt_info.s_cnt[1];
+ s_info->s_cnt[2] += lib_bhead_p->s_cnt_info.s_cnt[2];
+
+ s_info->tot_scores += lib_bhead_p->s_cnt_info.tot_scores;
+ /* this is done because we are not using r_rst->valid_stat to limit selection of scores */
+ /* s_info->s_cnt[sc_ix] = s_info->tot_scores; */
+
+ t_rbest = -BIGNUM;
+ t_valid_stat = 0;
+
+ while (buf2_cnt--) { /* count down the number of results */
+ rbuf_dp = lib_buf2_dp++; /* step through the results buffer */
+ rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
+
+ /* perhaps should use explicit flag to indicate no score */
+ if (rbuf_rp->r_rst.score[0] == -BIGNUM) continue;
+
+ if (rbuf_rp->r_rst.score[sc_ix] > t_rbest) {
+ t_rbest = rbuf_rp->r_rst.score[sc_ix];
+ t_rescore = rbuf_rp->r_rst.escore;
+ }
+
+ if (rbuf_rp->r_rst.valid_stat > t_valid_stat) {
+ t_valid_stat = 1;
+ }
+
+ /* statistics done for best score of set */
+ /* currently no check for rst->valid_stat, which causes
+ over-estimates of shuffles */
+
+ if (rbuf_dp->frame == nitt1) {
+ if (t_valid_stat) {
+ if (nrstats < shuff_max ) { kstats = jstats = nrstats++; }
+ else { /* randomly replace */
+ jstats = my_nrand(++kstats,rand_state);
+ if (jstats >= shuff_max) goto done;
+ }
+
+ rstats[jstats].n1 = rbuf_dp->seq->n1;
+ rstats[jstats].comp = rbuf_rp->r_rst.comp;
+ rstats[jstats].H = rbuf_rp->r_rst.H;
+ rstats[jstats].escore = t_rescore;
+ rstats[jstats].score = t_rbest;
+ done:
+ t_rbest = -BIGNUM;
+ }
+ }
+ }
+}
+
+int
+save_align(struct buf_head *lib_bhead_p, struct beststr **bestp_arr)
+{
+ struct buf2_ares_s *rbuf_ap, *lib_buf2_ap;
+ int buf2_cnt;
+
+ if (!lib_bhead_p->hdr.have_results || lib_bhead_p->hdr.buf2_cnt <= 0) return 0;
+
+ lib_buf2_ap = lib_bhead_p->buf2_ares;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+
+ while (buf2_cnt-- > 0) { /* count down the number of results */
+ rbuf_ap = lib_buf2_ap++; /* step through the results buffer */
+ if (bestp_arr[rbuf_ap->best_idx]->a_res == NULL) {
+ bestp_arr[rbuf_ap->best_idx]->have_ares = rbuf_ap->have_ares;
+ bestp_arr[rbuf_ap->best_idx]->a_res = rbuf_ap->a_res;
+ }
+#ifdef DEBUG
+ else {
+ fprintf(stderr,"*** error [%s:%d] - attempt to re-save a_res for [%d]: %s\n",
+ __FILE__, __LINE__, rbuf_ap->best_idx, bestp_arr[rbuf_ap->best_idx]->mseq->bline);
+ }
+#endif
+ }
+
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ lib_bhead_p->hdr.have_results = 0;
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ return buf2_cnt;
+}
+
+/* buf_do_work fills in the lib_bhead_p->buf2_res[] array with the
+ do_work() results,
+
+ inputs: **aa0, n0 (query)
+ lib_bhead_p->buf2_data lib_bhead_p->hdr.buf2_cnt library sequences
+ max_frame (used to set statistics info)
+ ppst,
+ void *f_struct prepared by init_work()
+
+ results: lib_bhead_p->buf2_res[]
+
+ included in buf2_res[] is use_stat, which captures the
+ logic required to decide whether a value should be saved
+ in the stats[] buffer. This complexity mostly arises
+ because there can be more scores than sequences, but there
+ can only on statistics score per sequence (the best score).
+*/
+void
+buf_do_work(unsigned char **aa0, int n0,
+ struct buf_head *lib_bhead_p,
+ int max_frame,
+ struct pstruct *ppst, void **f_str) {
+
+ int buf2_cnt;
+ unsigned long atmp;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_res_s *lib_buf2_rp, *t_best_rp;
+ int t_best, sc_ix;
+ double t_escore;
+
+ sc_ix = ppst->score_ix;
+
+ lib_bhead_p->s_cnt_info.s_cnt[0] = lib_bhead_p->s_cnt_info.s_cnt[1] =
+ lib_bhead_p->s_cnt_info.s_cnt[2] = lib_bhead_p->s_cnt_info.tot_scores = 0;
+
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+
+ t_best_rp = NULL;
+ t_best = -BIGNUM;
+ t_escore = 1000.0;
+
+ while (buf2_cnt-- > 0) {
+
+ lib_buf2_rp->rst.score[0] =
+ lib_buf2_rp->rst.score[1] =
+ lib_buf2_rp->rst.score[2] = -BIGNUM;
+
+ lib_buf2_rp->is_valid_stat = 0;
+
+ if (lib_buf2_dp->seq->n1 < ppst->n1_low ||
+ lib_buf2_dp->seq->n1 > ppst->n1_high ) {
+ /* tells save_best() there is no stats score here -- not
+ necessary as -BIGNUM indicates no score */
+ lib_buf2_dp->stats_idx = -1;
+ goto next_seq;
+ }
+
+#ifdef DEBUG
+ if (check_seq_range(lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
+ ppst->nsqx, "buf_do_work()")) {
+ fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_work] range error at: %d/%d (n1:%d)\n",
+ __FILE__, __LINE__,
+ prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
+ lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
+ goto next_seq;
+ };
+
+ /* also check for adler32_crc match */
+ if (lib_buf2_dp->seq->adler32_crc != (atmp=adler32(1L,lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1))) {
+ fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_work] CRC error [%lu!=%lu] at: %d/%d (n1:%d/l_offset:%ld)\n",
+ __FILE__, __LINE__,
+ prog_func,lib_buf2_dp->seq->adler32_crc, atmp,
+ lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
+ lib_bhead_p->hdr.buf2_cnt,lib_buf2_dp->seq->n1,
+ lib_buf2_dp->seq->l_offset);
+ goto next_seq;
+ }
+#endif
+
+ do_work (aa0[lib_buf2_dp->frame], n0,
+ lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
+ lib_buf2_dp->frame, ppst, f_str[lib_buf2_dp->frame], 0, 0,
+ &(lib_buf2_rp->rst), &(lib_bhead_p->s_cnt_info));
+
+ if (lib_buf2_rp->rst.valid_stat) {
+ if (lib_buf2_rp->rst.escore < t_escore) {
+ t_escore = lib_buf2_rp->rst.escore;
+ t_best_rp = lib_buf2_rp;
+ }
+ if (lib_buf2_rp->rst.score[sc_ix] > t_best) {
+ t_best = lib_buf2_rp->rst.score[sc_ix];
+ t_best_rp = lib_buf2_rp;
+ }
+ }
+
+ if (lib_buf2_dp->frame == max_frame) {
+ if (t_best_rp!=NULL) {
+ t_best_rp->is_valid_stat = 1;
+ t_best_rp = NULL;
+ }
+ t_best = -BIGNUM;
+ t_escore = 1000.0;
+ }
+
+ next_seq:
+ lib_buf2_dp++;
+ lib_buf2_rp++;
+ }
+
+ /* place to produce z_scores */
+ /* place to produce sorted array */
+
+ lib_bhead_p->hdr.have_results = 1;
+}
+
+void
+buf_do_align(unsigned char **aa0, int n0,
+ struct buf_head *lib_bhead_p,
+ struct pstruct *ppst, const struct mngmsg *m_msp,
+ void **f_str) {
+
+ int buf2_cnt, i, nsq;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_res_s *lib_buf2_rp;
+ struct buf2_ares_s *lib_buf2_ap;
+ struct rstruct rst;
+
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ lib_buf2_ap = lib_bhead_p->buf2_ares;
+
+ while (buf2_cnt-- > 0) {
+ if ( m_msp->stages > 1) {
+ /* this is not typically done unless m_msp->stages > 1 */
+ do_opt (aa0[lib_buf2_dp->frame], n0, lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
+ lib_buf2_dp->frame, ppst, f_str[lib_buf2_dp->frame], &rst);
+ lib_buf2_rp->rst.score[2]=rst.score[2];
+ }
+
+#ifdef DEBUG
+ if (lib_buf2_dp->seq->aa1b == NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [buf_do_align] null aa1b\n",__FILE__, __LINE__);
+ lib_buf2_ap->a_res = NULL;
+ break;
+ }
+ if (check_seq_range(lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
+ ppst->nsqx, "buf_do_align()")) {
+ fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_align] range error at: %d/%d (n1:%d)\n",
+ __FILE__, __LINE__,
+ prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
+ lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
+ };
+
+ /* also check for adler32_crc match */
+ if (lib_buf2_dp->seq->adler32_crc != adler32(1L,lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1)) {
+ fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_align] CRC error at: %d/%d (n1:%d)\n",
+ __FILE__, __LINE__,
+ prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
+ lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
+ }
+#endif
+
+ lib_buf2_ap->a_res = build_ares_code(aa0[lib_buf2_dp->frame], m_msp->n0,
+ lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq,
+ lib_buf2_dp->frame, &lib_buf2_ap->have_ares,
+ lib_buf2_dp->repeat_thresh, m_msp, ppst, f_str[lib_buf2_dp->frame] );
+
+ lib_buf2_dp++;
+ lib_buf2_ap++;
+ lib_buf2_rp++;
+ }
+ lib_bhead_p->hdr.have_results = 1;
+}
+
+void
+buf_qshuf_work(unsigned char *aa0s, int n0,
+ struct buf_head *lib_bhead_p,
+ int max_frame,
+ struct pstruct *ppst, void *qf_str,
+ int ix_score)
+{
+ int buf2_cnt;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_res_s *lib_buf2_rp, *tq_best_rp;
+ struct rstruct rrst;
+ struct score_count_s q_scnt_info;
+ int tq_best;
+ double tq_escore;
+
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+
+ tq_best_rp = NULL;
+ tq_best = -BIGNUM;
+ tq_escore = 1000.0;
+
+ while (buf2_cnt-- > 0) {
+ rrst.score[0] = rrst.score[1] = rrst.score[2] = -BIGNUM;
+ rrst.valid_stat = 0;
+
+ if (lib_buf2_dp->seq->n1 < ppst->n1_low ||
+ lib_buf2_dp->seq->n1 > ppst->n1_high ) {
+ lib_buf2_dp++;
+ lib_buf2_rp++;
+ tq_best_rp = NULL;
+ tq_best = -BIGNUM;
+ tq_escore = 1000.0;
+ continue;
+ }
+
+ do_work (aa0s, n0,
+ lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
+ lib_buf2_dp->frame, ppst, qf_str, 1, 0,
+ &rrst, &q_scnt_info);
+
+ /* buf_qshuf_work() is always called after buf_do_work(), which
+ sets rp->is_valid_stat */
+ if (lib_buf2_rp->is_valid_stat) {
+ tq_best_rp = lib_buf2_rp;
+ }
+
+ if (rrst.escore < tq_escore) {
+ tq_escore = rrst.escore;
+ }
+ if (rrst.score[ix_score] > tq_best) {
+ tq_best = rrst.score[ix_score];
+ }
+
+ if (lib_buf2_dp->frame == max_frame) {
+ if (tq_best_rp!=NULL) {
+ tq_best_rp->qr_score = tq_best;
+ tq_best_rp->qr_escore = tq_escore;
+ tq_best_rp = NULL;
+ }
+#ifdef DEBUG
+ else {
+ fprintf(stderr,"*** error [%s:%d] - tq_best_rp NULL at: %ld\n",
+ __FILE__, __LINE__, lib_buf2_rp - lib_bhead_p->buf2_res);
+ }
+#endif
+ tq_best = -BIGNUM;
+ tq_escore = 1000.0;
+ }
+ lib_buf2_dp++;
+ lib_buf2_rp++;
+ }
+}
+
+void
+buf_shuf_work(unsigned char **aa0, int n0, unsigned char *aa1s, struct buf_head *lib_bhead_p,
+ int max_frame, struct pstruct *ppst, void **f_str,
+ int ix_score, void *rand_state)
+{
+ int buf2_cnt;
+ int shuff_cnt;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_res_s *lib_buf2_rp, *tr_best_rp;
+ int tr_best, sc_ix;
+ double tr_escore;
+
+ sc_ix = ppst->score_ix;
+
+ lib_bhead_p->s_cnt_info.s_cnt[0] = lib_bhead_p->s_cnt_info.s_cnt[1] =
+ lib_bhead_p->s_cnt_info.s_cnt[2] = lib_bhead_p->s_cnt_info.tot_scores = 0;
+
+ shuff_cnt = 0;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+
+ tr_best_rp = NULL;
+ tr_best = -BIGNUM;
+ tr_escore = 1000.0;
+
+ while (buf2_cnt-- > 0) {
+ lib_buf2_rp->r_rst.score[0] = lib_buf2_rp->r_rst.score[1] =
+ lib_buf2_rp->r_rst.score[2] = -BIGNUM;
+ lib_buf2_rp->r_rst.valid_stat = lib_buf2_rp->is_valid_stat = 0;
+
+ if ((lib_buf2_dp->stats_idx < 0) || lib_buf2_dp->seq->n1 < ppst->n1_low ||
+ lib_buf2_dp->seq->n1 > ppst->n1_high ) {
+ lib_buf2_dp++;
+ lib_buf2_rp++;
+ tr_best_rp = NULL;
+ tr_best = -BIGNUM;
+ tr_escore = 1000.0;
+ continue;
+ }
+
+ shuff_cnt++;
+ if (ppst->zs_win > 0) {
+ wshuffle(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1,ppst->zs_win, rand_state);
+ }
+ else {
+ if (ppst->shuffle_dna3) {shuffle3(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1, rand_state);}
+ else {shuffle(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1, rand_state);}
+ }
+
+ /* rshuffle(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1); */
+
+#ifdef DEBUG
+ if (check_seq_range(aa1s, lib_buf2_dp->seq->n1,
+ ppst->nsqx, "buf_do_align()")) {
+ fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_shuff] range error at: %d/%d (n1:%d)\n",
+ __FILE__, __LINE__,
+ prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
+ lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
+ };
+#endif
+
+ do_work (aa0[lib_buf2_dp->frame], n0,
+ aa1s, lib_buf2_dp->seq->n1,
+ lib_buf2_dp->frame, ppst, f_str[lib_buf2_dp->frame], 0, 1,
+ &lib_buf2_rp->r_rst, &(lib_bhead_p->s_cnt_info));
+
+ if (lib_buf2_rp->r_rst.valid_stat) {
+ if (lib_buf2_rp->r_rst.escore < tr_escore) {
+ tr_escore = lib_buf2_rp->r_rst.escore;
+ tr_best_rp = lib_buf2_rp;
+ }
+ if (lib_buf2_rp->r_rst.score[sc_ix] > tr_best) {
+ tr_best = lib_buf2_rp->r_rst.score[sc_ix];
+ tr_best_rp = lib_buf2_rp;
+ }
+ }
+
+ if (lib_buf2_dp->frame == max_frame) {
+ if (tr_best_rp!=NULL) {
+ tr_best_rp->is_valid_stat = 1;
+ tr_best_rp = NULL;
+ }
+ tr_best = -BIGNUM;
+ tr_escore = 1000.0;
+ }
+
+ lib_buf2_dp++;
+ lib_buf2_rp++;
+ }
+ lib_bhead_p->hdr.shuff_cnt = shuff_cnt;
+ lib_bhead_p->hdr.have_results = 1;
+}
+
+/* buf_shuf_seq is designed to:
+ (1) take a list of sequences (specified by bptr[])
+ (2) collect them from the database if they are not already available
+ (3) send them to the threads or shuffle them directly and calculate scores
+*/
+
+void
+buf_shuf_seq(unsigned char **aa0, int n0,
+ unsigned char **aa1shuff_b, unsigned char *aa1save, int maxn,
+ struct beststr **bestp_arr, int nbest,
+ struct pstruct *ppst, struct mngmsg *m_msp,
+ struct mng_thr *m_bufi_p
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ , void **f_str
+#endif
+ , struct score_count_s *s_info)
+{
+ unsigned char *aa1shuff;
+ struct beststr *bbp, **tmp_bestp;
+ char l_bline[MAX_SSTR];
+ int n1lib_req, shuff_mult;
+ long loffset, l_off;
+ int n1, itt;
+ int max_do_cnt, ndiff, prev_index;
+ int istats;
+ int i, j;
+
+ /* these variables track buffers of library sequences */
+ int cur_buf_size, max_buf_size;
+ struct buf_head *lib_bhead_p;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_res_s *lib_buf2_rp;
+
+/* (1) get the sequences into a buffer - the sequence information is
+ currently in the bestp_arr - find out how many we have, and how
+ many we will need - the number to shuffle */
+
+/* figure out how much space we need, first checking whether we have
+ dups */
+ if ((tmp_bestp = (struct beststr **)calloc(nbest, sizeof(struct beststr *)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - %s/buf_shuf_seq() *** cannot allocate tmp_bestp[%d]\n",
+ __FILE__, __LINE__, prog_name, nbest);
+ exit(1);
+ }
+ for (i = 0; i < nbest; i++) {
+ tmp_bestp[i] = bestp_arr[i];
+ }
+
+ /* sort tmp_bestp[] by sequence index, so duplicates are adjacent */
+ sortbesti(tmp_bestp, nbest);
+
+ /* count number of different sequence indices, get required space
+ without dups */
+ prev_index = -1;
+ n1lib_req = ndiff = 0;
+ for (i = 0; i < nbest; i++) {
+ if (tmp_bestp[i]->seq->index > prev_index) {
+ prev_index = tmp_bestp[i]->seq->index;
+ n1lib_req += tmp_bestp[i]->n1+ 2;
+ ndiff++;
+ }
+ }
+
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ if (n1lib_req >= maxn) { /* we need new space, aa1shuff is too small */
+ if ((*aa1shuff_b = aa1shuff =
+ (unsigned char *)realloc(*aa1shuff_b, n1lib_req*sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot realloc aa1shuff[%d]\n",
+ __FILE__, __LINE__, n1lib_req);
+ exit(1);
+ }
+ }
+ else { aa1shuff = *aa1shuff_b;}
+ *aa1shuff = '\0';
+ aa1shuff++;
+
+#else
+ if (n1lib_req < 2) {
+ fprintf(stderr,"*** error [%s:%d] - [%s/buf_shuf_seq] no residues to shuffle: %d (%d)\n",
+ __FILE__, __LINE__,
+ prog_func,n1lib_req,ndiff);
+ exit(1);
+ }
+
+ if ((*aa1shuff_b = aa1shuff =
+ (unsigned char *)calloc(n1lib_req,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot calloc aa1shuff[%d]\n",
+ __FILE__, __LINE__, n1lib_req);
+ exit(1);
+ }
+ *aa1shuff = '\0';
+ aa1shuff++;
+#endif
+
+ shuff_mult = (m_msp->shuff_max+1)/ndiff;
+ istats = 0;
+
+ /* setup lib_bhead buffers for shuffle comparisons */
+#if defined(COMP_THR) || defined(PCOMPLIB) /* threaded/parallel */
+ /* max_do_cnt can be smaller than max_buf2_cnt, but not larger */
+ max_do_cnt = min(m_bufi_p->max_buf2_res,
+ m_msp->shuff_max / (2 * fa_max_workers));
+ /* we don't have a left over one, so we need one */
+ get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
+#else /* not threaded */
+ max_do_cnt = m_bufi_p->max_buf2_res;
+ lib_bhead_p = lib_buf2_list; /* equivalent to un-threaded get_rbuf() */
+#endif
+ max_buf_size = n1lib_req;
+ cur_buf_size = 0;
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ lib_bhead_p->hdr.have_results = 0;
+ lib_bhead_p->hdr.stop_work = 0;
+ lib_bhead_p->hdr.buf2_type=BUF2_DOSHUF;
+ lib_bhead_p->hdr.seq_record_continuous = 0;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+
+ /* read sequences into shuffle buffer */
+
+ for (i = 0; i < ndiff; i++) {
+ bbp = tmp_bestp[i];
+ if (bbp->seq->aa1b == NULL) {
+ /* get the sequence */
+ (bbp->mseq->m_file_p->ranlib)(l_bline, sizeof(l_bline),
+ bbp->mseq->lseek,bbp->mseq->libstr,bbp->mseq->m_file_p);
+ n1 = re_getlib(aa1save,NULL, maxn,m_msp->ldb_info.maxt3,
+ m_msp->ldb_info.l_overlap,bbp->mseq->cont,m_msp->ldb_info.term_code,
+ &loffset,&l_off,bbp->mseq->m_file_p);
+
+ /* fprintf(stderr, " %d gets %d %d\n",i,tmp_bestp[i]->seq->n1,n1); */
+
+ memcpy(aa1shuff, aa1save, n1+1);
+ bbp->seq->aa1b = aa1shuff;
+ aa1shuff += n1 + 1;
+ }
+
+ /* lib_buf2_dp is used up by scores, the sequence is not sent multiple times */
+ cur_buf_size += bbp->seq->n1+1;
+ for (j = 0; j < shuff_mult; j++ ) {
+ for (itt = m_msp->revcomp; itt <= m_msp->nitt1; itt++) {
+#ifdef PCOMPLIB
+ lib_buf2_dp->seq_dup = 0; /* mark first ->seq as original, not duplicate */
+#endif
+ lib_buf2_dp->seq = bbp->seq;
+ /* this invalidates lib_buf2_p->seq */
+ lib_buf2_dp->stats_idx = istats++;
+ lib_buf2_dp->frame = itt;
+ lib_buf2_dp++; /* point to next buf2 */
+ lib_buf2_rp++; /* point to next buf2 */
+ lib_bhead_p->hdr.buf2_cnt++;
+
+ if (lib_bhead_p->hdr.buf2_cnt >= max_do_cnt ||
+ cur_buf_size >= max_buf_size) {
+/* (2) send sequences for shuffling */
+#if defined(COMP_THR) || defined(PCOMPLIB) /* threaded - fill and empty buffers */
+ /* provide empty buffer to workers */
+ lib_bhead_p->hdr.aa1b_used = cur_buf_size;
+ lib_bhead_p->hdr.have_data = 1;
+ lib_bhead_p->hdr.seq_record_continuous = 0;
+ put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
+ get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
+#else /* non-thread - just do the searches */
+ if (lib_bhead_p->hdr.buf2_type & BUF2_DOSHUF) {
+ buf_shuf_work(aa0,m_msp->n0, aa1save, lib_bhead_p,
+ m_msp->nitt1, ppst, f_str, ppst->score_ix, rand_state);
+ }
+#endif
+/* (3) save results in the rstats structure */
+ if (lib_bhead_p->hdr.buf2_cnt > 0 && lib_bhead_p->hdr.have_results) {
+ save_shuf(lib_bhead_p,m_msp->nitt1,m_msp->shuff_max,ppst->score_ix,s_info);
+ }
+
+ lib_bhead_p->s_cnt_info.s_cnt[0] = lib_bhead_p->s_cnt_info.s_cnt[1] =
+ lib_bhead_p->s_cnt_info.s_cnt[2] = lib_bhead_p->s_cnt_info.tot_scores = 0;
+
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ cur_buf_size = 0;
+ lib_bhead_p->hdr.have_results = 0;
+ lib_bhead_p->hdr.buf2_type=BUF2_DOSHUF;
+ lib_bhead_p->hdr.seq_record_continuous = 0; /* seq_records are coming from bestptr in any order */
+ lib_bhead_p->hdr.stop_work = 0;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ }
+ } /* for (itt .. */
+ }
+ } /* done with tmp_bestp[] */
+ free(tmp_bestp);
+
+#if defined(COMP_THR) || defined(PCOMPLIB) /* if COMP_THR/PCOMPLIB - fill and empty buffers */
+ /* check last buffers for any results */
+ lib_bhead_p->hdr.seq_record_continuous = 0;
+ put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
+
+ /* wait for the threads to finish */
+
+ wait_rbuf(m_bufi_p->max_work_buf);
+ /*
+ fprintf(stderr, " num_reader[%d]-empty[%d]: %d\tnrstats: %d\n",
+ num_reader_bufs,empty_reader_bufs,
+ num_reader_bufs-empty_reader_bufs, nrstats);
+ */
+
+ for (i=0; i < num_reader_bufs; i++) {
+ if (RESULTS_BUF[i]->hdr.buf2_cnt > 0 && RESULTS_BUF[i]->hdr.have_results) {
+ save_shuf(RESULTS_BUF[i],m_msp->nitt1, m_msp->shuff_max, ppst->score_ix, s_info);
+ RESULTS_BUF[i]->hdr.buf2_cnt = RESULTS_BUF[i]->hdr.have_results = 0;
+ }
+ }
+#else /* just do the searches */
+ /* aa1save is used for shuffles, not aa1shuf, because aa1shuf
+ has library sequences */
+ buf_shuf_work(aa0,m_msp->n0, aa1save, lib_bhead_p,
+ m_msp->nitt1, ppst, f_str, ppst->score_ix, rand_state);
+
+ save_shuf(lib_bhead_p,m_msp->nitt1,m_msp->shuff_max, ppst->score_ix, s_info);
+ lib_bhead_p->hdr.buf2_cnt = lib_bhead_p->hdr.have_results = 0;
+#endif
+}
+
+/* buf_align_seq is structurally almost identical to buf_shuf_seq,
+ except that the appropriate sequences are pre-loaded into bbp->seq
+ (and ->bline), and it gets bbp->a_res, rather than scores */
+
+void
+buf_align_seq(unsigned char **aa0, int n0,
+ struct beststr **bestp_arr, int nbest,
+ struct pstruct *ppst, struct mngmsg *m_msp,
+ struct mng_thr *m_bufi_p
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ , void **f_str
+#endif
+ )
+{
+ struct beststr *bbp;
+ int max_align_cnt;
+ int i, n_pre_align;
+ int cur_buf_size, max_buf_size;
+ struct buf_head *lib_bhead_p;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_ares_s *lib_buf2_ap;
+
+ /* setup lib_bhead buffers for alignments */
+#if defined(COMP_THR) || defined(PCOMPLIB) /* threaded */
+ /* max_do_cnt can be smaller than max_buf2_res, but not larger */
+#ifdef COMP_THR
+ max_align_cnt = min(m_bufi_p->max_buf2_res,
+ nbest / (4 * fa_max_workers));
+#else
+ max_align_cnt = min(m_bufi_p->max_buf2_res, nbest / fa_max_workers);
+#endif
+ if (max_align_cnt < 1) max_align_cnt = 1;
+
+ get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
+#else /* not threaded */
+ max_align_cnt = m_bufi_p->max_buf2_res;
+ lib_bhead_p = lib_buf2_list; /* equivalent to un-threaded get_rbuf() */
+#endif
+
+ max_buf_size = lib_bhead_p->hdr.aa1b_size;
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ lib_bhead_p->hdr.have_results = 0;
+ lib_bhead_p->hdr.stop_work = 0;
+ lib_bhead_p->hdr.buf2_type=BUF2_DOALIGN;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_ap = lib_bhead_p->buf2_ares;
+
+ /* read sequences into align buffer */
+
+ n_pre_align = 0;
+ cur_buf_size = 0;
+ for (i = 0; i < nbest; i++) {
+ bbp = bestp_arr[i];
+
+ /* this invalidates lib_buf2_p->seq */
+ lib_buf2_dp->seq = bbp->seq;
+ cur_buf_size += bbp->seq->n1+1;
+ lib_buf2_dp->frame = bbp->frame;
+ lib_buf2_dp->repeat_thresh = bbp->repeat_thresh;
+#ifdef PCOMPLIB
+ lib_buf2_dp->seq_dup = 0;
+#endif
+ lib_buf2_ap->have_ares = 0;
+ lib_buf2_ap->a_res = NULL;
+ lib_buf2_ap->best_idx = i;
+ lib_buf2_dp++; /* point to next buf2_data */
+ lib_buf2_ap++; /* point to next buf2_ares */
+ lib_bhead_p->hdr.buf2_cnt++;
+
+ if (lib_bhead_p->hdr.buf2_cnt >= max_align_cnt ||
+ cur_buf_size >= max_buf_size - m_msp->ldb_info.maxn) {
+/* (2) send sequences for alignment */
+#if defined(COMP_THR) || defined(PCOMPLIB) /* threaded - fill and empty buffers */
+ /* provide empty buffer to workers */
+ lib_bhead_p->hdr.seqr_cnt = lib_bhead_p->hdr.buf2_cnt; /* for alignments, they are the same */
+ lib_bhead_p->hdr.have_data = 1;
+ lib_bhead_p->hdr.aa1b_used = cur_buf_size;
+ lib_bhead_p->hdr.seq_record_continuous = 0;
+ put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
+ get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
+#else /* non-thread - just do the searches */
+ buf_do_align(aa0, m_msp->n0, lib_bhead_p, ppst, m_msp, f_str);
+#endif
+
+/* (3) save alignments */
+ if (lib_bhead_p->hdr.buf2_cnt > 0 && lib_bhead_p->hdr.have_results) {
+ n_pre_align += save_align(lib_bhead_p,bestp_arr);
+ }
+
+ cur_buf_size = 0;
+ max_buf_size = lib_bhead_p->hdr.aa1b_size;
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ lib_bhead_p->hdr.have_results = 0;
+ lib_bhead_p->hdr.buf2_type=BUF2_DOALIGN;
+ lib_bhead_p->hdr.stop_work = 0;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_ap = lib_bhead_p->buf2_ares;
+ }
+ } /* done with bestp_arr[] */
+
+#if defined(COMP_THR) || defined(PCOMPLIB) /* if COMP_THR - fill and empty buffers */
+ /* check last buffers for any results */
+ lib_bhead_p->hdr.seqr_cnt = lib_bhead_p->hdr.buf2_cnt; /* for alignments, they are the same */
+ lib_bhead_p->hdr.have_data = 1;
+ lib_bhead_p->hdr.aa1b_used = cur_buf_size;
+ lib_bhead_p->hdr.seq_record_continuous = 0;
+ put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
+
+ /* wait for the threads to finish */
+
+ wait_rbuf(m_bufi_p->max_work_buf);
+
+ for (i=0; i < num_reader_bufs; i++) {
+ if (RESULTS_BUF[i]->hdr.buf2_cnt > 0 && RESULTS_BUF[i]->hdr.have_results) {
+ n_pre_align += save_align(RESULTS_BUF[i],bestp_arr);
+ RESULTS_BUF[i]->hdr.buf2_cnt = RESULTS_BUF[i]->hdr.have_results = 0;
+ }
+ }
+#else /* just do the searches */
+ buf_do_align(aa0, m_msp->n0, lib_bhead_p, ppst, m_msp, f_str);
+ n_pre_align += save_align(lib_bhead_p,bestp_arr);
+ lib_bhead_p->hdr.buf2_cnt = lib_bhead_p->hdr.have_results = 0;
+#endif
+
+ m_msp->align_done = 1;
+
+ if (n_pre_align != nbest) {
+ fprintf(stderr,"*** error [%s:%d] - n_pre_align:%d != nbest: %d\n",
+ __FILE__, __LINE__, n_pre_align, nbest);
+ }
+ for (i=0; i < nbest; i++) {
+ if (bestp_arr[i]->a_res == NULL) {
+ fprintf(stderr, "*** error [%s:%d] - have NULL a_res: %d\n",
+ __FILE__, __LINE__, i);
+ }
+ }
+}
+
+int
+check_seq_range(unsigned char *aa1b, int n1, int nsq, char *str) {
+ int i, range_error;
+ unsigned char *aa1p;
+
+ range_error = 0;
+ for (aa1p = aa1b, i=0; i < n1; i++, aa1p++) {
+ if (*aa1p > nsq) {
+ range_error = 1;
+ /* fprintf(stderr, "%s seq %d (%c) out of range at %d\n",
+ str, *aa1p, *aa1p,i);
+ */
+ }
+ }
+ return range_error;
+}
+
+struct stack_str {
+ void **stack;
+ int size;
+ int inc;
+ int top;
+};
+
+struct stack_str *init_stack(int size, int inc) {
+ struct stack_str *stack;
+
+ if ((stack=(struct stack_str *)calloc(1,sizeof(struct stack_str)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate stack\n",
+ __FILE__, __LINE__);
+ return NULL;
+ }
+
+ if ((stack->stack=(void *)calloc(size,sizeof(void *)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate stack->stack[%d]\n",
+ __FILE__, __LINE__,size);
+ free(stack);
+ return NULL;
+ }
+
+ stack->size = size;
+ stack->inc = inc;
+ stack->top = 0;
+ return stack;
+}
+
+void push_stack(struct stack_str *stack, void *value) {
+
+ if (!stack) return;
+ if (stack->top >= stack->size) {
+ stack->size += stack->inc;
+ if ((stack->stack = (void *)realloc(stack->stack, stack->size*sizeof(void *)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot re-allocate stack to [%d]\n",
+ __FILE__, __LINE__, stack->size);
+ return;
+ }
+ }
+ stack->stack[stack->top++] = value;
+}
+
+void * pop_stack(struct stack_str *stack) {
+ if (stack == NULL) {
+#ifdef DEBUG
+ fprintf(stderr," *** error [%s:%d] - pop_stack NULL stack\n",__FILE__, __LINE__);
+#endif
+ return NULL;
+ }
+
+ if (stack->top-- > 0) {
+ return stack->stack[stack->top];
+ }
+ else {
+ stack->top = 0;
+ return NULL;
+ }
+}
+
+void * free_stack(struct stack_str *stack) {
+ if (stack==NULL) return NULL;
+ if (stack->stack != NULL) free(stack->stack);
+ free(stack);
+ return NULL;
+}
+
+struct dyn_string_str *
+init_dyn_string(int size, int inc) {
+ struct dyn_string_str *dyn_string;
+
+ if ((dyn_string=(struct dyn_string_str *)calloc(1,sizeof(struct dyn_string_str)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate dyn_string\n",
+ __FILE__, __LINE__);
+ return NULL;
+ }
+
+ if ((dyn_string->string=(void *)calloc(size,sizeof(void *)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate dyn_string->string[%d]\n",
+ __FILE__, __LINE__,size);
+ free(dyn_string);
+ return NULL;
+ }
+
+ dyn_string->c_size = 0;
+ dyn_string->inc = inc;
+ dyn_string->mx_size = size;
+ return dyn_string;
+}
+
+void
+dyn_strcat(struct dyn_string_str *dyn_string, char *value) {
+ size_t add_len;
+
+ add_len = strlen(value);
+
+ if (!dyn_string) return;
+ if (add_len + dyn_string->c_size + 1 >= dyn_string->mx_size) {
+ while (dyn_string->inc < add_len) { dyn_string->inc *= 2; }
+ dyn_string->mx_size += dyn_string->inc;
+ if ((dyn_string->string = (void *)realloc(dyn_string->string, dyn_string->mx_size))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot re-allocate dyn_string to [%d]\n",
+ __FILE__, __LINE__, dyn_string->mx_size);
+ dyn_string->mx_size = 0;
+ return;
+ }
+ }
+ SAFE_STRNCAT(dyn_string->string,value,dyn_string->mx_size);
+ dyn_string->c_size += add_len;
+}
+
+void dyn_strcpy(struct dyn_string_str *dyn_string, char *value) {
+ size_t add_len;
+
+ add_len = strlen(value);
+
+ if (!dyn_string) return;
+ if (add_len + 1>= dyn_string->mx_size) {
+ while (dyn_string->inc < add_len) { dyn_string->inc *= 2; }
+ dyn_string->mx_size += dyn_string->inc;
+ if ((dyn_string->string = (void *)realloc(dyn_string->string, dyn_string->mx_size))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot re-allocate dyn_string to [%d]\n",
+ __FILE__, __LINE__, dyn_string->mx_size);
+ dyn_string->mx_size = 0;
+ return;
+ }
+ }
+ SAFE_STRNCPY(dyn_string->string,value,dyn_string->mx_size);
+}
+
+void free_dyn_string(struct dyn_string_str *dyn_string) {
+ if (dyn_string==NULL) return;
+ if (dyn_string->string != NULL) free(dyn_string->string);
+ free(dyn_string);
+}
+
+#include "a_mark.h"
+
+/* *itmp has the current alignment score, if *annot_arr[i_annot].label='V',
+ this can be increased (total increase in *v_delta)
+ *pam2aa0v[.value] gives possibly better pam score for variant
+ *ip is position in annotated sequence (&i0 for annot0_p)
+ *ia is position in aligned sequence (&i1 for annot0_p)
+ sp1 is the array for the (possibly modified) displayed sequence
+ sp1a is the array for the associated annotation
+ sq maps encoded residues to displayed characters
+ i_annot -- current annotation index in annot0_p->annot_arr_p[i_annot]
+ annot_arr = annot0/1_p->annot_arr_p
+ annot_stack = save current annotation
+ *have_push_features = set for annotations pushed in stack (not 'V')
+ *v_delta = change in score from variant at this position
+ **region_p = set for '[' region start
+ init_score -- used to initialize tmp_region_p->score.
+
+*/
+
+int
+next_annot_match(int *itmp, int *pam2aa0v, long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ int i_annot, int n_annot, struct annot_entry **annot_arr, char **ann_comment,
+ void *annot_stack, int *have_push_features, int *v_delta,
+ struct annot_entry **region_p, struct annot_entry *tmp_region_p,
+ int init_score) {
+ int v_tmp;
+
+ if (ann_comment) *ann_comment = NULL;
+
+ /* count through the annotations at this position (long ip) */
+
+ while (i_annot < n_annot && ip == annot_arr[i_annot]->pos) {
+ if (annot_arr[i_annot]->label == 'V') { /* label == 'V' */
+ v_tmp = pam2aa0v[annot_arr[i_annot]->value];
+ if (v_tmp > *itmp) {
+ *v_delta += (v_tmp- *itmp);
+ *itmp = v_tmp;
+ *sp1 = sq[annot_arr[i_annot]->value];
+ if (sp1a) *sp1a = 'V';
+ if (ann_comment) *ann_comment = annot_arr[i_annot]->comment;
+ }
+ }
+ else if (annot_arr[i_annot]->label == '[') {
+ /* region_p needs to point to a more sophisticated data
+ structure that keeps track of all the current regions being
+ updated
+
+ to start, region_p could include a linked list and a pointer to
+ the current left-most region, which would be used for ']'
+ detection
+
+ for efficiency, update the ->score only when a new
+ (overlapping) region is started or stopped
+
+ same for n_indent, n_aln
+ */
+
+ if (region_p) {
+ memcpy(tmp_region_p, annot_arr[i_annot],sizeof(struct annot_entry));
+ tmp_region_p->a_pos = ia;
+ tmp_region_p->score = init_score;
+ tmp_region_p->n_ident = tmp_region_p->n_aln = 0;
+ *region_p = tmp_region_p;
+ }
+ }
+ else if (annot_arr[i_annot]->label == ']') {
+ if (have_push_features) *have_push_features = 1;
+ push_stack(annot_stack, annot_arr[i_annot]);
+ }
+ else if (annot_stack) {
+ if (have_push_features) *have_push_features = 1;
+ push_stack(annot_stack, annot_arr[i_annot]);
+ }
+ i_annot++;
+ } /* everything at this alignment position is checked */
+ return i_annot;
+}
+
+/* returns M_NEG, M_ZERO, M_POS, M_IDENT, M_DEL (a_mark.h)
+ updates *aln->nsim, npos, nident, nmismatch
+
+*/
+int align_type(int score, char sp0, char sp1, int nt_align, struct a_struct *aln, int pam_x_id_sim) {
+ int spa_val;
+
+ if (score<0) {
+ spa_val = M_NEG;
+ }
+ else if (score == 0) {
+ spa_val = M_ZERO;
+ if (aln) aln->nsim++;
+ }
+ else {
+ spa_val = M_POS;
+ if (aln) {aln->nsim++; aln->npos++;}
+ }
+
+ /* correct for score < 0 with 'N:N'/'X:X' */
+ if (pam_x_id_sim > 0) { /* > 0 -> identical, similar */
+ if ((nt_align && toupper(sp0)=='N' && toupper(sp1)=='N') ||
+ (!nt_align && toupper(sp0)=='X' && toupper(sp1)=='X')) {
+ spa_val = M_POS;
+ if (aln) {
+ aln->nsim++;
+ }
+ }
+ }
+
+ if (aln) aln->nmismatch++;
+ if (toupper(sp0) == toupper(sp1)) {
+ spa_val = M_IDENT;
+ if (aln) {
+ aln->nident++;
+ aln->nmismatch--;
+ }
+ }
+ else if (nt_align) {
+ if ((toupper(sp0) == 'T' && toupper(sp1) == 'U') ||
+ (toupper(sp0)=='U' && toupper(sp1)=='T')) {
+ spa_val = M_IDENT;
+ if (aln) {
+ aln->nident++;
+ aln->nmismatch--;
+ }
+ }
+ /* add to gap count for 'N' matches ?? */
+ else if (aln && toupper(sp0) == 'N') aln->ngap_q++;
+ else if (aln && toupper(sp1) == 'N') aln->ngap_l++;
+ }
+
+ /* correct nident, nmismatch for N:N / X:X */
+ if (pam_x_id_sim < 0) { /* > 0 -> identical, similar */
+ if ((nt_align && toupper(sp0)=='N' && toupper(sp1)=='N') ||
+ (!nt_align && toupper(sp0)=='X' && toupper(sp1)=='X')) {
+ if (aln) {
+ aln->nident--;
+ aln->nmismatch++;
+ }
+ }
+ }
+
+ return spa_val;
+}
+
+/* seq_pos works with comment_var()/display_push_features()/do_url1() where
+ i_offset = nn for reversed sequences
+ off = 0 for 0 based offsets, 1 for 1-based offsets
+ */
+int
+seq_pos(int pos, int rev, int off) {
+
+ if (rev) {
+ return -pos-1 + off;
+ }
+ else {
+ return pos;
+ }
+}
+
+/* target = 0 (aa0), 1 (aa1)
+
+ d_type = display_type (annot_fmt in cal_cons.c):
+ 1 (long text), d1_fmt = " Variant: %d%c%c%d%c : %c%d%c";
+ 2 (-m 9c code) sprintf(tmp_str, "|%c%c:%ld%c%c%ld%c",
+
+ i0_pos/i1_pos have already been converted to reverse coordinate if necessary
+*/
+void comment_var (long i0_pos, char sp0, long i1_pos, char sp1, char o_sp1,
+ char sim_char, const char *ann_comment,
+ struct dyn_string_str *annot_var_dyn, int target, int d_type)
+{
+ char tmp_str[MAX_LSTR], tc, ann_ch0, ann_ch1;
+ char *d1_fmt;
+
+ if (d_type == 1) {
+ if (target ==1) {
+ d1_fmt = " Variant: %d%c%c%d%c : %c%d%c";
+ sprintf(tmp_str,d1_fmt,
+ i0_pos+1, sp0, sim_char, i1_pos+1,sp1, o_sp1,i1_pos+1,sp1);
+ }
+ else {
+ d1_fmt = " qVariant: %d%c%c%d%c : %c%d%c";
+ sprintf(tmp_str,d1_fmt,
+ i0_pos+1, sp0, sim_char, i1_pos+1,sp1, o_sp1,i1_pos+1,sp0);
+ }
+
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+
+ if (ann_comment) {
+ sprintf(tmp_str," : %s",ann_comment);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+
+ /* SAFE_STRNCAT(annot_var_s,"\n",n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, "\n");
+ }
+ else if (d_type == 2) {
+ if (target == 1) {
+ ann_ch0 = 'X';
+ ann_ch1 = 'V';
+ }
+ else {
+ ann_ch0 = 'V';
+ ann_ch1 = 'X';
+ }
+
+ sprintf(tmp_str, "|%c%c:%ld%c%c%ld%c",
+ ann_ch0,ann_ch1,
+ i0_pos+1,sp0, sim_char,i1_pos+1,sp1);
+ /* SAFE_STRNCAT(annot_var_s, tmp_str, n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+}
+
+void
+display_push_features(void *annot_stack, struct dyn_string_str *annot_var_dyn,
+ long i0_pos, char sp0, long i1_pos, char sp1, char sym,
+ struct annot_entry **region0_p,
+ struct annot_entry **region1_p,
+ int tot_score, double comp, int n0, int n1,
+ void *pstat_void, int d_type) {
+ struct annot_entry *this_annot_p;
+ double lbits, total_bits, zscore, lprob, lpercid;
+ char *ann_comment, *bp;
+ struct annot_entry *region_p;
+ char tmp_lstr[MAX_LSTR], ctarget, tmp_sstr[MAX_SSTR];
+ int q_min, q_max, l_min, l_max;
+ char *dt1_fmt, *dt2_fmt;
+
+ zscore = find_z(tot_score, 1.0, n1, comp, pstat_void);
+ total_bits = zs_to_bit(zscore, n0, n1);
+
+ while ((this_annot_p = (struct annot_entry *)pop_stack(annot_stack))!=NULL) {
+
+ if (this_annot_p->label == ']') {
+ if (this_annot_p->target == 1) {
+ region_p = *region1_p;
+ if (!region_p) {
+ fprintf(stderr,"*** error [%s:%d] *** -- target==1 but region1_p is null\n",__FILE__, __LINE__);
+#ifdef DEBUG
+ fprintf(stderr,"*** qtitle: %s\n",ext_qtitle);
+#endif
+ continue;
+ }
+ q_min = region_p->a_pos+1;
+ l_min = region_p->pos+1;
+ dt2_fmt = "|XR:%d-%d:%d-%d:s=%d;b=%.1f;I=%.3f;Q=%.1f";
+ }
+ else {
+ region_p = *region0_p;
+ if (!region_p) {
+ fprintf(stderr,"*** error [%s:%d] *** -- target==0 but region0_p is null\n",__FILE__, __LINE__);
+#ifdef DEBUG
+ fprintf(stderr,"*** qtitle: %s\n",ext_qtitle);
+#endif
+ continue;
+ }
+ q_min = region_p->pos+1;
+ l_min = region_p->a_pos+1;
+ dt2_fmt = "|RX:%d-%d:%d-%d:s=%d;b=%.1f;I=%.3f;Q=%.1f";
+ }
+
+ if (region_p->score < 0) {
+ lbits = 0.0;
+ lprob = 1.0;
+ }
+ else {
+ lbits = total_bits * (double)region_p->score/tot_score;
+ zscore = find_z(region_p->score, 1.0, n1, comp, pstat_void);
+ lprob = zs_to_p(zscore);
+ }
+
+ if (lprob > 0.99) lprob = 0.0;
+ else if (lprob < 1e-300) lprob = 3000.0;
+ else lprob = -10.0*log(lprob)/log(10.0);
+
+ if (region_p->n_aln > 0) {
+ lpercid = ((double)region_p->n_ident)/(double)region_p->n_aln;
+ }
+ else lpercid = -1.0;
+
+ if (d_type == 1) {
+ if (this_annot_p->target == 0) {dt1_fmt = " qRegion: %d-%d:%d-%d : score=%d; bits=%.1f; Id=%.3f; Q=%.1f : %s\n";}
+ else {dt1_fmt = " Region: %d-%d:%d-%d : score=%d; bits=%.1f; Id=%.3f; Q=%.1f : %s\n";}
+ sprintf(tmp_lstr, dt1_fmt, q_min, i0_pos+1,
+ l_min, i1_pos+1, region_p->score, lbits, lpercid, lprob,
+ (region_p->comment) ? region_p->comment : '\0');
+
+ }
+ else if (d_type == 2) {
+ sprintf(tmp_lstr,dt2_fmt,
+ q_min, i0_pos+1,
+ l_min, i1_pos+1, region_p->score, lbits,lpercid, lprob);
+
+ if (region_p->comment) {
+ SAFE_STRNCPY(tmp_sstr,region_p->comment,sizeof(tmp_sstr));
+ if ((bp=strchr(tmp_sstr,' '))!=NULL) { *bp = '\0';}
+ SAFE_STRNCAT(tmp_lstr,";C=",sizeof(tmp_lstr));
+ SAFE_STRNCAT(tmp_lstr,tmp_sstr,sizeof(tmp_lstr));
+ }
+ }
+ /* SAFE_STRNCAT(annot_var_s,tmp_lstr,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_lstr);
+ region_p->score = 0;
+ region_p = NULL;
+ }
+ else if ((ann_comment = this_annot_p->comment)) {
+ if (d_type == 1 ) {
+ if (this_annot_p->target == 0) {dt1_fmt = " qSite:%c : %d%c%c%d%c : %s\n";}
+ else {dt1_fmt = " Site:%c : %d%c%c%d%c : %s\n";}
+ sprintf(tmp_lstr,dt1_fmt, this_annot_p->label,i0_pos+1, sp0,
+ sym, i1_pos+1, sp1, ann_comment);
+ /* SAFE_STRNCAT(annot_var_s,tmp_lstr,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_lstr);
+ }
+ }
+ }
+}
diff --git a/src/compacc2e.c b/src/compacc2e.c
new file mode 100644
index 0000000..594b930
--- /dev/null
+++ b/src/compacc2e.c
@@ -0,0 +1,4316 @@
+/* $Id: compacc2.c 1280 2014-08-21 00:47:55Z wrp $ */
+/* $Revision: 1280 $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* Concurrent read version */
+
+#include <stdio.h>
+#include <stdlib.h>
+#if defined(UNIX)
+#include <unistd.h>
+#endif
+#if defined(UNIX) || defined(WIN32)
+#include <sys/types.h>
+#endif
+
+#include <limits.h>
+#include <ctype.h>
+#include <float.h>
+
+#include <string.h>
+#include <time.h>
+#include <math.h>
+
+#include "defs.h"
+#include "param.h"
+#include "structs.h"
+
+#include "mm_file.h"
+#include "best_stats.h"
+
+#define XTERNAL
+#include "uascii.h"
+#include "upam.h"
+#undef XTERNAL
+
+#ifdef DEBUG
+extern char ext_qtitle[];
+#endif
+
+extern void abort ();
+
+#include "drop_func.h" /* get init_work() */
+/* drop_func.h includes dyn_string.h */
+
+void revcomp(unsigned char *seq, int n, int *c_nt);
+extern void qshuffle(unsigned char *aa0, int n0, int nm0, void *);
+#ifdef DEBUG
+unsigned long adler32(unsigned long, const unsigned char *, unsigned int);
+#endif
+
+void
+s_annot_to_aa1a(long offset, int n1, struct annot_str *annot_p, unsigned char *ann_arr, char *tmp_line);
+
+extern void add_annot_def(struct mngmsg *m_msp, char *line, int qa_flag);
+int add_annot_char(unsigned char *ann_arr, char ctmp_label);
+
+int get_annot(char *sname, struct mngmsg *, char *bline, long offset, int n1,
+ struct annot_str **annot_p,int target, int debug);
+int
+get_annot_list(char *sname, struct mngmsg *m_msp, struct beststr **bestp_arr,
+ int nbest,int target, int debug);
+void
+print_sum(FILE *fd, struct db_str *qtt, struct db_str *ntt, int in_mem, long mem_use);
+int
+check_seq_range(unsigned char *aa1b, int n1, int nsq, char *str);
+/* print timing information */
+extern void ptime (FILE *, long);
+
+/* this function consolidates code in comp_lib4.c for non-threaded, and in
+ work_thr2.c (threads) and work_comp2.c (worker nodes)
+*/
+
+void
+init_aa0(unsigned char **aa0, int n0, int nm0,
+ unsigned char **aa0s, unsigned char **aa1s,
+ int qframe, int qshuffle_flg, int max_tot,
+ struct pstruct *ppst, void **f_str, void **qf_str,
+ void *my_rand_state) {
+ int id;
+
+ /* note that aa[5,4,3,2] are never used, but are provided so that frame
+ can range from 0 .. 5; likewise for f_str[5..2] */
+
+ aa0[5] = aa0[4] = aa0[3] = aa0[2] = aa0[1] = aa0[0];
+
+ /* zero out for SSE2/ALTIVEC -- make sure this is ALWAYS done */
+ for (id=0; id < SEQ_PAD; id++) aa0[0][n0+id] = '\0';
+
+ init_work (aa0[0], n0, ppst, &f_str[0]);
+ f_str[5] = f_str[4] = f_str[3] = f_str[2] = f_str[1] = f_str[0];
+
+ if (qframe == 2) {
+ if ((aa0[1]=(unsigned char *)calloc((size_t)n0+2+SEQ_PAD,sizeof(unsigned char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate aa01[%d]\n", __FILE__, __LINE__, n0);
+ }
+ *aa0[1]='\0';
+ aa0[1]++;
+ memcpy(aa0[1],aa0[0],n0+1);
+ /* for ALTIVEC/SSE2, must pad with 16 NULL's */
+ for (id=0; id<SEQ_PAD; id++) {aa0[1][n0+id]=0;}
+ revcomp(aa0[1],n0,ppst->c_nt);
+ init_work (aa0[1], n0, ppst, &f_str[1]);
+ }
+
+ if (qshuffle_flg) {
+ if ((*aa0s=(unsigned char *)calloc(n0+2+SEQ_PAD,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate aa0s[%d]\n",__FILE__, __LINE__, n0+2);
+ exit(1);
+ }
+ **aa0s='\0';
+ (*aa0s)++;
+ memcpy(*aa0s,aa0[0],n0);
+ qshuffle(*aa0s,n0,nm0, my_rand_state);
+ /* for SSE2/ALTIVEC, must pad with 16 NULL's */
+ for (id=0; id<SEQ_PAD; id++) {(*aa0s)[n0+id]=0;}
+ init_work (*aa0s, n0, ppst, qf_str);
+ }
+
+ /* always allocate shuffle space */
+ if((*aa1s=calloc(max_tot+1,sizeof(char))) == NULL) {
+ fprintf(stderr,"*** error [%s:%d] - unable to allocate shuffled library sequence [%d]\n", __FILE__, __LINE__, max_tot);
+ exit(1);
+ }
+ else {
+ **aa1s=0;
+ (*aa1s)++;
+ }
+}
+
+/* because it is used to pre-allocate space, maxn has various
+ constraints. For "simple" comparisons, it is simply the length of
+ the longest library sequence. But for translated comparisons, it
+ must be 3 or 6X the length of the query sequence.
+
+ In addition, however, it can be reduced to make certain that
+ sequences are read in smaller chunks. And, maxn affect how large
+ overlaps must be when sequences are read in chunks.
+*/
+
+int
+reset_maxn(struct mngmsg *m_msp, int over_len, int maxn) {
+
+ /* reduce maxn if requested */
+ if (m_msp->ldb_info.maxn > 0 && m_msp->ldb_info.maxn < maxn) maxn = m_msp->ldb_info.maxn;
+
+ if (m_msp->qdnaseq==m_msp->ldb_info.ldnaseq || m_msp->qdnaseq==SEQT_DNA ||
+ m_msp->qdnaseq == SEQT_RNA) {/* !TFAST - either FASTA or FASTX */
+
+ if (m_msp->n0 > m_msp->max_tot - m_msp->ldb_info.dupn) {
+ fprintf(stderr,"*** error [%s:%d] - query sequence is too long %d > %d - %d %s\n",
+ __FILE__, __LINE__,
+ m_msp->n0,
+ m_msp->max_tot, m_msp->ldb_info.dupn,
+ m_msp->sqnam);
+ exit(1);
+ }
+
+ m_msp->ldb_info.l_overlap = over_len;
+ m_msp->ldb_info.maxt3 = maxn-m_msp->ldb_info.l_overlap;
+ }
+ else { /* is TFAST */
+ if (m_msp->n0 > MAXTST) {
+ fprintf(stderr,"*** error [%s:%d] - query sequence is too long %d %s\n",
+ __FILE__, __LINE__, m_msp->n0,m_msp->sqnam);
+ exit(1);
+ }
+
+ if (m_msp->n0*3 > maxn ) { /* n0*3 for the three frames - this
+ will only happen if maxn has been
+ set low manually */
+
+ if (m_msp->n0*4+2 < m_msp->max_tot) { /* m_msg0*3 + m_msg0 */
+ fprintf(stderr,
+ "*** error [%s:%d] - query sequence too long for library segment: %d - resetting to %d\n",
+ __FILE__, __LINE__,
+ maxn,m_msp->n0*3);
+ maxn = m_msp->ldb_info.maxn = m_msp->n0*3;
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] - query sequence too long for translated search: %d * 4 > %d %s\n",
+ __FILE__, __LINE__, m_msp->n0,maxn, m_msp->sqnam);
+ exit(1);
+ }
+ }
+
+ /* set up some constants for overlaps */
+ m_msp->ldb_info.l_overlap = 3*over_len;
+ m_msp->ldb_info.maxt3 = maxn-m_msp->ldb_info.l_overlap-3;
+ m_msp->ldb_info.maxt3 -= m_msp->ldb_info.maxt3%3;
+ m_msp->ldb_info.maxt3++;
+
+ maxn = maxn - 3; maxn -= maxn%3; maxn++;
+ }
+ return maxn;
+}
+
+
+int
+scanseq(unsigned char *seq, int n, char *str) {
+ int tot,i;
+ char aaray[128]; /* this must be set > nsq */
+
+ for (i=0; i<128; i++) aaray[i]=0;
+ for (i=0; i < (int)strlen(str); i++) aaray[qascii[str[i]]]=1;
+ for (i=tot=0; i<n; i++) tot += aaray[seq[i]];
+ return tot;
+}
+
+/* subs_env takes a string, possibly with ${ENV}, and looks up all the
+ potential environment variables and substitutes them into the
+ string */
+
+void subs_env(char *dest, char *src, int dest_size) {
+ char *last_src, *bp, *bp1;
+
+ last_src = src;
+
+ if ((bp = strchr(src,'$'))==NULL) {
+ strncpy(dest, src, dest_size);
+ dest[dest_size-1] = '\0';
+ }
+ else {
+ *dest = '\0';
+ while (strlen(dest) < dest_size-1 && bp != NULL ) {
+ /* copy stuff before ${*/
+ *bp = '\0';
+ strncpy(dest, last_src, dest_size);
+ *bp = '$';
+
+ /* copy ENV */
+ if (*(bp+1) != '{') {
+ strncat(dest, "$", dest_size - strlen(dest) -1);
+ dest[dest_size-1] = '\0';
+ bp += 1;
+ }
+ else { /* have ${ENV} - put it in */
+ if ((bp1 = strchr(bp+2,'}'))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - Unterminated ENV: %s\n",
+ __FILE__, __LINE__, src);
+ break;
+ }
+ else {
+ *bp1 = '\0';
+ if (getenv(bp+2)!=NULL) {
+ strncat(dest, getenv(bp+2), dest_size - strlen(dest) - 1);
+ dest[dest_size-1] = '\0';
+ *bp1 = '}';
+ }
+ bp = bp1+1; /* bump bp even if getenv == NULL */
+ }
+ }
+ last_src = bp;
+
+ /* now get the next ${ENV} if present */
+ bp = strchr(last_src,'$');
+ }
+ /* now copy the last stuff */
+ strncat(dest, last_src, dest_size - strlen(dest) - 1);
+ dest[dest_size-1]='\0';
+ }
+}
+
+
+void
+selectbest(struct beststr **bptr, int k, int n) /* k is rank in array */
+{
+ int v, i, j, l, r;
+ struct beststr *tmptr;
+
+ l=0; r=n-1;
+
+ while ( r > l ) {
+ v = bptr[r]->rst.score[0];
+ i = l-1;
+ j = r;
+ do {
+ while (bptr[++i]->rst.score[0] > v) ;
+ while (bptr[--j]->rst.score[0] < v) ;
+ tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
+ } while (j > i);
+ bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
+ if (i>=k) r = i-1;
+ if (i<=k) l = i+1;
+ }
+}
+
+void
+selectbestz(struct beststr **bptr, int k, int n) /* k is rank in array */
+{
+ int i, j, l, r;
+ struct beststr *tmptr;
+ double v;
+
+ l=0; r=n-1;
+
+ while ( r > l ) {
+ v = bptr[r]->zscore;
+ i = l-1;
+ j = r;
+ do {
+ while (bptr[++i]->zscore > v) ;
+ while (bptr[--j]->zscore < v) ;
+ tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
+ } while (j > i);
+ bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
+ if (i>=k) r = i-1;
+ if (i<=k) l = i+1;
+ }
+}
+
+/* improved shellsort with high-performance increments */
+/*
+shellsort(itemType a[], int l, int r)
+{ int i, j, k, h; itemType v;
+ int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
+ 13776, 4592, 1968, 861, 336,
+ 112, 48, 21, 7, 3, 1 };
+ for ( k = 0; k < 16; k++)
+ for (h = incs[k], i = l+h; i <= r; i++) {
+ v = a[i]; j = i;
+ while (j > h && a[j-h] > v) {
+ a[j] = a[j-h]; j -= h;
+ }
+ a[j] = v;
+ }
+}
+*/
+
+/* ?improved? version of sortbestz using optimal increments and fewer
+ exchanges */
+void sortbestz(struct beststr **bptr, int nbest)
+{
+ int gap, i, j, k;
+ struct beststr *tmp;
+ double v;
+ int incs[14] = { 198768, 86961, 33936,
+ 13776, 4592, 1968, 861, 336,
+ 112, 48, 21, 7, 3, 1 };
+
+ for ( k = 0; k < 14; k++) {
+ gap = incs[k];
+ for (i=gap; i < nbest; i++) {
+ tmp = bptr[i];
+ j = i;
+ v = bptr[i]->zscore;
+ while ( j >= gap && bptr[j-gap]->zscore < v) {
+ bptr[j] = bptr[j - gap];
+ j -= gap;
+ }
+ bptr[j] = tmp;
+ }
+ }
+}
+
+
+/* sort based on sequence index */
+void sortbesti(struct beststr **bptr, int nbest)
+{
+ int gap, i, j, k;
+ struct beststr *tmp;
+ double v;
+ int incs[12] = { 33936, 13776, 4592, 1968, 861, 336,
+ 112, 48, 21, 7, 3, 1 };
+
+ for ( k = 0; k < 12; k++) {
+ gap = incs[k];
+ for (i=gap; i < nbest; i++) {
+ tmp = bptr[i];
+ j = i;
+ v = bptr[i]->seq->index;
+ while ( j >= gap && bptr[j-gap]->seq->index < v) {
+ bptr[j] = bptr[j - gap];
+ j -= gap;
+ }
+ bptr[j] = tmp;
+ }
+ }
+}
+
+void
+sortbeste(struct beststr **bptr, int nbest)
+{
+ int gap, i, j, k;
+ struct beststr *tmp;
+ double v;
+ int incs[14] = { 198768, 86961, 33936,
+ 13776, 4592, 1968, 861, 336,
+ 112, 48, 21, 7, 3, 1 };
+
+ for ( k = 0; k < 14; k++) {
+ gap = incs[k];
+ for (i=gap; i < nbest; i++) {
+ j = i;
+ tmp = bptr[i];
+ v = tmp->rst.escore;
+ while ( j >= gap && bptr[j-gap]->rst.escore > v) {
+ bptr[j] = bptr[j - gap];
+ j -= gap;
+ }
+ bptr[j] = tmp;
+ }
+ }
+
+ /* sometimes there are many high scores with E()==0.0, sort
+ those by z() score */
+
+ j = 0;
+ while (j < nbest && bptr[j]->rst.escore <= 2.0*DBL_MIN ) {j++;}
+ if (j > 1) sortbestz(bptr,j);
+}
+
+extern char *prog_func;
+extern char *verstr, *iprompt0, *refstr, *mp_verstr;
+extern long tstart, tscan, tprev, tdone; /* Timing */
+#ifdef COMP_MLIB
+extern long ttscan, ttdisp;
+#endif
+extern time_t tdstart, tddone;
+
+/* ****************************************************************
+ print command line arguments (argv_line)
+ possibly HTML header
+ !BLAST
+ please cite
+ version
+ BLAST
+ Reference version
+**************************************************************** */
+void
+print_header1(FILE *fd, const char *argv_line,
+ const struct mngmsg *m_msp, const struct pstruct *ppst) {
+ int i;
+
+#ifdef PGM_DOC
+ if (!(m_msp->markx & (MX_M8OUT+MX_MBLAST2)) || (m_msp->markx & MX_M8COMMENT)) fprintf(fd, "#%s\n",argv_line);
+#endif
+
+ if (m_msp->markx & MX_M11OUT) {
+ fprintf(fd, "#:lav\n\nd {\n \"%s\"\n}\n",argv_line+1);
+ }
+
+ if (m_msp->markx & MX_HTML) {
+#ifdef HTML_HEAD
+ fprintf(fd,"<html>\n<head>\n<title>%s Results</title>\n</head>\n<body>\n",prog_func);
+#endif
+ fprintf(fd,"<pre>\n");
+ }
+
+ if (m_msp->std_output) {
+ fprintf(fd,"%s\n",iprompt0);
+ if (refstr != NULL && refstr[0] != '\0') {
+ fprintf(fd," version %s%s\nPlease cite:\n %s\n",verstr,mp_verstr,refstr);
+ }
+ else {
+ fprintf(fd," version %s%s\n",verstr,mp_verstr);
+ }
+ }
+
+ if (m_msp->markx & MX_MBLAST2) {
+ if (refstr != NULL && refstr[0] != '\0') {
+ fprintf(fd,"%s %s%s\n\nReference: %s\n\n", prog_func, verstr, mp_verstr, refstr);
+ }
+ else {
+ fprintf(fd,"%s %s%s\n\n", prog_func, verstr, mp_verstr);
+ }
+ }
+
+ fflush(fd);
+}
+
+/* ****************************************************************
+ MX_HTML: <pre>
+ Query:
+ 1>>>accession description # aa
+ Annotation:
+ Library:
+**************************************************************** */
+void
+print_header2(FILE *fd, int qlib, char *info_qlabel, unsigned char **aa0,
+ const struct mngmsg *m_msp, const struct pstruct *ppst,
+ const char * info_lib_range_p) {
+ int j;
+ char tmp_str[MAX_STR];
+ double db_tt;
+
+ /* if (m_msp->markx & MX_HTML) fputs("<pre>\n",fd); */
+
+ if (m_msp->std_output) {
+ if (qlib==1) {
+ fprintf(fd,"Query: %s\n", m_msp->tname);
+ }
+
+ if (m_msp->qdnaseq == SEQT_DNA || m_msp->qdnaseq == SEQT_RNA) {
+ strncpy(tmp_str,(m_msp->qframe==1)? " (forward-only)" : "\0",sizeof(tmp_str));
+ tmp_str[sizeof(tmp_str)-1]='\0';
+ }
+ else tmp_str[0]='\0';
+
+ fprintf(fd,"%3d>>>%s%s\n", qlib,
+ m_msp->qtitle,
+ (m_msp->revcomp ? " (reverse complement)" : tmp_str));
+
+ /* check for annotation */
+ if (m_msp->ann_flg && m_msp->aa0a != NULL) {
+ fprintf(fd,"Annotation: ");
+ for (j=0; j<m_msp->n0; j++) {
+ if (m_msp->aa0a[j] && m_msp->ann_arr[m_msp->aa0a[j]] != ' ' ) {
+ fprintf(fd,"|%ld:%c%c",
+ j+m_msp->q_off,m_msp->ann_arr[m_msp->aa0a[j]],ppst->sq[aa0[0][j]]);
+ }
+ }
+ fprintf(fd,"\n");
+ }
+
+ fprintf(fd,"Library: %s%s\n", m_msp->ltitle,info_lib_range_p);
+
+ if (m_msp->db.carry==0) {
+ fprintf(fd, " %7ld residues in %5ld sequences\n", m_msp->db.length, m_msp->db.entries);
+ }
+ else {
+ db_tt = (double)m_msp->db.carry*(double)LONG_MAX + (double)m_msp->db.length;
+ fprintf(fd, " %.0f residues in %5ld library sequences\n", db_tt, m_msp->db.entries);
+ }
+
+ }
+ else {
+ if ((m_msp->markx & (MX_M8OUT + MX_M8COMMENT)) == (MX_M8OUT+MX_M8COMMENT)) {
+ fprintf(fd,"# %s %s%s\n",prog_func,verstr,mp_verstr);
+ fprintf(fd,"# Query: %s\n",m_msp->qtitle);
+ fprintf(fd,"# Database: %s\n",m_msp->ltitle);
+ }
+ }
+ if (m_msp->markx & MX_HTML) fputs("</pre>\n",fd);
+ fflush(fd);
+}
+
+/* **************************************************************** */
+/* before showbest */
+/* **************************************************************** */
+void print_header3(FILE *fd, int qlib, struct mngmsg *m_msp, struct pstruct *ppst) {
+
+ if (m_msp->markx & MX_MBLAST2) {
+ if (qlib == 1) {
+ fprintf(fd, "\nDatabase: %s\n %12ld sequences; %ld total letters\n\n\n",
+ m_msp->ltitle, m_msp->db.entries, m_msp->db.length);
+ }
+ fprintf(fd, "\nQuery= %s\nLength=%d\n", m_msp->qtitle, m_msp->n0);
+ }
+}
+
+
+/* **************************************************************** */
+/* alignment tranistion */
+/* **************************************************************** */
+void print_header4(FILE *fd, char *info_qlabel, char *argv_line, char *info_gstring3, char *info_hstring_p[2],
+ struct mngmsg *m_msp, struct pstruct *ppst) {
+
+ if (m_msp->std_output && (m_msp->markx & (MX_AMAP+ MX_HTML + MX_M9SUMM)) && !(m_msp->markx & MX_M10FORM)) {
+ fprintf(fd,"\n>>>%s%s, %d %s vs %s library\n",
+ info_qlabel,(m_msp->revcomp ? "_rev":"\0"), m_msp->n0,
+ m_msp->sqnam,m_msp->lname);
+ }
+
+ if (m_msp->markx & MX_M10FORM) {
+ fprintf(fd,"\n>>>%s%s, %d %s vs %s library\n",
+ info_qlabel,(m_msp->revcomp ? "-":"\0"), m_msp->n0, m_msp->sqnam,
+ m_msp->lname);
+ fprintf(fd,"; pg_name: %s\n",m_msp->pgm_name);
+ fprintf(fd,"; pg_ver: %s%s\n",verstr,mp_verstr);
+ fprintf(fd,"; pg_argv: %s",argv_line);
+ fputs(info_gstring3,fd);
+ fputs(info_hstring_p[0],fd);
+ fputs(info_hstring_p[1],fd);
+ }
+}
+
+void print_header4a(FILE *outfd, struct mngmsg *m_msp) {
+ if (!(m_msp->markx & MX_M8OUT) && (m_msp->markx & (MX_M10FORM+MX_M9SUMM)) && m_msp->show_code != SHOW_CODE_ID && m_msp->show_code != SHOW_CODE_IDD) {
+ fprintf(outfd,">>><<<\n");
+ }
+}
+
+void print_header5(FILE *fd, int qlib, struct db_str *qtt,
+ struct mngmsg *m_msp, struct pstruct *ppst,
+ int in_mem, long tot_memK) {
+
+ /* for MX_MBLAST2, show some statistics results */
+ if (m_msp->markx & MX_MBLAST2) {
+ fprintf(fd,"\n\nLambda K H\n");
+ fprintf(fd," %6.3f %6.3f %6.3f\n\n",ppst->pLambda,ppst->pK,ppst->pH);
+ fprintf(fd,"\nGapped\nLambda\n");
+ fprintf(fd," %6.3f %6.3f %6.3f\n",ppst->pLambda,ppst->pK,ppst->pH);
+ fprintf(fd,"\nEffective search space used: %ld\n\n",m_msp->db.entries);
+ }
+
+ if (m_msp->markx & MX_M8COMMENT) {
+ fprintf(fd, "# %s processed %d queries\n",prog_func,qlib);
+ }
+
+ if ( !((m_msp->markx & MX_M8OUT) || (m_msp->markx & MX_HTML))
+ && (m_msp->markx & (MX_M10FORM+MX_M9SUMM))) {
+ fprintf(fd,">>>///\n");
+ }
+
+ if ( m_msp->markx & MX_HTML) fputs("<pre>",fd);
+ if (m_msp->std_output) {
+ print_sum(fd, qtt, &m_msp->db, in_mem, tot_memK);}
+ if ( m_msp->markx & MX_HTML) fputs("</pre>\n",fd);
+#ifdef HTML_HEAD
+ if (m_msp->markx & MX_HTML) fprintf(fd,"</body>\n</html>\n");
+#endif
+
+ if (m_msp->markx & MX_MBLAST2) {
+ fprintf(fd,"\n Database: %s\n",m_msp->ltitle);
+ fprintf(fd," Number of letters in database: %ld\n",m_msp->db.length);
+ fprintf(fd," Number of sequences in database: %ld\n",m_msp->db.entries);
+ fprintf(fd,"\n\n\nMatrix: %s\n",ppst->pam_name);
+ fprintf(fd,"Gap Penalties: Existence: %d, Extension: %d\n",ppst->gdelval, ppst->ggapval);
+ }
+}
+
+void
+print_annot_header(FILE *fd, struct mngmsg *m_msp) {
+ int i;
+
+ if (m_msp->ann_arr_def[1]) {
+ if (m_msp->markx & MX_HTML) {fprintf(fd,"<pre>");}
+ fprintf(fd, "Annotation symbols:\n");
+ for (i=1; m_msp->ann_arr[i]; i++) {
+ if (m_msp->ann_arr_def[i]) {
+ fprintf(fd, " %c : %s\n",m_msp->ann_arr[i], m_msp->ann_arr_def[i]);
+ }
+ }
+ if (m_msp->markx & MX_HTML) {fputs("</pre><hr />\n",fd);}
+ }
+}
+
+extern int fa_max_workers;
+
+void
+print_sum(FILE *fd, struct db_str *qtt, struct db_str *ntt, int in_mem, long tot_memK)
+{
+ double db_tt;
+ char tstr1[26], tstr2[26];
+ char memstr[256];
+
+ strncpy(tstr1,ctime(&tdstart),sizeof(tstr1));
+ strncpy(tstr2,ctime(&tddone),sizeof(tstr1));
+ tstr1[24]=tstr2[24]='\0';
+
+ /* Print timing to output file as well */
+
+ fprintf(fd, "\n%ld residues in %ld query sequences\n", qtt->length, qtt->entries);
+ if (ntt->carry == 0)
+ fprintf(fd, "%ld residues in %ld library sequences\n", ntt->length, ntt->entries);
+ else {
+ db_tt = (double)ntt->carry*(double)LONG_MAX + (double)ntt->length;
+ fprintf(fd, "%.0f residues in %ld library sequences\n", db_tt, ntt->entries);
+ }
+
+ memstr[0]='\0';
+ if (tot_memK && in_mem != 0) {
+ sprintf(memstr," in memory [%ldG]",(tot_memK >> 20));
+ }
+
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ fprintf(fd," Scomplib [%s%s]\n start: %s done: %s\n",verstr,mp_verstr,tstr1,tstr2);
+#endif
+#if defined(COMP_THR)
+ fprintf(fd," Tcomplib [%s%s] (%d proc%s)\n start: %s done: %s\n", verstr, mp_verstr,
+ fa_max_workers, memstr, tstr1,tstr2);
+#endif
+#if defined(PCOMPLIB)
+ fprintf(fd," Pcomplib [%s%s] (%d proc%s)\n start: %s done: %s\n", verstr, mp_verstr,
+ fa_max_workers, memstr, tstr1,tstr2);
+#endif
+#ifndef COMP_MLIB
+ fprintf(fd," Scan time: ");
+ ptime(fd, tscan - tprev);
+ fprintf (fd," Display time: ");
+ ptime (fd, tdone - tscan);
+#else
+ fprintf(fd," Total Scan time: ");
+ ptime(fd, ttscan);
+ fprintf (fd," Total Display time: ");
+ ptime (fd, ttdisp);
+#endif
+ fprintf (fd,"\n");
+ fprintf (fd, "\nFunction used was %s [%s%s]\n", prog_func,verstr,mp_verstr);
+}
+
+extern double zs_to_Ec(double zs, long entries);
+extern double zs_to_bit(double zs, int n0, int n1);
+extern double zs_to_p(double zs);
+
+#include "aln_structs.h"
+
+void
+prhist(FILE *fd, const struct mngmsg *m_msp,
+ struct pstruct *ppst,
+ struct hist_str hist,
+ int nstats, int sstats,
+ struct db_str ntt,
+ char *stat_info2,
+ char *lib_range,
+ char **info_gstring2,
+ char **info_hstring,
+ long tscan)
+{
+ int i,j,hl,hll, el, ell, ev;
+ char hline[80], pch, *bp;
+ int mh1, mht;
+ int maxval, maxvalt, dotsiz, ddotsiz,doinset;
+ double cur_e, prev_e, f_int;
+ double max_dev, x_tmp;
+ double db_tt;
+ int n_chi_sq, cum_hl=0, max_i=0, max_dev_i;
+ double zs10_off;
+
+
+ if (m_msp->markx & MX_HTML) fputs("<pre>\n",fd);
+ else {fprintf(fd,"\n");}
+
+ if (ppst->zsflag_f < 0) {
+ if (!m_msp->nohist) {
+ fprintf(fd, " %7ld residues in %5ld sequences", ntt.length,ntt.entries);
+ fprintf(fd, "%s\n",lib_range);
+ }
+ fprintf(fd,"Algorithm: %s\nParameters: %s\n",info_gstring2[0],info_gstring2[1]);
+ return;
+ }
+
+ if (nstats > 20) {
+ zs10_off = ppst->zs_off * 10.0;
+
+ max_dev = 0.0;
+ mh1 = hist.maxh-1; /* max value for histogram */
+ mht = (3*hist.maxh-3)/4 - 1; /* x-coordinate for expansion */
+ n_chi_sq = 0;
+
+ if (!m_msp->nohist && mh1 > 0) {
+ for (i=0,maxval=0,maxvalt=0; i<hist.maxh; i++) {
+ if (hist.hist_a[i] > maxval) maxval = hist.hist_a[i];
+ if (i >= mht && hist.hist_a[i]>maxvalt) maxvalt = hist.hist_a[i];
+ }
+ cum_hl = -hist.hist_a[0];
+ dotsiz = (maxval-1)/60+1;
+ ddotsiz = (maxvalt-1)/50+1;
+ doinset = (ddotsiz < dotsiz && dotsiz > 2);
+
+ if (ppst->zsflag_f>=0)
+ fprintf(fd," opt E()\n");
+ else
+ fprintf(fd," opt\n");
+
+ prev_e = zs_to_Ec((double)(hist.min_hist-hist.histint/2)-zs10_off,hist.entries);
+ for (i=0; i<=mh1; i++) {
+ pch = (i==mh1) ? '>' : ' ';
+ pch = (i==0) ? '<' : pch;
+ hll = hl = hist.hist_a[i];
+ if (ppst->zsflag_f>=0) {
+ cum_hl += hl;
+ f_int = (double)(i*hist.histint+hist.min_hist)+(double)hist.histint/2.0;
+ cur_e = zs_to_Ec(f_int-zs10_off,hist.entries);
+ ev = el = ell = (int)(cur_e - prev_e + 0.5);
+ if (hl > 0 && i > 5 && i < (90-hist.min_hist)/hist.histint) {
+ x_tmp = fabs(cum_hl - cur_e);
+ if ( x_tmp > max_dev) {
+ max_dev = x_tmp;
+ max_i = i;
+ }
+ n_chi_sq++;
+ }
+ if ((el=(el+dotsiz-1)/dotsiz) > 60) el = 60;
+ if ((ell=(ell+ddotsiz-1)/ddotsiz) > 40) ell = 40;
+ fprintf(fd,"%c%3d %5d %5d:",
+ pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
+ mh1*hist.histint+hist.min_hist,hl,ev);
+ }
+ else fprintf(fd,"%c%3d %5d :",
+ pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
+ mh1*hist.histint+hist.min_hist,hl);
+
+ if ((hl=(hl+dotsiz-1)/dotsiz) > 60) hl = 60;
+ if ((hll=(hll+ddotsiz-1)/ddotsiz) > 40) hll = 40;
+ for (j=0; j<hl; j++) hline[j]='=';
+ if (ppst->zsflag_f>=0) {
+ if (el <= hl ) {
+ if (el > 0) hline[el-1]='*';
+ hline[hl]='\0';
+ }
+ else {
+ for (j = hl; j < el; j++) hline[j]=' ';
+ hline[el-1]='*';
+ hline[hl=el]='\0';
+ }
+ }
+ else hline[hl] = 0;
+ if (i==1) {
+ for (j=hl; j<10; j++) hline[j]=' ';
+ sprintf(&hline[10]," one = represents %d library sequences",dotsiz);
+ }
+ if (doinset && i == mht-2) {
+ for (j = hl; j < 10; j++) hline[j]=' ';
+ sprintf(&hline[10]," inset = represents %d library sequences",ddotsiz);
+ }
+ if (i >= mht&& doinset ) {
+ for (j = hl; j < 10; j++) hline[j]=' ';
+ hline[10]=':';
+ for (j = 11; j<11+hll; j++) hline[j]='=';
+ hline[11+hll]='\0';
+ if (ppst->zsflag_f>=0) {
+ if (ell <= hll) hline[10+ell]='*';
+ else {
+ for (j = 11+hll; j < 10+ell; j++) hline[j]=' ';
+ hline[10+ell] = '*';
+ hline[11+ell] = '\0';
+ }
+ }
+ }
+
+ fprintf(fd,"%s\n",hline);
+ prev_e = cur_e;
+ }
+ }
+ max_dev_i = max_i*hist.histint+hist.min_hist;
+ }
+ else {
+ max_dev = 0.0;
+ n_chi_sq = 0;
+ max_i = 0;
+ max_dev_i = 0;
+ }
+
+ if (ppst->zsflag_f >=0 ) {
+ if (!m_msp->nohist) {
+ if (ntt.carry==0) {
+ fprintf(fd, " %7ld residues in %5ld sequences", ntt.length, ntt.entries);
+ }
+ else {
+ db_tt = (double)ntt.carry*(double)LONG_MAX + (double)ntt.length;
+ fprintf(fd, " %.0f residues in %5ld library sequences", db_tt, ntt.entries);
+ }
+ fprintf(fd, "%s\n",lib_range);
+ }
+ fprintf(fd,"Statistics: %s\n",hist.stat_info);
+ if (stat_info2) {
+ fprintf(fd," Statistics E2: %s\n",stat_info2);
+ }
+
+#ifdef SAMP_STATS
+ fprintf(fd," statistics sampled from %ld (%d) to %ld sequences\n",
+ (hist.entries > nstats ? nstats : hist.entries),sstats, hist.entries);
+#else
+ fprintf(fd," statistics extrapolated from %ld to %ld sequences\n",
+ (hist.entries > nstats ? nstats : hist.entries),hist.entries);
+#endif
+
+ if (!m_msp->nohist && cum_hl > 0) {
+ fprintf(fd," Kolmogorov-Smirnov statistic: %6.4f (N=%d) at %3d\n",
+ max_dev/(float)cum_hl, n_chi_sq,max_dev_i);
+ }
+ if (m_msp->markx & MX_M10FORM) {
+ while ((bp=strchr(hist.stat_info,'\n'))!=NULL) *bp=' ';
+ if (cum_hl <= 0) cum_hl = -1;
+ sprintf(info_hstring[0],"; mp_extrap: %d %ld\n; mp_stats: %s\n; mp_KS: %6.4f (N=%d) at %3d\n",
+ MAX_STATS,hist.entries,hist.stat_info,max_dev/(float)cum_hl,
+ n_chi_sq,max_dev_i);
+ }
+ }
+
+ if (m_msp->markx & MX_M10FORM) {
+ if ((bp = strchr(info_gstring2[1],'\n'))!=NULL) *bp = ' ';
+ sprintf(info_hstring[1],"; mp_Algorithm: %s\n; mp_Parameters: %s\n",info_gstring2[0],info_gstring2[1]);
+ if (bp != NULL ) *bp = '\n';
+ }
+
+ if (ppst->other_info != NULL) {
+ fputs(ppst->other_info, fd);
+ }
+
+ fprintf(fd,"Algorithm: %s\nParameters: %s\n",info_gstring2[0],info_gstring2[1]);
+
+ fprintf (fd," Scan time: ");
+ ptime(fd,tscan);
+ fprintf(fd,"\n");
+ if (!m_msp->annot1_sname[0] && m_msp->markx & MX_HTML) {
+ fputs("</pre>\n<hr />\n",fd);
+ }
+
+ fflush(fd);
+}
+
+extern char prog_name[], *verstr;
+
+#ifdef PCOMPLIB
+#include "mpi.h"
+#endif
+
+void s_abort (char *p, char *p1)
+{
+ int i;
+
+ fprintf (stderr, "\n***[%s] %s%s***\n", prog_name, p, p1);
+#ifdef PCOMPLIB
+ MPI_Abort(MPI_COMM_WORLD,1);
+ MPI_Finalize();
+#endif
+ exit (1);
+}
+
+void w_abort (char *p, char *p1)
+{
+ fprintf (stderr, "\n***[%s] %s%s***\n\n", prog_name, p, p1);
+ exit (1);
+}
+
+extern struct a_res_str *
+build_ares_code(unsigned char *aa0, int n0,
+ unsigned char *aa1, struct seq_record *seq,
+ int frame, int *have_ares, int repeat_thresh,
+ const struct mngmsg *m_msp, struct pstruct *ppst,
+ void *f_str
+ );
+
+extern struct lmf_str *
+re_openlib(struct lmf_str *, int outtty);
+
+#define MAX_BLINE 2048
+#define RANLIB (m_fptr->ranlib)
+
+extern int
+re_getlib(unsigned char *, struct annot_str **,
+ int, int, int, int, int, long *, long *,
+ struct lmf_str *m_fptr);
+
+/*
+ pre_load_best loads a set of sequences using re_getlib
+
+ it should be used for getting sequences for shuffling, and for showbest() if m_msg->quiet
+
+ it both opens the m_file_p buffer, gets the bline[] descriptions,
+ and reads the actual sequences. In reading the sequences, it
+ should first allocate one large buffer so that individual buffers do not need to be freed.
+*/
+
+void
+pre_load_best(unsigned char *aa1save, int maxn,
+ struct beststr **bbp_arr, int nbest,
+ struct mngmsg *m_msp, int debug)
+{
+ int i, n1, bl_len, tmp_bline_len, l_llen;
+ int seq_buf_len;
+ char bline[MAX_BLINE];
+ unsigned char *seq_buf_p;
+ char *bline_buf_p;
+
+ struct beststr *bbp;
+ struct lmf_str *m_fptr;
+
+ /*
+ calculate how much room we need for sequences and blines
+ */
+
+ if (m_msp->pre_load_done) return;
+
+ seq_buf_len = 1;
+ for (i=0; i<nbest; i++) {
+ /* we are not (currently) allocating more than n1+1, because alignment is not vectorized,
+ if it were vectorized, we would need n+16
+ */
+#ifdef DEBUG
+ if (bbp_arr[i]->n1 != bbp_arr[i]->seq->n1) {
+ fprintf(stderr,"*** error [%s:%d] - n1 (%d) != seq->n1 (%d)\n",
+ __FILE__, __LINE__, bbp_arr[i]->n1, bbp_arr[i]->seq->n1);
+ }
+#endif
+
+ if (bbp_arr[i]->seq->aa1b == NULL) {
+ seq_buf_len += bbp_arr[i]->seq->n1 + 1;
+ }
+ }
+
+ /* have required sequence space (seq_buf_len), allocate it */
+
+ if ((m_msp->aa1save_buf_b=(unsigned char *)calloc(seq_buf_len, sizeof(char)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate space[%d] for sequence encoding\n",
+ __FILE__, __LINE__, seq_buf_len);
+ exit(1);
+ }
+ else {
+ seq_buf_p = m_msp->aa1save_buf_b+1; /* ensure there is an initial '\0' */
+ }
+
+ /* adjust description line length */
+ l_llen = m_msp->aln.llen;
+ if ((m_msp->markx & MX_M9SUMM) && m_msp->show_code != SHOW_CODE_ID && m_msp->show_code != SHOW_CODE_IDD) {
+ l_llen += 40;
+ if (l_llen > 200) l_llen=200;
+ }
+
+ tmp_bline_len = sizeof(bline)-1;
+ if (!(m_msp->markx & MX_M10FORM) && !m_msp->long_info) {tmp_bline_len = l_llen-5;}
+
+ /* allocate more bline than we need for simplicity */
+ if ((bline_buf_p=m_msp->bline_buf_b=(char *)calloc(nbest*tmp_bline_len, sizeof(char)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate space[%d] for bline descriptions\n",
+ __FILE__, __LINE__, nbest*tmp_bline_len);
+ exit(1);
+ }
+
+ for (i=0; i<nbest; i++) {
+ bbp = bbp_arr[i];
+
+ if ((m_fptr=re_openlib(bbp->mseq->m_file_p,!m_msp->quiet))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot re-open %s\n",
+ __FILE__, __LINE__, bbp->mseq->m_file_p->lb_name);
+ exit(1);
+ }
+ RANLIB(bline,tmp_bline_len,bbp->mseq->lseek,bbp->mseq->libstr,m_fptr);
+ bl_len = strlen(bline);
+ bbp->mseq->bline = bline_buf_p;
+ bbp->mseq->bline_max = m_msp->aln.llen;
+ strncpy(bbp->mseq->bline, bline, bl_len);
+ bline_buf_p += bl_len+1;
+
+ /* make sure we get annotation if present, and sequence if necessary */
+ if (bbp->seq->aa1b==NULL || (m_msp->ann_flg==1 && bbp->seq->annot_p==NULL)) {
+ n1 = re_getlib(aa1save, (m_msp->ann_flg==1) ? &(bbp->seq->annot_p) : NULL,
+ maxn,m_msp->ldb_info.maxt3, m_msp->ldb_info.l_overlap,
+ bbp->mseq->cont,m_msp->ldb_info.term_code,
+ &bbp->seq->l_offset,&bbp->seq->l_off,bbp->mseq->m_file_p);
+ if (n1 != bbp->seq->n1) {
+ fprintf(stderr,"*** error [%s:%d] - n1[%d/%d] != n1[%d] from re_getlib() at %s [maxn:%d/maxt3:%d]\n",
+ __FILE__, __LINE__,
+ bbp->n1, bbp->seq->n1, n1, bbp->mseq->libstr, maxn, m_msp->ldb_info.maxt3);
+ }
+
+#ifdef DEBUG
+ if (adler32(1L,aa1save,n1)!=bbp->adler32_crc) {
+ fprintf(stderr,"*** error [%s:%d] - adler32_crc from re_getlib() at %d(%d): %s\n",
+ __FILE__, __LINE__,
+ bbp->mseq->index,bbp->n1, bline);
+ }
+#endif
+
+ /* if we don't have the sequence in the aa1b buffer, copy it from re_getlib */
+ if (bbp->seq->aa1b == NULL) {
+ bbp->seq->aa1b = seq_buf_p;
+ memcpy(bbp->seq->aa1b, aa1save, bbp->seq->n1+1);
+ seq_buf_p += bbp->seq->n1+1;
+ }
+ }
+ }
+
+ /* here, we are getting query annots after all the bptr[]s have been processed */
+ /* moved to comp_lib9.c */
+ /*
+ if (m_msp->annot0_sname[0]) {
+ if (get_annot(m_msp->annot0_sname, m_msp, m_msp->qtitle, m_msp->q_offset+m_msp->q_off-1,m_msp->n0, &m_msp->annot_p, 0, debug) < 0) {
+ fprintf(stderr,"*** error [%s:%d] - %s did not produce annotations\n",__FILE__, __LINE__, m_msp->annot0_sname);
+ m_msp->annot0_sname[0] = '\0';
+ }
+ if (m_msp->annot_p && m_msp->annot_p->n_annot > 0) {
+ m_msp->aa0a = m_msp->annot_p->aa1_ann;
+ }
+ if (!m_msp->ann_arr[0]) {m_msp->ann_arr[0] = ' '; m_msp->ann_arr[1] = '\0';}
+ }
+ */
+
+ /* if we have an variant annotation script, execute it and capture the output */
+ /* must do after bline is set */
+ if (m_msp->annot1_sname[0]) {
+ if (get_annot_list(m_msp->annot1_sname, m_msp, bbp_arr, nbest, 1, debug)< 0) {
+ fprintf(stderr,"*** error [%s:%d] - %s did not produce annotations for %s\n",__FILE__, __LINE__, m_msp->annot1_sname,m_msp->qtitle);
+ m_msp->annot1_sname[0] = '\0';
+ };
+ if (!m_msp->ann_arr[0]) {m_msp->ann_arr[0] = ' '; m_msp->ann_arr[1] = '\0';}
+ }
+
+ m_msp->pre_load_done = 1;
+}
+
+/* merge_ares_chains()
+
+ seeks to merge two ares chains, producing a single chain that is
+ sorted by sw_score.
+
+ Strategy -- choose the chain with the highest score, and go down
+ it until the head of the other chain has higher score, then link
+ the other chain to the main chain, breaking the first, and
+ continue the process.
+
+ The two pointers, max_next and alt_next, keep track of the best
+ and the alternate chain
+ */
+
+
+#undef SHOW_MERGE_CHAIN
+
+struct a_res_str *
+merge_ares_chains(struct a_res_str *cur_ares,
+ struct a_res_str *tmp_ares,
+ int score_ix,
+ const char *msg)
+{
+ struct a_res_str *max_next, *max_ares, *alt_ares, *prev_next;
+
+ if (!tmp_ares) return cur_ares;
+
+#ifdef SHOW_MERGE_CHAIN
+ fprintf(stderr,"cur_ares->");
+ for (max_next = cur_ares; max_next; max_next = max_next->next) {
+ fprintf(stderr,"%d->",max_next->rst.score[score_ix]);
+ }
+
+ fprintf(stderr,"||\n");
+ fprintf(stderr,"tmp_ares->");
+ for (max_next = tmp_ares; max_next; max_next = max_next->next) {
+ fprintf(stderr,"%d->",max_next->rst.score[score_ix]);
+ }
+ fprintf(stderr,"||\n");
+#endif
+
+ /* start with the maximum score */
+
+ if (cur_ares->rst.score[score_ix] >= tmp_ares->rst.score[score_ix]) {
+ max_ares = max_next = prev_next = cur_ares;
+ alt_ares = tmp_ares;
+ }
+ else {
+ max_ares = max_next = prev_next = tmp_ares;
+ alt_ares = cur_ares;
+ }
+
+ while (max_next && alt_ares) {
+ /* this is guaranteed true for the first iteration */
+ while (max_next && max_next->rst.score[score_ix] >= alt_ares->rst.score[score_ix]) {
+ prev_next = max_next;
+ max_next = max_next->next;
+ }
+ if (max_next==NULL) break;
+ else { /* max_next->rst.score[score_ix] no longer greater, switch
+ pointers */
+ prev_next->next = alt_ares;
+ alt_ares = max_next;
+ max_next = prev_next->next;
+ }
+ }
+
+ /* we quit whenever max_next or alt_ares == NULL; if
+ (max_next==NULL), then continue adding the rest of alt_ares */
+
+ if (max_next==NULL) {
+ prev_next->next = alt_ares;
+ }
+
+
+#ifdef SHOW_MERGE_CHAIN
+ fprintf(stderr,"[%s] merge_ares->",msg);
+ for (max_next = max_ares; max_next; max_next = max_next->next) {
+ fprintf(stderr,"%d->",max_next->rst.score[score_ix]);
+ }
+ fprintf(stderr,"||\n\n");
+#endif
+
+ return max_ares;
+}
+
+/* copies from from to to shuffling */
+
+extern int my_nrand(int, void *);
+
+void
+shuffle(unsigned char *from, unsigned char *to, int n, void *rand_state)
+{
+ int i,j; unsigned char tmp;
+
+ if (from != to) memcpy((void *)to,(void *)from,n);
+
+ for (i=n; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ tmp = to[j];
+ to[j] = to[i-1];
+ to[i-1] = tmp;
+ }
+ to[n] = 0;
+}
+
+/* shuffles DNA sequences as codons */
+void
+shuffle3(unsigned char *from, unsigned char *to, int n, void *rand_state)
+{
+ int i, j, i3,j3; unsigned char tmp;
+ int n3;
+
+ if (from != to) memcpy((void *)to,(void *)from,n);
+
+ n3 = n/3;
+
+ for (i=n3; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ i3 = i*3;
+ j3 = j*3;
+ tmp = to[j3];
+ to[j3] = to[i3-1];
+ to[i3-1] = tmp;
+ tmp = to[j3+1];
+ to[j3+1] = to[i3];
+ to[i3] = tmp;
+ tmp = to[j3+2];
+ to[j3+2] = to[i3+1];
+ to[i3+1] = tmp;
+ }
+ to[n] = 0;
+}
+
+/* "shuffles" by reversing the sequence */
+void
+rshuffle(unsigned char *from, unsigned char *to, int n)
+{
+ unsigned char *ptr = from + n;
+
+ while (n-- > 0) {
+ *to++ = *ptr--;
+ }
+ *to = '\0';
+}
+
+static int ieven = 0;
+/* copies from from to from shuffling, ieven changed for threads */
+void
+wshuffle(unsigned char *from, unsigned char *to, int n, int wsiz, void *rand_state)
+{
+ int i,j, k, mm;
+ unsigned char tmp, *top;
+
+ memcpy((void *)to,(void *)from,n);
+
+ mm = n%wsiz;
+
+ if (ieven) {
+ for (k=0; k<(n-wsiz); k += wsiz) {
+ top = &to[k];
+ for (i=wsiz; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ tmp = top[j];
+ top[j] = top[i-1];
+ top[i-1] = tmp;
+ }
+ }
+ top = &to[n-mm];
+ for (i=mm; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ tmp = top[j];
+ top[j] = top[i-1];
+ top[i-1] = tmp;
+ }
+ ieven = 0;
+ }
+ else {
+ for (k=n; k>=wsiz; k -= wsiz) {
+ top = &to[k-wsiz];
+ for (i=wsiz; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ tmp = top[j];
+ top[j] = top[i-1];
+ top[i-1] = tmp;
+ }
+ }
+ top = &to[0];
+ for (i=mm; i>0; i--) {
+ j = my_nrand(i, rand_state);
+ tmp = top[j];
+ top[j] = top[i-1];
+ top[i-1] = tmp;
+ }
+ ieven = 1;
+ }
+ to[n] = 0;
+}
+
+int
+sfn_cmp(int *q, int *s)
+{
+ if (*q == *s) return *q;
+ while (*q && *s) {
+ if (*q == *s) return *q;
+ else if (*q < *s) q++;
+ else if (*q > *s) s++;
+ }
+ return 0;
+}
+
+#ifndef MPI_SRC
+void
+revcomp(unsigned char *seq, int n, int *c_nt)
+{
+ unsigned char tmp;
+ int i, ni;
+
+ for (i=0, ni = n-1; i< n/2; i++,ni--) {
+ tmp = c_nt[seq[i]];
+ seq[i] = c_nt[seq[ni]];
+ seq[ni] = tmp;
+ }
+ if ((n%2)==1) {
+ i = n/2;
+ seq[i] = c_nt[seq[i]];
+ }
+ seq[n]=0;
+}
+#endif
+
+/* check to see whether this score (or a shuff score) should
+ be included in statistics */
+int samp_stats_idx (int *pre_nstats, int nstats, void *rand_state) {
+ int jstats = -1;
+
+ /* this code works when every score can be used for statistics
+ estimates, but fails for fasta/[t]fast[xy] where only a fraction
+ of scores are used */
+
+ if (*pre_nstats < MAX_STATS) {
+ jstats = (*pre_nstats)++;
+ }
+
+ /* here, the problem is that while we may have pre_nstats
+ possible samplings, in some cases (-M subsets, fasta,
+ [t]fast[xy] we don't have MAX_STATS samples yet. Until we
+ have MAX_STATS, we want more. But the stats_idx strategy
+ means that there may be additional samples in the buffers
+ that are not reflected in nstats.
+ */
+
+ else {
+#ifdef SAMP_STATS_LESS
+ /* now we have MAX_STATS samples
+ we want to sample 1/2 of 60K - 120K, 1/3 of 120K - 180K, etc */
+ /* check every 15K to see if we have gone past the next threshold */
+
+ /* pre_nstats cannot be incremented before the % to ensure
+ that stats_inc is incremented exactly at 60000, 120000, etc.
+ use ">=" in case increment comes later
+ tests suggest the first 60K are sampled about 25% more
+ than the rest
+ */
+ if (nstats < MAX_STATS) {
+ jstats = MAX_STATS - my_nrand(MAX_STATS - nstats, rand_state)-1;
+ }
+ else if (((*pre_nstats)++ % (MAX_STATS/4)) == 0 &&
+ *pre_nstats >= stats_inc * MAX_STATS) {
+ stats_inc = (*pre_nstats / MAX_STATS) + 1;
+ }
+ if ((*pre_nstats % stats_inc) == 0) {
+ jstats = my_nrand(MAX_STATS, rand_state);
+ }
+#else
+ /* this sampling strategy calls my_nrand() for every
+ sequence > 60K, but provides a very uniform sampling */
+ jstats = my_nrand(++(*pre_nstats), rand_state);
+ if (jstats >= MAX_STATS) { jstats = -1;}
+#endif
+ }
+ return jstats;
+}
+
+/* **************************************************************** */
+/* build_link_data -- produces fasta file from m_msp->
+ (1) generate a temporary file name
+ (2) write out accessions \t expects to the temporary file
+ (3) run script against temporary file, producing fasta_file_expansion_file
+ (4) return fasta expansion filename for standard fasta openlib().
+
+ returns: the expansion library file name
+ **link_link_file_p is the name of the file with the data
+ that will be removed.
+*/
+/* **************************************************************** */
+char *
+build_link_data(char **link_lib_file_p,
+ struct mngmsg *m_msp, struct beststr **bestp_arr,
+ int debug) {
+ int i, status;
+ char tmp_line[MAX_SSTR];
+ char link_acc_file[MAX_STR];
+ int link_acc_fd;
+ char *link_lib_file;
+ char *link_lib_str;
+ char link_script[MAX_LSTR];
+ int link_lib_type;
+ char *bp, *link_bp;
+ FILE *link_fd=NULL; /* file for link accessions */
+
+#ifndef UNIX
+ return NULL;
+#else
+ /* get two tmpfiles, one for accessions, one for library */
+ link_acc_file[0] = '\0';
+
+ if ((link_lib_file=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [build_link_data] Cannot allocate link_lib_file",
+ __FILE__, __LINE__);
+ }
+ link_lib_file[0] = '\0';
+
+ if ((bp=getenv("TMP_DIR"))!=NULL) {
+ strncpy(link_acc_file,bp,sizeof(link_acc_file));
+ link_acc_file[sizeof(link_acc_file)-1] = '\0';
+ SAFE_STRNCAT(link_acc_file,"/",sizeof(link_acc_file));
+ }
+
+ SAFE_STRNCAT(link_acc_file,"link_acc_XXXXXX",sizeof(link_acc_file));
+ link_acc_fd = mkstemp(link_acc_file);
+ strncpy(link_lib_file,link_acc_file,MAX_STR);
+ link_acc_file[sizeof(link_acc_file)-1] = '\0';
+ SAFE_STRNCAT(link_lib_file,".lib",MAX_STR);
+
+ /* write out accessions to link_acc_file */
+ if ((link_fd =fdopen(link_acc_fd,"w"))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - Cannot open link_acc_file: %s\n",
+ __FILE__, __LINE__, link_acc_file);
+ goto no_links;
+ }
+
+ for (i=0; i<m_msp->nskip + m_msp->nshow; i++) {
+ if ((bp=strchr(bestp_arr[i]->mseq->bline,' '))!=NULL) {
+ *bp = '\0';
+ }
+ fprintf(link_fd,"%s\t%.3g\n",bestp_arr[i]->mseq->bline,bestp_arr[i]->rst.escore);
+ if (bp != NULL) *bp=' ';
+ }
+ fclose(link_fd);
+
+ /* build link_script link_acc_file > link_lib_file */
+ /* check for indirect */
+ link_bp = &m_msp->link_lname[0];
+ if (*link_bp == '!') {
+ link_bp++;
+ }
+ if (*link_bp == '@') {
+ link_bp++;
+ }
+
+ /* remove library type */
+ if ((bp=strchr(link_bp,' '))!=NULL) {
+ *bp = '\0';
+ sscanf(bp+1,"%d",&link_lib_type);
+ }
+ else {
+ link_lib_type = 0;
+ }
+
+ strncpy(link_script,link_bp,sizeof(link_script));
+ link_script[sizeof(link_script)-1] = '\0';
+ SAFE_STRNCAT(link_script," ",sizeof(link_script));
+ SAFE_STRNCAT(link_script,link_acc_file,sizeof(link_script));
+ SAFE_STRNCAT(link_script," >",sizeof(link_script));
+ SAFE_STRNCAT(link_script,link_lib_file,sizeof(link_script));
+
+ /* un-edit m_msp->link_lname */
+ if (bp != NULL) *bp = ' ';
+
+ /* run link_script link_acc_file > link_lib_file */
+ status = system(link_script);
+ if (!debug) {
+#ifdef UNIX
+ unlink(link_acc_file);
+#else
+ _unlink(link_acc_file);
+#endif
+ }
+
+ if (status == NO_FILE_EXIT) { /* my specific return for no links */
+ goto no_links;
+ }
+
+ if (status < 0 || status == 127) {
+ fprintf(stderr,"*** error [%s:%d] - script: %s failed\n",
+ __FILE__, __LINE__,link_script);
+ goto no_links;
+ }
+
+ if ((link_fd=fopen(link_lib_file,"r"))==NULL) {
+ goto no_links;
+ }
+ else fclose(link_fd);
+
+ if ((link_lib_str=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [build_link_data] Cannot allocate link_lib_str",
+ __FILE__, __LINE__);
+ }
+
+ /* build the file string (possibly @link_lib_file libtype) */
+ link_lib_str[0]='\0';
+ if (m_msp->link_lname[0] == '@') {
+ SAFE_STRNCAT(link_lib_str,"@",MAX_STR);
+ }
+ SAFE_STRNCAT(link_lib_str,link_lib_file,MAX_STR);
+ if (link_lib_type > 0) {
+ sprintf(tmp_line," %d",link_lib_type);
+ SAFE_STRNCAT(link_lib_str,tmp_line,MAX_STR);
+ }
+
+ *link_lib_file_p = link_lib_file;
+ return link_lib_str;
+
+ no_links:
+ free(link_lib_file);
+ *link_lib_file_p = NULL;
+ return NULL;
+#endif
+}
+
+/* **************************************************************** */
+/* build_lib_db -- produces fasta file from script
+ (1) generate a temporary file name lib_db_file
+ (2) run script producing data in lib_db_file
+
+ returns: the expansion library file name
+ **db_str_file_p is the name of the file with the data
+ that will be removed.
+*/
+/* **************************************************************** */
+char *
+build_lib_db(char *script_file) {
+ int i, status;
+ char tmp_line[MAX_SSTR];
+ char *lib_db_file, *lib_db_str;
+ char lib_db_script[MAX_LSTR];
+ int lib_db_indirect;
+ int lib_db_type;
+ int lib_db_str_len;
+ char *bp, *lib_bp;
+
+ if ((lib_db_file=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [build_lib_db] Cannot allocate lib_db_file",
+ __FILE__, __LINE__);
+ goto no_lib;
+ }
+
+ if ((bp=getenv("TMP_DIR"))!=NULL) {
+ strncpy(lib_db_file,bp,MAX_STR);
+ lib_db_file[sizeof(lib_db_file)-1] = '\0';
+ SAFE_STRNCAT(lib_db_file,"/",sizeof(lib_db_file));
+ }
+
+ SAFE_STRNCAT(lib_db_file,"lib_db_XXXXXX",MAX_STR);
+ mktemp(lib_db_file);
+ lib_db_str_len = strlen(lib_db_file)+1;
+
+ /* check for indirect */
+ lib_bp = script_file;
+ if (*lib_bp == '@') {
+ lib_bp++;
+ lib_db_str_len++;
+ }
+ /* remove library type */
+ if ((bp=strchr(lib_bp,' '))!=NULL) {
+ *bp = '\0';
+ sscanf(bp+1,"%d",&lib_db_type);
+ lib_db_str_len += (strlen(bp+1)+1);
+ }
+ else {
+ lib_db_type = 0;
+ }
+
+ strncpy(lib_db_script,lib_bp,sizeof(lib_db_script));
+ lib_db_script[sizeof(lib_db_script)-1] = '\0';
+ SAFE_STRNCAT(lib_db_script," >",sizeof(lib_db_script));
+ SAFE_STRNCAT(lib_db_script,lib_db_file,sizeof(lib_db_script));
+
+ if (bp != NULL) *bp = ' ';
+
+ /* run lib_db_script link_acc_file > lib_db_file */
+ status = system(lib_db_script);
+
+ if (status == NO_FILE_EXIT) { /* my specific return for no links */
+ goto no_lib;
+ }
+
+ if (status < 0 || status == 127) {
+ fprintf(stderr,"*** error [%s:%d] - [build_lib_db] script: %s failed\n",
+ __FILE__, __LINE__, lib_db_script);
+ goto no_lib;
+ }
+
+ /* build the file string (possibly @lib_db_str libtype) */
+ if ((lib_db_str=calloc(lib_db_str_len+1,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [build_lib_db] cannot allocate lib_db_str[%d]\n",
+ __FILE__, __LINE__, lib_db_str_len+1);
+ goto no_lib;
+ }
+ lib_db_str[0]='\0';
+ if (*script_file == '@') {
+ SAFE_STRNCAT(lib_db_str,"@",MAX_STR);
+ }
+ SAFE_STRNCAT(lib_db_str,lib_db_file,MAX_STR);
+ if (lib_db_type > 0) {
+ sprintf(tmp_line," %d",lib_db_type);
+ SAFE_STRNCAT(lib_db_str,tmp_line,MAX_STR);
+ }
+
+ return lib_db_str;
+
+ no_lib:
+ return NULL;
+}
+
+/* used to temporarily allocate annotation array in next_annot_entry()*/
+struct annot_mstr {
+ int max_annot;
+ struct annot_entry *tmp_arr_p;
+};
+
+/* init_tmp_annot_mstr(size) intializes the structure used to track annots */
+int
+init_tmp_annot(struct annot_mstr *this, int size) {
+ struct annot_entry *tmp_ann_astr;
+
+ /* only reset if array is NULL */
+ if (this->tmp_arr_p == NULL || this->max_annot <= 0) {
+ this->max_annot = 32;
+ if ((this->tmp_arr_p=(struct annot_entry *)calloc(this->max_annot, sizeof(struct annot_entry)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate annot_entry[%d]\n",
+ __FILE__,__LINE__,this->max_annot);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+int
+update_tmp_annot(struct annot_mstr *this) {
+
+ this->max_annot += (this->max_annot/2);
+ if ((this->tmp_arr_p= (struct annot_entry *)realloc(this->tmp_arr_p, this->max_annot*sizeof(struct annot_entry)))==NULL) {
+ fprintf(stderr,"[*** error [%s:%d] - cannot reallocate tmp_ann_astr[%d]\n",
+ __FILE__, __LINE__, this->max_annot);
+ return 0;
+ }
+ return 1;
+}
+
+struct annot_str *
+next_annot_entry(FILE *annot_fd, char *tmp_line, int n_tmp_line,
+ struct annot_str *annot_p,
+ struct annot_mstr *mtmp_annot_p, /* this structure will end up in the seq_record */
+ struct mngmsg *m_msp, int target);
+
+/* **************************************************************** */
+/* get_annot_list -- produces fasta file from sname
+ if sname[0]=='<', read the file directly, goto (4)
+ if sname[0]=='!', run a script
+ (1) generate a temporary file name
+ (2) write out list of blines
+ (3) run m_msp->annot1_sname[] script against temporary file, producing table of annotations
+
+ (4) read in the annotations and merge them into beststr
+ (5) return number of annotations
+*/
+/* **************************************************************** */
+
+int
+get_annot_list(char *sname, struct mngmsg *m_msp, struct beststr **bestp_arr, int nbest,
+ int target, int debug) {
+ int i, status;
+ long l_offset;
+ char tmp_line[MAX_STR];
+ char annot_bline_file[MAX_STR];
+ int annot_bline_fd;
+ char *annot_descr_file;
+ char annot_script[MAX_LSTR];
+ struct annot_str *annot_p;
+ char *bp;
+ int annot_seq_cnt;
+ FILE *annot_fd=NULL; /* file for annot accessions */
+ struct annot_mstr mtmp_annot; /* allows annot_arr_p to be expanded */
+
+#ifndef UNIX
+ return 0;
+#else
+
+ if (sname[0] == '!') {
+
+ /* get two tmpfiles, one for bline, one for returned annotations
+ (but it would make more sense to use popen() to get the
+ annotations back
+ */
+
+ annot_bline_file[0] = '\0';
+
+ if ((annot_descr_file=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [get_annot_list] Cannot allocate annot_file",
+ __FILE__, __LINE__);
+ }
+ annot_descr_file[0] = '\0';
+
+ if ((bp=getenv("TMP_DIR"))!=NULL) {
+ strncpy(annot_bline_file,bp,sizeof(annot_bline_file));
+ annot_bline_file[sizeof(annot_bline_file)-1] = '\0';
+ SAFE_STRNCAT(annot_bline_file,"/",sizeof(annot_bline_file));
+ }
+
+ SAFE_STRNCAT(annot_bline_file,"annot_bline_XXXXXX",sizeof(annot_bline_file));
+ annot_bline_fd = mkstemp(annot_bline_file);
+ strncpy(annot_descr_file,annot_bline_file,MAX_STR);
+ annot_bline_file[sizeof(annot_bline_file)-1] = '\0';
+ SAFE_STRNCAT(annot_descr_file,".annot",MAX_STR);
+
+ /* write out accessions to annot_bline_file */
+ if ((annot_fd =fdopen(annot_bline_fd,"w"))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - Cannot open annot_bline_file: %s\n",__FILE__, __LINE__, annot_bline_file);
+ goto no_annots;
+ }
+
+ for (i=0; i<nbest; i++) {
+ if (bestp_arr[i]->mseq->annot_req_flag) { continue; }
+ if ((strlen(bestp_arr[i]->mseq->bline) > DESCR_OFFSET) &&
+ (bp=strchr(bestp_arr[i]->mseq->bline+DESCR_OFFSET,' '))!=NULL) {*bp = '\0';}
+ else {bp = NULL;}
+ /* provide sequence length with offset, but only if offset is positive */
+ l_offset = bestp_arr[i]->seq->l_offset+bestp_arr[i]->seq->l_off -1;
+ if (l_offset < 0) { l_offset = 0;}
+ fprintf(annot_fd,"%s\t%ld\n",bestp_arr[i]->mseq->bline,
+ l_offset + bestp_arr[i]->seq->n1);
+ if (bp != NULL) *bp=' ';
+ bestp_arr[i]->mseq->annot_req_flag = 1;
+ }
+ fclose(annot_fd);
+
+ subs_env(annot_script, sname+1, sizeof(annot_script));
+ annot_script[sizeof(annot_script)-1] = '\0';
+ SAFE_STRNCAT(annot_script," ",sizeof(annot_script));
+ SAFE_STRNCAT(annot_script,annot_bline_file,sizeof(annot_script));
+ SAFE_STRNCAT(annot_script," >",sizeof(annot_script));
+ SAFE_STRNCAT(annot_script,annot_descr_file,sizeof(annot_script));
+
+ /* run annot_script annot_bline_file > annot_descr_file */
+ status = system(annot_script);
+ if (!debug) {
+#ifdef UNIX
+ unlink(annot_bline_file);
+#else
+ _unlink(annot_bline_file);
+#endif
+ }
+
+ if (status == NO_FILE_EXIT) { /* my specific return for no annots */
+ goto no_annots;
+ }
+
+ if (status < 0 || status == 127) {
+ fprintf(stderr,"*** error [%s:%d] - script: %s failed\n",
+ __FILE__, __LINE__, annot_script);
+ goto no_annots;
+ }
+ }
+ else if (sname[0] == '<') {
+ annot_descr_file = sname+1;
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] - %s not script (!) or file (<)\n",__FILE__, __LINE__, sname);
+ goto no_annots;
+ }
+
+ /* read annotation file */
+
+ if ((annot_fd=fopen(annot_descr_file,"r"))==NULL) {
+ goto no_annots;
+ }
+
+ /* be sure to ask for annotation once */
+ for (i=0; i<nbest; i++) {
+ bestp_arr[i]->mseq->annot_req_flag = 1;
+ }
+ /* we have some annotations */
+ /* the annotation script MUST return the annotations ordered as in annot_descr_file,
+ in "fasta" form:
+
+ >bline_descr
+ pos<tab>label<tab>value?<tab>comment (which is not read in this version)
+ 1 *
+ 11 V N
+ */
+
+ /* now read the annotation/variant file */
+
+ /* read #comments, =annot_defs at beginning of file */
+ tmp_line[0] = '#';
+ while (tmp_line[0] == '#' || tmp_line[0] == '=') {
+ if (tmp_line[0] == '=') add_annot_def(m_msp, tmp_line+1,1);
+ if (fgets(tmp_line, sizeof(tmp_line), annot_fd)==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - premature annotation file end (%s)\n",
+ __FILE__,__LINE__, annot_descr_file);
+ goto no_annots;
+ }
+ }
+
+ /* set mtmp_annot to be initialized */
+ mtmp_annot.tmp_arr_p = NULL;
+ mtmp_annot.max_annot = 0;
+
+ annot_seq_cnt = 0;
+
+ /* now read in the annotations, but only the first time if asked multiple times */
+ for (i=0; i<nbest; i++) {
+ if (!bestp_arr[i]->mseq->annot_req_flag) {
+ continue;
+ }
+ bestp_arr[i]->mseq->annot_req_flag = 0;
+
+ if ((bp=strchr(tmp_line,'\n'))!=NULL) *bp = '\0';
+ if ((bp=strchr(tmp_line,'\t'))!=NULL) *bp = '\0';
+ if (tmp_line[0] != '>' || strncmp(&tmp_line[1], bestp_arr[i]->mseq->bline, strlen(&tmp_line[1])) != 0) {
+ fprintf(stderr,"*** error [%s:%d] - %s description mismatch (%s:%s)\n",
+ __FILE__,__LINE__,annot_descr_file, tmp_line, bestp_arr[i]->mseq->bline);
+ goto no_annots;
+ }
+
+ annot_p = next_annot_entry(annot_fd, tmp_line, sizeof(tmp_line), bestp_arr[i]->seq->annot_p, &mtmp_annot, m_msp, target);
+
+ if (annot_p) {
+ bestp_arr[i]->seq->annot_p = annot_p;
+ s_annot_to_aa1a(bestp_arr[i]->seq->l_offset + bestp_arr[i]->seq->l_off - 1,
+ bestp_arr[i]->seq->n1, annot_p,m_msp->ann_arr, bestp_arr[i]->mseq->libstr);
+ annot_seq_cnt++;
+ mtmp_annot.tmp_arr_p = NULL; /* prevents tmp_arr_p from being freed */
+ }
+ else {
+ if (bestp_arr[i]->seq->annot_p) {
+ bestp_arr[i]->seq->annot_p->n_annot = 0;
+ }
+ }
+ }
+
+ if (mtmp_annot.tmp_arr_p) free(mtmp_annot.tmp_arr_p);
+
+ fclose(annot_fd);
+ if (sname[0]=='!') {
+ if (!debug) {
+#ifdef UNIX
+ unlink(annot_descr_file);
+#else
+ _unlink(annot_descr_file);
+#endif
+ }
+ free(annot_descr_file);
+ }
+ return annot_seq_cnt;
+
+ no_annots:
+ for (i=0; i<nbest; i++) {
+ if (bestp_arr[i]->seq->annot_p) {
+ if (bestp_arr[i]->seq->annot_p->n_annot > 0) {
+ bestp_arr[i]->seq->annot_p->n_annot = 0;
+ }
+ }
+ }
+ if (sname[0] == '!') free(annot_descr_file);
+ return -1;
+#endif
+}
+
+void sort_annots(struct annot_entry **s_annot, int n_annot)
+{
+ int gap, i, j, k;
+ struct annot_entry *tmp;
+ int v;
+ int incs[6] = { 112, 48, 21, 7, 3, 1 };
+
+ for ( k = 0; k < 6; k++) {
+ gap = incs[k];
+ for (i=gap; i < n_annot; i++) {
+ tmp = s_annot[i];
+ j = i;
+ v = s_annot[i]->pos;
+ while ( j >= gap && s_annot[j-gap]->pos > v) {
+ s_annot[j] = s_annot[j - gap];
+ j -= gap;
+ }
+ s_annot[j] = tmp;
+ }
+ }
+}
+
+/* next_annot_entry -- reads and parses one annotation entry
+ reads lines from file *annot_fd
+ saves results to annot_str *annot_p (initially allocated as mtmp_annot_p, and expanded using mtmp_annot_p)
+ also initializes dom_feats_head, annot_arr_p, s_annot_arr_p
+*/
+
+struct annot_str *
+next_annot_entry(FILE *annot_fd, char *tmp_line, int n_tmp_line, struct annot_str *annot_p,
+ struct annot_mstr *mtmp_annot_p, struct mngmsg *m_msp, int target) {
+
+ char ctmp_label, ctmp_value, tmp_comment[MAX_STR], annot_acc[MAX_STR];
+ struct domfeat_link *domfeats_head, *domfeats_current;
+ char *bp;
+ int f_pos, f_end;
+ int i_ann, l_doms;
+ int n_annot = 0;
+ int last_left_bracket = -1;
+ struct annot_entry *tmp_ann_entry_arr, **s_tmp_ann_entry_arr;
+
+ int *t_ascii = lascii; /* generally, annotation target is library */
+ if (target != 1) t_ascii = qascii; /* for TFAST, target is only query */
+
+ SAFE_STRNCPY(annot_acc, tmp_line, sizeof(annot_acc));
+
+ /* initialize a 32-entry annot_entry array */
+ /* temporary, expandable place for annotations */
+ if (init_tmp_annot(mtmp_annot_p, 32)==0) return NULL;
+ tmp_ann_entry_arr = mtmp_annot_p->tmp_arr_p;
+
+ l_doms = 0;
+
+ /* read through each annotation in file */
+ while (fgets(tmp_line, n_tmp_line, annot_fd)!=NULL ) {
+ if (tmp_line[0] == '>') goto next_bline; /* start of new annotation */
+ if (tmp_line[0] == '#') continue; /* ignore comments */
+ if (tmp_line[0] == '=') { /* symbol definition */
+ add_annot_def(m_msp, tmp_line+1,1);
+ continue;
+ }
+
+ if (n_annot >= mtmp_annot_p->max_annot - 1) {
+ /* try to expand annotation array */
+ if (update_tmp_annot(mtmp_annot_p)==0) {
+ return NULL;
+ }
+ tmp_ann_entry_arr = mtmp_annot_p->tmp_arr_p;
+ }
+
+ /* sscanf cannot give strings with blanks */
+ /* sscanf(tmp_line,"%d %c %c %s", &f_pos, &ctmp_label, &ctmp_value, tmp_comment); */
+ tmp_comment[0] = '\0';
+ if ((bp=strchr(tmp_line,'\r')) || (bp=strchr(tmp_line,'\n'))) *bp='\0'; /* clean up EOL */
+ if ((bp=strchr(tmp_line,'\t'))!=NULL) { /* fields MUST be tab delimited */
+ f_pos=atoi(tmp_line) - 1; /* get first field -- f_pos, converted to 0-offset */
+ ctmp_label = bp[1]; /* get second field -- ctmp_label */
+ if ((bp=strchr(bp+1,'\t'))!=NULL) { /* next field could be f_end or ctmp_value */
+ if (ctmp_label == '-') { f_end = atoi(bp+1) -1; ctmp_value = '\0';}
+ else {ctmp_value = bp[1]; f_end = f_pos;} /* have variant, not coordinate */
+ if ((bp=strchr(bp+1,'\t'))!=NULL) { /* if last <tab>, get comment */
+ strncpy(tmp_comment,bp+1,sizeof(tmp_comment));
+ }
+ }
+ }
+ else { /* no tab */
+ continue;
+ }
+
+ if (ctmp_label != ']') { /* anything except ']' needs to be recorded */
+ tmp_ann_entry_arr[n_annot].pos = f_pos;
+ tmp_ann_entry_arr[n_annot].end = f_end;
+ tmp_ann_entry_arr[n_annot].label=ctmp_label;
+ tmp_ann_entry_arr[n_annot].value=ctmp_value;
+ tmp_ann_entry_arr[n_annot].comment = NULL;
+ tmp_ann_entry_arr[n_annot].target = target; /* query (0) or library (1) */
+ }
+ else { /* ctmp_label == ']' -- closing domain */
+ if (last_left_bracket < 0) {
+ fprintf(stderr,"*** error [%s:%d] -- next_annot_entry(%s) - ']' without '[': %s\n",
+ __FILE__,__LINE__, annot_acc, tmp_line);
+ continue;
+ }
+ tmp_ann_entry_arr[last_left_bracket].end = f_pos;
+ tmp_ann_entry_arr[last_left_bracket].label = '-';
+ last_left_bracket = -1;
+ }
+
+ if (f_end < f_pos) {
+ fprintf(stderr,"*** error [%s:%d] -- %s: domain start (%d) > domain end (%d)\n",
+ __FILE__,__LINE__, annot_acc, f_pos+1, f_end+1);
+ continue;
+ }
+
+ if (tmp_comment[0]) {
+ if ((tmp_ann_entry_arr[n_annot].comment=(char *)calloc(strlen(tmp_comment)+1,sizeof(char)))!=NULL) {
+ strncpy(tmp_ann_entry_arr[n_annot].comment,tmp_comment,strlen(tmp_comment));
+ }
+ }
+
+ /* fill in other information if appropriate */
+ if (ctmp_label== 'V') {
+ /* map the .value from ascii to encoded residue */
+ /* in general, it is lascii == qascii, but for TFAST, lascii != qascii */
+ tmp_ann_entry_arr[n_annot].value = t_ascii[tmp_ann_entry_arr[n_annot].value];
+ }
+ else if (ctmp_label == '-') { /* have beginning and end of domain */
+ l_doms++;
+ i_ann = add_annot_char(m_msp->ann_arr, '[');
+ if (i_ann > 0) {
+ qascii['['] = NANN + i_ann;
+ m_msp->ann_arr_def[i_ann] = NULL; /* set definition string NULL */
+ }
+ i_ann = add_annot_char(m_msp->ann_arr, ']');
+ if (i_ann > 0) {
+ qascii[']'] = NANN + i_ann;
+ m_msp->ann_arr_def[i_ann] = NULL;
+ }
+ }
+ else if (ctmp_label == '[') { /* beginning of domain */
+ l_doms++;
+ last_left_bracket = n_annot;
+ i_ann = add_annot_char(m_msp->ann_arr, ctmp_label);
+ if (i_ann > 0) {
+ qascii['['] = NANN + i_ann;
+ m_msp->ann_arr_def[i_ann] = NULL;
+ }
+ }
+ else if (ctmp_label == ']') { /* end of domain */
+ i_ann = add_annot_char(m_msp->ann_arr, ctmp_label);
+ if (i_ann > 0) {
+ qascii[']'] = NANN + i_ann;
+ m_msp->ann_arr_def[i_ann] = NULL;
+ }
+ continue; /* no n_annot++, */
+ }
+ else if ((i_ann = add_annot_char(m_msp->ann_arr, ctmp_label)) > 0) { /* not V[], active site mark */
+ m_msp->ann_arr_def[i_ann] = NULL;
+ qascii[ctmp_label] = NANN + i_ann;
+ }
+ n_annot++;
+ }
+
+ next_bline:
+ if (n_annot) { /* if we have annotations, save them and set tmp_ann_entry_arr = NULL */
+
+ /* check for unpaired '['; unpaired ']' was checked earlier */
+
+ for (i_ann=0; i_ann < n_annot; i_ann++) {
+ if (tmp_ann_entry_arr[i_ann].label == '[') {
+ fprintf(stderr,"*** error [%s:%d] -- next_annot_entry(%s) - unpaired '[' %d:%s\n",
+ __FILE__,__LINE__, annot_acc, i_ann, tmp_ann_entry_arr[i_ann].comment);
+ return NULL;
+ }
+ }
+
+ /* everything is paired properly */
+ /* re-allocate to exact space */
+ tmp_ann_entry_arr = (struct annot_entry *)realloc(tmp_ann_entry_arr, (n_annot+1)*sizeof(struct annot_entry));
+
+ /* provide sorted array */
+ if ((s_tmp_ann_entry_arr = (struct annot_entry **)calloc((n_annot+1),sizeof(struct annot_entry *)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] -- next_annot_entry(%s) - cannot alloc s_tmp_ann_entry_arr[%d]",
+ __FILE__,__LINE__, annot_acc, n_annot+1);
+ return NULL;
+ }
+
+ for (i_ann=0; i_ann < n_annot+1; i_ann++) {
+ s_tmp_ann_entry_arr[i_ann] = &tmp_ann_entry_arr[i_ann];
+ }
+
+ sort_annots(s_tmp_ann_entry_arr,n_annot);
+
+ /* now allocate an annot_p if necessary, and link tmp_ann_entry_arr to it */
+ if (annot_p || (annot_p = calloc(1,sizeof(struct annot_str)))!=NULL) {
+ annot_p->annot_arr_p = tmp_ann_entry_arr;
+ annot_p->s_annot_arr_p = s_tmp_ann_entry_arr;
+ annot_p->n_annot = n_annot;
+ annot_p->n_domains = l_doms;
+ /* set to NULL to re-initialize */
+ }
+ }
+ else {
+ annot_p = NULL;
+ }
+ return annot_p;
+}
+
+
+/* **************************************************************** */
+/* add_annot_char(ann_arr, ctmp_label) --
+
+ (1) add annotation character to ann_arr if not present
+ (2) return i_ann if added
+*/
+/* **************************************************************** */
+
+int
+add_annot_char(unsigned char *ann_arr, char ctmp_label) {
+ int i_ann;
+
+ if (ann_arr[0] == '\0') {
+ ann_arr[0] = ' '; ann_arr[1] = '\0';
+ }
+
+ /* check to see if already there? */
+ if (strchr((char *)ann_arr,ctmp_label)==NULL) {
+ /* check for room for another character */
+ if (strlen((char *)ann_arr) >= MAX_FN) {
+ fprintf(stderr,"*** error [%s:%d] -- add_annot_char - too many annotation characters: len(%s) + %c > %d\n",
+ __FILE__, __LINE__, ann_arr, ctmp_label, MAX_FN-1);
+ return 0;
+ }
+ else {
+ ann_arr[i_ann=strlen((char *)ann_arr)] = ctmp_label; /* add the character */
+ ann_arr[i_ann+1] = '\0'; /* guarantee null termination */
+ return i_ann;
+ }
+ }
+ else {
+ return 0;
+ }
+}
+
+/* **************************************************************** */
+/* get_annot -- produces fasta file from m_msp->sname script
+ (modified 20-Sept-2012 to not use intermediate file)
+
+ # (1) generate a temporary file name
+ # (2) write out one bline (or portion that include accession)
+ # (3) run sname[] script against temporary file, producing table of annotations
+ (1) run script bline_id
+ (4) read in the annotations and put them in struct annot_entry;
+ (5) modify *annot_p to point to structure
+ (6) return number of annotations
+*/
+/* **************************************************************** */
+
+int
+get_annot(char *sname, struct mngmsg *m_msp, char *bline, long offset, int n1, struct annot_str **annot_p,
+ int target, int debug) {
+
+ char tmp_line[MAX_STR];
+ FILE *annot_data_fd;
+ char bline_descr[MAX_STR];
+ char annot_data_file[MAX_LSTR];
+ char annot_script[MAX_LSTR];
+ long q_offset;
+
+ char *bp;
+ FILE *annot_fd=NULL; /* file for annot accessions */
+ struct annot_mstr mtmp_annot;
+
+#ifndef UNIX
+ /* need pipes, system() */
+ return 0;
+#else
+
+ if (sname[0] == '!') {
+ /* popen implementation */
+
+ annot_data_file[0] = '\0';
+
+ if (bline[0] == '>') {
+ SAFE_STRNCPY(bline_descr, bline+1,sizeof(bline_descr));
+ }
+ else {
+ SAFE_STRNCPY(bline_descr, bline,sizeof(bline_descr));
+ }
+ if ((strlen(bline_descr) > DESCR_OFFSET) &&
+ (bp=strchr(bline_descr+DESCR_OFFSET,' '))!=NULL) {*bp = '\0';}
+ else {bp = NULL;}
+
+ q_offset = m_msp->q_offset + m_msp->q_off - 1;
+ if (q_offset < 0) { q_offset = 0;}
+ sprintf(annot_script,"%s \"%s\" %ld",sname+1, bline_descr,q_offset+m_msp->n0);
+ annot_script[sizeof(annot_script)-1] = '\0';
+
+ annot_fd = popen(annot_script,"r");
+ }
+ else if (sname[0] == '<') {
+ SAFE_STRNCPY(annot_data_file,sname+1,sizeof(annot_data_file));
+ annot_fd=fopen(annot_data_file,"r");
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] -- get_annot() - %s not script (!) or file (<)\n",__FILE__, __LINE__, sname);
+ goto no_annots;
+ }
+
+ if (!annot_fd) {
+ goto no_annots;
+ }
+ else { /* read the annotations into the array */
+
+ /* read #comments, =annot_defs at beginning of file */
+ tmp_line[0] = '#';
+ while (tmp_line[0] == '#' || tmp_line[0] == '=') {
+ if (tmp_line[0] == '=') add_annot_def(m_msp, tmp_line+1,1);
+ if (fgets(tmp_line, sizeof(tmp_line), annot_fd)==NULL) {
+ fprintf(stderr,"*** error [%s:%d] -- get_annot() - premature annotation file end (%s)\n",
+ __FILE__,__LINE__, annot_data_file);
+ goto no_annots;
+ }
+ }
+
+ /* set mtmp_annot to be initialized */
+ mtmp_annot.tmp_arr_p = NULL;
+ mtmp_annot.max_annot = 0;
+
+ /* strlen(&tmp_line[1])-1 to remove '>' and beginning and '\n' at end */
+ if (tmp_line[0] != '>') {
+ fprintf(stderr,"*** error [%s:%d] -- get_annot() - no %s description: [%s]\n",
+ __FILE__,__LINE__,annot_data_file, tmp_line);
+ goto no_annots;
+ }
+
+ *annot_p = next_annot_entry(annot_fd, tmp_line, sizeof(tmp_line), *annot_p, &mtmp_annot, m_msp, target);
+
+ if (sname[0] == '!') {
+ pclose(annot_fd);
+ }
+ else {
+ fclose(annot_fd);
+ }
+
+ /* now allocate an annot_p if necessary, and link tmp_ann_entry_arr to it */
+ if (*annot_p) {
+ s_annot_to_aa1a(offset, n1, (*annot_p),m_msp->ann_arr,"get_annot");
+ return (*annot_p)->n_annot;
+ }
+ else {
+ if (mtmp_annot.tmp_arr_p) free(mtmp_annot.tmp_arr_p);
+ return 0;
+ }
+ }
+
+ no_annots:
+ return -1;
+#endif
+}
+
+/* s_annot_to_aa1a -- takes an annot_entry[] and converts it to an *aa1_ann
+ */
+void
+s_annot_to_aa1a(long offset, int n1, struct annot_str *annot_p, unsigned char *ann_arr, char *tmp_line) {
+ unsigned char *aa1a_tmp;
+ int i, ic, n_annot;
+ struct annot_entry *this_annot;
+ char *bp;
+
+ if ((aa1a_tmp = (unsigned char *)calloc(n1+2,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] -- s_annot_to_aa1a() - cannot allocate aa1a_ann[%d] array\n",
+ __FILE__, __LINE__, n1);
+ return;
+ }
+
+ if (offset < 0) offset++;
+
+ for (i=0; i < annot_p->n_annot; i++) {
+ this_annot = &annot_p->annot_arr_p[i];
+ /* skip VAR labels */
+ if (this_annot->label == 'V') { continue; }
+ if (this_annot->label == '-') {
+ if (this_annot->pos - offset >= 0) {
+ if (this_annot->pos - offset < n1) {
+ aa1a_tmp[this_annot->pos-offset] = qascii['['] - NANN;
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] -- s_annot_to_aa1a() - attempt to write off end of aa1a_tmp[%d]: %ld -- %s\n",
+ __FILE__,__LINE__, n1, this_annot->pos - offset, tmp_line);
+ continue;
+ }
+ }
+ else {
+ if (this_annot->end - offset < 0) continue;
+ /*
+ fprintf(stderr,"*** error [%s:%d] --- s_annot_to_aa1a[%ld:%d] out of range\n",
+ __FILE__, __LINE__, this_annot->pos - offset, 0);
+ */
+ aa1a_tmp[0] = qascii['['] - NANN;
+ this_annot->pos = offset;
+ }
+ if (this_annot->end - offset < n1) {aa1a_tmp[this_annot->end-offset]=qascii[']'] - NANN;}
+ else {
+ /*
+ fprintf(stderr,"*** error [%s:%d] --- s_annot_to_aa1a[%ld:%d] out of range\n",
+ __FILE__, __LINE__, this_annot->end - offset, n1);
+
+ */
+ aa1a_tmp[n1-1] = qascii[']'] - NANN;
+ this_annot->end = offset+n1-1;
+ }
+ continue;
+ }
+ if (strchr((char *)ann_arr, this_annot->label)==NULL) {continue;}
+ if (this_annot->pos - offset < n1) {
+ if (this_annot->pos >= offset) { /* not an error, but annotation must be in range */
+ aa1a_tmp[this_annot->pos - offset]=qascii[this_annot->label] - NANN;
+ }
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] -- s_annot_to_aa1() - athis_annot->pos:[%ld - %ld] out of range: %d : %s\n",
+ __FILE__, __LINE__, this_annot->pos,offset, n1, tmp_line);
+ }
+ }
+ annot_p->aa1_ann = aa1a_tmp;
+}
+
+/* save_best captures much of the complexity of saving the best scores
+ and appropriately sampling the scores for statistical analysis. It
+ does the following:
+
+ (1) update s_info counts for functions like fasta/x/y that don't
+ optimize every score
+
+ (2) for every result in the buffer:
+ (a) decide if it should be used for statistical sampling
+ (b) if the number of samples > MAX_STATS, then run
+ process_hist() and update all the zscores
+ (c) reset everything for next sequence
+
+ (3) must ensure that -BIGNUM are never in best[]
+
+*/
+
+#include "thr_buf_structs.h"
+#ifndef PCOMPLIB
+#define RESULTS_BUF reader_buf
+#define XTERNAL
+#include "thr_bufs2.h"
+#else
+#define RESULTS_BUF worker_buf
+#include "pcomp_bufs.h"
+#endif
+
+extern char *prog_func; /* function label */
+extern int fa_max_workers;
+extern struct buf_head *lib_buf2_list;
+#ifdef DEBUG
+void check_rbuf(struct buf_head *cur_buf);
+#endif
+extern void get_rbuf(struct buf_head **lib_buf, int max_work_buf);
+extern void put_rbuf(struct buf_head *lib_buf, int max_work_buf);
+extern void wait_rbuf(int max_work_buf);
+extern void rbuf_done(int nthreads);
+extern void put_rbuf_done(int nthreads, struct buf_head *lib_buf,
+ int max_work_buf);
+extern int
+process_hist(struct stat_str *sptr, int nstats,
+ const struct mngmsg *m_msg,
+ struct pstruct *ppst,
+ struct hist_str *hist, void **pstat_void, struct score_count_s *s_info, int do_hist);
+
+extern void addhistz(double, struct hist_str *); /* scaleswn.c */
+void selectbestz(struct beststr **, int, int );
+extern double find_z(int score, double escore, int length, double comp, void *);
+extern double zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db);
+extern struct beststr **bestp_arr; /* array of pointers */
+extern int nbest;
+extern int nstats, nqstats, nrstats, pre_nstats, kstats, shuff_tot, sstats;
+extern double zbestcut; /* cut off for best z-score */
+extern int bestfull; /* index for selectbest() */
+extern int stats_done; /* flag for z-value processing */
+extern void *rand_state;
+extern struct stat_str *stats; /* array of scores for statistics from real
+ (or shuffled) sequences*/
+extern struct stat_str *qstats; /* array of scores for shuffled query stats */
+extern struct stat_str *rstats; /* array of scores from shuffled library */
+
+/* in the current version (fasta_35_01) save_best is used by both
+ threaded and unthreaded versions */
+
+#define COPY_RST_P(d,s) \
+{ d->rst.score[0] = s->rst.score[0]; \
+ d->rst.score[1] = s->rst.score[1]; \
+ d->rst.score[2] = s->rst.score[2]; \
+ d->rst.valid_stat = s->rst.valid_stat; \
+ d->rst.comp = s->rst.comp; \
+ d->rst.H = s->rst.H; \
+ d->rst.escore = s->rst.escore; \
+ d->rst.segnum = s->rst.segnum; \
+ d->rst.seglen = s->rst.seglen; \
+}
+
+void
+save_best(struct buf_head *lib_bhead_p,
+ const struct mngmsg *m_msp, struct pstruct *ppst,
+ struct db_str *ldb, FILE *fdata,
+ struct hist_str *histp, void **pstat_voidp,
+ struct score_count_s *s_info)
+{
+ double zscore;
+ int i_score;
+ struct beststr *bbp;
+ struct buf2_data_s *rbuf_dp, *lib_buf2_dp;
+ struct buf2_res_s *rbuf_rp, *lib_buf2_rp;
+ int i, t_best, t_rbest, t_qrbest, tm_best, t_n1, sc_ix;
+ int t_valid_stat, tr_valid_stat, use_shuff, zsflag_save;
+ double e_score, tm_escore, t_rescore, t_qrescore;
+ int buf2_cnt;
+
+ if (!lib_bhead_p->hdr.have_results) return;
+ if ((buf2_cnt=lib_bhead_p->hdr.buf2_cnt) <= 0) return;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+
+ shuff_tot += lib_bhead_p->hdr.shuff_cnt;
+ s_info->s_cnt[0] += lib_bhead_p->s_cnt_info.s_cnt[0];
+ s_info->s_cnt[1] += lib_bhead_p->s_cnt_info.s_cnt[1];
+ s_info->s_cnt[2] += lib_bhead_p->s_cnt_info.s_cnt[2];
+ s_info->tot_scores += lib_bhead_p->s_cnt_info.tot_scores;;
+
+ sc_ix = ppst->score_ix;
+
+ t_best = t_rbest = t_qrbest = -BIGNUM;
+ tm_escore = t_rescore = t_qrescore = FLT_MAX;
+ t_valid_stat = tr_valid_stat = 0;
+ if (ppst->zsflag >= 10 && ppst->zsflag < 20) { use_shuff = 1;}
+ else { use_shuff = 0;}
+
+#ifdef DEBUG
+ if (fdata) {
+ fprintf(fdata,">save_best: %d\n",buf2_cnt);
+ }
+#endif
+
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ while (buf2_cnt--) { /* count down the number of results */
+ rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
+ rbuf_dp = lib_buf2_dp++; /* step through the data buffer */
+
+ /* perhaps should use explicit flag to indicate no score */
+ if (rbuf_rp->rst.score[0] == -BIGNUM) continue;
+
+ /* i_score: current raw sorting score */
+ i_score = rbuf_rp->rst.score[sc_ix];
+ /* e_score, current escore */
+ e_score = rbuf_rp->rst.escore;
+
+ /* this should be done in the thread, and a sorted set of indexes
+ should be produced by the thread, so we just go down the list
+ to the zscore threshold */
+ zscore = (double)i_score;
+ if (stats_done) {
+ zscore=find_z(i_score, e_score, rbuf_dp->seq->n1,(double)rbuf_rp->rst.comp,
+ *pstat_voidp);
+ }
+
+ /* we have complex logic to decide:
+ (a) for multiframe results, which is the best
+ (b) information about valid stats
+ we should simply return a stats array where all this is figured
+ out in the thread.
+ */
+ t_n1 = rbuf_dp->seq->n1;
+ if (i_score > t_best) tm_best = t_best = i_score;
+ if (e_score < tm_escore) tm_escore = e_score;
+ if (rbuf_rp->rst.valid_stat > t_valid_stat) {
+ t_valid_stat = 1;
+ }
+
+ /* this stuff happens only for fasts/fastm/fastf
+ again, the t_qrbest stuff should be done in the thread
+ rather than check for every hit, run through the loop
+ only if necessary.
+ */
+ if (m_msp->qshuffle) {
+ if (rbuf_rp->qr_score > t_qrbest)
+ t_qrbest = rbuf_rp->qr_score;
+ if (rbuf_rp->qr_escore < t_qrescore)
+ t_qrescore = rbuf_rp->qr_escore;
+
+ if (rbuf_dp->frame == m_msp->nitt1 && t_qrbest > 0 && nqstats < m_msp->shuff_max) {
+ qstats[nqstats].n1 = rbuf_dp->seq->n1; /* save the best score */
+ qstats[nqstats].comp = rbuf_rp->rst.comp;
+ qstats[nqstats].H = rbuf_rp->rst.H;
+ qstats[nqstats].escore = t_qrescore;
+ qstats[nqstats++].score = t_qrbest;
+ t_qrbest = -BIGNUM; /* reset t_qrbest, t_qrescore */
+ t_qrescore = FLT_MAX;
+ }
+ } /* m_msp->qshuffle */
+
+ if (use_shuff) {
+ /* this check is required because some sequences scheduled to be
+ used for statistics may not in fact be returning a score (if
+ they are outside the -M range, for example.
+ */
+ if (rbuf_rp->r_rst.score[0] == -BIGNUM) { tr_valid_stat = 0; }
+ if (rbuf_rp->r_rst.valid_stat > tr_valid_stat) {
+ tr_valid_stat = 1;
+ }
+ if (rbuf_rp->r_rst.score[sc_ix] > t_rbest) {
+ t_rbest = rbuf_rp->r_rst.score[sc_ix];
+ t_rescore = rbuf_rp->r_rst.escore;
+ }
+ }
+
+ /* need to look for frame 0 if TFASTA, then save stats at frame 6 */
+ if (fdata) {
+ fprintf(fdata,
+ "%-12s %6d %d %.5f %.5f %4d %4d %4d %2d %2d %4d %4d %4d %2d %2d %5d %8lld\n",
+ rbuf_dp->mseq->libstr, rbuf_dp->seq->n1,rbuf_dp->frame,rbuf_rp->rst.comp,rbuf_rp->rst.H,
+ rbuf_rp->rst.score[0],rbuf_rp->rst.score[1],rbuf_rp->rst.score[2],
+ t_valid_stat, rbuf_rp->rst.alg_info,
+ (rbuf_rp->r_rst.score[0]<0 ? -1 : rbuf_rp->r_rst.score[0]),
+ (rbuf_rp->r_rst.score[1]<0 ? -1 : rbuf_rp->r_rst.score[1]),
+ (rbuf_rp->r_rst.score[2]<0 ? -1 : rbuf_rp->r_rst.score[2]),
+ tr_valid_stat, rbuf_rp->r_rst.alg_info,
+ rbuf_dp->stats_idx, rbuf_dp->mseq->lseek);
+ }
+
+ /* statistics done for best score of set */
+
+ if (rbuf_dp->frame == m_msp->nitt1) {
+ ldb->entries++;
+ ldb->length += t_n1;
+ if (ldb->length > LONG_MAX) {
+ ldb->length -= LONG_MAX; ldb->carry++;
+ }
+ }
+
+ if (rbuf_dp->frame == m_msp->nitt1 && ppst->zsflag >= 0) {
+ /* if this sample should be used for statistics */
+ if (use_shuff) t_valid_stat = tr_valid_stat;
+ if (t_valid_stat) {
+ /* we've got our initial MAX_STATS values */
+ if (nstats >= MAX_STATS) {
+ if (!stats_done) {
+ zsflag_save = ppst->zsflag;
+ if (ppst->zsflag > 20) {
+ ppst->zsflag -= 20;
+ }
+ ppst->zsflag_f = process_hist(stats,nstats,m_msp, ppst,
+ histp, pstat_voidp,s_info, 0);
+ ppst->zsflag = zsflag_save;
+ kstats = nstats;
+ if (ppst->zsflag >= 0) { /* this is redundant, but rare */
+ stats_done = 1;
+ for (i=0; i< nstats; i++) {
+ bestp_arr[i]->zscore =
+ find_z(bestp_arr[i]->rst.score[ppst->score_ix],
+ bestp_arr[i]->rst.escore, bestp_arr[i]->seq->n1,
+ bestp_arr[i]->rst.comp, *pstat_voidp);
+ }
+ }
+ }
+ }
+ else {
+ /* this logic allows stats_idx to be over-ruled for searches
+ where every query does not generate a score */
+ rbuf_dp->stats_idx = nstats;
+ nstats++;
+ }
+ }
+
+ if (rbuf_dp->stats_idx >= 0 && t_valid_stat) {
+ if (rbuf_dp->stats_idx >= MAX_STATS || nstats > MAX_STATS) {
+ fprintf(stderr, "*** error [%s:%d] - nstats index [%d] out of range [%d,%d]\n",
+ __FILE__, __LINE__,
+ rbuf_dp->stats_idx, nstats,MAX_STATS);
+ }
+ else { /* stats_idx is in range */
+ sstats++;
+ stats[rbuf_dp->stats_idx].n1 = t_n1;
+ stats[rbuf_dp->stats_idx].comp = rbuf_rp->rst.comp;
+ stats[rbuf_dp->stats_idx].H = rbuf_rp->rst.H;
+ if (use_shuff) { /* use shuffled score */
+ stats[rbuf_dp->stats_idx].escore = t_rescore;
+ stats[rbuf_dp->stats_idx].score = t_rbest;
+ }
+ else { /* real score, not shuffled */
+ stats[rbuf_dp->stats_idx].escore = tm_escore;
+ stats[rbuf_dp->stats_idx].score = tm_best;
+ }
+ } /* end stats_idx in range */
+ } /* end have valid stats_idx */
+
+ if (t_valid_stat && stats_done && histp) {
+ addhistz(find_z(t_best, tm_escore, rbuf_dp->seq->n1, (double) rbuf_rp->rst.comp,
+ *pstat_voidp), histp);
+ }
+ /* reset best scores */
+ t_best = t_rbest = -BIGNUM;
+ tm_escore = t_rescore = FLT_MAX;
+ t_valid_stat = tr_valid_stat = 0;
+ }
+
+ /*
+ if (rbuf_rp->rst.score[ppst->score_ix] > 200) {
+ fprintf(stderr, "high score[%d]: %s %d: %d\n", rbuf_dp->seq->index,
+ rbuf_dp->mseq->libstr, rbuf_dp->seq->n1, rbuf_rp->rst.score[ppst->score_ix]);
+ }
+ */
+
+ if (zscore > zbestcut) {
+ if (nbest >= MAX_BEST) {
+ bestfull = nbest-MAX_BEST/4;
+ selectbestz(bestp_arr,bestfull-1,nbest);
+ zbestcut = bestp_arr[bestfull-1]->zscore;
+ nbest = bestfull;
+ }
+ bbp = bestp_arr[nbest++];
+
+ COPY_RST_P(bbp, rbuf_rp);
+
+ bbp->seq = rbuf_dp->seq;
+ bbp->mseq = rbuf_dp->mseq;
+ bbp->n1 = rbuf_dp->seq->n1;
+#ifdef DEBUG
+ bbp->adler32_crc = rbuf_dp->seq->adler32_crc;
+#endif
+ /* rbuf_dp->best_save is set after a rbuf_dp is entered into best_str */
+ if (rbuf_dp->best_save) {
+ /* a previous rbuf_dp->seq is in best_str at best_save */
+ if (rbuf_dp->best_save->seq == rbuf_dp->seq) {
+ /* the best_save->seq matches the rbuf_dp->seq */
+ bbp->bbp_link = rbuf_dp->best_save;
+ /* bbp_link tells where this ->seq can be found */
+ }
+ else {
+ bbp->bbp_link = NULL;
+ }
+ }
+ rbuf_dp->best_save = bbp;
+ lib_bhead_p->hdr.have_best_save = 1;
+ bbp->zscore = zscore;
+ bbp->frame = rbuf_dp->frame;
+ }
+ }
+}
+
+void
+save_best2(struct buf_head *lib_bhead_p,
+ const struct mngmsg *m_msp, struct pstruct *ppst,
+ struct db_str *ldb, FILE *fdata,
+ struct hist_str *histp, void **pstat_voidp,
+ struct score_count_s *s_info)
+{
+ double zscore;
+ int i_score;
+ struct beststr *bbp;
+ struct buf2_data_s *rbuf_dp, *lib_buf2_dp;
+ struct buf2_res_s *rbuf_rp, *lib_buf2_rp;
+ int i, sc_ix;
+ int t_valid_stat, use_shuff, zsflag_save;
+ double e_score;
+ int buf2_cnt;
+
+ if (!lib_bhead_p->hdr.have_results) return;
+ if ((buf2_cnt = lib_bhead_p->hdr.buf2_cnt) <= 0) return;
+
+ /*
+#ifdef DEBUG
+ fprintf(stderr," save_best2: lib_bhead_p->buf2_data[0]->mseq->index/lseek: %d,%lld\n",
+ lib_bhead_p->buf2_data[0].mseq->index,lib_bhead_p->buf2_data[0].mseq->lseek);
+#endif
+ */
+ if (ppst->zsflag >= 10 && ppst->zsflag < 20) { use_shuff = 1;}
+ else {use_shuff = 0;}
+
+ shuff_tot += lib_bhead_p->hdr.shuff_cnt;
+ s_info->s_cnt[0] += lib_bhead_p->s_cnt_info.s_cnt[0];
+ s_info->s_cnt[1] += lib_bhead_p->s_cnt_info.s_cnt[1];
+ s_info->s_cnt[2] += lib_bhead_p->s_cnt_info.s_cnt[2];
+ s_info->tot_scores += lib_bhead_p->s_cnt_info.tot_scores;;
+ sc_ix = ppst->score_ix;
+
+ /* save the raw data if requested */
+ if (fdata) {
+#ifdef DEBUG
+ fprintf(fdata,">save_best: %d\n",buf2_cnt);
+#endif
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ while (buf2_cnt--) { /* count down the number of results */
+
+ rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
+ rbuf_dp = lib_buf2_dp++; /* step through the data buffer */
+
+ /* perhaps should use explicit flag to indicate no score */
+ if (rbuf_rp->rst.score[0] == -BIGNUM) continue;
+
+ fprintf(fdata,
+ "%-12s %6d %d %.5f %.5f %4d %4d %4d %2d %2d %4d %4d %4d %2d %2d %5d %8lld\n",
+ rbuf_dp->mseq->libstr, rbuf_dp->seq->n1,rbuf_dp->frame,rbuf_rp->rst.comp,rbuf_rp->rst.H,
+ rbuf_rp->rst.score[0],rbuf_rp->rst.score[1],rbuf_rp->rst.score[2],
+ rbuf_rp->is_valid_stat, rbuf_rp->rst.alg_info,
+ (rbuf_rp->r_rst.score[0]<0 ? -1 : rbuf_rp->r_rst.score[0]),
+ (rbuf_rp->r_rst.score[1]<0 ? -1 : rbuf_rp->r_rst.score[1]),
+ (rbuf_rp->r_rst.score[2]<0 ? -1 : rbuf_rp->r_rst.score[2]),
+ rbuf_rp->is_valid_stat, rbuf_rp->r_rst.alg_info,
+ rbuf_dp->stats_idx, rbuf_dp->mseq->lseek);
+ }
+ }
+
+ /* save the high-scoring data */
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ while (buf2_cnt--) {
+ rbuf_rp = lib_buf2_rp++;
+ rbuf_dp = lib_buf2_dp++;
+
+ /* perhaps should use explicit flag to indicate no score */
+ if (rbuf_rp->rst.score[0] == -BIGNUM) continue;
+
+ /* i_score: current raw sorting score */
+ i_score = rbuf_rp->rst.score[sc_ix];
+ /* e_score, current escore */
+ e_score = rbuf_rp->rst.escore;
+ /* this should be done in the thread, and a sorted set of indexes
+ should be produced by the thread, so we just go down the list
+ to the zscore threshold */
+ zscore = (double)i_score;
+ if (stats_done) {
+ zscore=find_z(i_score, e_score, rbuf_dp->seq->n1,(double)rbuf_rp->rst.comp,
+ *pstat_voidp);
+ }
+
+ if (rbuf_dp->frame == m_msp->nitt1) {
+ ldb->entries++;
+ ldb->length += rbuf_dp->seq->n1;
+ if (ldb->length > LONG_MAX) {
+ ldb->length -= LONG_MAX; ldb->carry++;
+ }
+ }
+
+ if (zscore > zbestcut) {
+ if (nbest >= MAX_BEST) {
+ bestfull = nbest-MAX_BEST/4;
+ selectbestz(bestp_arr,bestfull-1,nbest);
+ zbestcut = bestp_arr[bestfull-1]->zscore;
+ nbest = bestfull;
+ }
+ bbp = bestp_arr[nbest++];
+
+ COPY_RST_P(bbp, rbuf_rp);
+
+ bbp->seq = rbuf_dp->seq;
+ bbp->mseq = rbuf_dp->mseq;
+ bbp->n1 = rbuf_dp->seq->n1;
+#ifdef DEBUG
+ bbp->adler32_crc = rbuf_dp->seq->adler32_crc;
+#endif
+ /* rbuf_dp->best_save is set after a rbuf_dp is entered into best_str */
+ if (rbuf_dp->best_save) {
+ /* a previous rbuf_dp->seq is in best_str at best_save */
+ if (rbuf_dp->best_save->seq == rbuf_dp->seq) {
+ /* the best_save->seq matches the rbuf_dp->seq */
+ bbp->bbp_link = rbuf_dp->best_save;
+ /* bbp_link tells where this ->seq can be found */
+ }
+ else {
+ bbp->bbp_link = NULL;
+ }
+ }
+ rbuf_dp->best_save = bbp;
+ lib_bhead_p->hdr.have_best_save = 1;
+ bbp->zscore = zscore;
+ bbp->frame = rbuf_dp->frame;
+ }
+ }
+
+ /* process results for statistics */
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ while (buf2_cnt--) { /* count down the number of results */
+ rbuf_dp = lib_buf2_dp++; /* step through the results buffer */
+ rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
+
+ if (!rbuf_rp->is_valid_stat) { continue;}
+
+
+ if (use_shuff) {
+ i_score = rbuf_rp->r_rst.score[sc_ix];
+ e_score = rbuf_rp->r_rst.escore;
+ }
+ else {
+ i_score = rbuf_rp->rst.score[sc_ix];
+ e_score = rbuf_rp->rst.escore;
+ }
+
+ if (rbuf_dp->stats_idx >= MAX_STATS || nstats > MAX_STATS) {
+ fprintf(stderr, "*** error [%s:%d] - nstats index [%d] out of range [%d,%d]\n",
+ __FILE__, __LINE__,
+ rbuf_dp->stats_idx, nstats,MAX_STATS);
+ continue;
+ }
+
+ if (nstats < MAX_STATS) {
+ /* this logic allows stats_idx to be over-ruled for searches
+ where every query does not generate a score */
+ rbuf_dp->stats_idx = nstats;
+ nstats++;
+ }
+
+ if (stats_done && histp) {
+ addhistz(find_z(i_score, e_score, rbuf_dp->seq->n1, (double) rbuf_rp->rst.comp,
+ *pstat_voidp), histp);
+ }
+
+ if (rbuf_dp->stats_idx < 0) {
+ continue;
+ }
+
+ sstats++;
+ stats[rbuf_dp->stats_idx].n1 = rbuf_dp->seq->n1;
+ stats[rbuf_dp->stats_idx].comp = rbuf_rp->rst.comp;
+ stats[rbuf_dp->stats_idx].H = rbuf_rp->rst.H;
+ stats[rbuf_dp->stats_idx].escore = e_score;
+ stats[rbuf_dp->stats_idx].score = i_score;
+ }
+
+
+ /* fill the qstats[] array if m_msp->qshuffle */
+ if (m_msp->qshuffle && nqstats < m_msp->shuff_max) {
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ while (buf2_cnt--) {
+ rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
+ rbuf_dp = lib_buf2_dp++;
+
+ if (rbuf_rp->is_valid_stat && rbuf_rp->qr_score > 0
+ && nqstats < m_msp->shuff_max) {
+ qstats[nqstats].n1 = rbuf_dp->seq->n1; /* save the best score */
+ qstats[nqstats].comp = rbuf_rp->rst.comp;
+ qstats[nqstats].H = rbuf_rp->rst.H;
+ qstats[nqstats].escore = rbuf_rp->qr_escore;
+ qstats[nqstats++].score = rbuf_rp->qr_score;
+ }
+ } /* m_msp->qshuffle */
+ }
+
+ /* check if we have enough data to do stats */
+ if (!stats_done && nstats >= MAX_STATS) {
+ zsflag_save = ppst->zsflag;
+ if (ppst->zsflag > 20) {
+ ppst->zsflag -= 20;
+ }
+ ppst->zsflag_f = process_hist(stats,nstats,m_msp, ppst,
+ histp, pstat_voidp,s_info, 0);
+ ppst->zsflag = zsflag_save;
+ kstats = nstats;
+ stats_done = 1;
+ for (i=0; i< nstats; i++) {
+ bestp_arr[i]->zscore =
+ find_z(bestp_arr[i]->rst.score[ppst->score_ix],
+ bestp_arr[i]->rst.escore, bestp_arr[i]->seq->n1,
+ bestp_arr[i]->rst.comp, *pstat_voidp);
+ }
+ }
+
+}
+
+void
+save_shuf(struct buf_head *lib_bhead_p, int nitt1, int shuff_max, int sc_ix,
+ struct score_count_s *s_info)
+{
+ struct buf2_data_s *rbuf_dp, *lib_buf2_dp;
+ struct buf2_res_s *rbuf_rp, *lib_buf2_rp;
+ int t_valid_stat;
+ int t_rbest;
+ double t_rescore;
+ int buf2_cnt, jstats;
+ static int kstats=0;
+
+
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+
+ s_info->s_cnt[0] += lib_bhead_p->s_cnt_info.s_cnt[0];
+ s_info->s_cnt[1] += lib_bhead_p->s_cnt_info.s_cnt[1];
+ s_info->s_cnt[2] += lib_bhead_p->s_cnt_info.s_cnt[2];
+
+ s_info->tot_scores += lib_bhead_p->s_cnt_info.tot_scores;
+ /* this is done because we are not using r_rst->valid_stat to limit selection of scores */
+ /* s_info->s_cnt[sc_ix] = s_info->tot_scores; */
+
+ t_rbest = -BIGNUM;
+ t_valid_stat = 0;
+
+ while (buf2_cnt--) { /* count down the number of results */
+ rbuf_dp = lib_buf2_dp++; /* step through the results buffer */
+ rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
+
+ /* perhaps should use explicit flag to indicate no score */
+ if (rbuf_rp->r_rst.score[0] == -BIGNUM) continue;
+
+ if (rbuf_rp->r_rst.score[sc_ix] > t_rbest) {
+ t_rbest = rbuf_rp->r_rst.score[sc_ix];
+ t_rescore = rbuf_rp->r_rst.escore;
+ }
+
+ if (rbuf_rp->r_rst.valid_stat > t_valid_stat) {
+ t_valid_stat = 1;
+ }
+
+ /* statistics done for best score of set */
+ /* currently no check for rst->valid_stat, which causes
+ over-estimates of shuffles */
+
+ if (rbuf_dp->frame == nitt1) {
+ if (t_valid_stat) {
+ if (nrstats < shuff_max ) { kstats = jstats = nrstats++; }
+ else { /* randomly replace */
+ jstats = my_nrand(++kstats,rand_state);
+ if (jstats >= shuff_max) goto done;
+ }
+
+ rstats[jstats].n1 = rbuf_dp->seq->n1;
+ rstats[jstats].comp = rbuf_rp->r_rst.comp;
+ rstats[jstats].H = rbuf_rp->r_rst.H;
+ rstats[jstats].escore = t_rescore;
+ rstats[jstats].score = t_rbest;
+ done:
+ t_rbest = -BIGNUM;
+ }
+ }
+ }
+}
+
+int
+save_align(struct buf_head *lib_bhead_p, struct beststr **bestp_arr)
+{
+ struct buf2_ares_s *rbuf_ap, *lib_buf2_ap;
+ int buf2_cnt;
+
+ if (!lib_bhead_p->hdr.have_results || lib_bhead_p->hdr.buf2_cnt <= 0) return 0;
+
+ lib_buf2_ap = lib_bhead_p->buf2_ares;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+
+ while (buf2_cnt-- > 0) { /* count down the number of results */
+ rbuf_ap = lib_buf2_ap++; /* step through the results buffer */
+ if (bestp_arr[rbuf_ap->best_idx]->a_res == NULL) {
+ bestp_arr[rbuf_ap->best_idx]->have_ares = rbuf_ap->have_ares;
+ bestp_arr[rbuf_ap->best_idx]->a_res = rbuf_ap->a_res;
+ }
+#ifdef DEBUG
+ else {
+ fprintf(stderr,"*** error [%s:%d] - attempt to re-save a_res for [%d]: %s\n",
+ __FILE__, __LINE__, rbuf_ap->best_idx, bestp_arr[rbuf_ap->best_idx]->mseq->bline);
+ }
+#endif
+ }
+
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ lib_bhead_p->hdr.have_results = 0;
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ return buf2_cnt;
+}
+
+/* buf_do_work fills in the lib_bhead_p->buf2_res[] array with the
+ do_work() results,
+
+ inputs: **aa0, n0 (query)
+ lib_bhead_p->buf2_data lib_bhead_p->hdr.buf2_cnt library sequences
+ max_frame (used to set statistics info)
+ ppst,
+ void *f_struct prepared by init_work()
+
+ results: lib_bhead_p->buf2_res[]
+
+ included in buf2_res[] is use_stat, which captures the
+ logic required to decide whether a value should be saved
+ in the stats[] buffer. This complexity mostly arises
+ because there can be more scores than sequences, but there
+ can only on statistics score per sequence (the best score).
+*/
+void
+buf_do_work(unsigned char **aa0, int n0,
+ struct buf_head *lib_bhead_p,
+ int max_frame,
+ struct pstruct *ppst, void **f_str) {
+
+ int buf2_cnt;
+ unsigned long atmp;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_res_s *lib_buf2_rp, *t_best_rp;
+ int t_best, sc_ix, i;
+ double t_escore;
+
+ sc_ix = ppst->score_ix;
+
+ lib_bhead_p->s_cnt_info.s_cnt[0] = lib_bhead_p->s_cnt_info.s_cnt[1] =
+ lib_bhead_p->s_cnt_info.s_cnt[2] = lib_bhead_p->s_cnt_info.tot_scores = 0;
+
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+
+ t_best_rp = NULL;
+ t_best = -BIGNUM;
+ t_escore = 1000.0;
+
+ while (buf2_cnt-- > 0) {
+
+ lib_buf2_rp->rst.score[0] =
+ lib_buf2_rp->rst.score[1] =
+ lib_buf2_rp->rst.score[2] = -BIGNUM;
+
+ lib_buf2_rp->is_valid_stat = 0;
+
+ if (lib_buf2_dp->seq->n1 < ppst->n1_low ||
+ lib_buf2_dp->seq->n1 > ppst->n1_high ) {
+ /* tells save_best() there is no stats score here -- not
+ necessary as -BIGNUM indicates no score */
+ lib_buf2_dp->stats_idx = -1;
+ goto next_seq;
+ }
+
+#ifdef DEBUG
+ if (check_seq_range(lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
+ ppst->nsqx, "buf_do_work()")) {
+ fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_work] range error at: %d/%d (n1:%d)\n",
+ __FILE__, __LINE__,
+ prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
+ lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
+ goto next_seq;
+ };
+
+ /* also check for adler32_crc match */
+ if (lib_buf2_dp->seq->adler32_crc != (atmp=adler32(1L,lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1))) {
+ fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_work] CRC error [%lu!=%lu] at: %d/%d (n1:%d/l_offset:%ld)\n",
+ __FILE__, __LINE__,
+ prog_func,lib_buf2_dp->seq->adler32_crc, atmp,
+ lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
+ lib_bhead_p->hdr.buf2_cnt,lib_buf2_dp->seq->n1,
+ lib_buf2_dp->seq->l_offset);
+
+ goto next_seq;
+ }
+#endif
+
+ do_work (aa0[lib_buf2_dp->frame], n0,
+ lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
+ lib_buf2_dp->frame, ppst, f_str[lib_buf2_dp->frame], 0, 0,
+ &(lib_buf2_rp->rst), &(lib_bhead_p->s_cnt_info));
+
+ if (lib_buf2_rp->rst.valid_stat) {
+ if (lib_buf2_rp->rst.escore < t_escore) {
+ t_escore = lib_buf2_rp->rst.escore;
+ t_best_rp = lib_buf2_rp;
+ }
+ if (lib_buf2_rp->rst.score[sc_ix] > t_best) {
+ t_best = lib_buf2_rp->rst.score[sc_ix];
+ t_best_rp = lib_buf2_rp;
+ }
+ }
+
+ if (lib_buf2_dp->frame == max_frame) {
+ if (t_best_rp!=NULL) {
+ t_best_rp->is_valid_stat = 1;
+ t_best_rp = NULL;
+ }
+ t_best = -BIGNUM;
+ t_escore = 1000.0;
+ }
+
+ next_seq:
+ lib_buf2_dp++;
+ lib_buf2_rp++;
+ }
+
+ /* place to produce z_scores */
+ /* place to produce sorted array */
+
+ lib_bhead_p->hdr.have_results = 1;
+}
+
+void
+buf_do_align(unsigned char **aa0, int n0,
+ struct buf_head *lib_bhead_p,
+ struct pstruct *ppst, const struct mngmsg *m_msp,
+ void **f_str) {
+
+ int buf2_cnt, i, nsq;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_res_s *lib_buf2_rp;
+ struct buf2_ares_s *lib_buf2_ap;
+ struct rstruct rst;
+
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+ lib_buf2_ap = lib_bhead_p->buf2_ares;
+
+ while (buf2_cnt-- > 0) {
+ if ( m_msp->stages > 1) {
+ /* this is not typically done unless m_msp->stages > 1 */
+ do_opt (aa0[lib_buf2_dp->frame], n0, lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
+ lib_buf2_dp->frame, ppst, f_str[lib_buf2_dp->frame], &rst);
+ lib_buf2_rp->rst.score[2]=rst.score[2];
+ }
+
+#ifdef DEBUG
+ if (lib_buf2_dp->seq->aa1b == NULL) {
+ fprintf(stderr,"*** error [%s:%d] - [buf_do_align] null aa1b\n",__FILE__, __LINE__);
+ lib_buf2_ap->a_res = NULL;
+ break;
+ }
+ if (check_seq_range(lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
+ ppst->nsqx, "buf_do_align()")) {
+ fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_align] range error at: %d/%d (n1:%d)\n",
+ __FILE__, __LINE__,
+ prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
+ lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
+ };
+
+ /* also check for adler32_crc match */
+ if (lib_buf2_dp->seq->adler32_crc != adler32(1L,lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1)) {
+ fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_align] CRC error at: %d/%d (n1:%d)\n",
+ __FILE__, __LINE__,
+ prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
+ lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
+ }
+#endif
+
+ lib_buf2_ap->a_res = build_ares_code(aa0[lib_buf2_dp->frame], m_msp->n0,
+ lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq,
+ lib_buf2_dp->frame, &lib_buf2_ap->have_ares,
+ lib_buf2_dp->repeat_thresh, m_msp, ppst, f_str[lib_buf2_dp->frame] );
+
+ lib_buf2_dp++;
+ lib_buf2_ap++;
+ lib_buf2_rp++;
+ }
+ lib_bhead_p->hdr.have_results = 1;
+}
+
+void
+buf_qshuf_work(unsigned char *aa0s, int n0,
+ struct buf_head *lib_bhead_p,
+ int max_frame,
+ struct pstruct *ppst, void *qf_str,
+ int ix_score)
+{
+ int buf2_cnt;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_res_s *lib_buf2_rp, *tq_best_rp;
+ struct rstruct rrst;
+ struct score_count_s q_scnt_info;
+ int tq_best;
+ double tq_escore;
+
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+
+ tq_best_rp = NULL;
+ tq_best = -BIGNUM;
+ tq_escore = 1000.0;
+
+ while (buf2_cnt-- > 0) {
+ rrst.score[0] = rrst.score[1] = rrst.score[2] = -BIGNUM;
+ rrst.valid_stat = 0;
+
+ if (lib_buf2_dp->seq->n1 < ppst->n1_low ||
+ lib_buf2_dp->seq->n1 > ppst->n1_high ) {
+ lib_buf2_dp++;
+ lib_buf2_rp++;
+ tq_best_rp = NULL;
+ tq_best = -BIGNUM;
+ tq_escore = 1000.0;
+ continue;
+ }
+
+ do_work (aa0s, n0,
+ lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
+ lib_buf2_dp->frame, ppst, qf_str, 1, 0,
+ &rrst, &q_scnt_info);
+
+ /* buf_qshuf_work() is always called after buf_do_work(), which
+ sets rp->is_valid_stat */
+ if (lib_buf2_rp->is_valid_stat) {
+ tq_best_rp = lib_buf2_rp;
+ }
+
+ if (rrst.escore < tq_escore) {
+ tq_escore = rrst.escore;
+ }
+ if (rrst.score[ix_score] > tq_best) {
+ tq_best = rrst.score[ix_score];
+ }
+
+ if (lib_buf2_dp->frame == max_frame) {
+ if (tq_best_rp!=NULL) {
+ tq_best_rp->qr_score = tq_best;
+ tq_best_rp->qr_escore = tq_escore;
+ tq_best_rp = NULL;
+ }
+#ifdef DEBUG
+ else {
+ fprintf(stderr,"*** error [%s:%d] - tq_best_rp NULL at: %ld\n",
+ __FILE__, __LINE__, lib_buf2_rp - lib_bhead_p->buf2_res);
+ }
+#endif
+ tq_best = -BIGNUM;
+ tq_escore = 1000.0;
+ }
+ lib_buf2_dp++;
+ lib_buf2_rp++;
+ }
+}
+
+void
+buf_shuf_work(unsigned char **aa0, int n0, unsigned char *aa1s, struct buf_head *lib_bhead_p,
+ int max_frame, struct pstruct *ppst, void **f_str,
+ int ix_score, void *rand_state)
+{
+ int buf2_cnt;
+ int shuff_cnt;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_res_s *lib_buf2_rp, *tr_best_rp;
+ int tr_best, sc_ix;
+ double tr_escore;
+
+ sc_ix = ppst->score_ix;
+
+ lib_bhead_p->s_cnt_info.s_cnt[0] = lib_bhead_p->s_cnt_info.s_cnt[1] =
+ lib_bhead_p->s_cnt_info.s_cnt[2] = lib_bhead_p->s_cnt_info.tot_scores = 0;
+
+ shuff_cnt = 0;
+ buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+
+ tr_best_rp = NULL;
+ tr_best = -BIGNUM;
+ tr_escore = 1000.0;
+
+ while (buf2_cnt-- > 0) {
+ lib_buf2_rp->r_rst.score[0] = lib_buf2_rp->r_rst.score[1] =
+ lib_buf2_rp->r_rst.score[2] = -BIGNUM;
+ lib_buf2_rp->r_rst.valid_stat = lib_buf2_rp->is_valid_stat = 0;
+
+ if ((lib_buf2_dp->stats_idx < 0) || lib_buf2_dp->seq->n1 < ppst->n1_low ||
+ lib_buf2_dp->seq->n1 > ppst->n1_high ) {
+ lib_buf2_dp++;
+ lib_buf2_rp++;
+ tr_best_rp = NULL;
+ tr_best = -BIGNUM;
+ tr_escore = 1000.0;
+ continue;
+ }
+
+ shuff_cnt++;
+ if (ppst->zs_win > 0) {
+ wshuffle(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1,ppst->zs_win, rand_state);
+ }
+ else {
+ if (ppst->shuffle_dna3) {shuffle3(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1, rand_state);}
+ else {shuffle(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1, rand_state);}
+ }
+
+ /* rshuffle(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1); */
+
+#ifdef DEBUG
+ if (check_seq_range(aa1s, lib_buf2_dp->seq->n1,
+ ppst->nsqx, "buf_do_align()")) {
+ fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_shuff] range error at: %d/%d (n1:%d)\n",
+ __FILE__, __LINE__,
+ prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
+ lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
+ };
+#endif
+
+ do_work (aa0[lib_buf2_dp->frame], n0,
+ aa1s, lib_buf2_dp->seq->n1,
+ lib_buf2_dp->frame, ppst, f_str[lib_buf2_dp->frame], 0, 1,
+ &lib_buf2_rp->r_rst, &(lib_bhead_p->s_cnt_info));
+
+ if (lib_buf2_rp->r_rst.valid_stat) {
+ if (lib_buf2_rp->r_rst.escore < tr_escore) {
+ tr_escore = lib_buf2_rp->r_rst.escore;
+ tr_best_rp = lib_buf2_rp;
+ }
+ if (lib_buf2_rp->r_rst.score[sc_ix] > tr_best) {
+ tr_best = lib_buf2_rp->r_rst.score[sc_ix];
+ tr_best_rp = lib_buf2_rp;
+ }
+ }
+
+ if (lib_buf2_dp->frame == max_frame) {
+ if (tr_best_rp!=NULL) {
+ tr_best_rp->is_valid_stat = 1;
+ tr_best_rp = NULL;
+ }
+ tr_best = -BIGNUM;
+ tr_escore = 1000.0;
+ }
+
+ lib_buf2_dp++;
+ lib_buf2_rp++;
+ }
+ lib_bhead_p->hdr.shuff_cnt = shuff_cnt;
+ lib_bhead_p->hdr.have_results = 1;
+}
+
+/* buf_shuf_seq is designed to:
+ (1) take a list of sequences (specified by bptr[])
+ (2) collect them from the database if they are not already available
+ (3) send them to the threads or shuffle them directly and calculate scores
+*/
+
+void
+buf_shuf_seq(unsigned char **aa0, int n0,
+ unsigned char **aa1shuff_b, unsigned char *aa1save, int maxn,
+ struct beststr **bestp_arr, int nbest,
+ struct pstruct *ppst, struct mngmsg *m_msp,
+ struct mng_thr *m_bufi_p
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ , void **f_str
+#endif
+ , struct score_count_s *s_info)
+{
+ unsigned char *aa1shuff;
+ struct beststr *bbp, **tmp_bestp;
+ char l_bline[MAX_SSTR];
+ int n1lib_req, shuff_mult;
+ long loffset, l_off;
+ int n1, itt;
+ int max_do_cnt, ndiff, prev_index;
+ int istats;
+ int i, j;
+
+ /* these variables track buffers of library sequences */
+ int cur_buf_size, max_buf_size;
+ struct buf_head *lib_bhead_p;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_res_s *lib_buf2_rp;
+
+/* (1) get the sequences into a buffer - the sequence information is
+ currently in the bestp_arr - find out how many we have, and how
+ many we will need - the number to shuffle */
+
+/* figure out how much space we need, first checking whether we have
+ dups */
+ if ((tmp_bestp = (struct beststr **)calloc(nbest, sizeof(struct beststr *)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - %s/buf_shuf_seq() *** cannot allocate tmp_bestp[%d]\n",
+ __FILE__, __LINE__, prog_name, nbest);
+ exit(1);
+ }
+ for (i = 0; i < nbest; i++) {
+ tmp_bestp[i] = bestp_arr[i];
+ }
+
+ /* sort tmp_bestp[] by sequence index, so duplicates are adjacent */
+ sortbesti(tmp_bestp, nbest);
+
+ /* count number of different sequence indices, get required space
+ without dups */
+ prev_index = -1;
+ n1lib_req = ndiff = 0;
+ for (i = 0; i < nbest; i++) {
+ if (tmp_bestp[i]->seq->index > prev_index) {
+ prev_index = tmp_bestp[i]->seq->index;
+ n1lib_req += tmp_bestp[i]->n1+ 2;
+ ndiff++;
+ }
+ }
+
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ if (n1lib_req >= maxn) { /* we need new space, aa1shuff is too small */
+ if ((*aa1shuff_b = aa1shuff =
+ (unsigned char *)realloc(*aa1shuff_b, n1lib_req*sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot realloc aa1shuff[%d]\n",
+ __FILE__, __LINE__, n1lib_req);
+ exit(1);
+ }
+ }
+ else { aa1shuff = *aa1shuff_b;}
+ *aa1shuff = '\0';
+ aa1shuff++;
+
+#else
+ if (n1lib_req < 2) {
+ fprintf(stderr,"*** error [%s:%d] - [%s/buf_shuf_seq] no residues to shuffle: %d (%d)\n",
+ __FILE__, __LINE__,
+ prog_func,n1lib_req,ndiff);
+ exit(1);
+ }
+
+ if ((*aa1shuff_b = aa1shuff =
+ (unsigned char *)calloc(n1lib_req,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot calloc aa1shuff[%d]\n",
+ __FILE__, __LINE__, n1lib_req);
+ exit(1);
+ }
+ *aa1shuff = '\0';
+ aa1shuff++;
+#endif
+
+ shuff_mult = (m_msp->shuff_max+1)/ndiff;
+ istats = 0;
+
+ /* setup lib_bhead buffers for shuffle comparisons */
+#if defined(COMP_THR) || defined(PCOMPLIB) /* threaded/parallel */
+ /* max_do_cnt can be smaller than max_buf2_cnt, but not larger */
+ max_do_cnt = min(m_bufi_p->max_buf2_res,
+ m_msp->shuff_max / (2 * fa_max_workers));
+ /* we don't have a left over one, so we need one */
+ get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
+#else /* not threaded */
+ max_do_cnt = m_bufi_p->max_buf2_res;
+ lib_bhead_p = lib_buf2_list; /* equivalent to un-threaded get_rbuf() */
+#endif
+ max_buf_size = n1lib_req;
+ cur_buf_size = 0;
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ lib_bhead_p->hdr.have_results = 0;
+ lib_bhead_p->hdr.stop_work = 0;
+ lib_bhead_p->hdr.buf2_type=BUF2_DOSHUF;
+ lib_bhead_p->hdr.seq_record_continuous = 0;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_rp = lib_bhead_p->buf2_res;
+
+ /* read sequences into shuffle buffer */
+
+ for (i = 0; i < ndiff; i++) {
+ bbp = tmp_bestp[i];
+ if (bbp->seq->aa1b == NULL) {
+ /* get the sequence */
+ (bbp->mseq->m_file_p->ranlib)(l_bline, sizeof(l_bline),
+ bbp->mseq->lseek,bbp->mseq->libstr,bbp->mseq->m_file_p);
+ n1 = re_getlib(aa1save,NULL, maxn,m_msp->ldb_info.maxt3,
+ m_msp->ldb_info.l_overlap,bbp->mseq->cont,m_msp->ldb_info.term_code,
+ &loffset,&l_off,bbp->mseq->m_file_p);
+
+ /* fprintf(stderr, " %d gets %d %d\n",i,tmp_bestp[i]->seq->n1,n1); */
+
+ memcpy(aa1shuff, aa1save, n1+1);
+ bbp->seq->aa1b = aa1shuff;
+ aa1shuff += n1 + 1;
+ }
+
+ /* lib_buf2_dp is used up by scores, the sequence is not sent multiple times */
+ cur_buf_size += bbp->seq->n1+1;
+ for (j = 0; j < shuff_mult; j++ ) {
+ for (itt = m_msp->revcomp; itt <= m_msp->nitt1; itt++) {
+#ifdef PCOMPLIB
+ lib_buf2_dp->seq_dup = 0; /* mark first ->seq as original, not duplicate */
+#endif
+ lib_buf2_dp->seq = bbp->seq;
+ /* this invalidates lib_buf2_p->seq */
+ lib_buf2_dp->stats_idx = istats++;
+ lib_buf2_dp->frame = itt;
+ lib_buf2_dp++; /* point to next buf2 */
+ lib_buf2_rp++; /* point to next buf2 */
+ lib_bhead_p->hdr.buf2_cnt++;
+
+ if (lib_bhead_p->hdr.buf2_cnt >= max_do_cnt ||
+ cur_buf_size >= max_buf_size) {
+/* (2) send sequences for shuffling */
+#if defined(COMP_THR) || defined(PCOMPLIB) /* threaded - fill and empty buffers */
+ /* provide empty buffer to workers */
+ lib_bhead_p->hdr.aa1b_used = cur_buf_size;
+ lib_bhead_p->hdr.have_data = 1;
+ lib_bhead_p->hdr.seq_record_continuous = 0;
+ put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
+ get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
+#else /* non-thread - just do the searches */
+ if (lib_bhead_p->hdr.buf2_type & BUF2_DOSHUF) {
+ buf_shuf_work(aa0,m_msp->n0, aa1save, lib_bhead_p,
+ m_msp->nitt1, ppst, f_str, ppst->score_ix, rand_state);
+ }
+#endif
+/* (3) save results in the rstats structure */
+ if (lib_bhead_p->hdr.buf2_cnt > 0 && lib_bhead_p->hdr.have_results) {
+ save_shuf(lib_bhead_p,m_msp->nitt1,m_msp->shuff_max,ppst->score_ix,s_info);
+ }
+
+ lib_bhead_p->s_cnt_info.s_cnt[0] = lib_bhead_p->s_cnt_info.s_cnt[1] =
+ lib_bhead_p->s_cnt_info.s_cnt[2] = lib_bhead_p->s_cnt_info.tot_scores = 0;
+
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ cur_buf_size = 0;
+ lib_bhead_p->hdr.have_results = 0;
+ lib_bhead_p->hdr.buf2_type=BUF2_DOSHUF;
+ lib_bhead_p->hdr.seq_record_continuous = 0; /* seq_records are coming from bestptr in any order */
+ lib_bhead_p->hdr.stop_work = 0;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ }
+ } /* for (itt .. */
+ }
+ } /* done with tmp_bestp[] */
+ free(tmp_bestp);
+
+#if defined(COMP_THR) || defined(PCOMPLIB) /* if COMP_THR/PCOMPLIB - fill and empty buffers */
+ /* check last buffers for any results */
+ lib_bhead_p->hdr.seq_record_continuous = 0;
+ put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
+
+ /* wait for the threads to finish */
+
+ wait_rbuf(m_bufi_p->max_work_buf);
+ /*
+ fprintf(stderr, " num_reader[%d]-empty[%d]: %d\tnrstats: %d\n",
+ num_reader_bufs,empty_reader_bufs,
+ num_reader_bufs-empty_reader_bufs, nrstats);
+ */
+
+ for (i=0; i < num_reader_bufs; i++) {
+ if (RESULTS_BUF[i]->hdr.buf2_cnt > 0 && RESULTS_BUF[i]->hdr.have_results) {
+ save_shuf(RESULTS_BUF[i],m_msp->nitt1, m_msp->shuff_max, ppst->score_ix, s_info);
+ RESULTS_BUF[i]->hdr.buf2_cnt = RESULTS_BUF[i]->hdr.have_results = 0;
+ }
+ }
+#else /* just do the searches */
+ /* aa1save is used for shuffles, not aa1shuf, because aa1shuf
+ has library sequences */
+ buf_shuf_work(aa0,m_msp->n0, aa1save, lib_bhead_p,
+ m_msp->nitt1, ppst, f_str, ppst->score_ix, rand_state);
+
+ save_shuf(lib_bhead_p,m_msp->nitt1,m_msp->shuff_max, ppst->score_ix, s_info);
+ lib_bhead_p->hdr.buf2_cnt = lib_bhead_p->hdr.have_results = 0;
+#endif
+}
+
+/* buf_align_seq is structurally almost identical to buf_shuf_seq,
+ except that the appropriate sequences are pre-loaded into bbp->seq
+ (and ->bline), and it gets bbp->a_res, rather than scores */
+
+void
+buf_align_seq(unsigned char **aa0, int n0,
+ struct beststr **bestp_arr, int nbest,
+ struct pstruct *ppst, struct mngmsg *m_msp,
+ struct mng_thr *m_bufi_p
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ , void **f_str
+#endif
+ )
+{
+ struct beststr *bbp;
+ int max_align_cnt;
+ int i, n_pre_align;
+ int cur_buf_size, max_buf_size;
+ struct buf_head *lib_bhead_p;
+ struct buf2_data_s *lib_buf2_dp;
+ struct buf2_ares_s *lib_buf2_ap;
+
+ /* setup lib_bhead buffers for alignments */
+#if defined(COMP_THR) || defined(PCOMPLIB) /* threaded */
+ /* max_do_cnt can be smaller than max_buf2_res, but not larger */
+#ifdef COMP_THR
+ max_align_cnt = min(m_bufi_p->max_buf2_res,
+ nbest / (4 * fa_max_workers));
+#else
+ max_align_cnt = min(m_bufi_p->max_buf2_res, nbest / fa_max_workers);
+#endif
+ if (max_align_cnt < 1) max_align_cnt = 1;
+
+ get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
+#else /* not threaded */
+ max_align_cnt = m_bufi_p->max_buf2_res;
+ lib_bhead_p = lib_buf2_list; /* equivalent to un-threaded get_rbuf() */
+#endif
+
+ max_buf_size = lib_bhead_p->hdr.aa1b_size;
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ lib_bhead_p->hdr.have_results = 0;
+ lib_bhead_p->hdr.stop_work = 0;
+ lib_bhead_p->hdr.buf2_type=BUF2_DOALIGN;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_ap = lib_bhead_p->buf2_ares;
+
+ /* read sequences into align buffer */
+
+ n_pre_align = 0;
+ cur_buf_size = 0;
+ for (i = 0; i < nbest; i++) {
+ bbp = bestp_arr[i];
+
+ /* this invalidates lib_buf2_p->seq */
+ lib_buf2_dp->seq = bbp->seq;
+ cur_buf_size += bbp->seq->n1+1;
+ lib_buf2_dp->frame = bbp->frame;
+ lib_buf2_dp->repeat_thresh = bbp->repeat_thresh;
+#ifdef PCOMPLIB
+ lib_buf2_dp->seq_dup = 0;
+#endif
+ lib_buf2_ap->have_ares = 0;
+ lib_buf2_ap->a_res = NULL;
+ lib_buf2_ap->best_idx = i;
+ lib_buf2_dp++; /* point to next buf2_data */
+ lib_buf2_ap++; /* point to next buf2_ares */
+ lib_bhead_p->hdr.buf2_cnt++;
+
+ if (lib_bhead_p->hdr.buf2_cnt >= max_align_cnt ||
+ cur_buf_size >= max_buf_size - m_msp->ldb_info.maxn) {
+/* (2) send sequences for alignment */
+#if defined(COMP_THR) || defined(PCOMPLIB) /* threaded - fill and empty buffers */
+ /* provide empty buffer to workers */
+ lib_bhead_p->hdr.seqr_cnt = lib_bhead_p->hdr.buf2_cnt; /* for alignments, they are the same */
+ lib_bhead_p->hdr.have_data = 1;
+ lib_bhead_p->hdr.aa1b_used = cur_buf_size;
+ lib_bhead_p->hdr.seq_record_continuous = 0;
+ put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
+ get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
+#else /* non-thread - just do the searches */
+ buf_do_align(aa0, m_msp->n0, lib_bhead_p, ppst, m_msp, f_str);
+#endif
+
+/* (3) save alignments */
+ if (lib_bhead_p->hdr.buf2_cnt > 0 && lib_bhead_p->hdr.have_results) {
+ n_pre_align += save_align(lib_bhead_p,bestp_arr);
+ }
+
+ cur_buf_size = 0;
+ max_buf_size = lib_bhead_p->hdr.aa1b_size;
+ lib_bhead_p->hdr.buf2_cnt = 0;
+ lib_bhead_p->hdr.have_results = 0;
+ lib_bhead_p->hdr.buf2_type=BUF2_DOALIGN;
+ lib_bhead_p->hdr.stop_work = 0;
+ lib_buf2_dp = lib_bhead_p->buf2_data;
+ lib_buf2_ap = lib_bhead_p->buf2_ares;
+ }
+ } /* done with bestp_arr[] */
+
+#if defined(COMP_THR) || defined(PCOMPLIB) /* if COMP_THR - fill and empty buffers */
+ /* check last buffers for any results */
+ lib_bhead_p->hdr.seqr_cnt = lib_bhead_p->hdr.buf2_cnt; /* for alignments, they are the same */
+ lib_bhead_p->hdr.have_data = 1;
+ lib_bhead_p->hdr.aa1b_used = cur_buf_size;
+ lib_bhead_p->hdr.seq_record_continuous = 0;
+ put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
+
+ /* wait for the threads to finish */
+
+ wait_rbuf(m_bufi_p->max_work_buf);
+
+ for (i=0; i < num_reader_bufs; i++) {
+ if (RESULTS_BUF[i]->hdr.buf2_cnt > 0 && RESULTS_BUF[i]->hdr.have_results) {
+ n_pre_align += save_align(RESULTS_BUF[i],bestp_arr);
+ RESULTS_BUF[i]->hdr.buf2_cnt = RESULTS_BUF[i]->hdr.have_results = 0;
+ }
+ }
+#else /* just do the searches */
+ buf_do_align(aa0, m_msp->n0, lib_bhead_p, ppst, m_msp, f_str);
+ n_pre_align += save_align(lib_bhead_p,bestp_arr);
+ lib_bhead_p->hdr.buf2_cnt = lib_bhead_p->hdr.have_results = 0;
+#endif
+
+ m_msp->align_done = 1;
+
+ if (n_pre_align != nbest) {
+ fprintf(stderr,"*** error [%s:%d] - n_pre_align:%d != nbest: %d\n",
+ __FILE__, __LINE__, n_pre_align, nbest);
+ }
+ for (i=0; i < nbest; i++) {
+ if (bestp_arr[i]->a_res == NULL) {
+ fprintf(stderr, "*** error [%s:%d] - have NULL a_res: %d\n",
+ __FILE__, __LINE__, i);
+ }
+ }
+}
+
+int
+check_seq_range(unsigned char *aa1b, int n1, int nsq, char *str) {
+ int i, range_error;
+ unsigned char *aa1p;
+
+ range_error = 0;
+ for (aa1p = aa1b, i=0; i < n1; i++, aa1p++) {
+ if (*aa1p > nsq) {
+ range_error = 1;
+ /* fprintf(stderr, "%s seq %d (%c) out of range at %d\n",
+ str, *aa1p, *aa1p,i);
+ */
+ }
+ }
+ return range_error;
+}
+
+struct stack_str {
+ void **stack;
+ int size;
+ int inc;
+ int top;
+};
+
+struct stack_str *init_stack(int size, int inc) {
+ struct stack_str *stack;
+
+ if ((stack=(struct stack_str *)calloc(1,sizeof(struct stack_str)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate stack\n",
+ __FILE__, __LINE__);
+ return NULL;
+ }
+
+ if ((stack->stack=(void *)calloc(size,sizeof(void *)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate stack->stack[%d]\n",
+ __FILE__, __LINE__,size);
+ free(stack);
+ return NULL;
+ }
+
+ stack->size = size;
+ stack->inc = inc;
+ stack->top = 0;
+ return stack;
+}
+
+void push_stack(struct stack_str *stack, void *value) {
+
+ if (!stack) return;
+ if (stack->top >= stack->size) {
+ stack->size += stack->inc;
+ if ((stack->stack = (void *)realloc(stack->stack, stack->size*sizeof(void *)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot re-allocate stack to [%d]\n",
+ __FILE__, __LINE__, stack->size);
+ return;
+ }
+ /*
+ fprintf(stderr,"*** error [%s:%d] - stack corruption: %d >= %d\n",
+ __FILE__, __LINE__, stack->top, stack->size);
+ return;
+ */
+ }
+ stack->stack[stack->top++] = value;
+}
+
+int get_stack_len(struct stack_str *stack) {
+ if (stack == NULL) {return 0;}
+ else {return stack->top;}
+}
+
+
+void * pop_stack(struct stack_str *stack) {
+ if (stack == NULL) {
+#ifdef DEBUG
+ fprintf(stderr," *** error [%s:%d] - pop_stack NULL stack\n",__FILE__, __LINE__);
+#endif
+ return NULL;
+ }
+
+ if (stack->top-- > 0) {
+ return stack->stack[stack->top];
+ }
+ else {
+ stack->top = 0;
+ return NULL;
+ }
+}
+
+void * free_stack(struct stack_str *stack) {
+ if (stack==NULL) return NULL;
+ if (stack->stack != NULL) free(stack->stack);
+ free(stack);
+ return NULL;
+}
+
+struct dyn_string_str *
+init_dyn_string(int size, int inc) {
+ struct dyn_string_str *dyn_string;
+
+ if ((dyn_string=(struct dyn_string_str *)calloc(1,sizeof(struct dyn_string_str)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate dyn_string\n",
+ __FILE__, __LINE__);
+ return NULL;
+ }
+
+ if ((dyn_string->string=(void *)calloc(size,sizeof(void *)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot allocate dyn_string->string[%d]\n",
+ __FILE__, __LINE__,size);
+ free(dyn_string);
+ return NULL;
+ }
+
+ dyn_string->c_size = 0;
+ dyn_string->inc = inc;
+ dyn_string->mx_size = size;
+ return dyn_string;
+}
+
+void
+reset_dyn_string(struct dyn_string_str *dyn_string) {
+
+ memset(dyn_string->string,0,dyn_string->c_size);
+ dyn_string->c_size = 0;
+}
+
+void
+dyn_strcat(struct dyn_string_str *dyn_string, char *value) {
+ size_t add_len;
+
+ add_len = strlen(value);
+
+ if (!dyn_string) return;
+ if (add_len + dyn_string->c_size + 1 >= dyn_string->mx_size) {
+ while (dyn_string->inc < add_len) { dyn_string->inc *= 2; }
+ dyn_string->mx_size += dyn_string->inc;
+ if ((dyn_string->string = (void *)realloc(dyn_string->string, dyn_string->mx_size))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot re-allocate dyn_string to [%d]\n",
+ __FILE__, __LINE__, dyn_string->mx_size);
+ dyn_string->mx_size = 0;
+ return;
+ }
+ }
+ SAFE_STRNCAT(dyn_string->string,value,dyn_string->mx_size);
+ dyn_string->c_size += add_len;
+}
+
+void dyn_strcpy(struct dyn_string_str *dyn_string, char *value) {
+ size_t add_len;
+
+ add_len = strlen(value);
+
+ if (!dyn_string) return;
+ if (add_len + 1>= dyn_string->mx_size) {
+ while (dyn_string->inc < add_len) { dyn_string->inc *= 2; }
+ dyn_string->mx_size += dyn_string->inc;
+ if ((dyn_string->string = (void *)realloc(dyn_string->string, dyn_string->mx_size))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - cannot re-allocate dyn_string to [%d]\n",
+ __FILE__, __LINE__, dyn_string->mx_size);
+ dyn_string->mx_size = 0;
+ return;
+ }
+ }
+ SAFE_STRNCPY(dyn_string->string,value,dyn_string->mx_size);
+}
+
+void free_dyn_string(struct dyn_string_str *dyn_string) {
+ if (dyn_string==NULL) return;
+ if (dyn_string->string != NULL) free(dyn_string->string);
+ free(dyn_string);
+}
+
+#include "a_mark.h"
+
+struct domfeat_data *
+init_domfeat_data(const struct annot_str *annot_p) {
+ int i_ann;
+ struct domfeat_data *domfeats_head, *domfeats_current;
+
+ if ((domfeats_head = (struct domfeat_data *)calloc(annot_p->n_annot+1, sizeof(struct domfeat_data)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - calc_cons_u(): cannot allocate left_domain_list [%d]\n", __FILE__, __LINE__, annot_p->n_annot+1);
+ return NULL;
+ }
+
+ /* here we link potentially overlapping domains */
+ for (i_ann=0; i_ann < annot_p->n_annot; i_ann++) {
+ domfeats_head[i_ann].next = NULL;
+ domfeats_head[i_ann].end_pos = -1;
+ domfeats_head[i_ann].annot_entry_p = annot_p->annot_arr_p+i_ann;
+ }
+
+ return domfeats_head;
+}
+
+void
+close_annot_match (int ia, void *annot_stack, int *have_push_features_p,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_head_p,
+ long *left_end_p, int init_score) {
+
+ struct domfeat_data *this_dom_p;
+
+ for (this_dom_p = *left_domain_head_p; this_dom_p; this_dom_p = this_dom_p->next) {
+ if (ia > 0 && this_dom_p->end_pos > ia) {
+ break;
+ }
+ this_dom_p->score += *d_score_p;
+ this_dom_p->n_ident += *d_ident_p;
+ this_dom_p->n_alen += *d_alen_p;
+ this_dom_p->n_gaplen += *d_gaplen_p;
+ if (have_push_features_p) *have_push_features_p = 1;
+ push_stack(annot_stack, this_dom_p);
+ }
+
+ if (this_dom_p) {
+ *left_end_p = this_dom_p->end_pos;
+ }
+ else {*left_end_p = -1;}
+
+ *left_domain_head_p = this_dom_p;
+}
+
+/* next_annot_match()/process_annot_match() */
+
+/*
+ *itmp has the current alignment score, if *annot_arr[i_annot].label='V',
+ this can be increased (total increase in *v_delta)
+ *pam2aa0v[.value] gives possibly better pam score for variant
+ *ip is position in annotated sequence (&i0 for annot0_p)
+ *ia is position in aligned sequence (&i1 for annot0_p)
+ sp1 is the array for the (possibly modified) displayed sequence
+ sp1a is the array for the associated annotation
+ sq maps encoded residues to displayed characters
+ i_annot -- current annotation index in annot0_p->annot_arr_p[i_annot]
+ annot_arr = annot0/1_p->annot_arr_p
+ annot_stack = save current annotation
+ *have_push_features_p = set for annotations pushed in stack (not 'V')
+ *v_delta = change in score from variant at this position
+ **region_p = set for '[' region start
+ init_score -- used to initialize tmp_region_p->score.
+
+ currently, next_annot_match() is not called before alignment starts or after it ends
+*/
+
+/* process_annot_match processes a single annot_p->pos == ip entry,
+ returning 0 for a left_end == ip match (not using i_annot), or a 1
+ if an i_annot was consumed
+*/
+/*
+ within the alignment, next_annot_match() provides the while() loop to
+ process several matches at the same location
+
+ before/after the alignment, different while() loops call
+ process_annot_match()
+*/
+
+int
+process_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ struct annot_entry *annot_arr_p, int n_annots, char **ann_comment,
+ void *annot_stack, int *have_push_features_p, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_head_p,
+ struct domfeat_data *left_domain_p,
+ long *left_end_p, int init_score) {
+ int v_tmp;
+ int new_left_domain_end;
+ struct domfeat_data *this_dom, *prev_dom, *new_dom;
+
+ if (*left_domain_head_p) { /* do we have a domain_link_chain? */
+ *left_end_p = (*left_domain_head_p)->end_pos;
+ }
+
+ if (ip == *left_end_p) { /* do this first before starting any new domains */
+ close_annot_match(ip, annot_stack, have_push_features_p,
+ d_score_p, d_ident_p, d_alen_p, d_gaplen_p,
+ left_domain_head_p, left_end_p, init_score);
+ *d_ident_p = *d_alen_p = *d_gaplen_p = 0;
+ *d_score_p = init_score;
+ return 0;
+ }
+ else {
+ /* initialize domfeat_data (scoring, boundary) information */
+ left_domain_p->next = NULL;
+ left_domain_p->score = 0;
+ left_domain_p->n_ident = 0;
+ left_domain_p->n_alen = 0;
+ left_domain_p->pos = ip;
+ left_domain_p->a_pos = ia;
+
+ if (annot_arr_p->label == 'V') { /* label == 'V' */
+ v_tmp = pam2aa0v[annot_arr_p->value];
+ if (v_tmp > *itmp) {
+ *v_delta += (v_tmp- *itmp);
+ *itmp = v_tmp;
+ *sp1 = sq[annot_arr_p->value];
+ if (sp1a) *sp1a = 'V';
+ if (ann_comment) *ann_comment = annot_arr_p->comment;
+ }
+ }
+ else if (annot_arr_p->label == '-') {
+
+ /* initialize this dom_entry */
+ left_domain_p->score = init_score;
+ left_domain_p->n_ident = left_domain_p->n_alen = 0;
+ *left_end_p = left_domain_p->end_pos = annot_arr_p->end;
+
+ if (*left_domain_head_p == NULL) {
+ *left_domain_head_p = left_domain_p;
+ *d_score_p = init_score;
+ }
+ else {
+ /* we already have a domain list - update scores for "live"
+ domains and insert new domain */
+
+ new_left_domain_end = annot_arr_p->end;
+ new_dom = prev_dom = NULL;
+
+ /* this loop tries to do two things:
+ (1) update the scores for all the currently active domains
+ (2) find the place to insert the new domain
+ */
+
+ for (this_dom = *left_domain_head_p; this_dom; this_dom = this_dom->next) {
+ /* here we update the scores */
+ this_dom->score += *d_score_p;
+ this_dom->n_ident += *d_ident_p;
+ this_dom->n_alen += *d_alen_p;
+ this_dom->n_gaplen += *d_gaplen_p;
+
+ /* then we check for an insertion location */
+ if (this_dom->end_pos > new_left_domain_end) {
+ /* this_dom is beyond the new_left_domain_end, so link it to the previous domain */
+ new_dom = prev_dom;
+ }
+ prev_dom = this_dom;
+ }
+ /* all the scores are updated and new_dom is NULL (for beginning/end) or insertion location */
+
+ *d_ident_p = *d_alen_p = *d_gaplen_p = 0;
+ *d_score_p = init_score;
+
+ /* initialize this dom_entry */
+ left_domain_p->score = init_score;
+ left_domain_p->n_ident = left_domain_p->n_alen = 0;
+ left_domain_p->end_pos = annot_arr_p->end;
+
+ if (new_dom) { /* left_dom is null if it is first/last */
+ left_domain_p->next = new_dom->next;
+ new_dom->next = left_domain_p;
+ }
+ else { /* left_dom is NULL for start OR end, prev_dom has end of list */
+ if (prev_dom->end_pos < new_left_domain_end) { /* goes into the end of the list */
+ prev_dom->next = left_domain_p;
+ }
+ else {
+ /* place at start of list */
+ left_domain_p->next = *left_domain_head_p;
+ *left_domain_head_p = left_domain_p;
+ }
+ }
+ } /* done with domain insertion/ domain_end update */
+ *left_end_p = (*left_domain_head_p)->end_pos;
+ }/* all done with '[' */
+ else if (annot_stack) { /* not [-]V -- residue feature */
+ if (have_push_features_p) { *have_push_features_p = 1; }
+ push_stack(annot_stack, left_domain_p);
+ }
+ return 1;
+ }
+}
+
+int
+next_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ int i_annot, int n_annot, struct annot_entry **annot_arr, char **ann_comment,
+ void *annot_stack, int *have_push_features_p, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_head_p,
+ struct domfeat_data *left_domain_p,
+ long *left_end_p, int init_score) {
+
+ if (ann_comment) *ann_comment = NULL;
+
+ /* count through the annotations at this position (long ip) */
+ while ((i_annot < n_annot && ip == annot_arr[i_annot]->pos) || ip == *left_end_p) {
+ i_annot += process_annot_match(itmp, pam2aa0v, ip, ia, sp1, sp1a, sq,
+ annot_arr[i_annot], n_annot, ann_comment,
+ annot_stack, have_push_features_p, v_delta,
+ d_score_p, d_ident_p, d_alen_p, d_gaplen_p,
+ left_domain_head_p, &left_domain_p[i_annot],
+ left_end_p, init_score);
+ }
+ return i_annot;
+}
+
+/* returns M_NEG, M_ZERO, M_POS, M_IDENT, M_DEL (a_mark.h)
+ updates *aln->nsim, npos, nident, nmismatch
+
+*/
+int align_type(int score, char sp0, char sp1, int nt_align, struct a_struct *aln, int pam_x_id_sim) {
+ int spa_val;
+
+ if (score<0) {
+ spa_val = M_NEG;
+ }
+ else if (score == 0) {
+ spa_val = M_ZERO;
+ if (aln) aln->nsim++;
+ }
+ else {
+ spa_val = M_POS;
+ if (aln) {aln->nsim++; aln->npos++;}
+ }
+
+ /* correct for score < 0 with 'N:N'/'X:X' */
+ if (pam_x_id_sim > 0) { /* > 0 -> identical, similar */
+ if ((nt_align && toupper(sp0)=='N' && toupper(sp1)=='N') ||
+ (!nt_align && toupper(sp0)=='X' && toupper(sp1)=='X')) {
+ spa_val = M_POS;
+ if (aln) {
+ aln->nsim++;
+ }
+ }
+ }
+
+ if (aln) aln->nmismatch++;
+ if (toupper(sp0) == toupper(sp1)) {
+ spa_val = M_IDENT;
+ if (aln) {
+ aln->nident++;
+ aln->nmismatch--;
+ }
+ }
+ else if (nt_align) {
+ if ((toupper(sp0) == 'T' && toupper(sp1) == 'U') ||
+ (toupper(sp0)=='U' && toupper(sp1)=='T')) {
+ spa_val = M_IDENT;
+ if (aln) {
+ aln->nident++;
+ aln->nmismatch--;
+ }
+ }
+ /* add to gap count for 'N' matches ?? */
+ else if (aln && toupper(sp0) == 'N') aln->ngap_q++;
+ else if (aln && toupper(sp1) == 'N') aln->ngap_l++;
+ }
+
+ /* correct nident, nmismatch for N:N / X:X */
+ if (pam_x_id_sim < 0) { /* > 0 -> identical, similar */
+ if ((nt_align && toupper(sp0)=='N' && toupper(sp1)=='N') ||
+ (!nt_align && toupper(sp0)=='X' && toupper(sp1)=='X')) {
+ if (aln) {
+ aln->nident--;
+ aln->nmismatch++;
+ }
+ }
+ }
+
+ return spa_val;
+}
+
+/* seq_pos works with comment_var()/display_push_features()/do_url1() where
+ i_offset = nn for reversed sequences
+ off = 0 for 0 based offsets, 1 for 1-based offsets
+ */
+int
+seq_pos(int pos, int rev, int off) {
+
+ if (rev) {
+ return -pos-1 + off;
+ }
+ else {
+ return pos;
+ }
+}
+
+/* target = 0 (aa0), 1 (aa1)
+
+ d_type = display_type (annot_fmt in cal_cons.c):
+ 1 (long text), d1_fmt = " Variant: %d%c%c%d%c : %c%d%c";
+ 2 (-m 9c code) sprintf(tmp_str, "|%c%c:%ld%c%c%ld%c",
+
+ i0_pos/i1_pos have already been converted to reverse coordinate if necessary
+*/
+void comment_var (long i0_pos, char sp0, long i1_pos, char sp1, char o_sp1,
+ char sim_char, const char *ann_comment,
+ struct dyn_string_str *annot_var_dyn, int target, int d_type)
+{
+ char tmp_str[MAX_LSTR], tc, ann_ch0, ann_ch1;
+ char *d1_fmt;
+
+ if (d_type == 1) {
+ if (target ==1) {
+ d1_fmt = " Variant: %d%c%c%d%c : %c%d%c";
+ sprintf(tmp_str,d1_fmt,
+ i0_pos+1, sp0, sim_char, i1_pos+1,sp1, o_sp1,(target?i1_pos+1:i0_pos+1),sp1);
+ }
+ else {
+ d1_fmt = " qVariant: %d%c%c%d%c : %c%d%c";
+ sprintf(tmp_str,d1_fmt,
+ i0_pos+1, sp0, sim_char, i1_pos+1,sp1, o_sp1,(target?i1_pos+1:i0_pos+1),sp0);
+ }
+
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+
+ if (ann_comment) {
+ sprintf(tmp_str," : %s",ann_comment);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+
+ /* SAFE_STRNCAT(annot_var_s,"\n",n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, "\n");
+ }
+ else if (d_type == 2) {
+ if (target == 1) {
+ ann_ch0 = 'X';
+ ann_ch1 = 'V';
+ }
+ else {
+ ann_ch0 = 'V';
+ ann_ch1 = 'X';
+ }
+
+ sprintf(tmp_str, "|%c%c:%ld%c%c%ld%c",
+ ann_ch0,ann_ch1,
+ i0_pos+1,sp0, sim_char,i1_pos+1,sp1);
+ /* SAFE_STRNCAT(annot_var_s, tmp_str, n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+}
+
+/* display push features is designed to display both individual
+ residue features (active site, variant, modification site) and
+ domain boundaries. Domain boundary information is displayed when
+ the domain is closed or the alignment boundary has been exceeded
+ (for open domains). Thus, it uses the current site for the end,
+ and information domfeat_data information in annot_stack
+
+ d_type == 1 : full display ([q]Region: %d-%d:%d-%d : score=%d ... )
+ d_type == 2 : -m9C/-m8CC calc_code "|RX:%d-%d:%d-%d:s=%d;b=%.1f;I=%.3f;Q=%.1f";
+ d_type == 3 : -m9I CALC_ID_DOM short domains, variants
+*/
+
+void
+display_push_features(void *annot_stack, struct dyn_string_str *annot_var_dyn,
+ long i0_pos, char sp0, long i1_pos, char sp1, char sym,
+ int tot_score, double comp, int sw_score, int n0, int n1,
+ void *pstat_void, int d_type) {
+ struct domfeat_data *this_dom_p;
+ double lbits, total_bits, zscore, lprob, lpercid;
+ char *ann_comment, *bp;
+ char tmp_lstr[MAX_LSTR], ctarget, tmp_str[MAX_STR];
+ int q_min, q_max, l_min, l_max;
+ char *dt1_fmt, *dt2_fmt;
+ double tot_sw_norm;
+ int n_stack;
+
+ tot_sw_norm = (double)tot_score/(double)sw_score;
+
+ zscore = find_z(tot_score, 1.0, n1, comp, pstat_void);
+ total_bits = zs_to_bit(zscore, n0, n1);
+
+ /* this warning will be displayed if a site has more than 8
+ * annotations, and is being annotated both on the query and
+ * subject -- so it has been disabled */
+ /*
+ if ((n_stack = get_stack_len(annot_stack)) > 16) {
+ fprintf(stderr," *** warning [%s:%d] - annot stack >16: %d: n0: %d; n1: %d\n",__FILE__, __LINE__, n_stack,n0,n1);
+ }
+ */
+
+ while ((this_dom_p = (struct domfeat_data *)pop_stack(annot_stack))!=NULL) {
+
+ if (this_dom_p->annot_entry_p->label == '-') {
+ if (this_dom_p->annot_entry_p->target == 1) {
+ q_min = this_dom_p->a_pos+1;
+ l_min = this_dom_p->pos+1;
+ dt2_fmt = "|XR:%d-%d:%d-%d:s=%d;b=%.1f;I=%.3f;Q=%.1f";
+ }
+ else {
+ q_min = this_dom_p->pos+1;
+ l_min = this_dom_p->a_pos+1;
+ dt2_fmt = "|RX:%d-%d:%d-%d:s=%d;b=%.1f;I=%.3f;Q=%.1f";
+ }
+
+ if (this_dom_p->score < 0) {
+ lbits = 0.0;
+ lprob = 1.0;
+ }
+ else {
+ lbits = total_bits * (double)this_dom_p->score/sw_score;
+ zscore = find_z(this_dom_p->score * tot_sw_norm, 1.0, n1, comp, pstat_void);
+ lprob = zs_to_p(zscore);
+ }
+
+ if (lprob > 0.99) lprob = 0.0;
+ else if (lprob < 1e-300) lprob = 3000.0;
+ else lprob = -10.0*log(lprob)/log(10.0);
+
+ if (this_dom_p->n_alen - this_dom_p->n_gaplen > 0) {
+ lpercid = ((double)this_dom_p->n_ident)/(double)(this_dom_p->n_alen-this_dom_p->n_gaplen);
+ }
+ else lpercid = 0.0; /* was -1.0, but 0.0 for consistency with annot_blast_btop2.pl */
+
+ if (d_type == 1) {
+ if (this_dom_p->annot_entry_p->target == 0) {
+ dt1_fmt = " qRegion: %d-%d:%d-%d : score=%d; bits=%.1f; Id=%.3f; Q=%.1f : %s\n";
+ } else {
+ dt1_fmt = " Region: %d-%d:%d-%d : score=%d; bits=%.1f; Id=%.3f; Q=%.1f : %s\n";
+ }
+ sprintf(tmp_lstr, dt1_fmt, q_min, i0_pos+1,
+ l_min, i1_pos+1, this_dom_p->score, lbits, lpercid, lprob,
+ (this_dom_p->annot_entry_p->comment) ? this_dom_p->annot_entry_p->comment : '\0');
+
+ }
+ else if (d_type == 2) {
+ sprintf(tmp_lstr,dt2_fmt,
+ q_min, i0_pos+1,
+ l_min, i1_pos+1, this_dom_p->score, lbits,lpercid, lprob);
+
+ if (this_dom_p->annot_entry_p->comment) {
+ SAFE_STRNCPY(tmp_str,this_dom_p->annot_entry_p->comment,sizeof(tmp_str));
+ if ((bp=strchr(tmp_str,' '))!=NULL) { *bp = '\0';}
+ SAFE_STRNCAT(tmp_lstr,";C=",sizeof(tmp_lstr));
+ SAFE_STRNCAT(tmp_lstr,tmp_str,sizeof(tmp_lstr));
+ }
+ }
+ else if (d_type == 3 && this_dom_p->annot_entry_p->target == 1) { /* CALC_ID_DOM domain names */
+ SAFE_STRNCPY(tmp_str,(this_dom_p->annot_entry_p->comment) ? this_dom_p->annot_entry_p->comment : '\0',sizeof(tmp_str));
+ /* comment out to allow spaces in domain names */
+ if ((bp=strchr(tmp_str,' ')) != NULL) { *bp='\0';}
+ sprintf(tmp_lstr, "%s;",tmp_str);
+ }
+ /* SAFE_STRNCAT(annot_var_s,tmp_lstr,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_lstr);
+ /*
+ if (annot_var_dyn->c_size > 8192) {
+ fprintf(stderr,"*** error [%s:%d] -- display_push_annot() long ann_code[%d]: %s\n", __FILE__,__LINE__,annot_var_dyn->c_size,tmp_lstr);
+ }
+ */
+ }
+ else if ((ann_comment = this_dom_p->annot_entry_p->comment)) {
+ if (d_type == 1 ) {
+ if (this_dom_p->annot_entry_p->target == 0) {dt1_fmt = " qSite:%c : %d%c%c%d%c : %s\n";}
+ else {dt1_fmt = " Site:%c : %d%c%c%d%c : %s\n";}
+ sprintf(tmp_lstr,dt1_fmt, this_dom_p->annot_entry_p->label,i0_pos+1, sp0,
+ sym, i1_pos+1, sp1, ann_comment);
+ /* SAFE_STRNCAT(annot_var_s,tmp_lstr,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_lstr);
+ }
+ }
+ }
+}
diff --git a/src/dec_pthr_subs.c b/src/dec_pthr_subs.c
new file mode 100644
index 0000000..f9e2ad7
--- /dev/null
+++ b/src/dec_pthr_subs.c
@@ -0,0 +1,246 @@
+/* $Id: dec_pthr_subs.c 625 2011-03-23 17:21:38Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999 by William R. Pearson and the
+ Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+
+/* this file isolates the pthreads calls from the main program */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/types.h>
+#include <signal.h>
+
+#include "param.h"
+
+#include <pthread.h>
+#define XTERNAL
+#include "thr.h"
+#undef XTERNAL
+#include "pthr_subs.h"
+
+extern void work_thread (struct thr_str *work_info);
+
+/* start the threads working */
+
+void init_thr(int nthreads, struct thr_str *work_info)
+{
+ int status, i;
+ pthread_attr_t thread_attr;
+
+ if (nthreads > MAX_WORKERS) {
+ fprintf ( stderr," cannot start %d threads, max: %d\n",
+ nthreads, MAX_WORKERS);
+ exit(1);
+ }
+
+ /* mutex and condition variable initialisation */
+
+ status = pthread_mutex_init(&reader_mutex, pthread_mutexattr_default);
+ check(status,"Reader_mutex init bad status\n");
+
+ status = pthread_mutex_init(&worker_mutex, pthread_mutexattr_default);
+ check(status,"Worker_mutex init bad status\n");
+
+ status = pthread_cond_init(&reader_cond_var, pthread_condattr_default);
+ check(status,"Reader_cond_var init bad status\n");
+
+ status = pthread_cond_init(&worker_cond_var, pthread_condattr_default);
+ check(status,"Worker_cond_var init bad status\n");
+
+ status = pthread_mutex_init(&start_mutex, pthread_mutexattr_default);
+ check(status,"Start_mutex init bad status\n");
+
+ status = pthread_cond_init(&start_cond_var, pthread_condattr_default);
+ check(status,"Start_cond_var init bad status\n");
+
+ /* change stacksize on threads */ /***************************/
+
+ status = pthread_attr_create( &thread_attr );
+ check(status,"attribute create bad status\n");
+
+ status = pthread_attr_setstacksize( &thread_attr, 1000000);
+ check(status,"stacksize change bad status\n");
+
+ /* start the worker threads */
+
+ for (work_info->worker=0; work_info->worker < nthreads;
+ work_info->worker++) {
+ /**********************/
+ status=pthread_create(&threads[work_info->worker],thread_attr,
+ (pthread_startroutine_t)&work_thread,
+ (pthread_addr_t)work_info);
+ check(status,"Pthread_create failed\n");
+ }
+}
+
+void start_thr()
+{
+ int status;
+
+ /* tell threads to proceed */
+
+ status = pthread_mutex_lock(&start_mutex);
+ check(status,"Start_mutex lock bad status in main\n");
+
+ start_thread = 0; /* lower predicate */
+
+ status = pthread_cond_broadcast(&start_cond_var);
+ status = pthread_mutex_unlock(&start_mutex);
+ check(status,"Start_mutex unlock bad status in main\n");
+}
+
+void get_rbuf(struct buf_head **cur_buf, int max_work_buf)
+{
+ int status;
+
+ status = pthread_mutex_lock(&reader_mutex); /* lock reader_buf structure */
+
+ check(status,"Reader_mutex lock in master bad status\n");
+
+ /* no reader bufs: wait for signal to proceed */
+ while (num_reader_bufs == 0) {
+ pthread_cond_wait(&reader_cond_var,&reader_mutex);
+ }
+
+ *cur_buf = reader_buf[reader_buf_readp]; /* get the buffer address */
+ reader_buf_readp = (reader_buf_readp+1)%(max_work_buf); /* increment index */
+ num_reader_bufs--;
+
+ status = pthread_mutex_unlock(&reader_mutex); /* unlock structure */
+ check(status,"Reader_mutex unlock in master bad status\n");
+}
+
+void put_rbuf(struct buf_head *cur_buf, int max_work_buf)
+{
+ int status;
+
+ /* give the buffer to a thread, and wait for more */
+ status = pthread_mutex_lock(&worker_mutex); /* lock worker_buf_structure */
+ check(status,"Worker_mutex lock in master bad status\n");
+
+ /* Put buffer onto available for workers list */
+ worker_buf[worker_buf_readp] = cur_buf;
+ worker_buf_readp = (worker_buf_readp+1)%(max_work_buf);
+ num_worker_bufs++; /* increment number of buffers available to workers */
+
+ /* Signal one worker to wake and start work */
+ status = pthread_cond_signal(&worker_cond_var);
+
+ status = pthread_mutex_unlock(&worker_mutex);
+ check(status,"Worker_mutex unlock in master bad status\n");
+}
+
+void put_rbuf_done(int nthreads, struct buf_head *cur_buf, int max_work_buf)
+{
+ int status, i;
+ void *exit_value;
+
+ /* give the buffer to a thread, and wait for more */
+ status = pthread_mutex_lock(&worker_mutex); /* lock worker_buf_structure */
+ check(status,"Worker_mutex lock in master bad status\n");
+
+ /* Put buffer onto available for workers list */
+ worker_buf[worker_buf_readp] = cur_buf;
+ worker_buf_readp = (worker_buf_readp+1)%(max_work_buf);
+ num_worker_bufs++; /* increment number of buffers available to workers */
+
+ /* Signal one worker to wake and start work */
+
+ reader_done = 1;
+ status = pthread_cond_broadcast(&worker_cond_var);
+
+ status = pthread_mutex_unlock(&worker_mutex);
+ check(status,"Worker_mutex unlock in master bad status\n");
+
+ /* wait for all buffers available (means all do_workers are done) */
+
+ for (i=0; i < nthreads; i++) {
+ status = pthread_join( threads[i], &exit_value);
+ check(status,"Pthread_join bad status\n");
+
+ status = pthread_detach( &threads[i]);
+ check(status,"Pthread_detach bad status\n");
+ }
+}
+
+void wait_thr()
+{
+ int status;
+
+ /* Wait on master to give start signal */
+ status = pthread_mutex_lock(&start_mutex);
+ check(status,"Start_mutex lock bad status in worker\n");
+
+ while (start_thread) {
+ status = pthread_cond_wait(&start_cond_var, &start_mutex);
+ check(status,"Start_cond_wait bad status in worker\n");
+ }
+
+ status = pthread_mutex_unlock(&start_mutex);
+ check(status,"Start_mutex unlock bad status in worker\n");
+}
+
+int get_wbuf(struct buf_head **cur_buf, int max_work_buf)
+{
+ int status;
+
+ /* get a buffer to work on */
+ status = pthread_mutex_lock(&worker_mutex);
+ check(status,"First worker_mutex lock in worker bad status\n");
+
+ /* No worker_bufs available: wait for reader to produce some */
+ while (num_worker_bufs == 0) {
+ /* Exit if reader has finished */
+ if (reader_done) {
+ pthread_mutex_unlock(&worker_mutex);
+ return 0;
+ }
+ pthread_cond_wait(&worker_cond_var,&worker_mutex);
+ } /* end while */
+
+ /* Get the buffer from list */
+ *cur_buf = worker_buf[worker_buf_workp];
+ worker_buf_workp = (worker_buf_workp+1)%(max_work_buf);
+ num_worker_bufs--;
+
+ status = pthread_mutex_unlock(&worker_mutex);
+ check(status,"First worker_mutex unlock in worker bad status\n");
+ return 1;
+}
+
+void put_wbuf(struct buf_head *cur_buf, int max_work_buf)
+{
+ int status;
+
+ /* put buffer back on list for reader */
+ status = pthread_mutex_lock(&reader_mutex);
+ check(status,"Reader_mutex lock in worker bad status\n");
+
+ reader_buf[reader_buf_workp] = cur_buf;
+ reader_buf_workp = (reader_buf_workp+1)%(max_work_buf);
+ num_reader_bufs++;
+
+ /* No reader_bufs available: wake reader */
+ if (num_reader_bufs == 1) {
+ pthread_cond_signal(&reader_cond_var);
+ }
+
+ status = pthread_mutex_unlock(&reader_mutex);
+ check(status,"Reader_mutex unlock in worker bad status\n");
+}
diff --git a/src/dec_pthr_subs.h b/src/dec_pthr_subs.h
new file mode 100644
index 0000000..fd2b8e4
--- /dev/null
+++ b/src/dec_pthr_subs.h
@@ -0,0 +1,42 @@
+
+/* $Id: dec_pthr_subs.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+#include <pthread.h>
+
+#define check(status,string) \
+ if (status == -1) perror(string) /* error macro for thread calls */
+
+#ifndef XTERNAL
+pthread_t threads[MAX_WORKERS];
+
+/* mutex stuff */
+
+pthread_mutex_t reader_mutex; /* empty buffer pointer structure lock */
+pthread_mutex_t worker_mutex; /* full buffer pointer structure lock */
+
+/* condition variable stuff */
+
+pthread_cond_t reader_cond_var; /* condition variable for reader */
+pthread_cond_t worker_cond_var; /* condition variable for workers */
+
+pthread_mutex_t start_mutex; /* start-up synchronisation lock */
+pthread_cond_t start_cond_var; /* start-up synchronisation condition variable */
+
+extern pthread_t threads[];
+
+/* mutex stuff */
+
+extern pthread_mutex_t reader_mutex;
+extern pthread_mutex_t worker_mutex;
+
+/* condition variable stuff */
+
+extern pthread_cond_t reader_cond_var;
+extern pthread_cond_t worker_cond_var;
+
+extern pthread_mutex_t start_mutex;
+extern pthread_cond_t start_cond_var;
+extern int start_thread;
+
+#endif
diff --git a/src/defs.h b/src/defs.h
new file mode 100644
index 0000000..aae77e1
--- /dev/null
+++ b/src/defs.h
@@ -0,0 +1,171 @@
+/* Concurrent read version */
+
+/* $Id: defs.h 1261 2014-06-11 19:38:36Z wrp $ */
+/* $Revision: 1261 $ */
+
+#ifdef SUNOS
+#include <sys/stdtypes.h>
+#endif
+
+#ifndef IS_BIG_ENDIAN
+#if defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN)
+#define IS_BIG_ENDIAN
+#else
+#undef IS_BIG_ENDIAN
+#endif
+#endif
+
+#if !defined(MAX_WORKERS) && !defined(PCOMPLIB)
+#define MAX_WORKERS 1
+#endif
+#if defined(PCOMPLIB) && !defined(MAXWRKR)
+#define MAXWRKR 64
+#endif
+
+#define SAFE_STRNCPY(dest,src,dest_len) strncpy(dest,src,dest_len); dest[dest_len-1]='\0'
+#define SAFE_STRNCAT(str,cat,str_len) strncat(str,cat,str_len-strlen(str)-1)
+
+/* constants associated with displaying annotation links */
+#ifndef DESCR_OFFSET
+#define DESCR_OFFSET 20
+#endif
+
+#define NO_FILE_EXIT 4
+
+
+/* 3-Oct-2003 - we can now have 2 nucleotide query types, DNA
+ and RNA. pst.dnaseq can also be SEQT_RNA.
+ ldnaseq can only be DNA */
+
+#define SEQT_DNA 1
+#define SEQT_RNA 3 /* DNA and RNA seqtypes must be odd */
+
+#define SEQT_PROT 0
+#define SEQT_UNK -1
+#define SEQT_OTHER 2
+
+#ifndef DEF_NMLEN
+#define DEF_NMLEN 6
+#endif
+
+#define DEF_MIN_BITS 40 /* minimum number of bits required, appropriate for swissprot */
+
+/* unfortunately, there is an important relationship between MAXTRN and
+ MAXLIB embedded here. MAXTRN must be >= (MAXLIB)/3
+ or it will be possible for a translated DNA sequence to be longer
+ than the translation space available */
+
+#define MAX_STR 512 /* standard label/message buffer */
+#define MAX_SSTR 32 /* short string */
+#define MAX_LSTR 4096 /* long label/message buffer */
+#define MAX_FN 120 /* maximum size of a file name */
+#define MAX_CH 40 /* maximum number of library choices */
+#ifndef SMALLMEM
+#define MAX_LF 2000 /* maximum numer of library files */
+#else
+#define MAX_LF 80 /* maximum numer of library files */
+#endif
+
+#ifndef MAX_MEMK
+#if defined(BIG_LIB64) && (defined(COMP_THR) || defined(PCOMPLIB))
+#define MAX_MEMK 8*1024*1024 /* 12 GB (<<10) for library in memory */
+#else
+#define MAX_MEMK 2*1024*1024 /* 2 GB (<<10) for library in memory */
+#endif
+#endif
+
+/* padding at the end of sequences for ALTIVEC, other vector
+ processors */
+#define SEQ_PAD 16
+
+#define MAX_UID 20 /* length of libstr, used for character keys with SQL */
+
+#define BUF_MULT 2 /* increase to increase the number of buffers */
+#define DEF_WORKER_BUF 6000000
+#define AVE_AA_LEN 400
+#define AVE_NT_LEN 1200
+
+#define MAX_RSTATS 500 /* number of random shuffle stats */
+#define MIN_LOCAL_LEN 33 /* minimum length for addn'l local alignments
+ (should be in pstruct)*/
+#ifndef SMALLMEM
+#define MAXTST 40000 /* longest query */
+#define MAXLIB 150000 /* longest library sequence*/
+#define MAXLIB_P 45000
+#define MIN_RES 2000 /* minimum amount allocated for alignment */
+#ifndef TFAST
+#define MAXTRN 45000 /* buffer for fastx translation */
+#else
+#define MAXTRN 165000 /* buffer for tfastx translation, must be > 3 * MAXTST */
+#endif
+#define SEQDUP 150 /* future - overlap */
+#ifndef PCOMPLIB
+#ifndef MAX_BEST
+#define MAX_BEST 60000 /* max number of best scores */
+#endif
+#define MAX_STATS 60000
+#else
+#ifndef MAX_BEST
+#define MAX_BEST 60000 /* max number of best scores */
+#endif
+#define MAX_STATS 60000
+#endif
+#define BIGNUM 1000000000
+#ifndef MAXINT
+#define MAXINT 2147483647
+#endif
+#define MAXLN 120 /* size of a library name */
+#else
+#define MAXTST 1500
+#define MAXLIB 10000
+#define MAXLIB_P MAXLIB
+#define MIN_RES 1000
+#ifndef TFAST
+#define MAXTRN 4500
+#else
+#define MAXTRN 11500
+#endif
+#define SEQDUP 300
+#define MAX_BEST 2000
+#define MAX_STATS 20000
+#define BIGNUM 32767
+#define MAXINT 32767
+#define MAXLN 40 /* size of a library name */
+#endif
+#if !defined(TFAST)
+#define MAXDIAG (MAXTST+MAXLIB)
+#else
+#define MAXDIAG (MAXTST+MAXTRN)
+#endif
+
+#define MAXPAM 600 /* maximum allowable size of the pam matrix */
+#define PROF_MAX 500
+#define ALF_MAX 30
+
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+
+#define MX_ATYPE 7 /* markx==0,1,2 7=> no alignment */
+#define MX_ASEP 8 /* markx==3 - separate lines */
+#define MX_AMAP 16 /* markx==4,5 - graphic map */
+#define MX_HTML 32 /* markx==6 - HTML */
+#define MX_M9SUMM 64 /* markx==9(c) */
+#define MX_M10FORM 128 /* markx==10 - verbose output */
+#define MX_M11OUT 256 /* markx==11 - lalign lav */
+#define MX_M8OUT 512 /* markx==8 blast8 output */
+#define MX_M8COMMENT 1024 /* markx==8 blast8 output */
+#define MX_MBLAST 2048 /* markx=B blast output */
+#define MX_MBLAST2 4096 /* markx=BB more blast output */
+#define MX_ANNOT_COORD 16384 /* -m 0, use -m 0B for both */
+#define MX_ANNOT_MID 32768 /* markx 0M, 1M, 2M annotations in middle */
+#define MX_RES_ALIGN_SCORE (1<<20) /* show residue alignment score, not alignment */
+
+/* codes for -m 9 */
+#define SHOW_CODE_ID 1 /* identity only */
+#define SHOW_CODE_IDD 2 /* identity with domains */
+#define SHOW_CODE_ALIGN 4 /* encoded alignment */
+#define SHOW_CODE_CIGAR 8 /* CIGAR vs old encoded alignment */
+#define SHOW_CODE_BTOP 16 /* BLAST BTOP encoding */
+#define SHOW_CODE_MASK 12 /* use higher bits for annotation format */
+#define SHOW_CODE_EXT 16 /* encode identity, mismatch state */
+#define SHOW_ANNOT_FULL 32 /* show full-length annot in calc_code */
diff --git a/src/doinit.c b/src/doinit.c
new file mode 100644
index 0000000..f64b3a0
--- /dev/null
+++ b/src/doinit.c
@@ -0,0 +1,975 @@
+/* doinit.c general and function-specific initializations */
+
+/* $Id: doinit.c 1267 2014-07-29 13:50:40Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 2014 by William R. Pearson and the
+ Rector & Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* this file performs general initializations of search parameters
+
+ In addition, it calls several functions in init??.c that provide
+ program-specific initializations:
+
+ f_initenv() - called from initenv()
+ f_getopt() - called from initenv() during a getopt() scan
+ f_getarg() - called from initenv() after the getopt() scan
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if defined(UNIX) || defined(_MACH)
+#include <unistd.h>
+#endif
+#ifndef PCOMPLIB
+#ifdef IRIX
+#include <sys/sysmp.h>
+#endif
+#else
+#include "msg.h" /* need for FIRSTNODE */
+#ifdef MPI_SRC
+#include "mpi.h"
+#endif
+#endif
+
+#ifdef WIN32
+#include <windows.h>
+#endif
+
+#include "defs.h"
+#include "param.h"
+#include "upam.h" /* required for 'U' option change of nascii */
+
+#include "structs.h"
+
+#define XTERNAL
+#include "uascii.h"
+#undef XTERNAL
+
+#ifdef UNIX
+#include <getopt.h>
+#else
+extern int optind; /* used by getopt() */
+extern char *optarg;
+#endif
+
+char prog_name[MAX_FN];
+
+extern void f_initenv(struct mngmsg *, struct pstruct *, unsigned char **);
+extern void f_lastenv(struct mngmsg *, struct pstruct *);
+extern void f_getopt(char, char *, struct mngmsg *, struct pstruct *);
+extern void f_getarg(int, char **, int, struct mngmsg *, struct pstruct *);
+extern void show_help(char *, int pgm_id);
+extern void show_all_help(char *pgm_name, int pgm_id);
+void g_init_opts(struct mngmsg *, struct pstruct *);
+void subs_env(char *dest, char *src, int dest_size);
+
+void add_ascii_ann(int *qascii, unsigned char *ann_arr);
+static int set_markx(int markx, int val, char c);
+static void pre_parse_markx(char *opt_arg, struct mngmsg *m_msp);
+static void parse_markx(char *opt_arg, struct markx_str *this_markx);
+static void get_annot_def_file(struct mngmsg *m_msp, char *fa_annot_env);
+void markx_to_m_msp(struct mngmsg *m_msp, struct markx_str *this_markx);
+void m_msp_to_markx(struct markx_str *this_markx, struct mngmsg *m_msp);
+
+int optcnt;
+int fa_max_workers=MAX_WORKERS;
+#ifdef PCOMPLIB
+int worker_1=0;
+int worker_n=0;
+#endif
+
+extern struct opt_def_str f_options[];
+
+void set_opt_disp_defs(char opt_char, struct opt_def_str *options, int type,
+ int i_param1, int i_param2,
+ double d_param1, double d_param2, char *s_param);
+
+/* ****************************************************************
+ The option/-help system has been substantially restructured to
+ allow more consistent -h/-help messages.
+
+ There are now two global arrays, opt_def_str g_options (global
+ options, parsed in doinit.c), and opt_def_str f_options
+ (function-specific options, parsed in initfa.c)
+
+ struct opt_def_str {
+ char opt_char; # getopt single character option letter
+ int has_arg; # does it have an option?
+ char *opt_str; # getopt_long (future) long option name
+ char *opt_descr_s; # short description of option
+ char *opt_descr_l; # long description of option (if NULL, use opt_descr_s)
+ int opt_rank; # rank of option (not used)
+ int fmt_type; # fmt type (for defaults): 1,2 ints, 3,4 doubles
+ int i_param1; # int default1
+ int i_param2;
+ double d_param1; # double default1
+ double d_param2;
+ };
+
+ the g_opt_string and f_opt_string's parsed by getopt() are built
+ from these structures, guaranteeing that the options and help
+ messages are kept in sync.
+
+ long options descriptions (opt_descr_l) are saved in static arrays
+ (e.g. m_opt_descr[] in doinit.c, z_opt_descr[], s_opt_descr[] in
+ initfa.c
+
+ The default option values, which are displayed from i_param[1,2],
+ d_param[1,2], are set by g_init_opts() and f_init_opts() using
+ set_opt_disp_defs(). g_init_opts()/f_init_opts() should be called
+ as late as possible in the program.
+
+ **************************************************************** */
+
+static char m_opt_descr[] ="Output/alignment format;\n 0 - standard \":. \" alignment; 1 - \" xX\"; 2 - \".MS..\"; 3 - separate >fasta entries;\n 4 - \"---\" alignment map; 5 - 0+4; 6 - <html>;\n 8 - BLAST tabular; 8C commented BLAST tabular; 8CC BLAST tab CIGAR, 8CD BLAST tab CIGAR ext; 8CB BLAST tab BTOP\n B - BLAST Query/Sbjct alignments; BB - complete BLAST output;\n 9 - FASTA tabular; 9c - FASTA tabular encoded; 9C FASTA tabular CIGAR encoded; 9B FASTA tabul [...]
+
+struct opt_def_str g_options[] = {
+ {'C', 1, "aname_length", "length of the query/sbjct name in alignments", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'D', 0, "debug", "enable debugging output", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'e', 1, "expand", "expand_script to extend hits", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'F', 1, "evalue_min", "min E()-value displayed", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#if defined(PCOMPLIB) || !defined(SHOW_HIST)
+ {'H', 0, "histogram", "show histogram", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#else
+ {'H', 0, "nohist", "no histogram", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+ {'i', 0, "revcomp", "search with reverse-complement", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#ifdef SHOW_HELP
+ {'I', 0, "interact", "interactive mode", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+ {'l', 1, "fastlibs", "FASTLIBS abbreviation file", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'L', 0, "long_info", "long library descriptions", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'m', 1, "outfmt", "output format", &m_opt_descr[0], 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'N', 1, "lib_length", "max library length before overlapping", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'o', 1, "offsets", "offset coordinates of query/subject", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'O', 1, "out", "write results to file", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#ifndef SHOW_HELP
+ {'q', 0, "quiet", "quiet -- do not prompt", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'Q', 0, "\0", "quiet -- do not prompt", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#else
+ {'q', 0, "quiet", "quiet [default] -- do not prompt", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'Q', 0, "\0", "quiet [default] -- do not prompt", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+ {'R', 1, "results_file", "raw score file", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'T', 1, "threads", "max threads/workers", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'v', 1, "shuffle_window", "shuffle window size", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'V', 1, "annotation", "annotation characters in query/library for aligments", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'w', 1, "aln_width", "width of alignment display", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'Z', 1, "db_size", "database size for E()-value", "[library entries] database size for E()-value", 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'\0', 0, "", "", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL}
+};
+
+/* set default option values for help */
+void g_init_opts(struct mngmsg *m_msp, struct pstruct *ppst) {
+ set_opt_disp_defs('C', g_options, 1, m_msp->nmlen, 0, 0.0, 0.0, NULL);
+ set_opt_disp_defs('F', g_options, 3, 0, 0, m_msp->e_low, 0.0, NULL);
+ set_opt_disp_defs('m', g_options, 1, m_msp->markx, 0, 0.0, 0.0, NULL);
+ set_opt_disp_defs('o', g_options, 2, (int)m_msp->sq0off, (int)m_msp->sq1off, 0.0, 0.0,NULL);
+ set_opt_disp_defs('T', g_options, 1, fa_max_workers, 0, 0.0, 0.0, NULL);
+ set_opt_disp_defs('v', g_options, 1, ppst->zs_win, 0, 0.0, 0.0, NULL);
+ set_opt_disp_defs('w', g_options, 1, m_msp->aln.llen, 0, 0.0, 0.0, NULL);
+}
+
+void
+build_optstr(char *opt_str, int opt_len, struct opt_def_str *opt_defs);
+
+static int long_info_set=0;
+static int llen_set = 0;
+static int markx_set = 0;
+
+/* initenv () initializes the environment */
+void initenv (int argc, char **argv, struct mngmsg *m_msp,
+ struct pstruct *ppst, unsigned char **aa0)
+{
+ char *cptr, *bp, *bp1;
+ int copt;
+#ifdef WIN32
+ SYSTEM_INFO siSysInfo;
+#endif
+
+ /* options for all search functions */
+ /* char *g_optstr = "b:BC:d:DE:F:HiK:l:Lm:N:O:QqR:T:v:V:w:W:X:Z:"; */
+
+ char g_optstring[MAX_STR];
+ char f_optstring[MAX_STR];
+ char optstring[MAX_STR];
+
+ /* help functions exit(); try first */
+ if (argc == 1) {
+ show_help(m_msp->pgm_name, ppst->pgm_id);
+ }
+ if (strcmp(argv[1],"-help")==0 || strcmp(argv[1],"--help")==0) {
+ show_all_help(m_msp->pgm_name, ppst->pgm_id);
+ }
+
+ build_optstr(g_optstring, sizeof(f_optstring), g_options);
+ build_optstr(f_optstring, sizeof(f_optstring), f_options);
+
+/* these initializations will be used by all functions */
+
+ /* prog_name[] is only used for error messages */
+ strncpy(prog_name,argv[0],sizeof(prog_name));
+ prog_name[sizeof(prog_name)-1]='\0';
+
+#ifdef PCOMPLIB
+#ifdef MPI_SRC
+ MPI_Comm_size(MPI_COMM_WORLD,&fa_max_workers);
+ if (fa_max_workers <= 1) {
+ fprintf(stderr," nnodes = %d; no workers available\n",fa_max_workers);
+ exit(1);
+ }
+ else {
+ fa_max_workers -= FIRSTNODE;
+ fprintf(stderr," have %d workers\n",fa_max_workers);
+ }
+#endif
+#else /* not PCOMPLIB */
+#if defined(IRIX)
+ fa_max_workers = sysmp(MP_NPROCS);
+#else
+#if defined(WIN32)
+ GetSystemInfo(&siSysInfo);
+ fa_max_workers = siSysInfo.dwNumberOfProcessors;
+#endif
+#if defined(UNIX) || defined(HAVE_SYSCONF)
+ fa_max_workers = sysconf(_SC_NPROCESSORS_CONF);
+#endif /* UNIX || SYSCONF */
+#endif /* !IRIX */
+#endif /* !PCOMPLIB */
+
+ m_msp->ltitle[0] = '\0';
+
+ if ((cptr=getenv("FASTLIBS"))!=NULL) {
+ strncpy(m_msp->flstr,cptr,MAX_FN);
+ m_msp->flstr[MAX_FN-1] = '\0';
+ }
+ else m_msp->flstr[0]='\0';
+
+ m_msp->std_output = 1;
+ m_msp->hist.hist_a = NULL;
+ m_msp->outfile[0] = '\0';
+ m_msp->outfd = NULL;
+ m_msp->ldb_info.ldnaseq = SEQT_PROT; /* library is protein */
+ m_msp->n1_low = ppst->n1_low = 0;
+ m_msp->n1_high = ppst->n1_high = BIGNUM;
+ m_msp->ql_start = 1; /* start with first query sequence */
+ m_msp->ql_stop = BIGNUM; /* end with the last query sequence */
+ m_msp->aa1save_buf_b = NULL;
+ m_msp->bline_buf_b = NULL;
+
+ m_msp->pamd1 = MAXSQ;
+ m_msp->pamd2 = MAXSQ;
+
+ m_msp->ldb_info.term_code = 0;
+
+ ppst->tr_type = 0;
+ ppst->debug_lib = 0;
+ m_msp->nshow = 20;
+ ppst->max_repeat = 50;
+ m_msp->nohist = 1;
+#if defined(PCOMPLIB)
+ m_msp->mshow = 20;
+#else
+#ifdef SHOW_HIST
+ m_msp->nohist = 0;
+#endif
+ m_msp->mshow = 50;
+#endif
+ m_msp->do_showbest = 1;
+ m_msp->ashow = -1;
+ m_msp->ashow_set = 0;
+ m_msp->nmlen = DEF_NMLEN;
+ m_msp->z_bits = 1;
+ m_msp->tot_ident = 0;
+ m_msp->mshow_set = 0;
+ m_msp->mshow_min = 0;
+ m_msp->aln.llen = 60;
+ m_msp->aln.llcntx = 30;
+ m_msp->aln.llcntx_set = 0;
+ m_msp->e_low = 0.0;
+ m_msp->e_cut_set = 0;
+ m_msp->revcomp = 0;
+ m_msp->long_info = 0;
+ m_msp->ldb_info.maxn = 0;
+ m_msp->ldb_info.dupn = SEQDUP;
+ m_msp->dfile[0] = '\0';
+ m_msp->tname[0] = '\0';
+ m_msp->lname[0] = '\0';
+ m_msp->link_lname[0] = '\0';
+ m_msp->show_code = 0;
+ m_msp->tot_show_code = 0;
+ m_msp->aln.showall = 0;
+ m_msp->markx = 0;
+ m_msp->tot_markx = 0;
+ m_msp->markx_list = NULL;
+ m_msp->align_done = 0;
+ m_msp->sq0off = m_msp->sq1off = 1;
+ strncpy(m_msp->sqnam,"aa",4);
+ strncpy(m_msp->sqtype,"protein",10);
+
+ /* annotation info */
+ m_msp->ann_flg = 0;
+ memset(m_msp->ann_arr,'\0',MAX_FN);
+ m_msp->ann_arr_def[0] = NULL;
+ m_msp->ann_arr_def[1] = NULL;
+ m_msp->annot0_sname[0]='\0';
+ m_msp->annot1_sname[0]='\0';
+ m_msp->annot_p = NULL;
+ m_msp->aa0a = NULL;
+
+ ppst->LK_set = 0;
+ ppst->e_cut = m_msp->e_cut = 10.0;
+ ppst->e_cut_r = ppst->e_cut / 10.0;
+ ppst->do_rep = 1;
+ ppst->zs_win = 0;
+ ppst->show_ident = 0;
+
+ ppst->zdb_size = -1;
+ ppst->zdb_size_set = 0;
+ ppst->dnaseq = SEQT_PROT; /* default is protein */
+ ppst->nt_align = 0;
+
+ ppst->other_info = NULL;
+
+ g_init_opts(m_msp, ppst);
+
+ f_initenv (m_msp, ppst, aa0);
+
+ SAFE_STRNCPY (optstring, g_optstring, sizeof (optstring));
+ SAFE_STRNCAT (optstring, f_optstring, sizeof (optstring));
+
+ while ((copt = getopt (argc, argv, optstring)) != EOF)
+ {
+ if (strchr (g_optstring, copt) != NULL)
+ {
+ switch (copt) { /* switches for all options */
+ case 'C':
+ sscanf(optarg,"%d",&m_msp->nmlen);
+ if (m_msp->nmlen > MAX_UID-1) m_msp->nmlen = MAX_UID-1;
+ break;
+ case 'D': ppst->debug_lib = 1;
+ break;
+ case 'e':
+ strncpy(m_msp->link_lname, optarg, MAX_LSTR);
+ break;
+ case 'F':
+ sscanf(optarg,"%lg",&m_msp->e_low);
+ m_msp->e_cut_set = 1;
+ break;
+#if defined(PCOMPLIB) || !defined(SHOW_HIST)
+ case 'H':
+ m_msp->nohist = 0; break;
+#else
+ case 'H':
+ m_msp->nohist = 1; break;
+#endif
+ case 'i':
+ m_msp->revcomp = 1; break;
+ case 'I':
+ m_msp->quiet = 0; break;
+ case 'l':
+ strncpy(m_msp->flstr,optarg,MAX_FN);
+ m_msp->flstr[MAX_FN-1]='\0';
+ break;
+ case 'L':
+ m_msp->long_info = 1;
+ long_info_set = 1;
+ break;
+ case 'm':
+ pre_parse_markx(optarg, m_msp);
+ markx_set = 1;
+ break;
+ case 'N':
+ sscanf(optarg,"%d",&m_msp->ldb_info.maxn);
+ break;
+ case 'o':
+ sscanf (optarg,"%ld %ld",&m_msp->sq0off,&m_msp->sq1off); break;
+ case 'O':
+ strncpy(m_msp->outfile,optarg,MAX_FN);
+ m_msp->outfile[MAX_FN-1]='\0';
+ break;
+ case 'q':
+ case 'Q':
+ m_msp->quiet = 1;
+ break;
+ case 'R':
+ strncpy (m_msp->dfile, optarg, MAX_FN);
+ m_msp->dfile[MAX_FN-1]='\0';
+ break;
+ case 'T':
+#ifdef PCOMPLIB
+ if (strchr(optarg,'-') != NULL) {
+ sscanf(optarg,"%d-%d",&worker_1,&worker_n);
+ if (worker_1 > worker_n) {
+ worker_1 = worker_n = 0;
+ }
+ }
+ else
+#endif
+ sscanf (optarg, "%d", &fa_max_workers);
+ if (fa_max_workers < 0) fa_max_workers=1;
+ break;
+ case 'v':
+ sscanf (optarg,"%d",&ppst->zs_win);
+ break;
+ case 'V':
+ if (optarg[0] == '=') {
+ get_annot_def_file(m_msp, optarg+1);
+ }
+ else if ((cptr = getenv("FA_ANNOT_DEF"))) {
+ get_annot_def_file(m_msp, cptr);
+ }
+ else if (optarg[0] == 'q' && (optarg[1]=='!' || optarg[1]=='<')) {
+ strncpy(m_msp->annot0_sname,optarg+1,MAX_LSTR);
+ m_msp->ann_flg = 2;
+ }
+ else if (optarg[0]=='!' || optarg[0]=='<') {
+ strncpy(m_msp->annot1_sname,optarg,MAX_LSTR);
+ m_msp->ann_flg = 2;
+ }
+ else {
+ strncpy((char *)m_msp->ann_arr+1,optarg,MAX_FN-2);
+ m_msp->ann_arr[0]='\0';
+ m_msp->ann_arr[MAX_FN-2]='\0';
+ m_msp->ann_arr_n = strlen((char *)m_msp->ann_arr+1);
+ if (m_msp->ann_flg ==0) m_msp->ann_flg = 1;
+ }
+
+ if (strlen((char *)m_msp->ann_arr) > 0) {
+ add_ascii_ann(qascii, m_msp->ann_arr);
+ }
+
+ break;
+/*
+ case 'V':
+ fprintf(stderr," -V option not currently supported in parallel\n");
+ break;
+*/
+ case 'w':
+ sscanf (optarg,"%d",&m_msp->aln.llen);
+ if (m_msp->aln.llen < 10) m_msp->aln.llen = 10;
+ if (m_msp->aln.llen > 200) m_msp->aln.llen = 200;
+ if (!m_msp->aln.llcntx_set) m_msp->aln.llcntx = m_msp->aln.llen/2;
+ llen_set = 1;
+ break;
+ case 'Z':
+ sscanf(optarg,"%ld",&ppst->zdb_size);
+ ppst->zdb_size_set = 1;
+ break;
+ }
+ }
+ else if (strchr (f_optstring, copt))
+ f_getopt (copt, optarg, m_msp, ppst);
+ }
+ optind--;
+
+ if (!markx_set || !(m_msp->markx & (MX_ATYPE+MX_ANNOT_COORD+MX_ANNOT_MID))) {
+ m_msp->markx = set_markx(m_msp->markx, 0, '\0');
+ }
+
+ /* done with options, check for initializations in initfa.c
+ (set sascii alphabet) */
+ f_lastenv (m_msp, ppst);
+
+ if (argc - optind < 3) return;
+ m_msp->tnamesize = sizeof (m_msp->tname);
+ if (argc - optind > 1) {strncpy (m_msp->tname, argv[optind + 1],MAX_FN);}
+ if (argc - optind > 2) {strncpy(m_msp->lname, argv[optind + 2],MAX_LSTR);}
+ f_getarg (argc, argv, optind, m_msp, ppst);
+}
+
+/* ann_scan scans an aa0 query sequence if -V ann_chars, and returns
+ an edited query sequence and allocates aa0a[n_n0+2] space for the
+ annotation */
+
+int
+ann_scan(unsigned char *aa0, int n0, unsigned char **aa0a_p, int seqtype)
+{
+ unsigned char *aa0p, *aa0d, *aa0ad;
+ int n_n0;
+
+ /* count how many "real" residues */
+
+ if (seqtype==SEQT_UNK) {
+ /* with SEQT_UNK, annotation characters are all < @,
+ while sequence chars are all > @ */
+ for (n_n0=0, aa0p = aa0; aa0p < aa0+n0; aa0p++) {
+ if (*aa0p > '@' || *aa0p == ESS ) n_n0++; /* ESS captures ',' in sequence */
+ }
+ }
+ else {
+ /* if the sequence type is known, then annotation chars are > NANN */
+ for (n_n0=0, aa0p = aa0; aa0p < aa0+n0; aa0p++) {
+ if (*aa0p < NANN ) n_n0++;
+ }
+ }
+
+ if (n_n0 == n0) {
+ *aa0a_p = NULL;
+ return n_n0;
+ }
+
+ aa0d = aa0;
+ /* n_n0 has the real sequence length */
+ if ((*aa0a_p = calloc(n_n0+2, sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate annotation sequence: %d\n",n_n0);
+
+ /* this section is for failure, simply copy the correct sequence
+ and ignore the annotations */
+ if (seqtype==SEQT_UNK) {
+ for (aa0p = aa0; aa0p < aa0+n0; aa0p++) {
+ if (*aa0p > '@' || *aa0p == ESS) {*aa0d++ = *aa0p;}
+ }
+ }
+ else {
+ for (aa0p = aa0; aa0p < aa0+n0; aa0p++) {
+ if (*aa0p < NANN) {*aa0d++ = *aa0p;}
+ }
+ }
+ *aa0d = '\0';
+ return n_n0;
+ }
+
+ /* have aa0a_p annotation array allocated */
+ aa0ad = *aa0a_p;
+ if (seqtype==SEQT_UNK) {
+ for (aa0p = aa0; aa0p<aa0+n0; aa0p++) {
+ if (*aa0p > '@' || *aa0p == ESS) {*aa0d++ = *aa0p; *aa0ad++='\0';}
+ else if (aa0ad > *aa0a_p) { aa0ad[-1] = *aa0p - NANN;}
+ }
+ }
+ else {
+ for (aa0p = aa0; aa0p<aa0+n0; aa0p++) {
+ if (*aa0p < NANN) {*aa0d++ = *aa0p; *aa0ad++='\0';}
+ else if (aa0ad > *aa0a_p) { aa0ad[-1] = *aa0p - NANN;}
+ }
+ }
+ *aa0ad = *aa0d = '\0';
+ return n_n0;
+}
+
+/* renamed from ann_ascii() Feb, 2008 to allow ann_ascii[] */
+void
+add_ascii_ann(int *qascii, unsigned char *ann_arr)
+{
+ unsigned char *ann_p;
+ int ann_ix = NANN+1;
+
+ if (ann_arr[0] == '\0' && ann_arr[1]=='\0') {
+ ann_arr[0] = ' ';
+ ann_arr[0] = '\0';
+ return;
+ }
+
+ ann_arr[0] = ' ';
+
+ if (strchr((char *)ann_arr+1,'*')) {qascii['*'] = NA;}
+
+ for (ann_p = ann_arr+1; *ann_p; ann_p++) {
+ if (qascii[*ann_p] == NA) { qascii[*ann_p] = ann_ix++;}
+ }
+}
+
+/* parse annotation description line */
+void add_annot_def(struct mngmsg *m_msp, char *line, int qa_flag) {
+ char *bp;
+ int i_ann;
+
+ if ((bp=strchr(line,'\r')) !=NULL || (bp=strchr(line,'\n')) != NULL) {
+ *bp = '\0';
+ }
+
+ if (m_msp->ann_arr[0]=='\0') {
+ m_msp->ann_arr[0] = ' ';
+ m_msp->ann_arr[1] = '\0';
+ }
+
+ /* set the character */
+ i_ann = strlen((char *)m_msp->ann_arr);
+ if ((bp = strchr((char *)m_msp->ann_arr,line[0]))!=NULL) {
+ i_ann = (unsigned char *)bp - m_msp->ann_arr;
+ }
+ else {
+ m_msp->ann_arr[i_ann] = line[0];
+ m_msp->ann_arr[i_ann+1] = '\0'; /* required for strchr(ann_arr) to work */
+ if (qa_flag) qascii[line[0]] = NANN + i_ann;
+ }
+
+ if ((bp=strchr(line,':'))!=NULL) {
+ /* allocate space for definitions */
+ if ((m_msp->ann_arr_def[i_ann]=(char *)calloc(strlen(bp+1)+1,sizeof(char)))!=NULL) {
+ /* read in the definitions and associate with symbol */
+ strncpy(m_msp->ann_arr_def[i_ann], bp+1,strlen(bp+1));
+ }
+ }
+ else {
+ m_msp->ann_arr_def[i_ann] = NULL;
+ }
+
+
+}
+
+/* read definitions of annotation symbols from a file */
+static void
+get_annot_def_file(struct mngmsg *m_msp, char *fa_annot_env) {
+ FILE *def_fp;
+ char *bp, *bpf, line[MAX_STR];
+ char tmp_annot_env[MAX_STR];
+
+ if ((bpf=strchr(fa_annot_env,' '))!=NULL) *bpf = '\0';
+
+ subs_env(tmp_annot_env, fa_annot_env, sizeof(tmp_annot_env));
+ /* check that the file exists */
+ if ((def_fp = fopen(tmp_annot_env,"r"))==NULL) {
+ fprintf(stderr,"*** error *** annotation definition file: %s not found\n",
+ tmp_annot_env);
+ if (bpf) *bpf=' ';
+ return;
+ }
+
+ /* read a line */
+ while (fgets(line, sizeof(line), def_fp)!=NULL) {
+ add_annot_def(m_msp, line, 0);
+ }
+ fclose(def_fp);
+ if (bpf) *bpf=' ';
+
+ if (strlen((char *)m_msp->ann_arr)>1) m_msp->ann_flg = 1;
+}
+
+int
+set_markx(int markx, int val, char c) {
+
+ if (val < 3) {
+ if (c=='M') {
+ markx |= MX_ANNOT_MID;
+ markx &= (~MX_ANNOT_COORD);
+ }
+ else if (c=='B') {
+ markx |= MX_ANNOT_COORD;
+ markx |= MX_ANNOT_MID;
+ }
+ else {
+ markx |= MX_ANNOT_COORD;
+ }
+ if (c=='H') {
+ markx |= MX_HTML;
+ }
+ return markx | (MX_ATYPE & val);
+ }
+ else if (val == 3) {
+ markx |= (MX_ATYPE + MX_ASEP);
+ }
+ else if (val == 4) {
+ markx |= (MX_ATYPE + MX_AMAP);
+ }
+ else if (val == 5) {
+ markx |= MX_AMAP;
+ }
+ else if (val == 6 || c=='H') {
+ markx |= (MX_HTML) ;
+ if (c=='M') {
+ markx |= MX_ANNOT_MID;
+ markx &= (~MX_ANNOT_COORD);
+ }
+ else if (c=='B') {
+ markx |= MX_ANNOT_COORD;
+ markx |= MX_ANNOT_MID;
+ }
+ else {
+ markx |= MX_ANNOT_COORD;
+ }
+ }
+ else if (val == 8) {
+ markx |= MX_M9SUMM+MX_M8OUT;
+ }
+ else if (val == 9) {
+ markx |= MX_M9SUMM;
+ }
+ else if (val == 10) {
+ markx |= MX_M10FORM;
+ }
+ else if (val == 11) {
+ markx |= MX_M11OUT;
+ }
+
+ return markx;
+}
+
+void
+pre_parse_markx(char *opt_arg, struct mngmsg *m_msp) {
+ char *bp, *last_bp;
+ struct markx_str *tmp_markx, *cur_markx, *last_markx;
+
+ if (opt_arg[0] != 'F' && m_msp->markx_list != NULL) {
+ tmp_markx = m_msp->markx_list;
+ }
+ else {
+ if ((tmp_markx = (struct markx_str *)calloc(1,sizeof(struct markx_str)))==NULL) {
+ fprintf(stderr,"[error] Cannot allocate markx_list\n");
+ return;
+ }
+
+ /* initialize markx to m_msg defaults -- we do not use m_msp
+ directly, because it might have been changed by an earlier -m
+ out_fmt */
+
+ tmp_markx->nohist = 1;
+ if (m_msp->ashow_set) {tmp_markx->ashow = m_msp->ashow;}
+ else {tmp_markx->ashow = -1;}
+
+ tmp_markx->show_code = 0;
+ if (long_info_set) tmp_markx->long_info = 1;
+ else tmp_markx->long_info = 0;
+ if (llen_set) {
+ tmp_markx->aln_llen = m_msp->aln.llen;
+ tmp_markx->aln_llcntx = m_msp->aln.llcntx;
+ tmp_markx->aln_llcntx_set = m_msp->aln.llcntx_set;
+ }
+ else {
+ tmp_markx->aln_llen = 60;
+ if (m_msp->aln.llcntx_set) {
+ tmp_markx->aln_llcntx = m_msp->aln.llcntx;
+ tmp_markx->aln_llcntx_set = m_msp->aln.llcntx_set;
+ }
+ else {
+ tmp_markx->aln_llcntx = 30;
+ tmp_markx->aln_llcntx_set = 0;
+ }
+ }
+ tmp_markx->std_output = 1;
+ }
+
+ /* first check for -m "F file" format */
+ if (optarg[0] == 'F') {
+ if ((bp=strchr(optarg+1,' '))==NULL) {
+ fprintf(stderr,"-m F missing file name: %s\n",optarg);
+ return;
+ }
+ /* allocate space for file name */
+ if ((tmp_markx->out_file = calloc(strlen(bp+1)+1,sizeof(char)))==NULL) {
+ fprintf(stderr,"[error] Cannot allocate markx->out_file\n");
+ return;
+ }
+ strncpy(tmp_markx->out_file, bp+1, strlen(bp+1));
+ *bp = '\0';
+
+ last_bp = optarg+1;
+ }
+ else {
+ last_bp = optarg;
+ }
+
+ if (opt_arg[0] != 'F') {
+ m_msp_to_markx(tmp_markx, m_msp);
+ }
+
+ while ((bp=strchr(last_bp,','))!=NULL) {
+ *bp = '\0';
+ parse_markx(last_bp, tmp_markx);
+ *bp = ',';
+ last_bp = bp+1;
+ }
+
+ if (*last_bp) parse_markx(last_bp, tmp_markx);
+
+ if (m_msp->markx_list!=NULL) {
+ if (opt_arg[0] == 'F') {
+ /* if file name, add this to the end of the list */
+ last_markx = m_msp->markx_list;
+ for (cur_markx=m_msp->markx_list->next; cur_markx; cur_markx = cur_markx->next) {
+ last_markx = cur_markx;
+ }
+ last_markx->next = tmp_markx;
+ }
+ else if (tmp_markx != m_msp->markx_list) {
+ /* if no file name, then make this the first in the list,
+ unless it is already there */
+ cur_markx = m_msp->markx_list;
+ m_msp->markx_list = tmp_markx;
+ tmp_markx->next = cur_markx;
+ }
+ }
+ else {
+ m_msp->markx_list = tmp_markx;
+ }
+
+ m_msp->tot_markx |= tmp_markx->markx;
+ m_msp->tot_show_code |= tmp_markx->show_code;
+
+ /* if no -m F, save options into m_msp */
+ if (optarg[0] != 'F') {
+ markx_to_m_msp(m_msp, tmp_markx);
+ }
+
+ return;
+}
+
+void
+parse_markx(char *optarg, struct markx_str *this) {
+ int itmp;
+ char ctmp, ctmp2;
+
+ itmp = 0;
+ ctmp = ctmp2 = '\0';
+
+ if (optarg[0] == 'B') { /* BLAST alignment output */
+ this->markx = MX_MBLAST;
+ this->aln_llcntx = 0;
+ this->aln_llcntx_set = 1;
+ this->long_info=1;
+ this->ashow = -1;
+ if (optarg[1] == 'B') { /* complete BLAST output */
+ this->markx += MX_MBLAST2;
+ this->nohist = 1;
+ this->aln_llen = 65;
+ this->std_output = 0;
+ return;
+ }
+ else if (optarg[1] == '8') {
+ sscanf(optarg,"%d%c%c",&itmp,&ctmp,&ctmp2);
+ }
+ else {return;} /* done with BLAST aligment output */
+ }
+ else if (optarg[0] == 'A') {
+ this->markx += MX_RES_ALIGN_SCORE;
+ this->aln_llcntx = 0;
+ this->aln_llcntx_set = 1;
+ return;
+ }
+ else {
+ sscanf(optarg,"%d%c%c",&itmp,&ctmp,&ctmp2);
+ }
+ if (itmp==9) {
+ if (ctmp=='c') {this->show_code = SHOW_CODE_ALIGN;}
+ else if (ctmp=='d') {this->show_code = SHOW_CODE_ALIGN + SHOW_CODE_EXT;}
+ else if (ctmp=='C') {this->show_code = SHOW_CODE_CIGAR;}
+ else if (ctmp=='D') {this->show_code = SHOW_CODE_CIGAR + SHOW_CODE_EXT;}
+ else if (ctmp=='B') {this->show_code = SHOW_CODE_BTOP;}
+ else if (ctmp=='i') {this->show_code = SHOW_CODE_ID;}
+ else if (ctmp=='I') {this->show_code = SHOW_CODE_IDD;}
+ }
+ if (itmp > 6 && itmp != 11 && itmp != 10 && itmp != 9 && itmp != 8) itmp = 0;
+ this->markx = set_markx(this->markx,itmp,ctmp);
+ if (itmp == 11 ) { this->std_output = 0;}
+ if (itmp == 8) {
+ this->std_output = 0;
+ this->ashow = 0;
+ if (ctmp=='C') { this->markx += MX_M8COMMENT;}
+ if (ctmp2 == 'c') { this->show_code = SHOW_CODE_ALIGN;}
+ else if (ctmp2 == 'd') {this->show_code = SHOW_CODE_ALIGN + SHOW_CODE_EXT;}
+ else if (ctmp2 == 'C') {this->show_code = SHOW_CODE_CIGAR;}
+ else if (ctmp2 == 'D') {this->show_code = SHOW_CODE_CIGAR + SHOW_CODE_EXT;}
+ else if (ctmp2 == 'B') {this->show_code = SHOW_CODE_BTOP;}
+ }
+}
+
+/* transfer markx values for m_msp to m_msp */
+void
+markx_to_m_msp(struct mngmsg *m_msp, struct markx_str *this) {
+
+ m_msp->markx = this->markx;
+ m_msp->nohist = this->nohist;
+ m_msp->ashow = this->ashow;
+ m_msp->show_code = this->show_code;
+ m_msp->long_info = this->long_info;
+ m_msp->aln.llen = this->aln_llen;
+ m_msp->aln.llcntx = this->aln_llcntx;
+ m_msp->aln.llcntx_set = this->aln_llcntx_set;
+ m_msp->std_output = this->std_output;
+}
+
+/* save current m_msp values used with markx */
+void
+m_msp_to_markx(struct markx_str *this, struct mngmsg *m_msp) {
+
+ this->markx = m_msp->markx ;
+ this->nohist = m_msp->nohist ;
+ this->ashow = m_msp->ashow ;
+ this->show_code = m_msp->show_code ;
+ this->long_info = m_msp->long_info ;
+ this->aln_llen = m_msp->aln.llen ;
+ this->aln_llcntx = m_msp->aln.llcntx ;
+ this->aln_llcntx_set = m_msp->aln.llcntx_set ;
+ this->std_output = m_msp->std_output ;
+}
+
+/* put options from option table [struct opt_def_str *opt_defs] into
+ char *opt_str for getopt() */
+
+void
+build_optstr(char *opt_str, int max_len, struct opt_def_str *opt_defs) {
+ int i, opt_len = 0;
+ char *opt_pos;
+
+ opt_pos = opt_str;
+ for (i=0; opt_defs[i].opt_char != '\0'; i++) {
+ if (opt_len + 2 > max_len) {
+ fprintf(stderr," *** error -- options too long %d >= %d\n", opt_len, max_len);
+ break;
+ }
+ *opt_pos++ = opt_defs[i].opt_char;
+ opt_len++;
+ if (opt_defs[i].has_arg) {
+ *opt_pos++ = ':';
+ opt_len++;
+ }
+ }
+ *opt_pos = '\0';
+}
+
+/* set_opt_disp_defs associates parameter addresses with options */
+void
+set_opt_disp_defs(char opt_char, struct opt_def_str *options, int type,
+ int i_param1, int i_param2,
+ double d_param1, double d_param2,
+ char *s_param) {
+ struct opt_def_str *this_opt;
+
+ this_opt = options;
+ while (this_opt->opt_char != '\0') {
+ if (this_opt->opt_char == opt_char) {
+ this_opt->fmt_type = type;
+ switch (type) {
+ case 1:
+ this_opt->i_param1 = i_param1;
+ break;
+ case 2:
+ this_opt->i_param1 = i_param1;
+ this_opt->i_param2 = i_param2;
+ break;
+ case 3:
+ this_opt->d_param1 = d_param1;
+ break;
+ case 4:
+ this_opt->d_param1 = d_param1;
+ this_opt->d_param2 = d_param2;
+ break;
+ case 5:
+ if (s_param != NULL) {
+ this_opt->s_param = (char *)calloc(strlen(s_param)+1,sizeof(char));
+ strncpy(this_opt->s_param,s_param,strlen(s_param));
+ }
+ else this_opt->s_param = NULL;
+ break;
+ }
+ }
+ this_opt++;
+ }
+}
diff --git a/src/drop_func.h b/src/drop_func.h
new file mode 100644
index 0000000..f9e8611
--- /dev/null
+++ b/src/drop_func.h
@@ -0,0 +1,185 @@
+/* drop_func.h */
+
+/* $Id: drop_func.h 1196 2013-07-19 20:18:21Z wrp $ */
+/* $Revision: 1196 $ */
+
+/* copyright (c) 2005, 2014 by William R. Pearson and The Rector & Vistors
+ of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* functions provided by each of the drop files */
+
+#ifdef DEBUG
+unsigned long adler32(unsigned long, const unsigned char *, unsigned int);
+#endif
+
+void /* initializes f_struct **f_arg */
+init_work (unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+#ifndef DROP_INTERN
+ void **f_arg
+#else
+ struct f_struct **f_arg
+#endif
+);
+
+
+void /* frees memory allocated in f_struct */
+close_work (const unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+#ifndef DROP_INTERN
+ void **f_arg
+#else
+ struct f_struct **f_arg
+#endif
+);
+
+void /* documents search function, parameters */
+get_param (const struct pstruct *pstr,
+ char **pstring1, char *pstring2,
+ struct score_count_s *);
+
+void /* calculates alignment score(s), returns them in rst */
+do_work (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ const struct pstruct *ppst,
+#ifndef DROP_INTERN
+ void *f_arg,
+#else
+ struct f_struct *f_arg,
+#endif
+ int qr_flg, int shuff_flg, struct rstruct *rst,
+ struct score_count_s *);
+
+void /* calculates optimal alignment score */
+do_opt (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ struct pstruct *ppst,
+#ifndef DROP_INTERN
+ void *f_arg,
+#else
+ struct f_struct *f_arg,
+#endif
+ struct rstruct *rst
+ );
+
+struct a_res_str * /* produces encoding of alignment */
+do_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int repeat_thresh,
+ struct pstruct *ppst,
+#ifndef DROP_INTERN
+ void *f_arg,
+#else
+ struct f_struct *f_arg,
+#endif
+ int *have_ares);
+
+void
+pre_cons(const unsigned char *aa, int n, int frame,
+#ifndef DROP_INTERN
+ void *f_arg
+#else
+ struct f_struct *f_arg
+#endif
+ );
+
+void
+aln_func_vals(int frame, struct a_struct *aln);
+
+#include "dyn_string.h"
+
+/* calc_cons_a - takes aa0, aa1, a_res, and produces seqc0, seqc1,
+ * and seqc0a, seqc1a - the annotated sequences
+ */
+int
+calc_cons_a(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int *nc,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ char *seqc0, char *seqc1, char *seqca, int *seqc_score,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a, const struct annot_str *annot0_p, char *seqc0a,
+ const unsigned char *aa1a, const struct annot_str *annot1_p, char *seqc1a,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+#ifndef DROP_INTERN
+ void *f_arg,
+#else
+ struct f_struct *f_arg,
+#endif
+ void *pstat_void
+ );
+
+int /* returns lenc - length of aligment */
+calc_code(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ struct dyn_string_str *align_code_dyn,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a,
+ const struct annot_str *annot0_p,
+ const unsigned char *aa1a,
+ const struct annot_str *annot1_p,
+ struct dyn_string_str *ann_str_dyn,
+ int *score_delta,
+#ifndef DROP_INTERN
+ void *f_arg,
+#else
+ struct f_struct *f_arg,
+#endif
+ void *pstat_void,
+ int code_fmt
+ );
+
+int /* returns lenc - length of alignment */
+calc_id(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+#ifndef DROP_INTERN
+ void *f_arg
+#else
+ struct f_struct *f_arg
+#endif
+ );
+
+int /* returns lenc - length of alignment */
+calc_idd(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+#ifndef DROP_INTERN
+ void *f_arg
+#else
+ struct f_struct *f_arg
+#endif
+ );
diff --git a/src/dropff2.c b/src/dropff2.c
new file mode 100644
index 0000000..cd2145c
--- /dev/null
+++ b/src/dropff2.c
@@ -0,0 +1,1394 @@
+/* $Id: dropff2.c 989 2012-07-24 19:37:38Z wrp $ */
+
+/* copyright (c) 1998, 1999, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* this code implements the "fastf" algorithm, which is designed to
+ deconvolve mixtures of protein sequences derived from mixed-peptide
+ Edman sequencing. The expected input is:
+
+ >test | 40001 90043 | mgstm1
+ MGCEN,
+ MIDYP,
+ MLLAY,
+ MLLGY
+
+ Where the ','s indicate the length/end of the sequencing cycle
+ data. Thus, in this example, the sequence is from a mixture of 4
+ peptides, M was found in the first position, G,I, and L(2) at the second,
+ C,D, L(2) at the third, etc.
+
+ Because the sequences are derived from mixtures, there need not be
+ any partial sequence "MGCEN", the actual deconvolved sequence might be
+ "MLDGN".
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "defs.h"
+#include "param.h"
+#include "structs.h"
+#include "tatstats.h"
+
+#define EOSEQ 0
+#define ESS 59
+#define MAXHASH 32
+#define NMAP MAXHASH+1
+#define NMAP_X 23 /* re-code NMAP for 'X' */
+#define NMAP_Z 24 /* re-code NMAP for '*' */
+
+#ifndef MAXSAV
+#define MAXSAV 10
+#endif
+
+#define DROP_INTERN
+#include "drop_func.h"
+
+static char *verstr="4.21 May 2006 (ajm/wrp)";
+
+int shscore(unsigned char *aa0, const int n0, int **pam2, int nsq);
+
+#ifdef TFAST
+extern int aatran(const unsigned char *ntseq, unsigned char *aaseq,
+ const int maxs, const int frame);
+#endif
+
+struct hlstr { int next, pos;};
+
+void savemax(struct dstruct *, struct f_struct *);
+
+static int m0_spam(unsigned char *, const unsigned char *, int, struct savestr *,
+ int **, struct f_struct *);
+static int m1_spam(unsigned char *, int,
+ const unsigned char *, int,
+ struct savestr *, int **, int, struct f_struct *);
+
+int sconn(struct savestr **v, int nsave, int cgap,
+ struct f_struct *, struct rstruct *, const struct pstruct *,
+ const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int opt_prob);
+
+void kpsort(struct savestr **, int);
+void kssort(struct savestr **, int);
+void kpsort(struct savestr **, int);
+
+int
+sconn_a(unsigned char *, int, int, struct f_struct *,
+ struct a_res_str *);
+
+/* initialize for fasta */
+
+void
+init_work (unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ int mhv, phv;
+ int hmax;
+ int i0, ii0, hv;
+ struct f_struct *f_str;
+
+ int maxn0;
+ int i, j, q;
+ struct savestr *vmptr;
+ int *res;
+ int nsq;
+
+ nsq = ppst->nsqx;
+
+ f_str = (struct f_struct *) calloc(1, sizeof(struct f_struct));
+ if(f_str == NULL) {
+ fprintf(stderr, "Couldn't calloc f_str\n");
+ exit(1);
+ }
+
+ ppst->sw_flag = 0;
+
+ /* fastf3 cannot work with lowercase symbols as low complexity;
+ thus, NMAP must be disabled; this depends on aascii['X'] */
+ if (ppst->hsq[NMAP_X] == NMAP ) {ppst->hsq[NMAP_X]=1;}
+ if (ppst->hsq[NMAP_Z] == NMAP ) {ppst->hsq[NMAP_Z]=1;}
+
+ /* this does not work for share ppst structs, as in threads */
+ /*else {fprintf(stderr," cannot find 'X'==NMAP\n");} */
+
+ for (i0 = 1, mhv = -1; i0 <= ppst->nsq; i0++)
+ if (ppst->hsq[i0] < NMAP && ppst->hsq[i0] > mhv) mhv = ppst->hsq[i0];
+
+ if (mhv <= 0) {
+ fprintf (stderr, " maximum hsq <=0 %d\n", mhv);
+ exit (1);
+ }
+
+ for (f_str->kshft = 0; mhv > 0; mhv /= 2)
+ f_str->kshft++;
+
+/* kshft = 2; */
+ hmax = hv = (1 << f_str->kshft);
+ f_str->hmask = (hmax >> f_str->kshft) - 1;
+
+ if ((f_str->aa0 = (unsigned char *) calloc(n0+1, sizeof(char))) == NULL) {
+ fprintf (stderr, " cannot allocate f_str->aa0 array; %d\n",n0+1);
+ exit (1);
+ }
+ for (i=0; i<n0; i++) f_str->aa0[i] = aa0[i];
+ aa0 = f_str->aa0;
+
+ if ((f_str->aa0t = (unsigned char *) calloc(n0+1, sizeof(char))) == NULL) {
+ fprintf (stderr, " cannot allocate f_str0->aa0t array; %d\n",n0+1);
+ exit (1);
+ }
+ f_str->aa0ix = 0;
+
+ if ((f_str->harr = (struct hlstr *) calloc (hmax, sizeof (struct hlstr))) == NULL) {
+ fprintf (stderr, " cannot allocate hash array; hmax: %d hmask: %d\n",
+ hmax,f_str->hmask);
+ exit (1);
+ }
+ if ((f_str->pamh1 = (int *) calloc (nsq+1, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh1 array\n");
+ exit (1);
+ }
+ if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh2 array\n");
+ exit (1);
+ }
+ if ((f_str->link = (struct hlstr *) calloc (n0, sizeof (struct hlstr))) == NULL) {
+ fprintf (stderr, " cannot allocate hash link array");
+ exit (1);
+ }
+
+ for (i0 = 0; i0 < hmax; i0++) {
+ f_str->harr[i0].next = -1;
+ f_str->harr[i0].pos = -1;
+ }
+
+ for (i0 = 0; i0 < n0; i0++) {
+ f_str->link[i0].next = -1;
+ f_str->link[i0].pos = -1;
+ }
+
+ /* encode the aa0 array */
+ /*
+ this code has been modified to allow for mixed peptide sequences
+ aa0[] = 5 8 9 3 4 NULL 5 12 3 7 2 NULL
+ the 'NULL' character resets the hash position counter, to indicate that
+ any of several residues can be in the same position.
+ We also need to keep track of the number of times this has happened, so that
+ we can redivide the sequence later
+
+ i0 counts through the sequence
+ ii0 counts through the hashed sequence
+
+ */
+
+ f_str->nm0 = 1;
+ f_str->nmoff = -1;
+ phv = hv = 0;
+ for (i0= ii0 = 0; i0 < n0; i0++, ii0++) {
+ /* reset the counter and start hashing again */
+ if (aa0[i0] == ESS || aa0[i0] == 0) {
+ aa0[i0] = 0; /* set ESS to 0 */
+ /* fprintf(stderr," converted ',' to 0\n");*/
+ i0++; /* skip over the blank */
+ f_str->nm0++;
+ if (f_str->nmoff < 0) f_str->nmoff = i0;
+ phv = hv = 0;
+ ii0 = 0;
+ }
+ hv = ppst->hsq[aa0[i0]];
+ f_str->link[i0].next = f_str->harr[hv].next;
+ f_str->link[i0].pos = f_str->harr[hv].pos;
+ f_str->harr[hv].next = i0;
+ f_str->harr[hv].pos = ii0;
+ f_str->pamh2[hv] = ppst->pam2[0][aa0[i0]][aa0[i0]];
+ }
+ if (f_str-> nmoff < 0) f_str->nmoff = n0;
+
+
+#ifdef DEBUG
+ /*
+ fprintf(stderr," nmoff: %d/%d nm0: %d\n", f_str->nmoff, n0,f_str->nm0);
+ */
+#endif
+
+/*
+#ifdef DEBUG
+ fprintf(stderr," hmax: %d\n",hmax);
+ for ( hv=0; hv<hmax; hv++)
+ fprintf(stderr,"%2d %c %3d %3d\n",hv,
+ (hv > 0 && hv < ppst->nsq ) ? ppst->sq[ppst->hsq[hv]] : ' ',
+ f_str->harr[hv].pos,f_str->harr[hv].next);
+ fprintf(stderr,"----\n");
+ for ( hv=0; hv<n0; hv++)
+ fprintf(stderr,"%2d: %3d %3d\n",hv,
+ f_str->link[hv].pos,f_str->link[hv].next);
+#endif
+*/
+
+ f_str->maxsav = MAXSAV;
+ if ((f_str->vmax = (struct savestr *)
+ calloc(MAXSAV,sizeof(struct savestr)))==NULL) {
+ fprintf(stderr, "Couldn't allocate vmax[%d].\n",f_str->maxsav);
+ exit(1);
+ }
+
+ if ((f_str->vptr = (struct savestr **)
+ calloc(MAXSAV,sizeof(struct savestr *)))==NULL) {
+ fprintf(stderr, "Couldn't allocate vptr[%d].\n",f_str->maxsav);
+ exit(1);
+ }
+
+ for (vmptr = f_str->vmax; vmptr < &f_str->vmax[MAXSAV]; vmptr++) {
+ vmptr->used = (int *) calloc(n0, sizeof(int));
+ if(vmptr->used == NULL) {
+ fprintf(stderr, "Couldn't alloc vmptr->used\n");
+ exit(1);
+ }
+ }
+
+/* this has been modified from 0..<ppst->nsq to 1..<=ppst->nsq because the
+ pam2[0][0] is now undefined for consistency with blast
+*/
+
+ for (i0 = 1; i0 <= ppst->nsq; i0++)
+ f_str->pamh1[i0] = ppst->pam2[0][i0][i0];
+
+ ppst->param_u.fa.cgap = shscore(aa0,f_str->nmoff-1,ppst->pam2[0],ppst->nsq)/3;
+ if (ppst->param_u.fa.cgap > ppst->param_u.fa.bestmax/4)
+ ppst->param_u.fa.cgap = ppst->param_u.fa.bestmax/4;
+
+ f_str->ndo = 0;
+ f_str->noff = n0-1;
+ if (f_str->diag==NULL)
+ f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG,
+ sizeof (struct dstruct));
+
+ if (f_str->diag == NULL)
+ {
+ fprintf (stderr, " cannot allocate diagonal arrays: %ld\n",
+ (long) MAXDIAG * (long) (sizeof (struct dstruct)));
+ exit (1);
+ }
+
+#ifdef TFAST
+ if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+2,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+2);
+ exit (1);
+ }
+ f_str->aa1x++;
+#endif
+
+ /* allocate space for the scoring arrays */
+ maxn0 = n0 + 4;
+
+ maxn0 = max(3*n0/2,MIN_RES);
+ if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) {
+ fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0);
+ exit(1);
+ }
+ f_str->res = res;
+ f_str->max_res = maxn0;
+
+ /* Tatusov Statistics Setup */
+
+ /* initialize priors array. */
+ if((f_str->priors = (double *)calloc(ppst->nsq+1, sizeof(double))) == NULL) {
+ fprintf(stderr, "Couldn't allocate priors array.\n");
+ exit(1);
+ }
+ calc_priors(f_str->priors, ppst, f_str, NULL, 0, ppst->pseudocts);
+
+ f_str->dotat = 0;
+ f_str->shuff_cnt = ppst->shuff_node;
+
+ /* End of Tatusov Statistics Setup */
+
+ *f_arg = f_str;
+}
+
+
+/* pstring1 is a message to the manager, currently 512 */
+/* pstring2 is the same information, but in a markx==10 format */
+void
+get_param (const struct pstruct *ppstr,
+ char **pstring1, char *pstring2,
+ struct score_count_s *s_info)
+{
+#ifndef TFAST
+ char *pg_str="FASTF";
+#else
+ char *pg_str="TFASTF";
+#endif
+
+ sprintf (pstring1[0], "%s (%s)",pg_str,verstr);
+ sprintf (pstring1[1], "%s matrix (%d:%d), join: %d",
+ ppstr->pam_name, ppstr->pam_h,ppstr->pam_l,ppstr->param_u.fa.cgap);
+
+ if (ppstr->param_u.fa.iniflag) strcat(pstring1[0]," init1");
+
+ if (pstring2 != NULL) {
+ sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)\n\
+; pg_join: %d\n",
+ pg_str,verstr, ppstr->pam_name, ppstr->pam_h,ppstr->pam_l,
+ ppstr->param_u.fa.cgap);
+ }
+}
+
+void
+close_work (const unsigned char *aa0, const int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ struct f_struct *f_str;
+ struct savestr *vmptr;
+
+ f_str = *f_arg;
+
+ if (f_str != NULL) {
+
+ for (vmptr = f_str->vmax; vmptr < &f_str->vmax[MAXSAV]; vmptr++)
+ free(vmptr->used);
+
+ free(f_str->res);
+#ifdef TFAST
+ free(f_str->aa1x - 1); /* allocated, then aa1x++'ed */
+#endif
+ free(f_str->diag);
+ free(f_str->link);
+ free(f_str->pamh2);
+ free(f_str->pamh1);
+ free(f_str->harr);
+ free(f_str->aa0t);
+ free(f_str->aa0);
+ free(f_str->priors);
+ free(f_str->vmax);
+ free(f_str->vptr);
+ free(f_str);
+ *f_arg = NULL;
+ }
+}
+
+int do_fastf (unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ struct rstruct *rst, int *hoff, int opt_prob)
+{
+ int nd; /* diagonal array size */
+ int lhval;
+ int kfact;
+ register struct dstruct *dptr;
+ register int tscor;
+ register struct dstruct *diagp;
+ struct dstruct *dpmax;
+ register int lpos;
+ int tpos, npos;
+ struct savestr *vmptr;
+ int scor, tmp;
+ int im, ib, nsave;
+ int cmps (); /* comparison routine for ksort */
+ const int *hsq;
+
+ hsq = ppst->hsq;
+
+ if (n1 < 1) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ rst->escore = 1.0;
+ rst->segnum = 0;
+ rst->seglen = 0;
+ return 1;
+ }
+
+ if (n0+n1+1 >= MAXDIAG) {
+ fprintf(stderr,"n0,n1 too large: %d, %d\n",n0,n1);
+ rst->score[0] = rst->score[1] = rst->score[2] = -1;
+ rst->escore = 2.0;
+ rst->segnum = 0;
+ rst->seglen = 0;
+ return -1;
+ }
+
+ nd = n0 + n1;
+
+ dpmax = &f_str->diag[nd];
+ for (dptr = &f_str->diag[f_str->ndo]; dptr < dpmax;) {
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+
+ /* initialize the saved segment structures */
+ for (vmptr = f_str->vmax; vmptr < &f_str->vmax[MAXSAV]; vmptr++) {
+ vmptr->score = 0;
+ memset(vmptr->used, 0, n0 * sizeof(int));
+ }
+
+ f_str->lowmax = f_str->vmax;
+ f_str->lowscor = 0;
+
+ /* start hashing */
+
+ diagp = &f_str->diag[f_str->noff];
+ for (lhval = lpos = 0; lpos < n1; lpos++, diagp++) {
+ if (hsq[aa1[lpos]]>=NMAP) {
+ lpos++ ; diagp++;
+ while (lpos < n1 && hsq[aa1[lpos]]>=NMAP) {lpos++; diagp++;}
+ if (lpos >= n1) break;
+ lhval = 0;
+ }
+ lhval = hsq[aa1[lpos]];
+ for (tpos = f_str->harr[lhval].pos, npos = f_str->harr[lhval].next;
+ tpos >= 0; tpos = f_str->link[npos].pos, npos = f_str->link[npos].next) {
+ /* tscor gets position of end of current lpos diag run */
+ if ((tscor = (dptr = &diagp[-tpos])->stop) >= 0) {
+ tscor++; /* move forward one */
+ if ((tscor -= lpos) <= 0) { /* check for size of gap to this hit - */
+ /* includes implicit -1 mismatch penalty */
+ scor = dptr->score; /* current score of this run */
+ if ((tscor += (kfact = f_str->pamh2[lhval])) < 0 &&
+ f_str->lowscor < scor) /* if updating tscor makes run worse, */
+ savemax (dptr, f_str); /* save it */
+
+ if ((tscor += scor) >= kfact) { /* add to current run if continuing */
+ /* is better than restart (kfact) */
+ dptr->score = tscor;
+ dptr->stop = lpos;
+ }
+ else {
+ dptr->score = kfact; /* starting over is better */
+ dptr->start = (dptr->stop = lpos);
+ }
+ }
+ else { /* continue current run */
+ dptr->score += f_str->pamh1[aa0[tpos]];
+ dptr->stop = lpos;
+ }
+ }
+ else { /* no diagonal run yet */
+ dptr->score = f_str->pamh2[lhval];
+ dptr->start = (dptr->stop = lpos);
+ }
+ } /* end tpos */
+ } /* end lpos */
+
+ for (dptr = f_str->diag; dptr < dpmax;) {
+ if (dptr->score > f_str->lowscor) savemax (dptr, f_str);
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+ f_str->ndo = nd;
+
+/*
+ at this point all of the elements of aa1[lpos]
+ have been searched for elements of aa0[tpos]
+ with the results in diag[dpos]
+*/
+
+ /* set up pointers for sorting */
+
+ for (nsave = 0, vmptr = f_str->vmax; vmptr < &f_str->vmax[MAXSAV]; vmptr++) {
+ if (vmptr->score > 0) {
+ vmptr->score = m0_spam (aa0, aa1, n1, vmptr, ppst->pam2[0], f_str);
+ f_str->vptr[nsave++] = vmptr;
+ }
+ }
+
+ /* sort them */
+ kssort (f_str->vptr, nsave);
+
+
+#ifdef DEBUG
+ /*
+ for (ib=0; ib<nsave; ib++) {
+ fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
+ f_str->noff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp,
+ f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp,
+ f_str->vptr[ib]->start,f_str->vptr[ib]->stop,
+ f_str->vptr[ib]->dp,f_str->vptr[ib]->score);
+ for (im=f_str->vptr[ib]->start; im<=f_str->vptr[ib]->stop; im++)
+ fprintf(stderr," %c:%c",ppst->sq[aa0[f_str->noff+im-f_str->vptr[ib]->dp]],
+ ppst->sq[aa1[im]]);
+ fputc('\n',stderr);
+ }
+ fprintf(stderr,"---\n");
+ */
+ /* now use m_spam to re-evaluate */
+ /*
+ for (tpos = 0; tpos < n0; tpos++) {
+ fprintf(stderr,"%c:%2d ",ppst->sq[aa0[tpos]],aa0[tpos]);
+ if (tpos %10 == 9) fputc('\n',stderr);
+ }
+ fputc('\n',stderr);
+ */
+#endif
+
+ f_str->aa0ix = 0;
+ for (ib=0; ib < nsave; ib++) {
+ if ((vmptr=f_str->vptr[ib])->score > 0) {
+ vmptr->score = m1_spam (aa0, n0, aa1, n1, vmptr,
+ ppst->pam2[0], ppst->pam_l, f_str);
+ }
+ }
+ /* reset aa0 - modified by m1_spam */
+ for (tpos = 0; tpos < n0; tpos++) {
+ if (aa0[tpos] >= 32) aa0[tpos] -= 32;
+ }
+
+ kssort(f_str->vptr,nsave);
+
+ for ( ; nsave > 0; nsave--)
+ if (f_str->vptr[nsave-1]->score >0) break;
+
+ if (nsave <= 0) {
+ f_str->nsave = 0;
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ rst->escore = 1.0;
+
+ return 1;
+ }
+ else f_str->nsave = nsave;
+
+
+#ifdef DEBUG
+ /*
+ fprintf(stderr,"n0: %d; n1: %d; noff: %d\n",n0,n1,f_str->noff);
+ for (ib=0; ib<nsave; ib++) {
+ fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
+ f_str->noff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp,
+ f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp,
+ f_str->vptr[ib]->start,f_str->vptr[ib]->stop,
+ f_str->vptr[ib]->dp,f_str->vptr[ib]->score);
+ for (im=f_str->vptr[ib]->start; im<=f_str->vptr[ib]->stop; im++)
+ fprintf(stderr," %c:%c",ppst->sq[aa0[f_str->noff+im-f_str->vptr[ib]->dp]],
+ ppst->sq[aa1[im]]);
+ fputc('\n',stderr);
+ }
+
+ fprintf(stderr,"---\n");
+ */
+#endif
+
+ scor = sconn (f_str->vptr, nsave, ppst->param_u.fa.cgap, f_str,
+ rst, ppst, aa0, n0, aa1, n1, opt_prob);
+
+ for (vmptr=f_str->vptr[0],ib=1; ib<nsave; ib++)
+ if (f_str->vptr[ib]->score > vmptr->score) vmptr=f_str->vptr[ib];
+
+ rst->score[1] = vmptr->score;
+ rst->score[0] = rst->score[2] = max (scor, vmptr->score);
+
+ return 1;
+}
+
+void do_work (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ int qr_flg, int shuff_flg, struct rstruct *rst,
+ struct score_count_s *s_info)
+{
+ int opt_prob;
+ int hoff, n10, i;
+
+ if (qr_flg==1 && f_str->shuff_cnt <= 0) {
+ rst->escore = 2.0;
+ rst->score[0]=rst->score[1]=rst->score[2]= -1;
+ rst->valid_stat = 0;
+ return;
+ }
+
+ s_info->s_cnt[ppst->score_ix]++;
+ s_info->tot_scores++;
+
+ rst->valid_stat = 1;
+ if (f_str->dotat || ppst->zsflag == 4 || ppst->zsflag == 14 ) opt_prob=1;
+ else opt_prob = 0;
+ if (ppst->zsflag == 2 || ppst->zsflag == 12) opt_prob = 0;
+ if (qr_flg) {
+ opt_prob=1;
+ /* if (frame==1) */
+ f_str->shuff_cnt--;
+ }
+
+ if (n1 < 1) {
+ rst->score[0] = rst->score[1] = rst->score[2] = -1;
+ rst->escore = 2.0;
+ return;
+ }
+
+#ifdef TFAST
+ n10=aatran(aa1,f_str->aa1x,n1,frame);
+ if (ppst->debug_lib)
+ for (i=0; i<n10; i++)
+ if (f_str->aa1x[i]>ppst->nsq) {
+ fprintf(stderr,
+ "residue[%d/%d] %d range (%d)\n",i,n1,
+ f_str->aa1x[i],ppst->nsq);
+ f_str->aa1x[i]=0;
+ n10=i-1;
+ }
+
+ do_fastf (f_str->aa0, n0, f_str->aa1x, n10, ppst, f_str, rst, &hoff, opt_prob);
+#else /* FASTF */
+ do_fastf (f_str->aa0, n0, aa1, n1, ppst, f_str, rst, &hoff, opt_prob);
+#endif
+
+ rst->comp = rst->H = -1.0;
+
+}
+
+void do_opt (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct rstruct *rst)
+{
+ int optflag, tscore, hoff, n10;
+
+ optflag = ppst->param_u.fa.optflag;
+ ppst->param_u.fa.optflag = 1;
+
+#ifdef TFAST
+ n10=aatran(aa1,f_str->aa1x,n1,frame);
+ do_fastf (f_str->aa0, n0, f_str->aa1x, n10, ppst, f_str, rst, &hoff, 1);
+#else /* FASTA */
+ do_fastf(f_str->aa0, n0, aa1, n1, ppst, f_str, rst, &hoff, 1);
+#endif
+ ppst->param_u.fa.optflag = optflag;
+}
+
+void
+savemax (dptr, f_str)
+ register struct dstruct *dptr;
+ struct f_struct *f_str;
+{
+ register int dpos;
+ register struct savestr *vmptr;
+ register int i;
+
+ dpos = (int) (dptr - f_str->diag);
+
+/* check to see if this is the continuation of a run that is already saved */
+
+ if ((vmptr = dptr->dmax) != NULL && vmptr->dp == dpos &&
+ vmptr->start == dptr->start)
+ {
+ vmptr->stop = dptr->stop;
+ if ((i = dptr->score) <= vmptr->score)
+ return;
+ vmptr->score = i;
+ if (vmptr != f_str->lowmax)
+ return;
+ }
+ else
+ {
+ i = f_str->lowmax->score = dptr->score;
+ f_str->lowmax->dp = dpos;
+ f_str->lowmax->start = dptr->start;
+ f_str->lowmax->stop = dptr->stop;
+ dptr->dmax = f_str->lowmax;
+ }
+
+ for (vmptr = f_str->vmax; vmptr < &f_str->vmax[MAXSAV]; vmptr++)
+ if (vmptr->score < i)
+ {
+ i = vmptr->score;
+ f_str->lowmax = vmptr;
+ }
+ f_str->lowscor = i;
+}
+
+/* this version of spam() is designed to work with a collection of
+ subfragments, selecting the best amino acid at each position so
+ that, from each subfragment, each position is only used once.
+
+ As a result, m_spam needs to know the number of fragments.
+
+ In addition, it now requires a global alignment to the fragment
+ and resets the start and stop positions
+
+ */
+
+static int
+m1_spam (unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct savestr *dmax, int **pam2, int pam_l,
+ struct f_struct *f_str)
+{
+ int tpos, lpos, im, ii, nm, ci;
+ int tot, ctot, pv;
+
+ struct {
+ int start, stop, score;
+ } curv, maxv;
+ unsigned char *aa0p;
+ const unsigned char *aa1p;
+
+ lpos = dmax->start; /* position in library sequence */
+ tpos = lpos - dmax->dp + f_str->noff; /* position in query sequence */
+ /* force global alignment, reset start*/
+ if (tpos < lpos) {
+ lpos = dmax->start -= tpos;
+ tpos = 0;
+ }
+ else {
+ tpos -= lpos;
+ lpos = dmax->start = 0;
+ }
+
+ dmax->stop = dmax->start + (f_str->nmoff -2 - tpos);
+ if (dmax->stop > n1) dmax->stop = n1;
+
+ /*
+ if (dmax->start < 0) {
+ tpos = -dmax->start;
+ lpos = dmax->start=0;
+ }
+ else tpos = 0;
+ */
+
+ aa1p = &aa1[lpos];
+ aa0p = &aa0[tpos];
+
+ nm = f_str->nm0;
+
+ tot = curv.score = maxv.score = 0;
+ for (; lpos <= dmax->stop; lpos++,aa0p++,aa1p++) {
+ ctot = pam_l;
+ ci = -1;
+ for (im = 0, ii=0; im < nm; im++,ii+=f_str->nmoff) {
+ if (aa0p[ii] < 32 && (pv = pam2[aa0p[ii]][*aa1p]) > ctot) {
+ ctot = pv;
+ ci = ii;
+/* fprintf(stderr, "lpos: %d im: %d ii: %d ci: %d ctot: %d pi: %d pv: %d\n", lpos, im, ii, ci, ctot, aa0p[ii], pam2[aa0p[ii]][*aa1p]); */
+ }
+ }
+ tot += ctot;
+ if (ci >= 0 && aa0p[ci] < 32) {
+#ifdef DEBUG
+/* fprintf(stderr, "used: lpos: %d ci: %d : %c\n", lpos, ci, sq[aa0p[ci]]); */
+#endif
+ aa0p[ci] += 32;
+ dmax->used[&aa0p[ci] - aa0] = 1;
+ }
+ }
+ return tot;
+}
+
+int ma_spam (unsigned char *aa0, int n0, const unsigned char *aa1,
+ struct savestr *dmax, struct pstruct *ppst,
+ struct f_struct *f_str)
+{
+ int **pam2;
+ int tpos, lpos, im, ii, nm, ci, lp0;
+ int tot, ctot, pv;
+ struct {
+ int start, stop, score;
+ } curv, maxv;
+ const unsigned char *aa1p;
+ unsigned char *aa0p, *aa0pt;
+ int aa0t_flg;
+
+ pam2 = ppst->pam2[0];
+ aa0t_flg = 0;
+
+ lpos = dmax->start; /* position in library sequence */
+ tpos = lpos - dmax->dp + f_str->noff; /* position in query sequence */
+ lp0 = lpos = dmax->start;
+ aa1p = &aa1[lpos];
+ aa0p = &aa0[tpos]; /* real aa0 sequence */
+
+ /* the destination aa0 sequence (without nulls) */
+ aa0pt = &f_str->aa0t[f_str->aa0ix];
+
+ curv.start = lpos;
+ nm = f_str->nm0;
+
+ /* sometimes, tpos may be > 0, with lpos = 0 - fill with 'X' */
+ if (lpos == 0 && tpos > 0)
+ for (ii = 0; ii < tpos; ii++) *aa0pt++ = 31; /* filler character */
+
+ tot = curv.score = maxv.score = 0;
+ for (; lpos <= dmax->stop; lpos++) {
+ ctot = ppst->pam_l;
+ ci = -1;
+ for (im = 0, ii=0; im < nm; im++,ii+=f_str->nmoff) {
+ if (aa0p[ii] < 32 && (pv = pam2[aa0p[ii]][*aa1p]) > ctot) {
+ ctot = pv;
+ ci = ii;
+ }
+ }
+ tot += ctot;
+ if (ci >= 0) {
+ if (ci >= n0) {fprintf(stderr," warning - ci off end %d/%d\n",ci,n0);}
+ else {
+ *aa0pt++ = aa0p[ci];
+ aa0p[ci] += 32;
+ aa0t_flg=1;
+ }
+ }
+ aa0p++; aa1p++;
+ }
+
+ if (aa0t_flg) {
+ dmax->dp -= f_str->aa0ix; /* shift ->dp for aa0t */
+ if ((ci=(int)(aa0pt-f_str->aa0t)) > n0) {
+ fprintf(stderr," warning - aapt off %d/%d end\n",ci,n0);
+ }
+ else
+ *aa0pt++ = 0; /* skip over NULL */
+
+ aa0pt = &f_str->aa0t[f_str->aa0ix];
+ aa1p = &aa1[lp0];
+
+ /*
+ for (im = 0; im < f_str->nmoff; im++)
+ fprintf(stderr,"%c:%c,",ppst->sq[aa0pt[im]],ppst->sq[aa1p[im]]);
+ fprintf(stderr,"- %3d (%3d:%3d)\n",dmax->score,f_str->aa0ix,lp0);
+ */
+
+ f_str->aa0ix += f_str->nmoff; /* update offset into aa0t */
+ }
+ /*
+ fprintf(stderr," ma_spam returning: %d\n",tot);
+ */
+ return tot;
+}
+
+static int
+m0_spam (unsigned char *aa0, const unsigned char *aa1, int n1,
+ struct savestr *dmax, int **pam2,
+ struct f_struct *f_str)
+{
+ int tpos, lpos, lend, im, ii, nm;
+ int tot, ctot, pv;
+ struct {
+ int start, stop, score;
+ } curv, maxv;
+ const unsigned char *aa0p, *aa1p;
+
+ lpos = dmax->start; /* position in library sequence */
+ tpos = lpos - dmax->dp + f_str->noff; /* position in query sequence */
+ if (tpos > 0) {
+ if (lpos-tpos >= 0) {
+ lpos = dmax->start -= tpos; /* force global alignment, reset start*/
+ tpos = 0;
+ }
+ else {
+ tpos -= lpos;
+ lpos = dmax->start = 0;
+ }
+ }
+
+ nm = f_str->nm0;
+ lend = dmax->stop;
+ if (n1 - (lpos + f_str->nmoff-2) < 0 ) {
+ lend = dmax->stop = (lpos - tpos) + f_str->nmoff-2;
+ if (lend >= n1) lend = n1-1;
+ }
+
+ aa1p = &aa1[lpos];
+ aa0p = &aa0[tpos];
+
+ curv.start = lpos;
+
+ tot = curv.score = maxv.score = 0;
+ for (; lpos <= lend; lpos++) {
+ ctot = -10000;
+ for (im = 0, ii=0; im < nm; im++,ii+=f_str->nmoff) {
+ if ((pv = pam2[aa0p[ii]][*aa1p]) > ctot) {
+ ctot = pv;
+ }
+ }
+ tot += ctot;
+ aa0p++; aa1p++;
+ }
+
+ /* reset dmax if necessary */
+
+ return tot;
+}
+
+/* sconn links up non-overlapping alignments and calculates the score */
+
+int sconn (struct savestr **v, int n, int cgap, struct f_struct *f_str,
+ struct rstruct *rst, const struct pstruct *ppst,
+ const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int opt_prob)
+{
+ int i, si, cmpp ();
+ struct slink *start, *sl, *sj, *so, sarr[MAXSAV];
+ int lstart, plstop;
+ double tatprob;
+
+ /* sarr[] saves each alignment score/position, and provides a link
+ back to the previous alignment that maximizes the score */
+
+ /* sort the score left to right in lib pos */
+ kpsort (v, n);
+
+ start = NULL;
+
+ /* for the remaining runs, see if they fit */
+ for (i = 0, si = 0; i < n; i++) {
+
+ /* if the score is less than the gap penalty, it never helps */
+ if (!opt_prob && (v[i]->score < cgap) ){ continue; }
+
+ lstart = v[i]->start;
+
+ /* put the run in the group */
+ sarr[si].vp = v[i];
+ sarr[si].score = v[i]->score;
+ sarr[si].next = NULL;
+ sarr[si].prev = NULL;
+ sarr[si].tat = NULL;
+
+ if(opt_prob) {
+ sarr[si].tatprob =
+ calc_tatusov(NULL, &sarr[si], aa0, n0, aa1, n1,
+ ppst->pam2[0],ppst->nsq, f_str,
+ ppst->pseudocts, opt_prob,ppst->zsflag);
+ sarr[si].tat = sarr[si].newtat;
+ }
+
+ /* if it fits, then increase the score */
+ for (sl = start; sl != NULL; sl = sl->next) {
+ plstop = sl->vp->stop;
+ /* if end < start or start > end, add score */
+ if (plstop < lstart ) {
+ if(!opt_prob) {
+ sarr[si].score = sl->score + v[i]->score;
+ sarr[si].prev = sl;
+ /*
+ fprintf(stderr,"sconn %d added %d/%d getting %d; si: %d, tat: %g\n",
+ i,v[i]->start, v[i]->score,sarr[si].score,si, 2.0);
+ */
+ break;
+ } else {
+ tatprob =
+ calc_tatusov(sl, &sarr[si], aa0, n0, aa1, n1,
+ ppst->pam2[0], ppst->nsq, f_str,
+ ppst->pseudocts, opt_prob, ppst->zsflag);
+ /* if our tatprob gets worse when we add this, forget it */
+ if(tatprob > sarr[si].tatprob) {
+ free(sarr[si].newtat->probs); /* get rid of new tat struct */
+ free(sarr[si].newtat);
+ continue;
+ } else {
+ sarr[si].tatprob = tatprob;
+ free(sarr[si].tat->probs); /* get rid of old tat struct */
+ free(sarr[si].tat);
+ sarr[si].tat = sarr[si].newtat;
+ sarr[si].prev = sl;
+ sarr[si].score = sl->score + v[i]->score;
+ /*
+ fprintf(stderr,"sconn TAT %d added %d/%d getting %d; si: %d, tat: %g\n",
+ i,v[i]->start, v[i]->score,sarr[si].score,si, tatprob);
+ */
+ break;
+ }
+ }
+ }
+ }
+
+ /* now recalculate where the score fits - resort the scores */
+ if (start == NULL) {
+ start = &sarr[si];
+ } else {
+ if(!opt_prob) { /* sort by scores */
+ for (sj = start, so = NULL; sj != NULL; sj = sj->next) {
+ if (sarr[si].score > sj->score) { /* if new score > best score */
+ sarr[si].next = sj; /* previous best linked to best */
+ if (so != NULL)
+ so->next = &sarr[si]; /* old best points to new best */
+ else
+ start = &sarr[si];
+ break;
+ }
+ so = sj; /* old-best saved in so */
+ }
+ } else { /* sort by tatprobs */
+ for (sj = start, so = NULL; sj != NULL; sj = sj->next) {
+ if ( sarr[si].tatprob < sj->tatprob ||
+ ((sarr[si].tatprob == sj->tatprob) && sarr[si].score > sj->score) ) {
+ sarr[si].next = sj;
+ if (so != NULL)
+ so->next = &sarr[si];
+ else
+ start = &sarr[si];
+ break;
+ }
+ so = sj;
+ }
+ }
+ }
+ si++;
+ }
+
+ if(opt_prob) {
+ for (i = 0 ; i < si ; i++) {
+ free(sarr[i].tat->probs);
+ free(sarr[i].tat);
+ }
+ }
+
+ if (start != NULL) {
+
+ if(opt_prob)
+ rst->escore = start->tatprob;
+ else
+ rst->escore = 2.0;
+
+ rst->segnum = rst->seglen = 0;
+ for(sj = start ; sj != NULL; sj = sj->prev) {
+ rst->segnum++;
+ rst->seglen += sj->vp->stop - sj->vp->start + 1;
+ }
+ return (start->score);
+ } else {
+
+ if(opt_prob)
+ rst->escore = 1.0;
+ else
+ rst->escore = 2.0;
+
+ rst->segnum = rst->seglen = 0;
+ return (0);
+ }
+}
+
+void
+kssort (struct savestr **v, int n)
+{
+ int gap, i, j;
+ struct savestr *tmp;
+
+ for (gap = n / 2; gap > 0; gap /= 2)
+ for (i = gap; i < n; i++)
+ for (j = i - gap; j >= 0; j -= gap)
+ {
+ if (v[j]->score >= v[j + gap]->score)
+ break;
+ tmp = v[j];
+ v[j] = v[j + gap];
+ v[j + gap] = tmp;
+ }
+}
+void
+kpsort (v, n)
+struct savestr *v[];
+int n;
+{
+ int gap, i, j;
+ struct savestr *tmp;
+
+ for (gap = n / 2; gap > 0; gap /= 2)
+ for (i = gap; i < n; i++)
+ for (j = i - gap; j >= 0; j -= gap)
+ {
+ if (v[j]->start <= v[j + gap]->start)
+ break;
+ tmp = v[j];
+ v[j] = v[j + gap];
+ v[j + gap] = tmp;
+ }
+}
+
+/* sorts alignments from right to left (back to front) based on stop */
+
+void
+krsort (v, n)
+struct savestr *v[];
+int n;
+{
+ int gap, i, j;
+ struct savestr *tmp;
+
+ for (gap = n / 2; gap > 0; gap /= 2)
+ for (i = gap; i < n; i++)
+ for (j = i - gap; j >= 0; j -= gap)
+ {
+ if (v[j]->stop > v[j + gap]->stop)
+ break;
+ tmp = v[j];
+ v[j] = v[j + gap];
+ v[j + gap] = tmp;
+ }
+}
+
+struct a_res_str *
+do_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int repeat_thresh,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ int *have_ares)
+{
+ struct a_res_str *a_res;
+ int hoff, n10;
+ int ib;
+ unsigned char *aa0t;
+ const unsigned char *aa1p;
+
+ *have_ares = 0x2; /* set 0x2 bit to indicate local copy */
+
+ if ((a_res = (struct a_res_str *)calloc(1, sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr," [do_walign] Cannot allocate a_res");
+ return NULL;
+ }
+
+#ifdef TFAST
+ f_str->n10 = n10 = aatran(aa1,f_str->aa1x,n1,frame);
+ aa1p = f_str->aa1x;
+#else
+ n10 = n1;
+ aa1p = aa1;
+#endif
+
+ do_fastf(f_str->aa0, n0, aa1p, n10, ppst, f_str, &a_res->rst, &hoff, 1);
+
+ /* the alignment portion takes advantage of the information left
+ over in f_str after do_fastf is done. in particular, it is
+ easy to run a modified sconn() to produce the alignments.
+
+ unfortunately, the alignment display routine wants to have
+ things encoded as with bd_align and sw_align, so we need to do that.
+ */
+
+ if ((aa0t = (unsigned char *)calloc(n0+1,sizeof(unsigned char)))==NULL) {
+ fprintf(stderr," cannot allocate aa0t %d\n",n0+1);
+ exit(1);
+ }
+
+ kssort (f_str->vptr, f_str->nsave);
+ f_str->aa0ix = 0;
+ if (f_str->nsave > f_str->nm0) f_str->nsave = f_str->nm0;
+ for (ib=0; ib < f_str->nm0; ib++) {
+ if (f_str->vptr[ib]->score > 0) {
+ f_str->vptr[ib]->score =
+ ma_spam (f_str->aa0, n0, aa1p, f_str->vptr[ib], ppst, f_str);
+ }
+ }
+
+ /* after ma_spam is over, we need to reset aa0 */
+ for (ib = 0; ib < n0; ib++) {
+ if (f_str->aa0[ib] >= 32) f_str->aa0[ib] -= 32;
+ }
+
+ kssort(f_str->vptr,f_str->nsave);
+
+ for ( ; f_str->nsave > 0; f_str->nsave--)
+ if (f_str->vptr[f_str->nsave-1]->score >0) break;
+
+ a_res->nres = sconn_a (aa0t,n0, ppst->param_u.fa.cgap, f_str,a_res);
+ free(aa0t);
+
+ a_res->res = f_str->res;
+ a_res->sw_score = a_res->rst.score[0];
+ return a_res;
+}
+
+/* this version of sconn is modified to provide alignment information */
+
+int sconn_a (unsigned char *aa0, int n0, int cgap,
+ struct f_struct *f_str,
+ struct a_res_str *a_res)
+{
+ int i, si, cmpp (), n;
+ unsigned char *aa0p;
+ int sx, dx, doff;
+
+ struct savestr **v;
+ struct slink {
+ int score;
+ struct savestr *vp;
+ struct slink *snext;
+ struct slink *aprev;
+ } *start, *sl, *sj, *so, sarr[MAXSAV];
+ int lstop, plstart;
+ int *res, nres, tres;
+
+/* sort the score left to right in lib pos */
+
+ v = f_str->vptr;
+ n = f_str->nsave;
+
+ krsort (v, n); /* sort from left to right in library */
+
+ start = NULL;
+
+/* for each alignment, see if it fits */
+
+ for (i = 0, si = 0; i < n; i++) {
+
+/* if the score is less than the join threshold, skip it */
+ if (v[i]->score < cgap) continue;
+
+ lstop = v[i]->stop; /* have right-most lstart */
+
+/* put the alignment in the group */
+
+ sarr[si].vp = v[i];
+ sarr[si].score = v[i]->score;
+ sarr[si].snext = NULL;
+ sarr[si].aprev = NULL;
+
+/* if it fits, then increase the score */
+/* start points to a sorted (by total score) list of candidate
+ overlaps */
+
+ for (sl = start; sl != NULL; sl = sl->snext) {
+ plstart = sl->vp->start;
+ if (plstart > lstop ) {
+ sarr[si].score = sl->score + v[i]->score;
+ sarr[si].aprev = sl;
+ break; /* quit as soon as the alignment has been added */
+ }
+ }
+
+/* now recalculate the list of best scores */
+ if (start == NULL)
+ start = &sarr[si]; /* put the first one in the list */
+ else
+ for (sj = start, so = NULL; sj != NULL; sj = sj->snext) {
+ if (sarr[si].score > sj->score) { /* new score better than old */
+ sarr[si].snext = sj; /* snext best after new score */
+ if (so != NULL)
+ so->snext = &sarr[si]; /* prev_best->snext points to best */
+ else start = &sarr[si]; /* start points to best */
+ break; /* stop looking */
+ }
+ so = sj; /* previous candidate best */
+ }
+ si++; /* increment to snext alignment */
+ }
+
+ /* we have the best set of alignments, write them to *res */
+ if (start != NULL) {
+ res = f_str->res; /* set a destination for the alignment ops */
+ tres = nres = 0; /* alignment op length = 0 */
+ aa0p = aa0; /* point into query (needed for calcons later) */
+ a_res->min1 = start->vp->start; /* start in library */
+ a_res->min0 = 0; /* start in query */
+ for (sj = start; sj != NULL; sj = sj->aprev ) {
+ doff = (int)(aa0p-aa0) - (sj->vp->start-sj->vp->dp+f_str->noff);
+ /*
+ fprintf(stderr,"doff: %3d\n",doff);
+ */
+ for (dx=sj->vp->start,sx=sj->vp->start-sj->vp->dp+f_str->noff;
+ dx <= sj->vp->stop; dx++) {
+ *aa0p++ = f_str->aa0t[sx++]; /* copy residue into aa0 */
+ tres++; /* bump alignment counter */
+ res[nres++] = 0; /* put 0-op in res */
+ }
+ sj->vp->dp -= doff;
+ if (sj->aprev != NULL) {
+ if (sj->aprev->vp->start - sj->vp->stop - 1 > 0 )
+ /* put an insert op into res to get to next aligned block */
+ tres += res[nres++] = (sj->aprev->vp->start - sj->vp->stop - 1);
+ }
+ /*
+ fprintf(stderr,"t0: %3d, tx: %3d, l0: %3d, lx: %3d, dp: %3d noff: %3d, score: %3d\n",
+ sj->vp->start - sj->vp->dp + f_str->noff,
+ sj->vp->stop - sj->vp->dp + f_str->noff,
+ sj->vp->start,sj->vp->stop,sj->vp->dp,
+ f_str->noff,sj->vp->score);
+ fprintf(stderr,"%3d - %3d: %3d\n",
+ sj->vp->start,sj->vp->stop,sj->vp->score);
+ */
+ a_res->max1 = sj->vp->stop;
+ a_res->max0 = a_res->max1 - sj->vp->dp + f_str->noff;
+ }
+
+ /*
+ fprintf(stderr,"(%3d - %3d):(%3d - %3d)\n",
+ a_res->min0,a_res->max0,a_res->min1,a_res->max1);
+ */
+
+ /* now replace f_str->aa0t with aa0 */
+ for (i=0; i<n0; i++) f_str->aa0t[i] = aa0[i];
+
+ return tres;
+ }
+ else return (0);
+}
+
+/* calculate the 100% identical score */
+int
+shscore(unsigned char *aa0, int n0, int **pam2, int nsq)
+{
+ int i, sum;
+ for (i=0,sum=0; i<n0; i++)
+ if (aa0[i]!=0 && aa0[i]<=nsq) sum += pam2[aa0[i]][aa0[i]];
+ return sum;
+}
+
+void
+pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {
+
+#ifdef TFAST
+ f_str->n10=aatran(aa1,f_str->aa1x,n1,frame);
+#endif
+}
+
+/* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
+/* call from calcons, calc_id, calc_code */
+void
+aln_func_vals(int frame, struct a_struct *aln) {
+
+#ifdef TFAST
+ aln->qlrev = 0;
+ aln->qlfact = 1;
+ aln->llfact = aln->llmult = 3;
+ aln->frame = 0;
+ if (frame > 3) aln->llrev = 1;
+#else /* FASTF */
+ aln->llfact = aln->qlfact = aln->llmult = 1;
+ aln->llrev = aln->qlrev = 0;
+ aln->frame = 0;
+#endif
+}
+
+void aa0shuffle(unsigned char *aa0, int n0, struct f_struct *f_str) {
+
+ int i, j, k;
+ unsigned char tmp;
+
+ for (i = f_str->nmoff-1 ; --i ; ) {
+
+ /* j = nrand(i); if (i == j) continue;*/ /* shuffle columns */
+ j = (f_str->nmoff - 2) - i; if (i <= j) break; /* reverse columns */
+
+ /* swap all i'th column residues for all j'th column residues */
+ for(k = 0 ; k < f_str->nm0 ; k++) {
+ tmp = aa0[(k * (f_str->nmoff)) + i];
+ aa0[(k * (f_str->nmoff)) + i] = aa0[(k * (f_str->nmoff)) + j];
+ aa0[(k * (f_str->nmoff)) + j] = tmp;
+ }
+ }
+}
diff --git a/src/dropfs2.c b/src/dropfs2.c
new file mode 100644
index 0000000..516037e
--- /dev/null
+++ b/src/dropfs2.c
@@ -0,0 +1,1681 @@
+/* $Id: dropfs2.c 1254 2014-01-29 16:03:40Z wrp $ */
+/* $Revision: 1254 $ */
+
+/* copyright (c) 1998, 1999, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* changed to return 2.0, rather than -1.0, for failure */
+
+/* Feb 4, 2005 - modifications to allow searches with ktup=2 for very
+ long queries. This is a temporary solution to savemax(), spam()
+ which do not preserve exact matches
+
+ do_fasts() has been modified to allow higher maxsav for do_walign
+ than for do_work (2*nsegs, 6*nsegs)
+ */
+
+/* this code implements the "fasts" algorithm, which compares a set of
+ protein fragments to a protein sequence. Comma's are used to separate
+ the sequence fragments, which need not be the same length.
+
+ The expected input is:
+
+ >mgstm1
+ MGDAPDFD,
+ MILGYW,
+ MLLEYTDS
+
+ The fragments do not need to be in the correct order (which is
+ presumably unknown from the peptide sequencing.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+
+#include "defs.h"
+#include "param.h"
+#include "tatstats.h"
+
+#define EOSEQ 0
+#define ESS 59
+#define NMAP_X 21 /* for 'X' - changed for NCBI */
+#define NMAP_Z 25 /* for '*' - changed for NCBI */
+#define MAXHASH 32
+#define NMAP MAXHASH+1
+
+static char *verstr="4.32 Feb 2007";
+
+#define DROP_INTERN
+#include "drop_func.h"
+
+int shscore(const unsigned char *aa0, const int n0, int **pam2, int nsq);
+extern void aancpy(char *to, char *from, int count, struct pstruct *ppst);
+
+#ifdef TFAST
+extern int aatran(const unsigned char *ntseq, unsigned char *aaseq, const int maxs, const int frame);
+#endif
+
+void savemax(struct dstruct *, struct f_struct *, int maxsav, int exact,int t_end);
+
+int spam(const unsigned char *, const unsigned char *, int, struct savestr *, int **, struct f_struct *);
+int sconn(struct savestr **v,
+ int nsave,
+ struct f_struct *,
+ struct rstruct *,
+ const struct pstruct *,
+ const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int opt_prob);
+
+void kpsort(struct savestr **, int);
+void kssort(struct savestr **, int); /* sort by score */
+int sconn_a(unsigned char *, int,
+ const unsigned char *, int,
+ struct f_struct *,
+ struct a_res_str *,
+ struct pstruct *);
+void kpsort(struct savestr **, int);
+
+/* initialize for fasta */
+
+void
+init_work (unsigned char *aa0, const int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg
+ )
+{
+ int mhv, phv;
+ int hmax, nsegs;
+ int i0, ib, hv, old_hv;
+ int pamfact;
+ struct f_struct *f_str;
+ /* these used to be globals, but do not need to be */
+ int ktup, fact, kt1;
+
+ int maxn0;
+ int stmp; /* temporary score */
+ int tmp_zsflag;
+ int i, j, q;
+ int tat_size;
+ int *res;
+
+ unsigned char *query;
+ int k, l, m, n, N, length, index;
+
+ double *tatprobptr;
+
+ f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct));
+
+ ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval;
+ ktup = ppst->param_u.fa.ktup;
+ if ( ktup > ppst->param_u.fa.bktup ) {
+ ktup = ppst->param_u.fa.ktup = ppst->param_u.fa.bktup;
+ }
+ fact = ppst->param_u.fa.scfact;
+
+ /* fasts3 cannot work with lowercase symbols as low complexity;
+ thus, NMAP must be disabled; this depends on aascii['X'] */
+ if (ppst->hsq[NMAP_X] == NMAP ) {ppst->hsq[NMAP_X]=1;}
+ if (ppst->hsq[NMAP_Z] == NMAP ) {ppst->hsq[NMAP_Z]=1;}
+ /* this does not work in a threaded environment */
+ /* else {fprintf(stderr," cannot find 'X'==NMAP\n");} */
+
+ for (i0 = 1, mhv = -1; i0 < ppst->nsq; i0++)
+ if (ppst->hsq[i0] < NMAP && ppst->hsq[i0] > mhv) mhv = ppst->hsq[i0];
+
+ if (mhv <= 0) {
+ fprintf (stderr, " maximum hsq <=0 %d\n", mhv);
+ exit (1);
+ }
+
+ for (f_str->kshft = 0; mhv > 0; mhv /= 2) f_str->kshft++;
+
+/* kshft = 2; */
+ kt1 = ktup-1;
+ hv = 1;
+ for (i0 = 0; i0 < ktup; i0++) hv = hv << f_str->kshft;
+ hmax = hv;
+ f_str->hmask = (hmax >> f_str->kshft) - 1;
+
+ if ((f_str->aa0t = (unsigned char *) calloc(n0+1, sizeof(char))) == NULL) {
+ fprintf (stderr, " cannot allocate f_str0->aa0t array; %d\n",n0+1);
+ exit (1);
+ }
+
+ if ((f_str->aa0ti = (int *) calloc(n0+1, sizeof(int))) == NULL) {
+ fprintf (stderr, " cannot allocate f_str0->aa0ti array; %d\n",n0+1);
+ exit (1);
+ }
+
+ if ((f_str->aa0b = (int *) calloc(n0+1, sizeof(int))) == NULL) {
+ fprintf (stderr, " cannot allocate f_str0->aa0b array; %d\n",n0+1);
+ exit (1);
+ }
+
+ if ((f_str->aa0e = (int *) calloc(n0+1, sizeof(int))) == NULL) {
+ fprintf (stderr, " cannot allocate f_str0->aa0e array; %d\n",n0+1);
+ exit (1);
+ }
+
+ if ((f_str->aa0i = (int *) calloc(n0+1, sizeof(int))) == NULL) {
+ fprintf (stderr, " cannot allocate f_str0->aa0i array; %d\n",n0+1);
+ exit (1);
+ }
+
+ if ((f_str->aa0s = (int *) calloc(n0+1, sizeof(int))) == NULL) {
+ fprintf (stderr, " cannot allocate f_str0->aa0s array; %d\n",n0+1);
+ exit (1);
+ }
+
+ if ((f_str->aa0l = (int *) calloc(n0+1, sizeof(int))) == NULL) {
+ fprintf (stderr, " cannot allocate f_str0->aa0l array; %d\n",n0+1);
+ exit (1);
+ }
+
+ if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate hash array: hmax: %d hmask: %d\n",
+ hmax, f_str->hmask);
+ exit (1);
+ }
+ if ((f_str->pamh1 = (int *) calloc (ppst->nsq+1, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh1 array\n");
+ exit (1);
+ }
+ if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh2 array\n");
+ exit (1);
+ }
+
+ if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate hash link array");
+ exit (1);
+ }
+
+ /* for FASTS/FASTM, we want to know when we get to the end of a peptide,
+ so we can ensure that we set the end and restart */
+
+ if ((f_str->l_end = (int *) calloc (n0, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate link end array");
+ exit (1);
+ }
+
+ for (i0 = 0; i0 < hmax; i0++) f_str->harr[i0] = -1;
+ for (i0 = 0; i0 < n0; i0++) f_str->link[i0] = -1;
+ for (i0 = 0; i0 < n0; i0++) f_str->l_end[i0] = 0;
+
+ /* count the number of peptides */
+ nsegs = 1;
+ for (i0 = 0; i0 < n0; i0++) {
+ if (aa0[i0] == ESS || aa0[i0] == 0) nsegs++;
+ }
+
+ /* allocate space for peptides offsets, nm_u */
+ if ((f_str->nmoff = (int *)calloc(nsegs+1, sizeof(int)))==NULL) {
+ fprintf(stderr, " cannot allocat nmoff array: %d\n", nsegs);
+ exit(1);
+ }
+
+ if ((f_str->nm_u = (int *)calloc(nsegs+1, sizeof(int)))==NULL) {
+ fprintf(stderr, " cannot allocat nm_u array: %d\n", nsegs);
+ exit(1);
+ }
+
+ phv = hv = 0;
+ f_str->nmoff[0] = 0;
+ f_str->nm0 = 1;
+
+ /* encode the aa0 array */
+ if (kt1 > 0) {
+ hv = ppst->hsq[aa0[0]];
+ phv = ppst->pam2[0][aa0[0]][aa0[0]];
+ }
+
+ for (i0=kt1 ; i0 < n0; i0++) {
+ if (aa0[i0] == ESS || aa0[i0] == 0) {
+ /* fprintf(stderr," converted %d to 0\n",aa0[i0]); */
+ aa0[i0] = EOSEQ; /* set ESS to 0 */
+ f_str->nmoff[f_str->nm0++] = i0+1;
+ f_str->l_end[i0-1] = 1;
+ phv = hv = 0;
+ if (kt1 > 0) {
+ i0++;
+ hv = ppst->hsq[aa0[i0]];
+ phv = ppst->pam2[0][aa0[i0]][aa0[i0]];
+ }
+ continue;
+ }
+
+ hv = ((hv & f_str->hmask) << f_str->kshft) + ppst->hsq[aa0[i0]];
+ f_str->link[i0] = f_str->harr[hv];
+ f_str->harr[hv] = i0;
+ f_str->pamh2[hv] = (phv += ppst->pam2[0][aa0[i0]][aa0[i0]]);
+ phv -= ppst->pam2[0][aa0[i0 - kt1]][aa0[i0 - kt1]];
+ }
+ f_str->l_end[n0-1] = 1;
+
+ f_str->nmoff[f_str->nm0] = n0+1;
+
+ /*
+#ifdef DEBUG
+ fprintf(stderr, ">>%s\n",qtitle);
+ for (j=0; j<f_str->nm0; j++) {
+ for (i=f_str->nmoff[j]; i < f_str->nmoff[j+1]-1; i++) {
+ fprintf(stderr,"%c",ppst->sq[aa0[i]]);
+ }
+ fprintf(stderr," %d\n",aa0[i]);
+ }
+
+ for (j=1; j<=ppst->nsq; j++) {
+ fprintf(stderr, "%c %d\n", ppst->sq[j], f_str->harr[j]);
+ }
+
+ for (j=0; j<=n0; j++) {
+ fprintf(stderr, "%c %d\n", ppst->sq[aa0[j]], f_str->link[j]);
+ }
+
+#endif
+ */
+
+ /* build an integer array of the max score that can be achieved
+ from that position - use in savemax to mark some segments as
+ fixed */
+
+ /* setup aa0b[], aa0e[], which specify the begining and end of each
+ segment */
+
+ stmp = 0;
+ q = -1;
+ for (ib = i0 = 0; i0 < n0; i0++) {
+ f_str->aa0l[i0] = i0 - q;
+ if (aa0[i0]==EOSEQ) {
+ f_str->aa0b[i0] = -1;
+ f_str->aa0e[i0] = -1;
+ f_str->aa0i[i0] = -1;
+ f_str->aa0l[i0] = -1;
+ q = i0;
+ if (i0 > 0)f_str->aa0s[i0-1] = stmp;
+ stmp = 0;
+ ib++;
+ }
+ else {
+ stmp += ppst->pam2[0][aa0[i0]][aa0[i0]];
+ }
+
+ f_str->aa0b[i0] = f_str->nmoff[ib];
+ f_str->aa0e[i0] = f_str->nmoff[ib+1]-2;
+ f_str->aa0i[i0] = ib;
+
+ /*
+ fprintf(stderr,"%2d %c: %2d %2d %2d\n",i0,ppst->sq[aa0[i0]],
+ f_str->aa0b[i0],f_str->aa0e[i0],f_str->aa0i[i0]);
+ */
+ }
+ f_str->aa0s[n0-1]=stmp; /* save last best possible score */
+
+ /* maxsav - maximum number of peptide alignments saved in search */
+ /* maxsav_w - maximum number of peptide alignments saved in
+ alignment */
+
+ f_str->maxsav = max(MAXSAV,2*f_str->nm0);
+ f_str->maxsav_w = max(MAXSAV,6*f_str->nm0);
+
+ if ((f_str->vmax = (struct savestr *)
+ calloc(f_str->maxsav_w,sizeof(struct savestr)))==NULL) {
+ fprintf(stderr, "Couldn't allocate vmax[%d].\n",f_str->maxsav_w);
+ exit(1);
+ }
+
+ if ((f_str->vptr = (struct savestr **)
+ calloc(f_str->maxsav_w,sizeof(struct savestr *)))==NULL) {
+ fprintf(stderr, "Couldn't allocate vptr[%d].\n",f_str->maxsav_w);
+ exit(1);
+ }
+
+ if ((f_str->sarr = (struct slink *)
+ calloc(f_str->maxsav_w,sizeof(struct slink)))==NULL) {
+ fprintf(stderr, "Couldn't allocate sarr[%d].\n",f_str->maxsav_w);
+ exit(1);
+ }
+
+ /* Tatusov Statistics Setup */
+
+ /* initialize priors array. */
+ if((f_str->priors = (double *)calloc(ppst->nsq+1, sizeof(double))) == NULL) {
+ fprintf(stderr, "Couldn't allocate priors array.\n");
+ exit(1);
+ }
+
+ calc_priors(f_str->priors, ppst, f_str, NULL, 0, ppst->pseudocts);
+
+ /* pre-calculate the Tatusov probability array for each full segment */
+
+ tmp_zsflag = ppst->zsflag;
+ if (tmp_zsflag > 20) tmp_zsflag -= 20;
+ if (tmp_zsflag > 10) tmp_zsflag -= 10;
+ if (tmp_zsflag >= 1 && tmp_zsflag <= 3 && f_str->nm0 <= 10) {
+
+ tat_size = (1<<f_str->nm0) -1;
+ f_str->dotat = 1;
+ f_str->tatprobs = (struct tat_str **) malloc((size_t)tat_size*sizeof(struct tat_str *));
+ if (f_str->tatprobs == NULL) {
+ fprintf (stderr, " cannot allocate tatprobs array: %ld\n",
+ tat_size * sizeof(struct tat_str *));
+ exit (1);
+ }
+
+ f_str->intprobs = (double **) malloc((size_t)tat_size * sizeof(double *));
+ if(f_str->intprobs == NULL) {
+ fprintf(stderr, "Couldn't allocate intprobs array.\n");
+ exit(1);
+ }
+
+ for(k = 0, l = f_str->nm0 ; k < l ; k++) {
+ query = &(aa0[f_str->nmoff[k]]);
+ length = f_str->nmoff[k+1] - f_str->nmoff[k] - 1;
+
+ /* this segment alone */
+ index = (1 << k) - 1;
+ generate_tatprobs(query, 0, length - 1, f_str->priors, ppst->pam2[0], ppst->nsq, &(f_str->tatprobs[index]), NULL);
+
+ /* integrate the probabilities */
+ N = f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore;
+ tatprobptr = (double *) calloc(N+1, sizeof(double));
+ if(tatprobptr == NULL) {
+ fprintf(stderr, "Couldn't calloc tatprobptr.\n");
+ exit(1);
+ }
+ f_str->intprobs[index] = tatprobptr;
+
+ for (i = 0; i <= N ; i++ ) {
+ tatprobptr[i] = f_str->tatprobs[index]->probs[i];
+ for (j = i + 1 ; j <= N ; j++ ) {
+ tatprobptr[i] += f_str->tatprobs[index]->probs[j];
+ }
+ }
+
+ /* this segment built on top of all other subcombinations */
+ for(i = 0, j = (1 << k) - 1 ; i < j ; i++) {
+ index = (1 << k) + i;
+ generate_tatprobs(query, 0, length - 1, f_str->priors, ppst->pam2[0], ppst->nsq, &(f_str->tatprobs[index]), f_str->tatprobs[i]);
+
+ /* integrate the probabilities */
+ N = f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore;
+ tatprobptr = (double *) calloc(N+1, sizeof(double));
+ if(tatprobptr == NULL) {
+ fprintf(stderr, "Couldn't calloc tatprobptr.\n");
+ exit(1);
+ }
+ f_str->intprobs[index] = tatprobptr;
+
+ for (m = 0; m <= N ; m++ ) {
+ tatprobptr[m] = f_str->tatprobs[index]->probs[m];
+ for (n = m + 1 ; n <= N ; n++ ) {
+ tatprobptr[m] += f_str->tatprobs[index]->probs[n];
+ }
+ }
+ }
+ }
+ } else {
+ f_str->dotat = 0;
+ f_str->shuff_cnt = ppst->shuff_node;
+ }
+
+ /* End of Tatusov Statistics Setup */
+
+ /*
+ for (i0=1; i0<=ppst->nsq; i0++) {
+ fprintf(stderr," %c: %2d ",ppst->sq[i0],f_str->harr[i0]);
+ hv = f_str->harr[i0];
+ while (hv >= 0) {
+ fprintf(stderr," %2d",f_str->link[hv]);
+ hv = f_str->link[hv];
+ }
+ fprintf(stderr,"\n");
+ }
+ */
+
+/* this has been modified from 0..<ppst->nsq to 1..<=ppst->nsq because the
+ pam2[0][0] is now undefined for consistency with blast
+*/
+ for (i0 = 1; i0 <= ppst->nsq; i0++)
+ f_str->pamh1[i0] = ppst->pam2[0][i0][i0];
+
+ f_str->ndo = 0;
+ f_str->noff = n0-1;
+ if (f_str->diag==NULL)
+ f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG,
+ sizeof (struct dstruct));
+ if (f_str->diag == NULL) {
+ fprintf (stderr, " cannot allocate diagonal arrays: %ld\n",
+ (long) MAXDIAG * (long) (sizeof (struct dstruct)));
+ exit (1);
+ }
+
+#ifdef TFAST
+ if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+2,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+2);
+ exit (1);
+ }
+ f_str->aa1x++;
+#endif
+
+ maxn0 = max(3*n0/2,MIN_RES);
+ if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) {
+ fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0);
+ exit(1);
+ }
+ f_str->res = res;
+ f_str->max_res = maxn0;
+
+ *f_arg = f_str;
+}
+
+
+/* pstring1 is a message to the manager, currently 512 */
+/* pstring2 is the same information, but in a markx==10 format */
+void
+get_param (const struct pstruct *ppstr,
+ char **pstring1, char *pstring2,
+ struct score_count_s *s_info)
+{
+#ifdef FASTS
+#ifndef TFAST
+ char *pg_str="FASTS";
+#else
+ char *pg_str="TFASTS";
+#endif
+#endif
+
+#ifdef FASTM
+#ifndef TFAST
+ char *pg_str="FASTM";
+#else
+ char *pg_str="TFASTM";
+#endif
+#endif
+
+ sprintf (pstring1[0], "%s (%s)",pg_str,verstr);
+ sprintf (pstring1[1], "%s matrix (%d:%d), ktup=%d",
+ ppstr->pam_name, ppstr->pam_h,ppstr->pam_l, ppstr->param_u.fa.ktup);
+
+ if (ppstr->param_u.fa.iniflag) strcat(pstring1[0]," init1");
+
+ if (pstring2 != NULL) {
+ sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)\n\
+; pg_gap-pen: %d %d\n; pg_ktup: %d\n",
+ pg_str,verstr,ppstr->pam_name, ppstr->pam_h,ppstr->pam_l, ppstr->gdelval,
+ ppstr->ggapval,ppstr->param_u.fa.ktup);
+ }
+}
+
+void
+close_work (const unsigned char *aa0, const int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ struct f_struct *f_str;
+ int i, j;
+
+ f_str = *f_arg;
+
+ if (f_str != NULL) {
+
+ free(f_str->res);
+#ifdef TFAST
+ free(f_str->aa1x - 1); /* because f_str->aa1x got ++'ed when allocated! */
+#endif
+ free(f_str->diag);
+ free(f_str->l_end);
+ free(f_str->link);
+ free(f_str->pamh2);
+ free(f_str->pamh1);
+ free(f_str->harr);
+ free(f_str->vmax);
+ free(f_str->vptr);
+ free(f_str->sarr);
+ free(f_str->aa0l);
+ free(f_str->aa0s);
+ free(f_str->aa0i);
+ free(f_str->aa0e);
+ free(f_str->aa0b);
+ free(f_str->aa0ti);
+ free(f_str->aa0t);
+ free(f_str->nmoff);
+ free(f_str->nm_u);
+
+ if(f_str->dotat) {
+ for(i = 0, j = (1 << f_str->nm0) - 1 ; i < j ; i++) {
+ free(f_str->tatprobs[i]->probs);
+ free(f_str->tatprobs[i]);
+ free(f_str->intprobs[i]);
+ }
+ free(f_str->tatprobs);
+ free(f_str->intprobs);
+ }
+
+ free(f_str->priors);
+ free(f_str);
+ *f_arg = NULL;
+ }
+}
+
+void do_fasts (const unsigned char *aa0, const int n0,
+ const unsigned char *aa1, const int n1,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ struct rstruct *rst, int *hoff, int opt_prob,
+ int maxsav)
+{
+ int nd; /* diagonal array size */
+ int lhval;
+ int kfact;
+ register struct dstruct *dptr;
+ register int tscor;
+ register struct dstruct *diagp;
+ struct dstruct *dpmax;
+ register int lpos;
+ int tpos;
+ struct savestr *vmptr, *vmaxmax;
+ int scor, tmp;
+ int im, ib, nsave, i;
+ int cmps (); /* comparison routine for ksort */
+ int ktup;
+ int doffset;
+
+
+ vmaxmax = &f_str->vmax[maxsav];
+
+ ktup = ppst->param_u.fa.ktup;
+
+ if (n1 < ktup) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ rst->escore = 1.0;
+ rst->segnum = 0;
+ rst->seglen = 0;
+ return;
+ }
+
+ if (n0+n1+1 >= MAXDIAG) {
+ fprintf(stderr,"n0,n1 too large: %d, %d\n",n0,n1);
+ rst->score[0] = rst->score[1] = rst->score[2] = -1;
+ rst->escore = 2.0;
+ rst->segnum = 0;
+ rst->seglen = 0;
+ return;
+ }
+
+ nd = n0 + n1;
+
+ dpmax = &f_str->diag[nd];
+ for (dptr = &f_str->diag[f_str->ndo]; dptr < dpmax;)
+ {
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+
+ for (vmptr = f_str->vmax; vmptr < vmaxmax; vmptr++) {
+ vmptr->score = 0;
+ vmptr->exact = 0;
+ }
+ f_str->lowmax = f_str->vmax;
+ f_str->lowscor = 0;
+
+ /* start hashing */
+ diagp = &f_str->diag[f_str->noff];
+ for (lhval=lpos=0; lpos < n1; lpos++, diagp++) {
+ if (ppst->hsq[aa1[lpos]]>=NMAP) { /* skip residue */
+ lpos++ ; diagp++;
+ while (lpos < n1 && ppst->hsq[aa1[lpos]]>=NMAP) {lpos++; diagp++;}
+ if (lpos >= n1) break;
+ lhval = 0;
+ }
+
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
+
+ for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
+
+ dptr = &diagp[-tpos];
+
+ if (f_str->l_end[tpos]) {
+ if (dptr->score + f_str->pamh1[aa0[tpos]] == f_str->aa0s[tpos]) {
+ dptr->stop = lpos;
+ dptr->score = f_str->aa0s[tpos];
+ savemax(dptr, f_str, maxsav, 1, tpos);
+ dptr->dmax = NULL;
+ }
+
+ else if (dptr->score + f_str->pamh1[aa0[tpos]] > f_str->aa0s[tpos]) {
+ /*
+ fprintf(stderr,"exact match score too high: %d:%d %d < %d + %d - %d:%d - %d > %d\n",
+ tpos, lpos, f_str->aa0s[tpos],dptr->score, f_str->pamh1[aa0[tpos]],
+ dptr->start, dptr->stop,
+ dptr->stop - dptr->start, f_str->aa0l[tpos]);
+ */
+ dptr->stop = lpos;
+ dptr->start = lpos - f_str->aa0l[tpos];
+ dptr->score = f_str->aa0s[tpos];
+ savemax(dptr, f_str, maxsav, 1, tpos);
+ dptr->dmax = NULL;
+ }
+ }
+ else if ((tscor = dptr->stop) >= 0) {
+ tscor++; /* tscor is stop of current, increment it */
+ if ((tscor -= lpos) <= 0) { /* tscor, the end of the current
+ match, is before lpos, so there
+ is a mismatch - this is also the
+ mismatch cost */
+ tscor *= 2;
+ scor = dptr->score; /* save the run score on the diag */
+ if ((tscor += (kfact = f_str->pamh2[lhval])) < 0
+ && f_str->lowscor < scor) {
+ /* if what we will get (tscor + kfact) is < 0 and the
+ score is better than the worst savemax() score, save
+ it */
+ savemax (dptr, f_str, maxsav,0,-1);
+ }
+
+ /* if extending is better than starting over, extend */
+ if ((tscor += scor) >= kfact) {
+ dptr->score = tscor;
+ dptr->stop = lpos;
+ if (f_str->l_end[tpos]) {
+ if (dptr->score == f_str->aa0s[tpos]) {
+ savemax(dptr, f_str, maxsav,1,tpos);
+ dptr->dmax = NULL;
+ }
+ else if (dptr->score > f_str->lowscor)
+ savemax(dptr, f_str, maxsav,0,tpos);
+ }
+ }
+ else { /* otherwise, start new */
+ dptr->score = kfact;
+ dptr->start = dptr->stop = lpos;
+ }
+ }
+ else { /* tscor is after lpos, so extend one residue */
+ dptr->score += f_str->pamh1[aa0[tpos]];
+ dptr->stop = lpos;
+ if (f_str->l_end[tpos]) {
+ if (dptr->score == f_str->aa0s[tpos]) {
+ savemax(dptr, f_str, maxsav,1,tpos);
+ dptr->dmax = NULL;
+ }
+ else if (dptr->score > f_str->lowscor)
+ savemax(dptr, f_str, maxsav,0,tpos);
+ }
+ }
+ }
+ else { /* start new */
+ dptr->score = f_str->pamh2[lhval];
+ dptr->start = dptr->stop = lpos;
+ }
+ } /* end tpos */
+ } /* end lpos */
+
+ for (dptr = f_str->diag; dptr < dpmax;) {
+ if (dptr->score > f_str->lowscor) savemax (dptr, f_str, maxsav,0,-1);
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+ f_str->ndo = nd;
+
+/*
+ at this point all of the elements of aa1[lpos]
+ have been searched for elements of aa0[tpos]
+ with the results in diag[dpos]
+*/
+
+ for (nsave=0, vmptr=f_str->vmax; vmptr< vmaxmax; vmptr++) {
+ if (vmptr->score > 0) {
+ /*
+
+ fprintf(stderr,"%c 0: %4d-%4d 1: %4d-%4d dp: %d score: %d",
+ (vmptr->exact ? 'x' : ' '),
+ f_str->noff+vmptr->start-vmptr->dp,
+ f_str->noff+vmptr->stop-vmptr->dp,
+ vmptr->start,vmptr->stop,
+ vmptr->dp,vmptr->score);
+ */
+ vmptr->score = spam (aa0, aa1, n1, vmptr, ppst->pam2[0], f_str);
+ /*
+ fprintf(stderr," sscore: %d %d-%d\n",vmptr->score,vmptr->start,vmptr->stop);
+ */
+ if (vmptr->score > 0) f_str->vptr[nsave++] = vmptr;
+ }
+ }
+
+ if (nsave <= 0) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ rst->escore = 1.0;
+ rst->segnum = 0;
+ rst->seglen = 0;
+ f_str->nsave = 0;
+ return;
+ }
+
+ /*
+ fprintf(stderr,"n0: %d; n1: %d; noff: %d\n",n0,n1,f_str->noff);
+ for (ib=0; ib<nsave; ib++) {
+ fprintf(stderr,"%c 0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
+ f_str->vptr[ib]->exact ? 'x' : ' ',
+ f_str->noff+f_str->vptr[ib]->start-f_str->vptr[ib]->dp,
+ f_str->noff+f_str->vptr[ib]->stop-f_str->vptr[ib]->dp,
+ f_str->vptr[ib]->start,f_str->vptr[ib]->stop,
+ f_str->vptr[ib]->dp,f_str->vptr[ib]->score);
+ }
+
+ fprintf(stderr,"---\n");
+ */
+ kssort(f_str->vptr,nsave);
+
+ /* make certain each seg is used only once */
+
+ for (ib=0; ib<f_str->nm0; ib++) f_str->nm_u[ib]=0;
+ for (ib=0; ib < nsave; ib++) {
+ doffset = f_str->vptr[ib]->dp - f_str->noff;
+ tpos=f_str->aa0i[f_str->vptr[ib]->start - doffset];
+ if (f_str->nm_u[tpos] == 0) {
+ f_str->nm_u[tpos]=1;
+ } else {
+ f_str->vptr[ib]->score = -1;
+ }
+ }
+
+ kssort(f_str->vptr,nsave);
+ for (ib = nsave-1; ib >= 0; ib--) {
+ if (f_str->vptr[ib]->score > -1) break;
+ }
+ nsave = ib+1;
+
+#ifdef DEBUG
+ /*
+ for (ib = 0; ib < nsave; ib++) {
+ if (f_str->vptr[ib]->score > 1000) {
+ fprintf(stderr," score[%d] too high: %d\n",ib, f_str->vptr[ib]->score);
+ for (i=0; i< 10; i++) {
+ fprintf(stderr, "%c:%d ",ppst->sq[aa1[i]],aa1[i]);
+ }
+ fprintf(stderr,"\n");
+
+ f_str->vptr[ib]->score = 0;
+ }
+ }
+ */
+#endif
+
+ scor = sconn (f_str->vptr, nsave,
+ f_str, rst, ppst, aa0, n0, aa1, n1,
+ opt_prob);
+
+ if (rst->escore < 0.0) rst->escore = 2.0;
+ kssort(f_str->vptr,nsave);
+
+ /* here we should use an nsave that is consistent with sconn and nm0 */
+
+ f_str->nsave = nsave;
+ if (nsave > f_str->nm0) f_str->nsave = f_str->nm0;
+
+ rst->score[1] = f_str->vptr[0]->score;
+ rst->score[0] = rst->score[2] = max(scor, f_str->vptr[0]->score);
+
+}
+
+void do_work (const unsigned char *aa0, const int n0,
+ const unsigned char *aa1, const int n1,
+ int frame,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ int qr_flg, int shuff_flg, struct rstruct *rst,
+ struct score_count_s *s_info)
+{
+ int opt_prob;
+ int hoff, n10, i;
+
+ if (qr_flg==1 && f_str->shuff_cnt <= 0) {
+ rst->valid_stat = 0;
+ rst->escore = 2.0;
+ rst->score[0]=rst->score[1]=rst->score[2]= -1;
+ return;
+ }
+ rst->valid_stat = 1;
+
+ s_info->s_cnt[ppst->score_ix]++;
+ s_info->tot_scores++;
+
+ if (f_str->dotat || ppst->zsflag == 4 || ppst->zsflag == 14 ) opt_prob=1;
+ else opt_prob = 0;
+ if (ppst->zsflag == 2 || ppst->zsflag == 12) opt_prob = 0;
+ if (qr_flg) {
+ opt_prob=1;
+ /* if (frame==1) */
+ f_str->shuff_cnt--;
+ }
+
+ if (n1 < ppst->param_u.fa.ktup) {
+ rst->score[0] = rst->score[1] = rst->score[2] = -1;
+ rst->escore = 2.0;
+ return;
+ }
+#ifdef TFAST
+ n10=aatran(aa1,f_str->aa1x,n1,frame);
+ if (ppst->debug_lib)
+ for (i=0; i<n10; i++)
+ if (f_str->aa1x[i]>ppst->nsq) {
+ fprintf(stderr,
+ "residue[%d/%d] %d range (%d)\n",i,n1,
+ f_str->aa1x[i],ppst->nsq);
+ f_str->aa1x[i]=0;
+ n10=i-1;
+ }
+
+ do_fasts (aa0, n0, f_str->aa1x, n10, ppst, f_str, rst, &hoff, opt_prob, f_str->maxsav);
+#else /* FASTA */
+ do_fasts (aa0, n0, aa1, n1, ppst, f_str, rst, &hoff, opt_prob, f_str->maxsav);
+#endif
+
+ rst->comp = rst->H = -1.0;
+}
+
+void do_opt (const unsigned char *aa0, const int n0,
+ const unsigned char *aa1, const int n1,
+ int frame,
+ struct pstruct *ppst, struct f_struct *f_str,
+ struct rstruct *rst)
+{
+ int lag, tscore, hoff, n10;
+
+#ifdef TFAST
+ n10=aatran(aa1,f_str->aa1x,n1,frame);
+ do_fasts (aa0, n0, f_str->aa1x, n10, ppst, f_str, rst, &hoff, 1, f_str->maxsav);
+#else /* FASTA */
+ do_fasts(aa0,n0,aa1,n1,ppst,f_str,rst, &hoff, 1, f_str->maxsav);
+#endif
+}
+
+
+/* modify savemax() so that full length 100% matches are marked
+ so that they cannot be removed - if we have a 100% match, mark "exact"
+
+ modify savemax() to split alignments that include a comma
+*/
+
+/* savemax(dptr, f_str, maxsav) takes a current diagonal run (saved in dptr),
+ and places it in the set of runs to be saved (in f_str->vmax[])
+*/
+
+void
+savemax (struct dstruct *dptr, struct f_struct *f_str, int maxsav,
+ int exact, int tpos)
+{
+ register int dpos; /* position along the diagonal, -n0 .. n1 */
+ int i, j, lowj;
+ register struct savestr *vmptr;
+ struct savestr *vmaxmax;
+
+ vmaxmax = &f_str->vmax[maxsav];
+
+ dpos = (int) (dptr - f_str->diag); /* current diagonal */
+
+/* check to see if this is the continuation of a run that is already saved */
+/* if we are at the end of the query, save it regardless */
+
+/* if (t_end > 0 && t_end < dptr->stop - dptr->start) {return;} */
+
+ if ((vmptr = dptr->dmax) != NULL /* have an active run */
+ && vmptr->dp == dpos && /* on the correct diagonal */
+ vmptr->start == dptr->start) { /* and it starts at the same place */
+ vmptr->stop = dptr->stop; /* update the end of the match in vmax[] */
+
+ if (exact == 1) {
+ /*
+ fprintf(stderr,"have cont exact match: %d - %d:%d %d:%d = %d\n",
+ dptr->score, dptr->start, dptr->stop,
+ vmptr->start, vmptr->stop, dptr->stop - dptr->start+1);
+ */
+ exact = 1;
+ }
+
+
+/* if the score is worse, don't update, return - if the score gets bad
+ enough, it will restart in the diagonal scan */
+ if ((i = dptr->score) <= vmptr->score) { return;}
+
+/* score is better, update */
+ vmptr->score = i;
+
+ vmptr->exact = exact;
+/* if the score is not the worst, return */
+ if (vmptr != f_str->lowmax) { return;}
+ }
+ else { /* not a continuation */
+ /* save in the lowest place */
+ /*
+ fprintf(stderr," Replacing: %d - %d:%d => %d - %d:%d",
+ f_str->lowmax->score, f_str->lowmax->start, f_str->lowmax->stop,
+ dptr->score, dptr->start, dptr->stop);
+ */
+
+ vmptr = f_str->lowmax;
+
+ /*
+ if (exact == 1) {
+ fprintf(stderr,"have new exact match: %d - %d:%d = %d\n",
+ dptr->score, dptr->start, dptr->stop, dptr->stop - dptr->start+1);
+ }
+ */
+ vmptr->exact = exact;
+
+ i = vmptr->score = dptr->score; /* 'i' is used as a bound */
+ vmptr->dp = dpos;
+ vmptr->start = dptr->start;
+ vmptr->stop = dptr->stop;
+ dptr->dmax = vmptr;
+ }
+
+ /* rescan the list for the worst score */
+ for (vmptr = f_str->vmax; vmptr < &f_str->vmax[maxsav] ; vmptr++) {
+ if (vmptr->score < i && !vmptr->exact) {
+ i = vmptr->score;
+ f_str->lowmax = vmptr;
+ }
+ }
+
+ f_str->lowscor = i;
+}
+
+/* this version of spam scans the diagonal to find the best local score,
+ then resets the boundaries for a global alignment and re-scans */
+
+/* NOOVERHANG allows one to score any overhanging alignment as zero.
+ Useful for SAGE alignments. Normally, one allows overhangs because
+ of the possibility of partial sequences.
+*/
+
+#undef NOOVERHANG
+
+/*
+ May, 2005 - spam() has an intesting bug that occurs when two
+ peptides match in order, separated by one position (the comma). In
+ this case, spam() splits the match, and only returns the better of
+ the two matches. So, if spam splits an alignment at a comma, it
+ needs the ability to insert the missing match.
+
+*/
+
+int spam (const unsigned char *aa0, const unsigned char *aa1,int n1,
+ struct savestr *dmax, int **pam2,
+ struct f_struct *f_str)
+{
+ int lpos, doffset;
+ int tot, mtot;
+ struct {
+ int start, stop, score;
+ } curv, maxv;
+ register const unsigned char *aa0p, *aa1p;
+
+ curv.start = dmax->start;
+ aa1p = &aa1[dmax->start];
+ doffset = dmax->dp - f_str->noff;
+ aa0p = &aa0[dmax->start - doffset];
+
+ tot = curv.score = maxv.score = 0;
+ for (lpos = dmax->start; lpos <= dmax->stop; lpos++) {
+ tot += pam2[*aa0p++][*aa1p++];
+ if (tot > curv.score) {
+ curv.stop = lpos; /* here, curv.stop is actually curv.max */
+ curv.score = tot;
+ }
+ else if (tot < 0) {
+ if (curv.score > maxv.score) {
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+ tot = curv.score = 0;
+ curv.start = lpos+1;
+ }
+ }
+
+ if (curv.score > maxv.score) {
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+
+ if (maxv.score <= 0) return 0;
+
+ /* now, reset the boundaries of the alignment using aa0b[]
+ and aa0e[], which specify the residues that start and end
+ the segment */
+
+ maxv.start = f_str->aa0b[maxv.stop-doffset] + doffset;
+ if (maxv.start < 0) {
+ maxv.start = 0;
+#ifdef NOOVERHANG
+ return 0;
+#endif
+ }
+
+ maxv.stop = f_str->aa0e[maxv.stop-doffset] + doffset;
+ if (maxv.stop > n1) {
+ maxv.stop = n1-1;
+#ifdef NOOVERHANG
+ return 0;
+#endif
+ }
+ aa1p = &aa1[lpos = maxv.start];
+ aa0p = &aa0[lpos - doffset];
+
+ for (tot=0; lpos <= maxv.stop; lpos++) {
+ tot += pam2[*aa0p++][*aa1p++];
+ }
+
+ maxv.score = tot;
+
+/* if (maxv.start != dmax->start || maxv.stop != dmax->stop)
+ printf(" new region: %3d %3d %3d %3d\n",maxv.start,
+ dmax->start,maxv.stop,dmax->stop);
+*/
+ dmax->start = maxv.start;
+ dmax->stop = maxv.stop;
+
+ return maxv.score;
+}
+
+int sconn (struct savestr **v, int n,
+ struct f_struct *f_str,
+ struct rstruct *rst, const struct pstruct *ppst,
+ const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1, int opt_prob)
+{
+ int i, si, cmpp ();
+ struct slink *start, *sl, *sj, *so, *sarr;
+ int lstart, ltmp, tstart, plstop, ptstop, ptstart, tstop;
+ double tatprob;
+ int dotat;
+
+ sarr = f_str->sarr;
+
+ /* sort the score left to right in lib pos */
+ kpsort (v, n);
+
+ start = NULL;
+ rst->score[0] = 0;
+ rst->escore = 2.0;
+
+/* for the remaining runs, see if they fit */
+/* lstart/lstop -> start/stop in library sequence
+ tstart/tstop -> start/stop in query sequence
+ plstart/plstop ->
+*/
+
+ for (i = 0, si = 0; i < n; i++) {
+
+ /* the segment is worth adding; find out where? */
+ lstart = v[i]->start;
+ ltmp = v[i]->stop;
+ tstart = lstart - v[i]->dp + f_str->noff;
+ tstop = ltmp - v[i]->dp + f_str->noff;
+
+ /* put the run in the group */
+ sarr[si].vp = v[i];
+ sarr[si].score = v[i]->score;
+ sarr[si].next = NULL;
+ sarr[si].prev = NULL;
+ sarr[si].tat = NULL;
+
+/*
+ opt_prob for FASTS only has to do with using aa1 for priors,
+ i.e. we always calculate tatprobs for segments in FASTS (unlike
+ FASTF)
+*/
+ if(opt_prob) {
+ sarr[si].tatprob =
+ calc_tatusov(NULL, &sarr[si], aa0, n0, aa1, n1,
+ ppst->pam2[0], ppst->nsq, f_str,
+ ppst->pseudocts, opt_prob, ppst->zsflag);
+ if (sarr[si].tatprob < 0.0) {
+ fprintf(stderr," negative tatprob: %lg\n",sarr[si].tatprob);
+ sarr[si].tatprob = 1.0;
+ }
+ sarr[si].tat = sarr[si].newtat;
+ }
+
+/* if it fits, then increase the score
+
+ start points to the highest scoring run
+ -> next is the second highest, etc.
+ put the segment into the highest scoring run that it fits into
+*/
+ for (sl = start; sl != NULL; sl = sl->next) {
+ ltmp = sl->vp->start;
+ /* plstop -> previous lstop */
+ plstop = sl->vp->stop;
+ /* ptstart -> previous t(query) start */
+ ptstart = ltmp - sl->vp->dp + f_str->noff;
+ /* ptstop -> previous t(query) stop */
+ ptstop = plstop - sl->vp->dp + f_str->noff;
+#ifndef FASTM
+ /* if the previous library stop is before the current library start */
+ if (plstop < lstart && ( ptstop < tstart || ptstart > tstop))
+#else
+ /* if the previous library stop is before the current library start */
+ if (plstop < lstart && ptstop < tstart)
+#endif
+ {
+ if(!opt_prob) {
+ sarr[si].score = sl->score + v[i]->score;
+ sarr[si].prev = sl;
+ break;
+ } else {
+ tatprob = calc_tatusov(sl, &sarr[si], aa0, n0, aa1, n1,
+ ppst->pam2[0], ppst->nsq, f_str,
+ ppst->pseudocts, opt_prob, ppst->zsflag);
+ /* if our tatprob gets worse when we add this, forget it */
+ if(tatprob > sarr[si].tatprob) {
+ free(sarr[si].newtat->probs); /* get rid of new tat struct */
+ free(sarr[si].newtat);
+ continue; /* reuse this sarr[si] */
+ } else {
+ sarr[si].tatprob = tatprob;
+ free(sarr[si].tat->probs); /* get rid of old tat struct */
+ free(sarr[si].tat);
+ sarr[si].tat = sarr[si].newtat;
+ sarr[si].prev = sl;
+ sarr[si].score = sl->score + v[i]->score;
+ /*
+ fprintf(stderr,"sconn %d added %d:%d getting %d; si: %d, tat: %g\n",
+ i,v[i]->start, v[i]->score,sarr[si].score,si, tatprob);
+ */
+ break;
+ }
+ }
+ }
+ }
+
+ /* now recalculate where the score fits */
+ if (start == NULL) start = &sarr[si];
+ else {
+ if(!opt_prob) {
+ for (sj = start, so = NULL; sj != NULL; sj = sj->next) {
+ if (sarr[si].score > sj->score) {
+ sarr[si].next = sj;
+ if (so != NULL)
+ so->next = &sarr[si];
+ else
+ start = &sarr[si];
+ break;
+ }
+ so = sj;
+ }
+ } else {
+ for (sj = start, so = NULL; sj != NULL; sj = sj->next) {
+ if ( sarr[si].tatprob < sj->tatprob ||
+ ((sarr[si].tatprob == sj->tatprob) && sarr[si].score > sj->score) ) {
+ sarr[si].next = sj;
+ if (so != NULL)
+ so->next = &sarr[si];
+ else
+ start = &sarr[si];
+ break;
+ }
+ so = sj;
+ }
+ }
+ }
+
+ si++;
+ }
+
+ if(opt_prob) {
+ for (i = 0 ; i < si ; i++) {
+ free(sarr[i].tat->probs);
+ free(sarr[i].tat);
+ }
+ }
+
+ if (start != NULL) {
+ if(opt_prob) {
+ rst->escore = start->tatprob;
+ } else {
+ rst->escore = 2.0;
+ }
+
+ rst->segnum = rst->seglen = 0;
+ for(sj = start ; sj != NULL; sj = sj->prev) {
+ rst->segnum++;
+ rst->seglen += sj->vp->stop - sj->vp->start + 1;
+ }
+ return (start->score);
+ } else {
+ rst->escore = 1.0;
+ }
+
+ rst->segnum = rst->seglen = 0;
+ return (0);
+}
+
+void
+kssort (v, n)
+struct savestr *v[];
+int n;
+{
+ int gap, i, j;
+ struct savestr *tmp;
+
+ for (gap = n / 2; gap > 0; gap /= 2)
+ for (i = gap; i < n; i++)
+ for (j = i - gap; j >= 0; j -= gap)
+ {
+ if (v[j]->score >= v[j + gap]->score)
+ break;
+ tmp = v[j];
+ v[j] = v[j + gap];
+ v[j + gap] = tmp;
+ }
+}
+
+void
+kpsort (v, n)
+struct savestr *v[];
+int n;
+{
+ int gap, i, j;
+ struct savestr *tmp;
+
+ for (gap = n / 2; gap > 0; gap /= 2)
+ for (i = gap; i < n; i++)
+ for (j = i - gap; j >= 0; j -= gap)
+ {
+ if (v[j]->start <= v[j + gap]->start)
+ break;
+ tmp = v[j];
+ v[j] = v[j + gap];
+ v[j + gap] = tmp;
+ }
+}
+
+/* calculate the 100% identical score */
+int
+shscore(const unsigned char *aa0, const int n0, int **pam2, int nsq)
+{
+ int i, sum;
+ for (i=0,sum=0; i<n0; i++)
+ if (aa0[i] != EOSEQ && aa0[i]<=nsq) sum += pam2[aa0[i]][aa0[i]];
+ return sum;
+}
+
+/* sorts alignments from right to left (back to front) based on stop */
+
+void
+krsort (v, n)
+struct savestr *v[];
+int n;
+{
+ int gap, i, j;
+ struct savestr *tmp;
+
+ for (gap = n / 2; gap > 0; gap /= 2)
+ for (i = gap; i < n; i++)
+ for (j = i - gap; j >= 0; j -= gap)
+ {
+ if (v[j]->stop > v[j + gap]->stop)
+ break;
+ tmp = v[j];
+ v[j] = v[j + gap];
+ v[j + gap] = tmp;
+ }
+}
+
+struct a_res_str *
+do_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int repeat_thresh,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ int *have_ares)
+{
+ struct a_res_str *a_res;
+ int hoff, n10;
+ struct rstruct rst;
+ int ib, i;
+ unsigned char *aa0t;
+ const unsigned char *aa1p;
+ struct savestr *vmptr;
+
+ /* the a_res for this function must always be re-calculated */
+
+ *have_ares = 0x02; /* set 0x2 bit to indicate local copy */
+
+ if ((a_res = (struct a_res_str *)calloc(1, sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr," [do_walign] Cannot allocate a_res");
+ return NULL;
+ }
+
+#ifdef TFAST
+ f_str->n10 = n10 = aatran(aa1,f_str->aa1x,n1,frame);
+ aa1p = f_str->aa1x;
+#else
+ n10 = n1;
+ aa1p = aa1;
+#endif
+
+ do_fasts(aa0, n0, aa1p, n10, ppst, f_str, &rst, &hoff, 1, f_str->maxsav_w);
+ a_res->sw_score = rst.score[0];
+ memcpy(&a_res->rst, &rst, sizeof(rst));
+
+ /* the alignment portion takes advantage of the information left
+ over in f_str after do_fasts is done. in particular, it is
+ easy to run a modified sconn() to produce the alignments.
+
+ unfortunately, the alignment display routine wants to have
+ things encoded as with bd_align and sw_align, so we need to do that.
+ */
+
+ /* unnecessary; do_fasts just did this */
+ /* kssort(f_str->vptr,f_str->nsave); */
+
+ /* at some point, we want one best score for each of the segments */
+
+
+ a_res->min0 = a_res->min1 = a_res->max0 = a_res->max1 = 0;
+ if (f_str->nsave <=0) {
+ a_res->res = NULL;
+ return a_res;
+ }
+
+ for ( ; f_str->nsave > 0; f_str->nsave--)
+ if (f_str->vptr[f_str->nsave-1]->score >0) break;
+
+ if ((aa0t = (unsigned char *)calloc(n0+1,sizeof(unsigned char)))==NULL) {
+ fprintf(stderr," cannot allocate aa0t %d\n",n0+1);
+ exit(1);
+ }
+
+ /* copy aa0[] into f_str->aa0t[] */
+ for (i=0; i<n0; i++) f_str->aa0t[i] = aa0t[i] = aa0[i];
+ f_str->aa0t[i] = aa0t[i] = '\0';
+
+ a_res->nres = sconn_a (aa0t,n0,aa1p,n10,f_str, a_res, ppst);
+
+ free(aa0t);
+
+ a_res->res = f_str->res;
+ return a_res;
+}
+
+/* this version of sconn is modified to provide alignment information */
+/* in addition, it needs to know whether a segment has been used before */
+
+/* sconn_a fills in the res[nres] array, but this is passed implicitly
+ through f_str->res[f_str->nres] */
+
+int sconn_a (unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct f_struct *f_str,
+ struct a_res_str *a_res,
+ struct pstruct *ppst)
+{
+ int i, si, cmpp (), n;
+ unsigned char *aa0p;
+ int sx, dx, doff, *aa0tip;
+
+ struct savestr **v;
+ struct slink *start, *sl, *sj, *so, *sarr;
+ int lstart, lstop, ltmp, plstart, tstart, plstop, ptstop, ptstart, tstop;
+
+ int *res, nres, tres;
+
+ double tatprob;
+
+/* sort the score left to right in lib pos */
+
+ v = f_str->vptr;
+ n = f_str->nsave;
+ sarr = f_str->sarr;
+
+ /* set things up in case nothing fits */
+ if (n <=0 || v[0]->score <= 0) return 0;
+
+ if (v[0]->score < 0) {
+ sarr[0].vp = v[0];
+ sarr[0].score = v[0]->score;
+ sarr[0].next = NULL;
+ sarr[0].prev = NULL;
+ start = &sarr[0];
+ }
+ else {
+
+ krsort (v, n); /* sort from left to right in library */
+
+ start = NULL;
+
+ /* for each alignment, see if it fits */
+
+
+ for (i = 0, si = 0; i < n; i++) {
+ /* if the score is less than the join threshold, skip it */
+
+ if (v[i]->score < 0) continue;
+
+ lstart = v[i]->start;
+ lstop = v[i]->stop;
+ tstart = lstart - v[i]->dp + f_str->noff;
+ tstop = lstop - v[i]->dp + f_str->noff;
+
+ /* put the alignment in the group */
+
+ sarr[si].vp = v[i];
+ sarr[si].score = v[i]->score;
+ sarr[si].next = NULL;
+ sarr[si].prev = NULL;
+ sarr[si].tat = NULL;
+
+ sarr[si].tatprob =
+ calc_tatusov(NULL, &sarr[si], aa0, n0, aa1, n1,
+ ppst->pam2[0], ppst->nsq, f_str,
+ ppst->pseudocts, 1, ppst->zsflag);
+ sarr[si].tat = sarr[si].newtat;
+
+
+ /* if it fits, then increase the score */
+ /* start points to a sorted (by total score) list of candidate
+ overlaps */
+
+ for (sl = start; sl != NULL; sl = sl->next) {
+ plstart = sl->vp->start;
+ plstop = sl->vp->stop;
+ ptstart = plstart - sl->vp->dp + f_str->noff;
+ ptstop = plstop - sl->vp->dp + f_str->noff;
+#ifndef FASTM
+ if (plstart > lstop && (ptstop < tstart || ptstart > tstop)) {
+#else
+ if (plstop > lstart && ptstart > tstop) {
+#endif
+ /* alignment always uses probabilistic scoring ... */
+ /* sarr[si].score = sl->score + v[i]->score;
+ sarr[si].prev = sl;
+ break; */ /* quit as soon as the alignment has been added */
+
+ tatprob = calc_tatusov(sl, &sarr[si], aa0, n0, aa1, n1,
+ ppst->pam2[0], ppst->nsq, f_str,
+ ppst->pseudocts, 1, ppst->zsflag);
+ /* if our tatprob gets worse when we add this, forget it */
+ if(tatprob > sarr[si].tatprob) {
+ free(sarr[si].newtat->probs); /* get rid of new tat struct */
+ free(sarr[si].newtat);
+ continue; /* reuse this sarr[si] */
+ } else {
+ sarr[si].tatprob = tatprob;
+ free(sarr[si].tat->probs); /* get rid of old tat struct */
+ free(sarr[si].tat);
+ sarr[si].tat = sarr[si].newtat;
+ sarr[si].prev = sl;
+ sarr[si].score = sl->score + v[i]->score;
+ /*
+ fprintf(stderr,"sconn %d added %d/%d getting %d; si: %d, tat: %g\n",
+ i,v[i]->start, v[i]->score,sarr[si].score,si, tatprob);
+ */
+ break;
+ }
+ }
+ }
+
+ /* now recalculate the list of best scores */
+ if (start == NULL)
+ start = &sarr[si]; /* put the first one in the list */
+ else
+ for (sj = start, so = NULL; sj != NULL; sj = sj->next) {
+ /* if (sarr[si].score > sj->score) { */ /* new score better than old */
+ if ( sarr[si].tatprob < sj->tatprob ||
+ ((sarr[si].tatprob == sj->tatprob) && sarr[si].score > sj->score) ) {
+ sarr[si].next = sj; /* next best after new score */
+ if (so != NULL)
+ so->next = &sarr[si]; /* prev_best->next points to best */
+ else start = &sarr[si]; /* start points to best */
+ break; /* stop looking */
+ }
+ so = sj; /* previous candidate best */
+ }
+ si++; /* increment to next alignment */
+ }
+ }
+
+ for (i = 0 ; i < si ; i++) {
+ free(sarr[i].tat->probs);
+ free(sarr[i].tat);
+ }
+
+ res = f_str->res;
+ tres = nres = 0;
+ aa0p = aa0;
+ aa0tip = f_str->aa0ti; /* point to temporary index */
+ a_res->min1 = start->vp->start;
+ a_res->min0 = 0;
+
+ sx=start->vp->start-start->vp->dp+f_str->noff;
+ f_str->aa0t_off = sx - f_str->aa0b[sx];
+
+ for (sj = start; sj != NULL; sj = sj->prev ) {
+ doff = (int)(aa0p-aa0) - (sj->vp->start-sj->vp->dp+f_str->noff);
+
+ /* fprintf(stderr,"doff: %3d\n",doff); */
+
+ for (dx=sj->vp->start,sx=sj->vp->start-sj->vp->dp+f_str->noff;
+ dx <= sj->vp->stop; dx++) {
+ *aa0tip++ = f_str->aa0i[sx]; /* save index */
+ *aa0p++ = f_str->aa0t[sx++]; /* save sequence at index */
+ tres++;
+ res[nres++] = 0;
+ }
+ sj->vp->dp -= doff;
+ if (sj->prev != NULL) {
+ if (sj->prev->vp->start - sj->vp->stop - 1 > 0 )
+ tres += res[nres++] = (sj->prev->vp->start - sj->vp->stop - 1);
+ }
+
+ /*
+ fprintf(stderr,"t0: %3d, tx: %3d, l0: %3d, lx: %3d, dp: %3d noff: %3d, score: %3d\n",
+ sj->vp->start - sj->vp->dp + f_str->noff,
+ sj->vp->stop - sj->vp->dp + f_str->noff,
+ sj->vp->start,sj->vp->stop,sj->vp->dp,
+ f_str->noff,sj->vp->score);
+
+ fprintf(stderr,"%3d - %3d: %3d\n",
+ sj->vp->start,sj->vp->stop,sj->vp->score);
+ */
+ a_res->max1 = sj->vp->stop+1;
+ a_res->max0 = a_res->max1 - sj->vp->dp + f_str->noff;
+ }
+ *aa0p = '\0'; /* be sure to terminate the string */
+
+ /*
+ fprintf(stderr,"(%3d - %3d):(%3d - %3d)\n",
+ a_res->min0,a_res->max0,a_res->min1,a_res->max1);
+ */
+
+ /* now replace f_str->aa0t with aa0
+ (f_str->aa0t is permanent, aa0 is not)*/
+ for (i=0; i<n0; i++) f_str->aa0t[i] = aa0[i];
+
+ return tres;
+}
+
+/* for fasts (and fastf), pre_cons needs to set up f_str as well as do
+ necessary translations - for right now, simply do do_walign */
+
+void
+pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {
+
+#ifdef TFAST
+ f_str->n10=aatran(aa1,f_str->aa1x,n1,frame);
+#endif
+
+}
+
+/* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
+/* call from calcons, calc_id, calc_code */
+void
+aln_func_vals(int frame, struct a_struct *aln) {
+
+#ifdef TFAST
+ aln->qlrev = 0;
+ aln->qlfact= 1;
+ aln->llfact = aln->llmult = 3;
+ if (frame > 3) aln->llrev = 1;
+ else aln->llrev = 0;
+ aln->frame = 0;
+#else /* FASTS */
+ aln->llfact = aln->llmult = aln->qlfact = 1;
+ aln->llrev = aln->qlrev = 0;
+ aln->frame = 0;
+#endif
+}
+
+void aaptrshuffle(unsigned char *res, int n) {
+
+ int i, j;
+ unsigned char tmp;
+
+ for( i = n; --i; ) {
+
+ /* j = nrand(i); if (i == j) continue; */ /* shuffle */
+ j = (n - 1) - i; if (i <= j ) break; /* reverse */
+
+ tmp = res[i];
+ res[i] = res[j];
+ res[j] = tmp;
+ }
+}
+
+void aa0shuffle(unsigned char *aa0, int n0, struct f_struct *f_str) {
+
+ int i;
+ int j;
+
+ for(i = 0 ; i < f_str->nm0 ; i++) { /* for each fragment */
+
+ aaptrshuffle(&(aa0[f_str->nmoff[i]]),
+ f_str->nmoff[i+1] - f_str->nmoff[i] - 1 );
+
+ }
+
+}
diff --git a/src/dropfx.c b/src/dropfx.c
new file mode 100644
index 0000000..fcd9799
--- /dev/null
+++ b/src/dropfx.c
@@ -0,0 +1,4072 @@
+/* $Id: dropfx.c 1280 2014-08-21 00:47:55Z wrp $ */
+
+/* copyright (c) 1998, 1999, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* implements the fastx algorithm, see:
+
+ W. R. Pearson, T. Wood, Z. Zhang, A W. Miller (1997) "Comparison of
+ DNA sequences with protein sequences" Genomics 46:24-36
+
+ see dropnfa.c for better variable descriptions and comments
+*/
+
+/* 17-Sept-2008 - modified for multiple non-overlapping alignments */
+
+/* 18-Sept-2006 - remove global variables used for alignment */
+
+/* 22-June-2006 - correct incorrect alignment coordinates generated
+ after pro_dna() on projected DNA region.
+*/
+
+/* 9-May-2003 -> 3.46 changed lx_band to use projected protein
+ boundary end. this fixes some addressing issues on MacOSX, and
+ speeds up alignment on very long proteins
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+
+#include "defs.h"
+#include "param.h"
+#define XTERNAL
+#include "upam.h"
+
+/* this must be consistent with upam.h */
+#define MAXHASH 32
+#define NMAP MAXHASH+1
+
+/* globals for fasta */
+#define MAXWINDOW 64
+
+#ifndef MAXSAV
+#define MAXSAV 10
+#endif
+
+#ifndef ALLOCN0
+static char *verstr="3.8 June 2014";
+#else
+static char *verstr="3.8an0 June 2014";
+#endif
+
+struct dstruct /* diagonal structure for saving current run */
+{
+ int score; /* hash score of current match */
+ int start; /* start of current match */
+ int stop; /* end of current match */
+ struct savestr *dmax; /* location in vmax[] where best score data saved */
+};
+
+struct savestr
+{
+ int score; /* pam score with segment optimization */
+ int score0; /* pam score of best single segment */
+ int gscore; /* score from global match */
+ int dp; /* diagonal of match */
+ int start; /* start of match in lib seq */
+ int stop; /* end of match in lib seq */
+};
+
+struct swstr { int H, E;};
+/* struct bdstr { int CC, DD, CP, DP;}; */
+
+#define SGW1 100
+#define SGW2 300
+struct smgl_str {
+ int C[SGW1+1][SGW2+1];
+ int st[SGW1+1][SGW2+1];
+ int D[SGW2+7], I[SGW2+1];
+};
+
+struct update_code_str {
+ int p_op_idx;
+ int p_op_cnt;
+ int show_code;
+ int cigar_order;
+ int show_ext;
+ char *op_map;
+};
+
+#ifndef TFAST
+static char *ori_code = "-x/=\\+*"; /* FASTX */
+static char *cigar_code = "DXFMRI*";
+#else
+static char *ori_code = "+x/=\\-*"; /* TFASTX */
+static char *cigar_code = "IXFMRD*";
+#endif
+
+static struct update_code_str *
+init_update_data(int show_code);
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *, int op_idx, int op_cnt);
+
+static void
+update_code(char *al_str, int al_str_max,
+ struct update_code_str *update_data, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1);
+
+static void
+close_update_data(char *al_str, int al_str_max,
+ struct update_code_str *update_data);
+
+void kpsort (struct savestr **v, int n);
+extern void *init_stack(int, int);
+extern void push_stack(void *, void *);
+extern void *pop_stack(void *);
+extern void *free_stack(void *);
+
+struct sx_s {int C1, C2, C3, I1, I2, I3, flag; };
+
+struct f_struct {
+ struct dstruct *diag;
+ int ndo;
+ int noff;
+ int hmask; /* hash constants */
+ int *pamh1; /* pam based array */
+ int *pamh2; /* pam based kfact array */
+ int *link, *harr; /* hash arrays */
+ int kshft; /* shift width */
+ int nsav; /* number of saved runs, worst saved run */
+#ifndef TFAST
+ unsigned char *aa0x; /* contains translated codons 111222333*/
+ unsigned char *aa0y; /* contains translated codons 123123123*/
+#else
+ unsigned char *aa1x; /* contains translated codons 111222333 */
+ unsigned char *aa1y; /* contains translated codons 123123123 */
+ int have_yaa; /* flag if translation is done */
+#endif
+ struct sx_s *cur;
+ int cur_sp_size;
+ int *waa0;
+ int *waa1;
+ struct smgl_str smgl_s;
+ int *res;
+ int max_res;
+};
+
+#define DROP_INTERN
+#include "drop_func.h"
+
+int shscore(unsigned char *aa0, int n0, int **pam2);
+int saatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame);
+extern int ELK_to_s(double E_join, int n0, int n1, double Lambda, double K, double H);
+
+int savemax (struct dstruct *dptr, int dpos,
+ struct savestr *vmax, struct savestr **lowmax);
+
+int spam (const unsigned char *aa0, const unsigned char *aa1,
+ struct savestr *dmax, int **pam2,
+ struct f_struct *f_str);
+int sconn (struct savestr **v, int n,int cgap, int pgap, struct f_struct *f_str);
+int lx_band(const unsigned char *prot_seq, int len_prot,
+ const unsigned char *dna_prot_seq, int len_dna_prot,
+ int **pam_matrix, int gopen, int gext,
+ int gshift, int start_diag, int width, struct f_struct *f_str);
+
+void fx_walign (const unsigned char *aa0, int n0,
+ const unsigned char *xaa, int n1, unsigned char *yaa,
+ int frame, int max_res,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct a_res_str *a_res,
+ int score_thresh);
+
+struct a_res_str *
+merge_ares_chains(struct a_res_str *cur_ares,
+ struct a_res_str *tmpl_ares,
+ int score_ix, const char *msg);
+
+extern void w_abort (char *p, char *p1);
+
+/* initialize for fasta */
+
+void
+init_work (unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ int mhv, phv;
+ int hmax;
+ int i0, hv;
+ int pamfact;
+ int btemp;
+ struct f_struct *f_str;
+ int ktup; /* word size examined */
+ int fact; /* factor used to scale ktup match value */
+ int kt1; /* ktup-1 */
+ int lkt; /* last ktup - initiall kt1, but can be increased
+ for hsq >= NMAP */
+
+ int maxn0;
+ int *pwaa;
+ int i, j, q;
+ struct swstr *ss, *r_ss;
+ int *waa;
+ int *res;
+ int nsq, ip, *hsq;
+#ifndef TFAST
+ int last_n0, itemp;
+ unsigned char *fd, *fs, *aa0x, *aa0y, *aa0s;
+ int n0x, n0x3;
+#endif
+
+ if (ppst->ext_sq_set) {
+ nsq = ppst->nsqx; ip = 1;
+ hsq = ppst->hsqx;
+ }
+ else {
+ nsq = ppst->nsqx; ip = 0;
+ hsq = ppst->hsq;
+ }
+
+ f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct));
+
+ if (!ppst->param_u.fa.use_E_thresholds) {
+ btemp = 2 * ppst->param_u.fa.bestoff / 3 +
+ n0 / ppst->param_u.fa.bestscale +
+ ppst->param_u.fa.bkfact *
+ (ppst->param_u.fa.bktup - ppst->param_u.fa.ktup);
+ btemp = min (btemp, ppst->param_u.fa.bestmax);
+ if (btemp > 3 * n0) btemp = 3 * shscore(aa0,n0,ppst->pam2[0]) / 5;
+
+ ppst->param_u.fa.cgap = btemp + ppst->param_u.fa.bestoff / 3;
+ if (ppst->param_u.fa.optcut_set != 1) {
+#ifndef TFAST
+ ppst->param_u.fa.optcut = (btemp*5)/4;
+#else
+ ppst->param_u.fa.optcut = (btemp*4)/3;
+#endif
+ }
+ }
+
+#ifdef OLD_FASTA_GAP
+ ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval;
+#else
+ ppst->param_u.fa.pgap = ppst->gdelval + 2*ppst->ggapval;
+#endif
+
+ ppst->param_u.fa.cgap = max(ppst->param_u.fa.cgap, -ppst->param_u.fa.pgap);
+
+ pamfact = ppst->param_u.fa.pamfact;
+ ktup = ppst->param_u.fa.ktup;
+ fact = ppst->param_u.fa.scfact * ktup;
+
+ if (pamfact == -1)
+ pamfact = 0;
+ else if (pamfact == -2)
+ pamfact = 1;
+
+ for (i0 = 1, mhv = -1; i0 <=ppst->nsq; i0++)
+ if (hsq[i0] < NMAP && hsq[i0] > mhv) mhv = hsq[i0];
+
+ if (mhv <= 0) {
+ fprintf (stderr, " maximum hsq <=0 %d\n", mhv);
+ exit (1);
+ }
+
+ for (f_str->kshft = 0; mhv > 0; mhv /= 2)
+ f_str->kshft++;
+
+/* kshft = 2; */
+ kt1 = ktup - 1;
+ hv = 1;
+ for (i0 = 0; i0 < ktup; i0++) {
+ hv = hv << f_str->kshft;
+ }
+ hmax = hv;
+ f_str->hmask = (hmax >> f_str->kshft) - 1;
+
+ if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate hash array\n");
+ exit (1);
+ }
+ if ((f_str->pamh1 = (int *) calloc (ppst->nsq+1, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh1 array\n");
+ exit (1);
+ }
+ if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh2 array\n");
+ exit (1);
+ }
+ if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate hash link array");
+ exit (1);
+ }
+
+#ifdef TFAST
+ if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+2,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+2);
+ exit (1);
+ }
+ f_str->aa1x++;
+
+ if ((f_str->aa1y =(unsigned char *)calloc((size_t)ppst->maxlen+2,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa1y array %d\n", ppst->maxlen+2);
+ exit (1);
+ }
+ f_str->aa1y++;
+#else /* FASTX */
+ maxn0 = n0 + 2;
+ if ((aa0x =(unsigned char *)calloc((size_t)maxn0,sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa0x array %d\n", maxn0);
+ exit (1);
+ }
+ aa0x++;
+ f_str->aa0x = aa0x;
+
+ if ((aa0y =(unsigned char *)calloc((size_t)maxn0,sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa0y array %d\n", maxn0);
+ exit (1);
+ }
+ aa0y++;
+ f_str->aa0y = aa0y;
+
+ last_n0 = 0;
+ for (itemp=0; itemp<3; itemp++) {
+ n0x = saatran(aa0,&aa0x[last_n0],n0,itemp);
+
+ /*
+ for (i=0; i<n0x; i++) {
+ fprintf(stderr,"%c",aa[aa0x[last_n0+i]]);
+ if ((i%60)==59) fprintf(stderr,"\n");
+ }
+ fprintf(stderr,"\n");
+ */
+ last_n0 += n0x+1;
+ }
+ /*
+ fprintf(stderr,"\n");
+ */
+ for (itemp=0, fs=aa0x; itemp <3; itemp++,fs++) {
+ for (fd = &aa0y[itemp]; *fs!=EOSEQ; fd += 3, fs++) *fd = *fs;
+ *fd=EOSEQ;
+ }
+
+ /* now switch aa0 and aa0x for hashing functions */
+ /* this seems dangerous in threaded code, but only the pointer is changed,
+ not the data itself */
+
+ fs = aa0;
+ aa0 = aa0x;
+ aa0x = fs;
+
+#endif
+
+ for (i0 = 0; i0 < hmax; i0++)
+ f_str->harr[i0] = -1;
+ for (i0 = 0; i0 < n0; i0++)
+ f_str->link[i0] = -1;
+
+ /* encode the aa0 array */
+
+ phv = hv = 0;
+ lkt = kt1;
+ for (i0 = 0; i0 < min(lkt,n0); i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {hv=phv=0; lkt=i0+ktup; continue;}
+ hv = (hv << f_str->kshft) + hsq[aa0[i0]];
+ phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup;
+ }
+
+ for (; i0 < n0; i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {
+ hv=phv=0;
+ lkt = i0+ktup;
+ /* restart hv, phv calculation */
+ for (; (i0 < lkt || hsq[aa0[i0]]>=NMAP) && i0<n0; i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {hv=phv=0; lkt = i0+ktup; continue;}
+ hv = (hv << f_str->kshft) + hsq[aa0[i0]];
+ phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup;
+ }
+ }
+ if (i0 >= n0) break;
+ hv = ((hv & f_str->hmask) << f_str->kshft) + hsq[aa0[i0]];
+ f_str->link[i0] = f_str->harr[hv];
+ f_str->harr[hv] = i0;
+ if (pamfact) {
+ f_str->pamh2[hv] = (phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup);
+ /* this check should always be true, but just in case */
+ if (hsq[aa0[i0-kt1]]<NMAP)
+ phv -= ppst->pam2[ip][aa0[i0 - kt1]][aa0[i0 - kt1]] * ktup;
+ }
+ else f_str->pamh2[hv] = fact * ktup;
+ }
+
+#ifndef TFAST
+ /* done hashing, now switch aa0, aa0x back */
+ fs = aa0;
+ aa0 = aa0x;
+ aa0x = fs;
+#endif
+
+ if (pamfact)
+ for (i0 = 1; i0 < ppst->nsq; i0++)
+ f_str->pamh1[i0] = ppst->pam2[ip][i0][i0] * ktup;
+ else
+ for (i0 = 1; i0 < ppst->nsq; i0++)
+ f_str->pamh1[i0] = fact;
+
+ f_str->ndo = 0; /* used to save time on diagonals with long queries */
+
+#ifndef ALLOCN0
+ if ((f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG,
+ sizeof (struct dstruct)))==NULL) {
+ fprintf (stderr," cannot allocate diagonal arrays: %ld\n",
+ (long) MAXDIAG *sizeof (struct dstruct));
+ exit (1);
+ };
+#else
+ if ((f_str->diag = (struct dstruct *) calloc ((size_t)n0,
+ sizeof (struct dstruct)))==NULL) {
+ fprintf (stderr," cannot allocate diagonal arrays: %ld\n",
+ (long)n0*sizeof (struct dstruct));
+ exit (1);
+ };
+#endif
+
+
+ if ((waa= (int *)malloc (sizeof(int)*(nsq+1)*n0)) == NULL) {
+ fprintf(stderr,"cannot allocate waa struct %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ pwaa = waa;
+ for (i=0; i<nsq; i++) {
+ for (j=0;j<n0; j++) {
+ *pwaa = ppst->pam2[ip][i][aa0[j]];
+ pwaa++;
+ }
+ }
+ f_str->waa0 = waa;
+
+ if ((waa= (int *)malloc (sizeof(int)*(nsq+1)*n0)) == NULL) {
+ fprintf(stderr,"cannot allocate waa struct %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ pwaa = waa;
+ for (i=0; i<nsq; i++) {
+ for (j=0;j<n0; j++) {
+ *pwaa = ppst->pam2[0][i][aa0[j]];
+ pwaa++;
+ }
+ }
+ f_str->waa1 = waa;
+
+#ifndef TFAST
+ maxn0 = max(2*n0,MIN_RES);
+#else
+ /* maxn0 needs to be large enough to accomodate introns
+ for TFASTX. For all other functions, it will be
+ more reasonable. */
+ maxn0 = max(4*n0,MIN_RES);
+#endif
+ if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) {
+ fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0);
+ exit(1);
+ }
+ f_str->res = res;
+ f_str->max_res = maxn0;
+
+ *f_arg = f_str;
+}
+
+
+/* pstring1 is a message to the manager, currently 512 */
+/* pstring2 is the same information, but in a markx==10 format */
+void
+get_param (const struct pstruct *ppst,
+ char **pstring1, char *pstring2,
+ struct score_count_s *s_info)
+{
+ char options_str1[128];
+ char options_str2[128];
+#ifndef TFAST
+ char *pg_str="FASTX";
+#else
+ char *pg_str="TFASTX";
+#endif
+
+ if (!ppst->param_u.fa.use_E_thresholds) {
+ sprintf(options_str1,"join: %d (%.3g), opt: %d (%.3g)",
+ ppst->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ sprintf(options_str2,"pg_join: %d (%.3g)\n; pg_optcut: %d (%.3g)",
+ ppst->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ }
+ else {
+ sprintf(options_str1,"E-join: %.2g (%.3g), E-opt: %.2g (%.3g)",
+ ppst->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ sprintf(options_str2,"pg_join_E(): %.2g (%.3g)\n; pg_optcut_E(): %.2g (%.3g)",
+ ppst->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ }
+
+ if (!ppst->param_u.fa.optflag) {
+ sprintf (pstring1[0], "%s (%s)",pg_str,verstr);
+ }
+ else {
+ sprintf (pstring1[0], "%s (%s) [optimized]",pg_str,verstr);
+ }
+
+#ifdef OLD_FASTA_GAP
+ sprintf (pstring1[1], "%s matrix (%d:%d)%s, gap-pen: %d/%d, shift: %d\n ktup: %d, %s, width: %3d",
+#else
+ sprintf (pstring1[1], "%s matrix (%d:%d)%s, open/ext: %d/%d, shift: %d\n ktup: %d, %s, width: %3d",
+#endif
+ ppst->pam_name, ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set) ? "xS":"\0",
+ ppst->gdelval, ppst->ggapval, ppst->gshift,
+ ppst->param_u.fa.ktup, options_str1, ppst->param_u.fa.optwid);
+
+ if (ppst->param_u.fa.iniflag) strcat(pstring1[0]," init1");
+
+ if (pstring2 != NULL) {
+#ifdef OLD_FASTA_GAP
+ sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n\
+; pg_gap-pen: %d %d\n; pg_ktup: %d\n; %s\n",
+#else
+ sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n\
+; pg_open_ext: %d %d\n; pg_ktup: %d\n; %s\n",
+#endif
+ pg_str,verstr,ppst->pam_name, ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set) ? "xS":"\0", ppst->gdelval,
+ ppst->ggapval,ppst->param_u.fa.ktup,options_str2);
+ }
+}
+
+void
+close_work (const unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ struct f_struct *f_str;
+
+ f_str = *f_arg;
+
+ if (f_str != NULL) {
+ if (f_str->cur != NULL) free(f_str->cur);
+#ifndef TFAST
+ f_str->aa0y--;
+ free(f_str->aa0y);
+ f_str->aa0x--;
+ free(f_str->aa0x);
+#else
+ f_str->aa1y--;
+ free(f_str->aa1y);
+ f_str->aa1x--;
+ free(f_str->aa1x);
+#endif
+ free(f_str->res);
+ free(f_str->waa1);
+ free(f_str->waa0);
+ free(f_str->diag);
+ free(f_str->link);
+ free(f_str->pamh2);
+ free(f_str->pamh1);
+ free(f_str->harr);
+ free(f_str);
+ *f_arg = NULL;
+ }
+}
+
+/* do_fastx() always compares a (possibly translated) protein query
+ sequence to another protein sequence.
+
+ #ifndef TFAST (e.g. FASTX),
+ then the hash table was built from the translated (amino-acid)
+ version of the query.
+
+ #ifdef TFAST, then aa0 is already a protein sequence
+
+ Args:
+ aa0, n0 query sequence
+ aa1, n1 library sequence
+ yaa translated DNA sequence (from either aa0 or aa1)
+ *ppst -> param struct
+ *f_str -> function structure set in init_work()
+ *rst -> scores (results struct)
+ *hoff -> offset of query in library sequence
+ */
+void do_fastx (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ unsigned char *yaa, /* translated 123123... */
+ const struct pstruct *ppst, struct f_struct *f_str,
+ struct rstruct *rst, int *hoff, int shuff_flg,
+ struct score_count_s *s_info)
+{
+ int nd; /* diagonal array size */
+ int lhval;
+ int kfact;
+ int i;
+ int my_hoff;
+ int c_gap, opt_cut;
+ const unsigned char *aa_prot, *aa_trans_prot;
+ int n_aap, n_taap;
+ register struct dstruct *dptr;
+ struct savestr vmax[MAXSAV]; /* best matches saved for one sequence */
+ struct savestr *vptr[MAXSAV];
+ struct savestr *lowmax;
+ int lowscor;
+ register int tscor;
+
+#ifndef ALLOCN0
+ register struct dstruct *diagp;
+#else
+ register int dpos;
+ int lposn0;
+#endif
+ struct dstruct *dpmax;
+ register int lpos;
+ int tpos;
+ struct savestr *vmptr;
+ int scor, tmp;
+ int im, ib, nsave;
+ int ktup, kt1, ip, lkt, ktup_sq;
+ const int *hsq;
+ int n0_eff;
+#ifndef TFAST
+ int n0x31, n0x32;
+ n0x31 = (n0-2)/3;
+ n0x32 = n0x31+1+(n0-n0x31-1)/2;
+#else
+ const unsigned char *fs;
+ unsigned char *fd;
+ int n1x31, n1x32, itemp;
+ n1x31 = (n1-2)/3;
+ n1x32 = n1x31+1+(n1-n1x31-1)/2;
+#endif
+
+ if (ppst->ext_sq_set) {
+ ip = 1;
+ hsq = ppst->hsqx;
+ }
+ else {
+ ip = 0;
+ hsq = ppst->hsq;
+ }
+
+ ktup = ppst->param_u.fa.ktup;
+ kt1 = ktup-1;
+ if (ktup <= 3) {
+ ktup_sq = ktup*ktup;
+ }
+ else {
+ ktup_sq = ktup;
+ }
+ if (ktup == 1) ktup_sq *= 2;
+
+ if (n1 < ktup) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+ if (n0+n1+1 >= MAXDIAG) {
+ fprintf(stderr,"n0,n1 too large: %d, %d\n",n0,n1);
+ rst->score[0] = rst->score[1] = rst->score[2] = -1;
+ return;
+ }
+
+ if (ppst->param_u.fa.use_E_thresholds) {
+ rst->valid_stat = 0;
+ n0_eff = n0;
+ if (n0 > 120) n0_eff = (n0+2)/3;
+ c_gap = ELK_to_s(ppst->param_u.fa.E_join*ktup_sq, n0_eff, n1, ppst->pLambda, ppst->pK, ppst->pH);
+ opt_cut = ELK_to_s(ppst->param_u.fa.E_band_opt*ktup_sq, n0_eff, n1, ppst->pLambda, ppst->pK, ppst->pH);
+ }
+ else {
+ c_gap = ppst->param_u.fa.cgap;
+ opt_cut = ppst->param_u.fa.optcut;
+ rst->valid_stat = 1;
+ }
+ /* if (shuff_flg) rst->valid_stat = 1; */
+
+ f_str->noff = n0 - 1;
+
+#ifdef ALLOCN0
+ nd = n0;
+#endif
+
+#ifndef ALLOCN0
+ nd = n0 + n1;
+#endif
+
+ dpmax = &f_str->diag[nd];
+ for (dptr = &f_str->diag[f_str->ndo]; dptr < dpmax;)
+ {
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+
+ for (vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++)
+ vmptr->score = 0;
+ lowmax = vmax;
+ lowscor = 0;
+
+ /* start hashing */
+ lhval = 0;
+ lkt = kt1;
+ for (lpos = 0; (lpos < lkt || hsq[aa1[lpos]]>=NMAP) && lpos<n1; lpos++) {
+ if (hsq[aa1[lpos]]>=NMAP) {
+ lhval = 0; lkt=lpos+ktup; continue;
+#ifdef ALLOCN0 /* reinitialize dptr */
+ dptr = &f_str->diag[lpos % nd];
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr->score = 0;
+#endif
+ }
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + hsq[aa1[lpos]];
+ }
+
+#ifndef ALLOCN0
+ diagp = &f_str->diag[f_str->noff + lkt];
+ for (; lpos < n1; lpos++, diagp++) {
+ /* if (hsq[aa1[lpos]]>=NMAP) {lhval = 0; continue;} */
+ if (hsq[aa1[lpos]]>=NMAP) {
+ lpos++ ; diagp++;
+ while (lpos < n1 && hsq[aa1[lpos]]>=NMAP) {lpos++; diagp++;}
+ if (lpos >= n1) break;
+ lhval = 0;
+ }
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + hsq[aa1[lpos]];
+ for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
+ if ((tscor = (dptr = &diagp[-tpos])->stop) >= 0) {
+#else
+ lposn0 = f_str->noff + lpos;
+ for (; lpos < n1; lpos++, lposn0++) {
+ if (hsq[aa1[lpos]]>=NMAP) {lhval = 0; goto loopl;}
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + hsq[aa1[lpos]];
+ for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
+ dpos = lposn0 - tpos;
+ if ((tscor = (dptr = &f_str->diag[dpos % nd])->stop) >= 0) {
+#endif
+ tscor += ktup;
+ if ((tscor -= lpos) <= 0) { /* better to start over */
+ scor = dptr->score;
+ if ((tscor += (kfact = f_str->pamh2[lhval])) < 0 && lowscor < scor) {
+#ifdef ALLOCN0
+ lowscor = savemax (dptr, dpos, vmax, &lowmax);
+#else
+ lowscor = savemax (dptr, dptr - f_str->diag, vmax, &lowmax);
+#endif
+ }
+ if ((tscor += scor) >= kfact) {
+ dptr->score = tscor;
+ dptr->stop = lpos;
+ }
+ else {
+ dptr->score = kfact;
+ dptr->start = (dptr->stop = lpos) - kt1;
+ }
+ } /* continue current run in diagonal */
+ else {
+ dptr->score += f_str->pamh1[aa0[tpos]];
+ dptr->stop = lpos;
+ }
+ }
+ else {
+ dptr->score = f_str->pamh2[lhval];
+ dptr->start = (dptr->stop = lpos) - kt1;
+ }
+ } /* end tpos */
+
+#ifdef ALLOCN0
+ /* reinitialize diag structure */
+ loopl:
+ if ((dptr = &f_str->diag[lpos % nd])->score > lowscor) {
+ lowscor = savemax (dptr, lpos, vmax, &lowmax);
+ }
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr->score = 0;
+#endif
+ } /* end lpos */
+
+#ifdef ALLOCN0
+ for (tpos = 0, dpos = f_str->noff + n1 - 1; tpos < n0; tpos++, dpos--) {
+ if ((dptr = &f_str->diag[dpos % nd])->score > lowscor) {
+ lowscor = savemax (dptr, dpos, vmax, &lowmax);
+ }
+ }
+#else
+ for (dptr = f_str->diag; dptr < dpmax;) {
+ if (dptr->score > lowscor) {
+ lowscor = savemax (dptr, dptr - f_str->diag, vmax, &lowmax);
+ }
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+ f_str->ndo = nd;
+#endif
+
+/*
+ at this point all of the elements of aa1[lpos]
+ have been searched for elements of aa0[tpos]
+ with the results in diag[dpos]
+*/
+
+ for (nsave = 0, vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++)
+ {
+ /*
+ fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
+ f_str->noff+vmptr->start-vmptr->dp,
+ f_str->noff+vmptr->stop-vmptr->dp,
+ vmptr->start,vmptr->stop,
+ vmptr->dp,vmptr->score);
+ */
+ if (vmptr->score > 0) {
+ vmptr->score = spam (aa0, aa1, vmptr, ppst->pam2[ip], f_str);
+ vptr[nsave++] = vmptr;
+ }
+ }
+
+ if (nsave <= 0) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+#ifndef TFAST
+ /* FASTX code here to modify the start, stop points for
+ the three phases of the translated protein sequence
+ */
+ /*
+ fprintf(stderr,"n0x: %d; n0x31:%d; n0x32: %d\n",n0,n0x31,n0x32);
+ for (ib=0; ib<nsave; ib++) {
+ fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
+ f_str->noff+vptr[ib]->start-vptr[ib]->dp,
+ f_str->noff+vptr[ib]->stop-vptr[ib]->dp,
+ vptr[ib]->start,vptr[ib]->stop,
+ vptr[ib]->dp,vptr[ib]->score);
+ }
+
+ fprintf(stderr,"---\n");
+ */
+ for (ib=0; ib<nsave; ib++) {
+ if (f_str->noff-vptr[ib]->dp+vptr[ib]->start >= n0x32)
+ vptr[ib]->dp += n0x32;
+ if (f_str->noff-vptr[ib]->dp +vptr[ib]->start >= n0x31)
+ vptr[ib]->dp += n0x31;
+ }
+#else
+ /* TFASTX code here to modify the start, stop points for
+ the three phases of the translated protein sequence
+ TFASTX modifies library start points, rather than
+ query start points
+ */
+
+ for (ib=0; ib<nsave; ib++) {
+ if (vptr[ib]->start >= n1x32) {
+ vptr[ib]->start -= n1x32;
+ vptr[ib]->stop -= n1x32;
+ vptr[ib]->dp -= n1x32;
+ }
+ if (vptr[ib]->start >= n1x31) {
+ vptr[ib]->start -= n1x31;
+ vptr[ib]->stop -= n1x31;
+ vptr[ib]->dp -= n1x31;
+ }
+ }
+
+#endif /* TFASTX */
+
+ scor = sconn (vptr, nsave, c_gap,
+ ppst->param_u.fa.pgap, f_str);
+
+ for (vmptr=vptr[0],ib=1; ib<nsave; ib++)
+ if (vptr[ib]->score > vmptr->score) vmptr=vptr[ib];
+
+/* kssort (vptr, nsave); */
+
+ rst->score[1] = vmptr->score; /* best single score - init1*/
+ rst->score[0] = max (scor, vmptr->score); /* initn */
+ rst->score[2] = rst->score[0]; /* initn */
+
+#ifndef TFAST /* FASTX */
+ *hoff = my_hoff=f_str->noff - vmptr->dp;
+#else
+ *hoff = my_hoff = vmptr->dp-f_str->noff;
+#endif
+
+ /*
+ if (n1 > 5000) {
+ fprintf(stderr," Long n1: %d\n",n1);
+ }
+ */
+
+ s_info->tot_scores++;
+ if (rst->score[0] >= c_gap) {s_info->s_cnt[0]++;}
+ if (ppst->param_u.fa.optflag) {
+#ifdef TFAST
+ if ( /* shuff_flg || */ rst->score[0] > opt_cut) {
+/* generate f_str->aa1y only if it is not there */
+ if ( !f_str->have_yaa ) {
+ for (fs=aa1,itemp=0; itemp <3; itemp++,fs++) {
+ for (fd= yaa+itemp; *fs!=EOSEQ; fd += 3, fs++) {*fd = *fs;}
+ *fd=EOSEQ;
+ }
+ }
+ }
+ aa_prot = aa0;
+ n_aap = n0;
+ aa_trans_prot= yaa;
+ n_taap = n1;
+#else
+ aa_prot = aa1;
+ n_aap = n1;
+ aa_trans_prot= yaa;
+ n_taap = n0;
+#endif
+ if ( /* shuff_flg || */ rst->score[0] > opt_cut) {
+ s_info->s_cnt[2]++;
+ rst->valid_stat = 1;
+ rst->score[2] = lx_band(aa_prot,n_aap,aa_trans_prot,n_taap,
+ ppst->pam2[ip],
+ -ppst->gdelval,
+ -ppst->ggapval,-ppst->gshift,
+ my_hoff-ppst->param_u.fa.optwid/2,ppst->param_u.fa.optwid,
+ f_str);
+ }
+ }
+}
+
+/* returns rst.score[0] - initn
+ rst.score[1] - init1
+ rst.score[2] - opt
+*/
+
+void do_work (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ int qr_flg, int shuff_flg, struct rstruct *rst,
+ struct score_count_s *s_info)
+{
+ int hoff;
+ int last_n1, itx, itt, n10, i;
+
+#ifdef TFAST
+ unsigned char *aa1x;
+ /* aa0 has a protein sequence */
+ /* aa1 has a raw DNA sequence */
+
+ itt = frame;
+ last_n1 = 0;
+ aa1x = f_str->aa1x;
+ for (itx= itt*3; itx< itt*3+3; itx++) {
+ n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
+ /*
+ fprintf(stderr," itt %d itx: %d\n",itt,itx);
+ for (i=0; i<n10; i++) {
+ fprintf(stderr,"%c",aa[f_str->aa1x[last_n1+i]]);
+ if ((i%60)==59) fprintf(stderr,"\n");
+ }
+ fprintf(stderr,"\n");
+ */
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+ f_str->have_yaa = 0;
+#endif
+
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ rst->escore = 1.0;
+ rst->segnum = rst->seglen = 1;
+
+#ifndef TFAST
+ do_fastx (f_str->aa0x, n0, aa1, n1, f_str->aa0y, ppst, f_str, rst, &hoff, shuff_flg, s_info);
+#else /* tfastx */
+ do_fastx (aa0, n0, f_str->aa1x, n10, f_str->aa1y, ppst, f_str, rst, &hoff, shuff_flg, s_info);
+#endif
+
+ rst->comp = rst->H = -1.0;
+}
+
+void do_opt (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct rstruct *rst)
+{
+ int optflag, tscore, hoff;
+ struct score_count_s s_info;
+
+#ifdef TFAST
+ int last_n1, itx, itt, n10, i;
+ unsigned char *xaa;
+
+ /* aa0 has a protein sequence */
+ /* aa1 has a raw DNA sequence */
+
+ itt = frame;
+ last_n1 = 0;
+ xaa = f_str->aa1x;
+ for (itx= itt*3; itx< itt*3+3; itx++) {
+ n10 = saatran(aa1,&xaa[last_n1],n1,itx);
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+ f_str->have_yaa = 0;
+#endif
+
+ optflag = ppst->param_u.fa.optflag;
+ ppst->param_u.fa.optflag = 1;
+
+#ifndef TFAST
+ do_fastx (f_str->aa0x, n0, aa1, n1, f_str->aa0y, ppst, f_str, rst, &hoff, 0, &s_info);
+#else /* TFASTX */
+ do_fastx (aa0, n0, xaa, n10, f_str->aa1y, ppst, f_str, rst, &hoff, 0, &s_info);
+#endif
+
+ ppst->param_u.fa.optflag = optflag;
+}
+
+int
+savemax (struct dstruct *dptr, int dpos,
+ struct savestr *vmax, struct savestr **lowmax)
+{
+ struct savestr *vmptr;
+ int i;
+
+/* check to see if this is the continuation of a run that is already saved */
+
+ if ((vmptr = dptr->dmax) != NULL && vmptr->dp == dpos &&
+ vmptr->start == dptr->start) {
+ vmptr->stop = dptr->stop;
+ if ((i = dptr->score) <= vmptr->score) return (*lowmax)->score;
+ vmptr->score = i;
+ if (vmptr != (*lowmax)) return (*lowmax)->score;
+ }
+ else {
+ i = (*lowmax)->score = dptr->score;
+ (*lowmax)->dp = dpos;
+ (*lowmax)->start = dptr->start;
+ (*lowmax)->stop = dptr->stop;
+ dptr->dmax = (*lowmax);
+ }
+
+ for (vmptr = vmax; vmptr < vmax+MAXSAV; vmptr++) {
+ if (vmptr->score < i) {
+ i = vmptr->score;
+ *lowmax = vmptr;
+ }
+ }
+ return i;
+}
+
+int spam (const unsigned char *aa0, const unsigned char *aa1,
+ struct savestr *dmax, int **pam2,
+ struct f_struct *f_str)
+{
+ int lpos;
+ int tot, mtot;
+ struct {
+ int start, stop, score;
+ } curv, maxv;
+ const unsigned char *aa0p, *aa1p;
+
+ aa1p = &aa1[lpos = dmax->start];
+ aa0p = &aa0[lpos - dmax->dp + f_str->noff];
+ curv.start = lpos;
+
+ tot = curv.score = maxv.score = 0;
+ for (; lpos <= dmax->stop; lpos++) {
+ tot += pam2[*aa0p++][*aa1p++];
+ if (tot > curv.score) {
+ curv.stop = lpos;
+ curv.score = tot;
+ }
+ else if (tot < 0) {
+ if (curv.score > maxv.score) {
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+ tot = curv.score = 0;
+ curv.start = lpos+1;
+ }
+ }
+
+ if (curv.score > maxv.score) {
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+
+/* if (maxv.start != dmax->start || maxv.stop != dmax->stop)
+ printf(" new region: %3d %3d %3d %3d\n",maxv.start,
+ dmax->start,maxv.stop,dmax->stop);
+*/
+ dmax->start = maxv.start;
+ dmax->stop = maxv.stop;
+
+ return maxv.score;
+}
+
+#define XFACT 10
+
+int sconn (struct savestr **v, int n,
+ int cgap, int pgap, struct f_struct *f_str)
+{
+ int i, si;
+ struct slink {
+ int score;
+ struct savestr *vp;
+ struct slink *next;
+ } *start, *sl, *sj, *so, sarr[MAXSAV];
+ int lstart, tstart, plstop, ptstop;
+
+/* sort the score left to right in lib pos */
+
+ kpsort (v, n);
+
+ start = NULL;
+
+/* for the remaining runs, see if they fit */
+
+ for (i = 0, si = 0; i < n; i++)
+ {
+
+/* if the score is less than the gap penalty, it never helps */
+ if (v[i]->score < cgap)
+ continue;
+ lstart = v[i]->start;
+ tstart = lstart - v[i]->dp + f_str->noff;
+
+/* put the run in the group */
+ sarr[si].vp = v[i];
+ sarr[si].score = v[i]->score;
+ sarr[si].next = NULL;
+
+/* if it fits, then increase the score */
+ for (sl = start; sl != NULL; sl = sl->next)
+ {
+ plstop = sl->vp->stop;
+ ptstop = plstop - sl->vp->dp + f_str->noff;
+ if (plstop < lstart+XFACT && ptstop < tstart+XFACT) {
+ sarr[si].score = sl->score + v[i]->score + pgap;
+ break;
+ }
+ }
+
+/* now recalculate where the score fits */
+ if (start == NULL)
+ start = &sarr[si];
+ else
+ for (sj = start, so = NULL; sj != NULL; sj = sj->next)
+ {
+ if (sarr[si].score > sj->score)
+ {
+ sarr[si].next = sj;
+ if (so != NULL)
+ so->next = &sarr[si];
+ else
+ start = &sarr[si];
+ break;
+ }
+ so = sj;
+ }
+ si++;
+ }
+
+ if (start != NULL)
+ return (start->score);
+ else
+ return (0);
+}
+
+void
+kssort (v, n)
+struct savestr *v[];
+int n;
+{
+ int gap, i, j;
+ struct savestr *tmp;
+
+ for (gap = n / 2; gap > 0; gap /= 2)
+ for (i = gap; i < n; i++)
+ for (j = i - gap; j >= 0; j -= gap)
+ {
+ if (v[j]->score >= v[j + gap]->score)
+ break;
+ tmp = v[j];
+ v[j] = v[j + gap];
+ v[j + gap] = tmp;
+ }
+}
+
+void
+kpsort (struct savestr **v, int n) {
+ int gap, i, j, k;
+ int incs[4] = { 21, 7, 3, 1 };
+ struct savestr *tmp;
+ int v_start;
+
+ for ( k = 0; k < 4; k++) {
+ gap = incs[k];
+ for (i = gap; i < n; i++) {
+ tmp = v[i];
+ j = i;
+ v_start = v[i]->start;
+ while (j >= gap && v[j - gap]->start > v_start) {
+ v[j] = v[j - gap];
+ j -= gap;
+ }
+ v[j] = tmp;
+ }
+ }
+}
+
+static void
+init_row(struct sx_s *row, int sp) {
+ int i;
+ for (i = 0; i < sp; i++) {
+ row[i].C1 = row[i].I1 = 0;
+ row[i].C2 = row[i].I2 = 0;
+ row[i].C3 = row[i].I3 = 0;
+ row[i].flag = 0;
+ }
+}
+
+int
+lx_band(const unsigned char *prot_seq, /* array with protein sequence numbers*/
+ int len_prot, /* length of prot. seq */
+ const unsigned char *dna_prot_seq, /* translated DNA sequence numbers*/
+ int len_dna_prot, /* length trans. seq. */
+ int **pam_matrix, /* scoring matrix */
+ int gopen, int gext, /* gap open, gap extend penalties */
+ int gshift, /* frame-shift penalty */
+ int start_diag, /* start diagonal of band */
+ int width, /* width for band alignment */
+ struct f_struct *f_str)
+{
+ void *ckalloc();
+ int i, j, bd, bd1, x1, sp, p1=0, p2=0, end_prot;
+ int sc, del, best = 0, cd,ci, e1, e2, e3, cd1, cd2, cd3, f, gg;
+ register int *wt;
+ const unsigned char *dp;
+ register struct sx_s *ap, *aq;
+
+ sp = width+7;
+ gg = gopen+gext;
+ /* sp = sp/3; */
+ if (f_str->cur == NULL) {
+ f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp);
+ f_str->cur_sp_size = sp;
+ }
+ else if (f_str->cur_sp_size != sp) {
+ free(f_str->cur);
+ f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp);
+ f_str->cur_sp_size = sp;
+ }
+
+ init_row(f_str->cur, sp);
+
+ /*
+ if (start_diag %3 !=0) start_diag = start_diag/3-1;
+ else start_diag = start_diag/3;
+ */
+
+ /*
+ if (width % 3 != 0) width = width/3+1;
+ else width = width /3;
+ */
+
+ /* currently, this code assumes that the DNA sequence is longer than the
+ protein sequence. This is not always true. len_prot in the loop below
+ should be decreased to the projection of the DNA on the protein */
+
+ x1 = start_diag; /* x1 = lower bound of DNA */
+
+
+ end_prot = max(0,-width-start_diag) + (len_dna_prot+5)/3 + width;
+ end_prot = min(end_prot,len_prot);
+
+ /* i counts through protein sequence, x1 through DNAp */
+
+ for (i = max(0, -width-start_diag), x1+=i; i < end_prot; i++, x1++) {
+ bd = min(x1+width, len_dna_prot/3); /* upper bound of band */
+ bd1 = max(0,x1); /* lower bound of band */
+ wt = pam_matrix[prot_seq[i]];
+ del = 1-x1; /*adjustment*/
+ bd += del;
+ bd1 +=del;
+
+ ap = &f_str->cur[bd1];
+ aq = ap+1;
+ e1 = f_str->cur[bd1-1].C3;
+ e2 = ap->C1;
+ cd1 = cd2= cd3= 0;
+
+ for (dp = &dna_prot_seq[(bd1-del)*3]; ap < &f_str->cur[bd]; ap++) {
+ sc = max(max(e1, (e3=ap->C2))-gshift, e2)+wt[*dp++];
+ if (cd1 > sc) sc = cd1;
+ cd1 -= gext;
+ if ((ci = aq->I1) > 0) {
+ if (sc < ci) { ap->C1 = ci; ap->I1 = ci-gext;}
+ else {
+ ap->C1 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd1 < sc) cd1 = sc;
+ ap->I1 = max(ci-gext, sc);
+ } else ap->I1 = ci-gext;
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I1 = ap->C1 = 0;
+ } else {
+ ap->C1 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd1 < sc) cd1 = sc;
+ ap->I1 = sc;
+ } else ap->I1 = 0;
+ }
+ }
+ sc = max(max(e2, (e1=ap->C3))-gshift, e3)+wt[*dp++];
+ if (cd2 > sc) sc = cd2;
+ cd2 -= gext;
+ if ((ci = aq->I2) > 0) {
+ if (sc < ci) { ap->C2 = ci; ap->I2 = ci-gext;}
+ else {
+ ap->C2 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd2 < sc) cd2 = sc;
+ ap->I2 = max(ci-gext, sc);
+ }
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I2 = ap->C2 = 0;
+ } else {
+ ap->C2 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd2 < sc) cd2 = sc;
+ ap->I2 = sc;
+ } else ap->I2 = 0;
+ }
+ }
+ sc = max(max(e3, (e2=aq->C1))-gshift, e1)+wt[*dp++];
+ if (cd3 > sc) sc = cd3;
+ cd3 -= gext;
+ if ((ci = aq++->I3) > 0) {
+ if (sc < ci) { ap->C3 = ci; ap->I3 = ci-gext;}
+ else {
+ ap->C3 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd3 < sc) cd3 = sc;
+ ap->I3 = max(ci-gext, sc);
+ }
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I3 = ap->C3 = 0;
+ } else {
+ ap->C3 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd3 < sc) cd3 = sc;
+ ap->I3 = sc;
+ } else ap->I3 = 0;
+ }
+ }
+ }
+ }
+ /* printf("The best score is %d\n", best); */
+ return best+gopen+gext;
+}
+
+/* ckalloc - allocate space; check for success */
+void *ckalloc(size_t amount)
+{
+ void *p;
+
+ if ((p = (void *)malloc( (size_t)amount)) == NULL)
+ w_abort("Ran out of memory.","");
+ return(p);
+}
+
+/* calculate the 100% identical score */
+int
+shscore(unsigned char *aa0, int n0, int **pam2)
+{
+ int i, sum;
+ for (i=0,sum=0; i<n0; i++)
+ sum += pam2[aa0[i]][aa0[i]];
+ return sum;
+}
+
+#define WIDTH 60
+
+/* code above is to convert sequence into numbers */
+
+typedef struct mat *match_ptr;
+
+typedef struct mat {
+ int i, j, l;
+ match_ptr next;
+} match_node;
+
+typedef struct {
+ int i,j;
+} state;
+
+typedef state *state_ptr;
+
+typedef struct st_s { int C, I, D;} *st_ptr;
+
+/* static st_ptr up=NULL, down, tp; */
+/* static int *st_up; */
+/* static int gop, gext, shift; */
+
+void *ckalloc(size_t);
+static match_ptr small_global(), global();
+static int local_align(), find_best();
+static void init_row2(), init_ROW();
+
+int
+pro_dna(const unsigned char *prot_seq, /* array with prot. seq. numbers*/
+ int len_prot, /* length of prot. seq */
+ const unsigned char *dna_prot_seq, /* trans. DNA seq. numbers*/
+ int len_dna_prot, /* length trans. seq. */
+ int **pam_matrix, /* scoring matrix */
+ int gopen, int gex, /* gap open, gap extend penalties */
+ int gshift, /* frame-shift penalty */
+ struct smgl_str *smgl_sp,
+ int max_res,
+ struct a_res_str *a_res) /* alignment info */
+{
+ match_ptr align, ap, aq;
+ int x, y, ex, ey, i, score;
+ int *alignment;
+ st_ptr up, down, tp;
+
+ /* these globals removed */
+ /* gext = gex; gop = gopen; shift = gshift; */
+
+ /* for fastx (but not tfastx), these could be moved into init_work(),
+ and done only once */
+
+ up = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+ down = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+ tp = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+
+ /*local alignment find the best local alignment x (prot) and y (DNA)
+ is the starting position of the best local alignment
+ and ex (prot) ey (DNA) is the ending position */
+ score= local_align(&x, &y, &ex, &ey, pam_matrix,
+ gopen, gex, gshift,
+ dna_prot_seq, len_dna_prot,
+ prot_seq, len_prot, up, down);
+
+ /* this is very strange, since local_align initialized up, down */
+ up += 3; down += 3; tp += 3;
+
+ /* x, y - start in prot, dna_prot */
+ a_res->min0 = x; /* prot */
+ a_res->max0 = ex; /* prot */
+
+ a_res->min1 = y; /* DNA-prot */
+ a_res->max1 = ey; /* DNA-prot */
+
+ align = global(x, y, ex, ey, pam_matrix, gopen, gex, gshift,
+ dna_prot_seq, prot_seq, 0, 0, &up, &down, &tp,
+ smgl_sp);
+
+ alignment = a_res->res;
+
+ /* from earlier version */
+ /* alignment[0] = x; */ /* start of alignment in prot */
+ /* alignment[1] = y; */ /* start of alignment in DNA */
+
+ for (ap = align, i= 0; ap; i++) {
+ if (i < max_res) {alignment[i] = ap->l;}
+ aq = ap->next; free(ap); ap = aq;
+ }
+
+ if (i >= max_res) {
+ fprintf(stderr," alignment truncated: %d/%d\n", max_res,i);
+ }
+
+ up = &up[-3]; down = &down[-3]; tp = &tp[-3];
+ free(up); free(tp); free(down);
+ /* free(st_up); */ /* moved into local align */
+
+ a_res->nres = i; /* i has the length of the alignment */
+ return score;
+}
+
+static void
+swap(void **a, void **b) {
+ void *t;
+
+ t = *a;
+ *a = *b;
+ *b = t;
+}
+
+/*
+ local alignment find the best local alignment x and y
+ is the starting position of the best local alignment
+ and ex ey is the ending position
+*/
+static int
+local_align(int *x, int *y, int *ex, int *ey,
+ int **wgts, int gop, int gext, int shift,
+ const unsigned char *dnap, int ld,
+ const unsigned char *pro, int lp,
+ st_ptr up, st_ptr down) {
+
+ int i, j, score, x1,x2,x3,x4, e1, e2 = 0, e3,
+ sc, del, e, best = 0, *wt, cd, ci;
+ state_ptr cur_st, last_st, cur_i_st;
+ st_ptr cur, last;
+ const unsigned char *dp;
+ int *st_up, *cur_d_st;
+
+/*
+ Array rowiC store the best scores of alignment ending at a position
+ Arrays rowiD, and rowiI store the best scores of alignment ending
+ at a position with a deletion or insrtion
+ Arrays sti stores the starting position of the best alignment whose
+ score stored in the corresponding row array.
+ The program stores two rows to complete the computation, same is
+ for the global alignment routine.
+*/
+
+ /* for fastx (but not tfastx), this could be moved into init_work(),
+ and done only once */
+ st_up = (int *) ckalloc(sizeof(int)*(ld+10));
+ init_row2(st_up, ld+5);
+
+ ld += 2;
+ init_ROW(up, ld+1); /* set to zero */
+ init_ROW(down, ld+1); /* set to zero */
+
+
+ cur = up+1;
+ last = down+1;
+
+ /* for fastx (but not tfastx), these could be moved into init_work(),
+ and done only once */
+ cur_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+ last_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+ cur_i_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+
+ cur_d_st = st_up;
+
+ dp = dnap-2;
+ for (i = 0; i < lp; i++) {
+ wt = &wgts[pro[i]][0];
+ for (j = 0; j < 2; j++) {
+ cur_st[j].i = i+1;
+ cur_st[j].j = j+1;
+ }
+ for (j = 2; j < ld; j++) {
+ score = wt[dp[j]];
+ del = -1;
+ if (j >= 3) {
+ sc = -score;
+ e3 = e2-shift; e2 = last[j-3].C;
+ e1 = last[j-2].C-shift;
+ if (e1 > sc) {sc = e1; del = 2;}
+ if (e2 > sc) {sc = e2; del = 3;}
+ if (e3 > sc) {sc = e3; del = 4;}
+ } else {
+ sc = e2 = 0;
+ if (sc < -score) sc=-score;
+ else del = 3;
+ }
+ sc += score;
+ if (sc < (ci=last[j].I)) {
+ sc = ci; del = 0;
+ }
+ if (sc < (cd=cur[j].D)) {
+ sc = cd; del = 5;
+ }
+ cur[j].C = sc;
+ e = sc - gop;
+ if (e > cd) {
+ cur[j+3].D = e-gext;
+ cur_d_st[j+3] = 3;
+ } else {
+ cur[j+3].D = cd-gext;
+ cur_d_st[j+3] = cur_d_st[j]+3;
+ }
+ switch(del) {
+ case 5:
+ e1 = cur_d_st[j];
+ cur_st[j].i = cur_st[j-e1].i;
+ cur_st[j].j = cur_st[j-e1].j;
+ break;
+ case 0:
+ cur_st[j].i = cur_i_st[j].i;
+ cur_st[j].j = cur_i_st[j].j;
+ break;
+ case 2:
+ case 3:
+ case 4:
+ if (i) {
+ if (j-del >= 0) {
+ cur_st[j].i = last_st[j-del].i;
+ cur_st[j].j = last_st[j-del].j;
+ } else {
+ cur_st[j].i = i;
+ cur_st[j].j = 0;
+ }
+ } else {
+ cur_st[j].i = 0;
+ cur_st[j].j = max(0, j-del+1);
+ }
+ break;
+ case -1:
+ cur_st[j].i = i+1;
+ cur_st[j].j = j+1;
+ break;
+ }
+ if (e > ci) {
+ cur[j].I = e -gext;
+ cur_i_st[j].i = cur_st[j].i;
+ cur_i_st[j].j = cur_st[j].j;
+ } else {
+ cur[j].I = ci- gext;
+ }
+ if (sc > best) {
+ x1 = cur_st[j].i;
+ x2 = cur_st[j].j;
+ best =sc;
+ x3 = i;
+ x4 = j;
+ }
+ }
+ swap((void **)&last, (void **)&cur);
+ swap((void **)&cur_st, (void **)&last_st);
+ }
+ /* printf("The best score is %d\n", best); */
+ *x = x1; *y = x2; *ex = x3; *ey = x4;
+ free(cur_st); free(last_st); free(cur_i_st);
+ free(st_up);
+ return best;
+}
+
+/*
+ Both global_up and global_down do linear space score only global
+ alignments on subsequence pro[x]...pro[ex], and dna[y]...dna[ey].
+ global_up does the algorithm upwards, from row x towards row y.
+ global_down does the algorithm downwards, from row y towards x.
+*/
+
+static void
+global_up(st_ptr *row1, st_ptr *row2,
+ int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext, int shift,
+ unsigned char *dnap,
+ unsigned char *pro,
+ int N) {
+ int i, j, k, sc, e, e1, e2, e3, t, ci, cd, score, *wt;
+ st_ptr cur, last;
+
+ cur = *row1; last = *row2;
+
+ sc = -gop-gext;
+
+ for (j = 1; j <= ey-y+1; j++) {
+ if (j % 3 == 0) {last[j].C = sc; sc -= gext; last[j].I = sc-gop;}
+ else { last[j].I = last[j].C = -10000;}
+ cur[j].I = -10000;
+ }
+
+ last[0].C = 0; cur[0].D = cur[1].D = cur[2].D = -10000;
+ last[0].D = last[1].D = last[2].D = -10000;
+
+ if (N) last[0].I = -gext;
+ else last[0].I = -gop-gext;
+
+ for (i = 1; i <= ex-x+1; i++) {
+ wt = &wgts[pro[i+x-1]][0]; e2 = last[0].C; e1 = -10000;
+ for (j = 0; j <= ey-y+1; j++) {
+ t = j+y;
+ sc = -10000;
+ if (t < 3) score = -10000;
+ else score = wt[dnap[t-3]];
+ if (j < 4) {
+ if (j == 3) sc = e2;
+ else if (j == 2) sc = e2-shift;
+ }
+ else {
+ e3 = e2; e2 = e1;
+ e1 = last[j-2].C;
+ sc = max(max(e1, e3)-shift, e2);
+ }
+ sc += score;
+ sc = max(sc, max(ci=last[j].I, cd = cur[j].D));
+ cur[j].C = sc;
+ cur[j+3].D = max(cd, sc-gop)-gext;
+ cur[j].I = max(ci, sc-gop)-gext;
+ }
+ swap((void **)&last, (void **)&cur);
+ }
+ for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
+ if (*row1 != last) swap((void **)row1, (void **)row2);
+}
+
+static void
+global_down(st_ptr *row1, st_ptr *row2,
+ int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext, int shift,
+ unsigned char *dnap, unsigned char *pro,
+ int N) {
+ int i, j, k, sc, del, *tmp, e, t, e1,e2,e3, ci,cd, s1, s2, s3, *wt;
+ st_ptr cur, last;
+
+ cur = (*row1); last = *row2;
+
+ sc = -gop-gext;
+
+ for (j = ey-y; j >= 0; j--) {
+ if ((ey-y+1-j) % 3) {last[j].C = sc; sc-=gext; last[j].I = sc-gop;}
+ else last[j].I = last[j].C = -10000;
+ }
+
+ last[ey-y+1].C = 0;
+ cur[ey-y+1].D = cur[ey-y].D = cur[ey-y-1].D = -10000;
+ last[ey-y+1].D = last[ey-y].D = last[ey-y-1].D = -10000;
+
+ if (N) last[ey-y+1].I = -gext;
+ else last[ey-y+1].I = -gop-gext;
+
+ for (i = ex-x; i >= 0; i--) {
+ wt = &wgts[pro[i+x]][0]; e2 = last[ey-y+1].C;
+ e1 = s2 = s3 = -10000;
+ for (j = ey-y+1; j >= 0; j--) {
+ t = j+y;
+ s1 = wt[dnap[t-1]];
+ sc = -10000;
+ if (t+3 > ey) {
+ if (t+2==ey) sc = e2+s2;
+ else if (t+1==ey) sc = e2-shift+s1;
+ } else {
+ e3 = e2; e2 = e1;
+ e1 = last[j+2].C;
+ sc = max(max(e1+s1, e3+s3)-shift, e2+s2);
+ }
+ if (sc < (cd= cur[j].D)) {
+ sc = cd;
+ cur[j-3].D = cd-gext;
+ } else cur[j-3].D =max(cd, sc-gop)-gext;
+ if (sc < (ci= last[j].I)) {
+ sc = ci; del = 0;
+ cur[j].I = ci - gext;
+ } else cur[j].I = max(sc-gop,ci)-gext;
+ cur[j].C = sc;
+ s3 = s2; s2 = s1;
+ }
+ swap((void **)&last, (void **)&cur);
+ }
+ for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
+ if (*row1 != last) swap((void **)row1, (void **)row2);
+}
+
+static void
+init_row2(int *row, int ld) {
+ int i;
+ for (i = 0; i < ld; i++) row[i] = 0;
+}
+
+static void
+init_ROW(st_ptr row, int ld) {
+ int i;
+ for (i = 0; i < ld; i++) row[i].I = row[i].D = row[i].C = 0;
+}
+
+static match_ptr
+combine(match_ptr x1, match_ptr x2, int st) {
+ match_ptr x;
+
+ if (x1 == NULL) return x2;
+ for (x = x1; x->next; x = x->next);
+ x->next = x2;
+ if (st) {
+ for (x = x2; x; x = x->next) {
+ x->j++;
+ if (x->l == 3 || x->l == 4) break;
+ }
+ x->l--;
+ }
+ return x1;
+}
+
+/*
+ global use the two upwards and downwards score only linear
+ space global alignment subroutine to recursively build the
+ alignment.
+*/
+
+match_ptr
+global(int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext, int shift,
+ unsigned char *dnap,
+ unsigned char *pro,
+ int N1, int N2,
+ st_ptr *up_stp, st_ptr *dn_stp, st_ptr *tp_stp,
+ struct smgl_str *smgl_sp
+ )
+{
+ int m;
+ int m1, m2;
+ match_ptr x1, x2, mm1, mm2;
+ /*printf("%d %d %d %d\n", x,y, ex, ey);*/
+ /*
+ if the space required is limited, we can do a quadratic space
+ algorithm to find the alignment.
+ */
+ if (ex <= x) {
+ mm1 = NULL; mm2= NULL;
+ for (m = y+3; m <= ey; m+=3) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = 5; x1->next = mm1;
+ if (mm1== NULL) mm2 = x1;
+ mm1 = x1;
+ }
+ if (ex == x) {
+ if ((ey-y) % 3 != 0) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = ((ey-y) % 3) +1; x1->next = NULL;
+ if (mm2) mm2->next = x1;
+ else mm1 = x1;
+ } else {
+ if (mm2) mm2->l = 4;
+ }
+ }
+ return mm1;
+ }
+ if (ey <= y) {
+ mm1 = NULL;
+ for (m = x; m <= ex; m++) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = 0; x1->next = mm1; mm1 = x1;
+ }
+ return mm1;
+ }
+ if (ex -x < SGW1-1 && ey-y < SGW2-1)
+ return small_global(x,y,ex,ey,
+ wgts, gop, gext, shift,
+ dnap, pro, N1, N2, smgl_sp);
+ m = (x+ex)/2;
+ /*
+ Do the score only global alignment from row x to row m, m is
+ the middle row of x and ex. Store the information of row m in
+ upC, upD, and upI.
+ */
+ global_up(up_stp, tp_stp, x, y, m, ey,
+ wgts, gop, gext, shift,
+ dnap, pro, N1);
+
+ /*
+ Do the score only global alignment downwards from row ex
+ to row m+1, store information of row m+1 in downC downI and downD
+ */
+ global_down(dn_stp, tp_stp, m+1, y, ex, ey,
+ wgts, gop, gext, shift,
+ dnap, pro, N2);
+
+ /*
+ Use these information of row m and m+1, to find the crossing
+ point of the best alignment with the middle row. The crossing
+ point is given by m1 and m2. Then we recursively call global
+ itself to compute alignments in two smaller regions found by
+ the crossing point and combine the two alignments to form a
+ whole alignment. Return that alignment.
+ */
+ if (find_best(*up_stp, *dn_stp, &m1, &m2, ey-y+1, y, gop)) {
+ x1 = global(x, y, m, m1, wgts, gop, gext, shift, dnap, pro, N1, 0,
+ up_stp, dn_stp, tp_stp, smgl_sp);
+ x2 = global(m+1, m2, ex, ey, wgts, gop, gext, shift, dnap, pro, 0, N2,
+ up_stp, dn_stp, tp_stp, smgl_sp);
+ if (m1 == m2) x1 = combine(x1,x2,1);
+ else x1 = combine(x1, x2,0);
+ } else {
+ x1 = global(x, y, m-1, m1, wgts, gop, gext, shift, dnap, pro, N1, 1,
+ up_stp, dn_stp, tp_stp, smgl_sp);
+ x2 = global(m+2, m2, ex, ey, wgts, gop, gext, shift, dnap, pro, 1, N2,
+ up_stp, dn_stp, tp_stp, smgl_sp);
+ mm1 = (match_ptr) ckalloc(sizeof(match_node));
+ mm1->i = m; mm1->l = 0; mm1->j = m1;
+ mm2 = (match_ptr) ckalloc(sizeof(match_node));
+ mm2->i = m+1; mm2->l = 0; mm2->j = m1;
+ mm1->next = mm2; mm2->next = x2;
+ x1 = combine(x1, mm1, 0);
+ }
+ return x1;
+}
+
+static int
+find_best(st_ptr up, st_ptr down,
+ int *m1, int *m2,
+ int ld, int y, int gop) {
+ int i, best = -100000, j = 0, s1, s2, s3, s4, st;
+ up++;
+ for (i = 1; i < ld; i++) {
+ s2 = up[i-1].C + down[i].C;
+ s4 = up[i-1].I + down[i].I + gop;
+ if (best < s2) {
+ best = s2; j = i; st = 1;
+ }
+ if (best < s4) {
+ best = s4; j = i; st = 0;
+ }
+ }
+ *m1 = j-1+y;
+ *m2 = j+y;
+ /*printf("find best score =%d\n", best);*/
+ return st;
+}
+
+/*
+ An alignment is represented as a linked list whose element
+ is of type match_node. Each element represent an edge in the
+ path of the alignment graph. The fields of match_node are
+ l --- gives the type of the edge.
+ i, j --- give the end position.
+*/
+
+static match_ptr
+small_global(int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext, int shift,
+ unsigned char *dnap, unsigned char *pro,
+ int N1, int N2, struct smgl_str *smgl_sp) {
+
+ /* int C[SGW1+1][SGW2+1], st[SGW1+1][SGW2+1], D[SGW2+7], I[SGW2+1]; */
+
+ int i, j, e, sc, score, del, k, t, *wt, ci, cd;
+ int *cI, *cD, *cC, *lC, *cst, e2, e3, e4;
+ match_ptr mp, first;
+
+ /*printf("small_global %d %d %d %d\n", x, y, ex, ey);*/
+ sc = -gop-gext; smgl_sp->C[0][0] = 0;
+
+ cI = smgl_sp->I;
+ if (N1) cI[0] = -gext; else cI[0] = sc;
+ for (j = 1; j <= ey-y+1; j++) {
+ if (j % 3== 0) {
+ smgl_sp->C[0][j] = sc;
+ sc -= gext;
+ cI[j] = sc-gop;
+ }
+ else {cI[j] = smgl_sp->C[0][j] = -10000;}
+ smgl_sp->st[0][j] = 5;
+ }
+
+ lC = &smgl_sp->C[0][0];
+ cD = smgl_sp->D; cD[0] = cD[1] = cD[2] = -10000;
+
+ for (i = 1; i <= ex-x+1; i++) {
+ cC = &smgl_sp->C[i][0];
+ wt = &wgts[pro[i+x-1]][0]; cst = &smgl_sp->st[i][0];
+ for (j = 0; j <=ey-y+1; j++) {
+ sc = -10000; del = 0;
+ ci = cI[j];
+ cd= cD[j];
+ t = j+y;
+ if (t < 3) score = -10000;
+ else score = wt[dnap[t-3]];
+ if (j >= 4) {
+ e2 = lC[j-2]-shift; sc = lC[j-3]; e4 = lC[j-4]-shift;
+ del = 3;
+ if (e2 > sc) { sc = e2; del = 2;}
+ if (e4 >= sc) { sc = e4; del = 4;}
+ } else {
+ if (j ==3) {sc= lC[0]; del = 3;}
+ else if (j == 2) {sc = lC[0]-shift; del = 2;}
+ }
+ sc = sc+score;
+ if (sc < ci) {
+ sc = ci; del = 0;
+ }
+ if (sc <= cd) {
+ sc = cd;
+ del = 5;
+ }
+ cC[j] = sc;
+ sc -= gop;
+ if (sc < cd) {
+ del += 10;
+ cD[j+3] = cd - gext;
+ } else cD[j+3] = sc -gext;
+ if (sc < ci) {
+ del += 20;
+ cI[j] = ci-gext;
+ } else cI[j] = sc-gext;
+ *(cst++) = del;
+ }
+ lC = cC;
+ }
+ if (N2 && ci +gop > cC[ey-y+1]) {
+ smgl_sp->st[ex-x+1][ey-y+1] = 0;
+ /*printf("small score = %d\n", ci+gop);*/
+ } /*else printf("small score =%d\n", cC[ey-y+1]);*/
+ first = NULL; e = 1;
+ for (i = ex+1, j = ey+1; i > x || j > y; i--) {
+ mp = (match_ptr) ckalloc(sizeof(match_node));
+ mp->i = i-1;
+ k = (t=smgl_sp->st[i-x][j-y])%10;
+ mp->j = j-1;
+ if (e == 5 && (t/10)%2 == 1) k = 5;
+ if (e == 0 && (t/20)== 1) k = 0;
+ if (k == 5) { j -= 3; i++; e=5;}
+ else {j -= k;if (k==0) e= 0; else e = 1;}
+ mp->l = k;
+ mp->next = first;
+ first = mp;
+ }
+
+ /* for (i = 0; i <= ex-x; i++) {
+ for (j = 0; j <= ey-y; j++)
+ printf("%d ", C[i][j]);
+ printf("\n");
+ }
+ */
+ return first;
+}
+
+#define XTERNAL
+#include "upam.h"
+
+extern void
+display_alig(int *a, unsigned char *dna, unsigned char * pro, int length, int ld)
+{
+ int len = 0, i, j, x, y, lines, k;
+ char line1[100], line2[100], line3[100],
+ tmp[10] = " ";
+ unsigned char *dna1, c1, c2, c3, *st;
+
+ dna1 = ckalloc((size_t)ld);
+ for (st = dna, i = 0; i < ld; i++, st++) dna1[i] = NCBIstdaa[*st];
+ line1[0] = line2[0] = line3[0] = '\0'; x= a[0]; y = a[1]-1;
+
+ for (len = 0, j = 2, lines = 0; j < length; j++) {
+ i = a[j];
+ /*printf("%d %d %d\n", i, len, b->j);*/
+ if (i > 0 && i < 5) tmp[i-2] = NCBIstdaa[pro[x++]];
+ if (i == 5) {
+ i = 3; tmp[0] = tmp[1] = tmp[2] = '-';
+ if (a[j+1] == 2) tmp[2] = ' ';
+ }
+ if (i > 0) {
+ strncpy(&line1[len], (const char *)&dna1[y], i); y+=i;
+ } else {line1[len] = '-'; i = 1; tmp[0] = NCBIstdaa[pro[x++]];}
+ strncpy(&line2[len], tmp, i);
+ for (k = 0; k < i; k++) {
+ if (tmp[k] != ' ' && tmp[k] != '-') {
+ if (k == 2) tmp[k] = '\\';
+ else if (k == 1) tmp[k] = '|';
+ else tmp[k] = '/';
+ } else tmp[k] = ' ';
+ }
+ if (i == 1) tmp[0] = ' ';
+ strncpy(&line3[len], tmp, i);
+ tmp[0] = tmp[1] = tmp[2] = ' ';
+ len += i;
+ line1[len] = line2[len] =line3[len] = '\0';
+ if (len >= WIDTH) {
+ printf("\n%5d", WIDTH*lines++);
+ for (k = 10; k <= WIDTH; k+=10)
+ printf(" . :");
+ if (k-5 < WIDTH) printf(" .");
+ c1 = line1[WIDTH]; c2 = line2[WIDTH]; c3 = line3[WIDTH];
+ line1[WIDTH] = line2[WIDTH] = line3[WIDTH] = '\0';
+ printf("\n %s\n %s\n %s\n", line1, line3, line2);
+ line1[WIDTH] = c1; line2[WIDTH] = c2; line3[WIDTH] = c3;
+ strncpy(line1, &line1[WIDTH], sizeof(line1)-1);
+ strncpy(line2, &line2[WIDTH], sizeof(line2)-1);
+ strncpy(line3, &line3[WIDTH], sizeof(line3)-1);
+ len = len - WIDTH;
+ }
+ }
+ printf("\n%5d", WIDTH*lines);
+ for (k = 10; k < len; k+=10)
+ printf(" . :");
+ if (k-5 < len) printf(" .");
+ printf("\n %s\n %s\n %s\n", line1, line3, line2);
+}
+
+
+/* alignment store the operation that align the protein and dna sequence.
+ The code of the number in the array is as follows:
+ 0: delete of an amino acid.
+ 2: frame shift, 2 nucleotides match with an amino acid
+ 3: match an amino acid with a codon
+ 4: the other type of frame shift
+ 5: delete of a codon
+
+
+ Also the first two element of the array stores the starting point
+ in the protein and dna sequences in the local alignment.
+
+ Display looks like where WIDTH is assumed to be divisible by 10.
+
+ 0 . : . : . : . : . : . :
+ CCTATGATACTGGGATACTGGAACGTCCGCGGACTGACACACCCGATCCGCATGCTCCTG
+ P M I L G Y W N V R G L T H P I R M L L
+
+ 60 . : . : . : . : . : . :
+ GAATACACAGACTCAAGCTATGATGAGAAGAGATACACCATGGGTGACGCTCCCGACTTT
+ E Y T D S S Y D E K R Y T M G D A P D F
+*/
+
+
+/* fatal - print message and die */
+void fatal(msg)
+char *msg;
+{
+ fprintf(stderr, "%s\n", msg);
+ exit(1);
+}
+
+void
+fx_walign (const unsigned char *aa0, int n0,
+ const unsigned char *xaa, int n1, unsigned char *yaa,
+ int frame, int max_res,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct a_res_str *a_res,
+ int score_thresh
+ )
+{
+ unsigned char *local_xaa, *local_yaa;
+ int score;
+ int i, last_n1, itemp, n10;
+ int hoff, l_min, l_max, n_nt, n_aa, w_fact;
+ int score_ix, window;
+ int aa1_min_s, aa1_max_s;
+ unsigned char *fs, *fd;
+ struct score_count_s s_info;
+ int itx;
+
+ memset(&s_info,0,sizeof(s_info));
+
+ score_ix = ppst->score_ix;
+
+ /* check for large differences in sequence length - if there is a
+ large difference, use do_fastx() to get the offset. */
+
+#ifndef TFAST /* FASTX */
+ n_nt = n0;
+ n_aa = n1;
+#else /* TFASTX */
+ n_nt = n1;
+ n_aa = n0;
+#endif
+
+ do_fastx(aa0, n0, xaa, n1, yaa, ppst, f_str, &a_res->rst, &hoff,1, &s_info);
+
+ if (a_res->rst.score[score_ix] <= score_thresh) {
+ a_res->sw_score = 0;
+ a_res->n1 = n1;
+ return;
+ }
+
+ /* now we will do an alignment, but we need to be certain to do the
+ alignment in the region mapped by hoff to include the
+ high-scoring region */
+
+ /* if initn > 2 * init1, use wider window */
+ if (a_res->rst.score[0] > 2 * a_res->rst.score[1]) {w_fact = 4;}
+ else w_fact = 2;
+
+ /* Here we need to use different strategies depending on whether we
+ have DNA or protein. For a DNA query (protein library, FASTX), the
+ strategy is simple -- NULL bound the library protein sequence and
+ do the alignment. For a protein query (TFASTX), things are more complex.
+ Moreover, the mapping must be calculated differently in each case.
+ */
+
+
+#ifndef TFAST /* map onto the protein (aa1) sequence */
+ window = min(n1, ppst->param_u.fa.optwid);
+ l_min = max(0, -window - hoff);
+ l_max = min(n1, n0-hoff+window);
+
+ local_yaa = yaa;
+ local_xaa = (unsigned char *)xaa;
+ if (l_min > 0 || l_max < n1 -1 ) {
+ local_xaa = (unsigned char *)calloc(l_max - l_min+2,sizeof(char));
+ local_xaa++;
+ memcpy(local_xaa, xaa+l_min, l_max - l_min);
+ }
+/*
+ if (l_min > 0) {
+ aa1_min_s = xaa[l_min-1];
+ local_xaa[l_min-1] = '\0';
+ }
+ if (l_max < n1 - 1) {
+ aa1_max_s = xaa[l_max];
+ xaa[l_max] = '\0';
+ }
+*/
+#else
+ window = min(n0, ppst->param_u.fa.optwid);
+ l_min = max(0,(hoff-window)*3);
+ l_max = min((hoff+window+n0)*3,n_nt);
+ local_xaa = (unsigned char *)xaa;
+ local_yaa = yaa;
+ if (l_min > 0 || l_max <n_nt -1) {
+ local_yaa = (unsigned char*)calloc(l_max - l_min + 2,sizeof(unsigned char));
+ local_yaa++;
+ memcpy(local_yaa, yaa+l_min, l_max - l_min);
+ }
+
+ /*
+ if (l_min > 0)
+ aa1_min_s = yaa[l_min-1];
+ yaa[l_min-1] = '\0';
+ }
+ if (l_max < n_nt-1) {
+
+ aa1_max_s = yaa[l_max];
+ yaa[l_max] = '\0';
+ }
+ */
+#endif
+
+ if (a_res->rst.score[ppst->score_ix] <= score_thresh) {
+ a_res->sw_score = 0;
+ a_res->n1 = n1;
+ return;
+ }
+
+ /* pro_dna always compares protein to DNA, and returns protein
+ coordinates in a_res->min0,max0 */
+
+ a_res->sw_score =
+ pro_dna(
+#ifndef TFAST /* FASTX */
+ local_xaa, l_max - l_min, /* true protein is in aa1/xaa */
+ yaa, n_nt,
+#else /* TFASTX */
+ aa0, n0, /* true protein is in aa0 */
+ local_yaa, l_max - l_min,
+#endif
+ ppst->pam2[0],
+#ifdef OLD_FASTA_GAP
+ -(ppst->gdelval - ppst->ggapval),
+#else
+ -ppst->gdelval,
+#endif
+ -ppst->ggapval,
+ -ppst->gshift,
+ &f_str->smgl_s,
+ max_res, a_res);
+
+ /*
+ if (a_res->rst.score[0] < a_res->sw_score) {
+ a_res->rst.score[0] = a_res->sw_score;
+ a_res->rst.score[ppst->score_ix] = a_res->sw_score;
+ }
+ */
+
+#ifndef TFAST
+ if (l_min > 0 || l_max < n1-1) free(--local_xaa);
+/*
+ if (l_min > 0) {
+ xaa[l_min-1] = aa1_min_s;
+ }
+ if (l_max < n1 - 1) {
+ xaa[l_max] = aa1_max_s;
+ }
+*/
+ a_res->min0 += l_min;
+ a_res->max0 += l_min;
+#else
+ if (l_min > 0 || l_max < n1-1) free(--local_yaa);
+ /*
+ if (l_min > 0) {
+ yaa[l_min-1] = aa1_min_s;
+ }
+ if (l_max < n1 - 1) {
+ yaa[l_max] = aa1_max_s;
+ }
+ */
+ a_res->n1 = n1;
+ a_res->min1 += l_min;
+ a_res->max1 += l_min;
+#endif
+
+}
+
+/*
+ fx_malign is a recursive interface to fx_walign() that is called
+ from do_walign(). fx_malign() first does an alignment, then checks
+ to see if the score is greater than the threshold. If so, it tries
+ doing a left and right alignment.
+
+ In this implementation, the translation required for f_str->aa1x and
+ f_str->aa1y is done at each recursive level. A better implementation
+ would do the translation once, and then be more sophisticated about
+ the boundaries on f_str->aa1x,y. This is challenging, however,
+ because there is no easy way to subset aa1x [111112222233333],
+ though it is possible to subset aa1y cleanly. The current solution
+ is to re-generate xaa from yaa.
+
+ 21-Nov-2010 -- like do_walign(), fx_malign() uses a const xaa, to
+ ensure that threads do not interfere with each other. If a
+ sub-range is needed, a new sequence is produced.
+
+ */
+struct a_res_str *
+fx_malign (const unsigned char *aa0, int n0,
+ const unsigned char *xaa, int n1, unsigned char *yaa,
+ int frame,
+ int score_thresh, int max_res,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct a_res_str *cur_ares,
+ int first_align)
+{
+ struct a_res_str *tmpl_ares, *tmpr_ares, *this_ares;
+ struct a_res_str *mtmpl_ares, *mtmpr_ares, *mt_next;
+ unsigned char *my_xaa;
+ unsigned char *local_xaa, *local_yaa;
+ int nxyaa;
+ int hoff, score_ix;
+ int min_alen;
+ struct rstruct rst;
+ /* char save_res; */
+ int iphase, i;
+ unsigned char *fd;
+ int max_sub_score = -1;
+
+ score_ix = ppst->score_ix;
+
+ /* now we need alignment storage - get it */
+ if ((cur_ares->res = (int *)calloc((size_t)max_res,sizeof(int)))==NULL) {
+ fprintf(stderr," *** cannot allocate alignment results array %d\n",max_res);
+ exit(1);
+ }
+
+ cur_ares->next = NULL;
+
+#ifdef TFAST
+ min_alen = min(n0,MIN_LOCAL_LEN)*3; /* n0 in aa, min_alen in nt */
+#else
+ min_alen = min(n0/3,MIN_LOCAL_LEN); /* no in nt, min_alen in aa */
+#endif
+
+#ifdef TFAST
+ /* convert yaa to xaa -- cannot use *fs to stop because subset
+ does not have '\0' in all three frames */
+ my_xaa = (unsigned char *)calloc(n1+2,sizeof(unsigned char));
+ my_xaa++;
+ for (fd=my_xaa, iphase = 0; iphase < 3; iphase++) {
+ for (i=iphase; i<n1; i+=3,fd++) *fd = yaa[i];
+ }
+ *fd=EOSEQ;
+#else
+ my_xaa = (unsigned char *)xaa;
+#endif
+
+ fx_walign(aa0, n0, my_xaa, n1, yaa, frame, max_res,
+ ppst, f_str, cur_ares,(first_align ? 1 : score_thresh));
+
+ /* in cur_ares, min0,max0 are always protein, min1,max1 are always
+ DNA, but n0 could be protein or DNA, depending on
+ FASTX/TFASTX */
+
+ if (!ppst->do_rep || cur_ares->rst.score[ppst->score_ix] <= score_thresh) {
+#ifdef TFAST
+ free(--my_xaa);
+#endif
+ return cur_ares;
+ }
+
+ /* we are going to do a recursive edit, so we need a local copy of
+ xaa (fastx) or yaa (tfastx) */
+
+#ifdef TFAST /* TFASTX, n1 is nt */
+ nxyaa = cur_ares->min1;
+#else /* FASTX n1 is aa */
+ nxyaa = cur_ares->min0;
+#endif
+
+ if (nxyaa >= min_alen) { /* try the left */
+ /* allocate a_res */
+ tmpl_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+
+#ifdef TFAST /* TFASTX, no xaa */
+ local_xaa = my_xaa; /* my_xaa is calloc'ed for TFAST */
+ local_yaa = (unsigned char *)calloc(nxyaa+2, sizeof(unsigned char));
+ local_yaa++; /* skip the initial zero */
+ memcpy(local_yaa, yaa, nxyaa);
+/*
+ save_res = yaa[cur_ares->min1];
+ yaa[cur_ares->min1] = '\0';
+*/
+#else
+ local_yaa = yaa;
+ local_xaa = (unsigned char *)calloc(nxyaa+2, sizeof(unsigned char));
+ local_xaa++; /* skip the initial zero */
+ memcpy(local_xaa, xaa, nxyaa);
+/*
+ save_res = xaa[cur_ares->min0];
+ xaa[cur_ares->min0] = '\0';
+*/
+#endif
+ tmpl_ares = fx_malign(aa0, n0, local_xaa, nxyaa,
+ local_yaa,
+ frame, score_thresh, max_res,
+ ppst, f_str, tmpl_ares, 0);
+
+#ifdef TFAST
+ free(--local_yaa); /* local_yaa, allocated above */
+#else
+ free(--local_xaa); /* FASTX - local_xaa allocated above */
+#endif
+
+ if (tmpl_ares->rst.score[ppst->score_ix] > score_thresh) {
+ max_sub_score = tmpl_ares->rst.score[ppst->score_ix];
+ }
+ else {
+ if (tmpl_ares->res) free(tmpl_ares->res);
+ free(tmpl_ares);
+ tmpl_ares = NULL;
+ }
+ }
+ else {tmpl_ares = NULL;}
+
+ /* do the right */
+#ifdef TFAST /* TFASTX - n0 is aa, n1 nt */
+ nxyaa = n1 - cur_ares->max1 - 1;
+#else /* FASTX - n1 is aa, n0 nt */
+ /* this is counter-intuitive, because n1 is the length of the DNA
+ sequence in both cases */
+ nxyaa = n1 - cur_ares->max0 - 1;
+#endif
+
+ if (nxyaa >= min_alen) { /* try the right */
+ /* allocate a_res */
+ tmpr_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+
+ /* find boundaries */
+#ifdef TFAST /* TFASTX, no xaa */
+ local_xaa = my_xaa;
+ local_yaa = (unsigned char *)calloc(nxyaa+2, sizeof(unsigned char));
+ local_yaa++; /* skip the initial zero */
+ memcpy(local_yaa, yaa+cur_ares->max1+1,nxyaa);
+/*
+ save_res = yaa[cur_ares->max1];
+ yaa[cur_ares->max1] = '\0';
+*/
+#else
+ local_yaa = yaa;
+ local_xaa = (unsigned char *)calloc(nxyaa+2, sizeof(unsigned char));
+ local_xaa++; /* skip the initial zero */
+ memcpy(local_xaa, xaa+cur_ares->max0+1,nxyaa);
+/*
+ save_res = xaa[cur_ares->max0];
+ xaa[cur_ares->max0] = '\0';
+*/
+#endif
+ tmpr_ares = fx_malign(aa0, n0,
+ local_xaa, nxyaa, local_yaa,
+ frame,
+ score_thresh, max_res,
+ ppst, f_str, tmpr_ares,0);
+#ifdef TFAST /* TFASTX, no xaa */
+ free(--local_yaa);
+#else
+ free(--local_xaa);
+#endif
+/* yaa[cur_ares->max1] = save_res;*/
+
+ if (tmpr_ares->rst.score[ppst->score_ix] > score_thresh) {
+ /* adjust the left boundary */
+ for (this_ares = tmpr_ares; this_ares; this_ares = this_ares->next) {
+#ifdef TFAST
+ this_ares->min1 += cur_ares->max1+1;
+ this_ares->max1 += cur_ares->max1+1;
+#else
+ this_ares->min0 += cur_ares->max0+1;
+ this_ares->max0 += cur_ares->max0+1;
+#endif
+ }
+
+ if (tmpr_ares->rst.score[ppst->score_ix] > max_sub_score) {
+ max_sub_score = tmpr_ares->rst.score[ppst->score_ix];
+ }
+ }
+ else {
+ if (tmpr_ares->res) free(tmpr_ares->res);
+ free(tmpr_ares);
+ tmpr_ares = NULL;
+ }
+ }
+ else {tmpr_ares = NULL;}
+
+#ifdef TFAST
+ free(--my_xaa);
+#endif
+
+ if (max_sub_score <= score_thresh) {
+ if (tmpl_ares) {
+ if (tmpl_ares->res) free(tmpl_ares->res);
+ free(tmpl_ares);
+ }
+ if (tmpr_ares) {
+ if (tmpr_ares->res) free(tmpr_ares->res);
+ free(tmpr_ares);
+ }
+ return cur_ares;
+ }
+
+ cur_ares = merge_ares_chains(cur_ares, tmpl_ares, score_ix, "left");
+ cur_ares = merge_ares_chains(cur_ares, tmpr_ares, score_ix, "right");
+
+ return cur_ares;
+}
+
+/* do_walign() can be called with aa0,n0 as nt (FASTX) or
+ aa0,n0 as aa (TFASTX). if aa0 is nt, then f_str->aa0x,y have the
+ translations already. if aa0 is aa, then f_str->aa1x,y must be
+ generated.
+
+ This is the last time that aa0 can be nt or aa; in all lower
+ functions (fx_malign, do_fastx, fx_walign), both aa0, n0 and aa1,
+ n1 are amino acids; though one or the other may be translated.
+
+ In the lower functions, yaa can be aa0y (FASTX) or aa1y (TFASTX).
+ If it is aa1y, there may be no translation available.
+
+ 21-Nov-2010 With fasta-36.3.1, do_walign() uses const aa0, aa1. If aa1 needs
+ modification for recursive alignment, a copy is made.
+*/
+
+struct a_res_str *
+do_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int repeat_thresh,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ int *have_ares)
+{
+ int hoff, use_E_thresholds_s, optflag_s, optcut_s, optwid_s, score;
+ struct a_res_str *a_res, *tmp_a_res;
+ int a_res_index;
+ int last_n1, itx, itt, n10, iphase;
+ unsigned char *xaa, *fs, *fd;
+ struct rstruct rst;
+#ifdef DEBUG
+ unsigned long adler32_crc;
+#endif
+
+ *have_ares = 0x3; /* set 0x2 bit to indicate local copy */
+
+ if ((a_res = (struct a_res_str *)calloc(1, sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr," [do_walign] Cannot allocate a_res");
+ return NULL;
+ }
+
+#ifdef DEBUG
+ adler32_crc = adler32(1L,aa1,n1);
+#endif
+
+ use_E_thresholds_s = ppst->param_u.fa.use_E_thresholds;
+ optflag_s = ppst->param_u.fa.optflag;
+ optcut_s = ppst->param_u.fa.optcut;
+ optwid_s = ppst->param_u.fa.optwid;
+ ppst->param_u.fa.use_E_thresholds = 0;
+ ppst->param_u.fa.optflag = 1;
+ ppst->param_u.fa.optcut = 0;
+ if (!ppst->param_u.fa.optwid_set) {
+ ppst->param_u.fa.optwid *= 2;
+ }
+
+#ifndef TFAST /* FASTX */
+ a_res = fx_malign(f_str->aa0x, n0, aa1, n1, f_str->aa0y, frame,
+ repeat_thresh, f_str->max_res,
+ ppst, f_str, a_res, 1);
+#else /* TFASTX */
+ /* aa0 has a protein sequence */
+ /* aa1 has a raw DNA sequence */
+
+ itt = frame;
+ last_n1 = 0;
+ xaa = f_str->aa1x;
+ for (itx= itt*3; itx< itt*3+3; itx++) {
+ n10 = saatran(aa1,&xaa[last_n1],n1,itx);
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+ /* create aa1y from xaa */
+ for (fs=xaa,iphase=0; iphase <3; iphase++,fs++) {
+ for (fd= &f_str->aa1y[iphase]; *fs!=EOSEQ; fd += 3, fs++) *fd = *fs;
+ *fd=EOSEQ;
+ }
+ f_str->have_yaa = 1;
+
+ a_res = fx_malign(aa0, n0, xaa, n10, f_str->aa1y, frame,
+ repeat_thresh, f_str->max_res,
+ ppst, f_str, a_res, 1);
+#endif
+ /*
+ if (a_res->res[0] != 3) {
+ fprintf(stderr, "*** alignment does not start with match: %d\n",a_res->res[0]);
+ }
+ */
+
+#ifdef DEBUG
+ if (adler32(1L,aa1,n1) != adler32_crc) {
+ fprintf(stderr,"[dropfx.c/do_walign] adler32_crc mismatch n1: %d\n",n1);
+ }
+#endif
+
+ a_res_index = 0;
+ for (tmp_a_res=a_res; tmp_a_res; tmp_a_res = tmp_a_res->next) {
+ tmp_a_res->index = a_res_index++;
+ }
+
+ ppst->param_u.fa.use_E_thresholds = use_E_thresholds_s;
+ ppst->param_u.fa.optflag = optflag_s;
+ ppst->param_u.fa.optcut = optcut_s;
+ ppst->param_u.fa.optwid = optwid_s;
+ return a_res;
+}
+
+/* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
+/* call from calcons, calc_id, calc_code */
+void
+aln_func_vals(int frame, struct a_struct *aln) {
+
+#ifndef TFAST
+ aln->llrev = 0;
+ aln->llfact = 1;
+ aln->llmult = 1;
+ aln->qlfact = 3;
+ aln->frame = frame;
+ if (frame > 0) aln->qlrev = 1;
+ else aln->qlrev = 0;
+ aln->llrev = 0;
+#else /* TFASTX */
+ aln->qlfact = 1;
+ aln->qlrev = 0;
+ aln->llfact = 3;
+ aln->llmult = 1;
+ aln->frame = frame;
+ if (frame > 0) aln->llrev = 1;
+ else aln->llrev = 0;
+ aln->qlrev = 0;
+#endif /* TFASTX */
+}
+
+/* this function is required for programs like tfastx/y/s that do
+ translations on DNA sequences and save them in f_str->aa1??
+*/
+
+void
+pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {
+#ifdef TFAST
+ int i, last_n1, itemp, n10;
+ unsigned char *fs, *fd;
+ int itx;
+
+ last_n1 = 0;
+ for (itx=3*frame; itx<3+3*frame; itx++) {
+ n10 = saatran(aa1,&f_str->aa1x[last_n1],n1,itx);
+/*
+ for (i=0; i<n10; i++) {
+ fprintf(stderr,"%c",ppst->sq[aa10[last_n1+i]]);
+ if ((i%60)==59) fprintf(stderr,"\n");
+ }
+ fprintf(stderr,"\n");
+*/
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+
+ /* create aa1y from aa1x */
+ for (fs=f_str->aa1x,itemp=0; itemp <3; itemp++,fs++) {
+ for (fd= &f_str->aa1y[itemp]; *fs!=EOSEQ; fd += 3, fs++) *fd = *fs;
+ *fd=EOSEQ;
+ }
+ f_str->have_yaa = 1;
+#endif
+}
+
+/*
+ Alignment: store the operation that align the protein and dna sequence.
+ The code of the number in the array is as follows:
+ 0: delete of an amino acid.
+ 2: frame shift, 2 nucleotides match with an amino acid
+ 3: match an amino acid with a codon
+ 4: the other type of frame shift
+ 5: delete of a codon
+
+ The first two elements of the array stores the starting point
+ in the protein and dna sequences in the local alignment.
+*/
+
+#include "a_mark.h"
+
+extern int align_type(int score, char sp0, char sp1, int nt_align, struct a_struct *aln, int pam_x_id_sim);
+
+extern void
+process_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ struct annot_entry *annot_arr_p, char **ann_comment,
+ void *annot_stack, int *have_push_features, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, struct domfeat_link **left_domain_p,
+ long *left_end_p, int init_score);
+
+extern int
+next_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ int i_annot, int n_annot, struct annot_entry **annot_arr, char **ann_comment,
+ void *annot_stack, int *have_push_features, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, struct domfeat_link **left_domain,
+ long *left_domain_end, int init_score);
+
+extern void
+close_annot_match (int ia, void *annot_stack, int *have_push_features,
+ int *d_score_p, int *d_ident_p, int *d_alen_p,
+ struct domfeat_link **left_domain_p,
+ long *left_end_p, int init_score);
+
+extern void
+comment_var(long i0, char sp0, long i1, char sp1, char o_sp1, char sim_char,
+ const char *ann_comment, struct dyn_string_str *annot_var_dyn,
+ int target, int d_type);
+
+void
+display_push_features(void *annot_stack, struct dyn_string_str *annot_var_dyn,
+ long i0_pos, char sp0, long i1_pos, char sp1, char sym,
+ int score, double comp, int n0, int n1,
+ void *pstat_void, int d_type);
+
+#define DP_FULL_FMT 1 /* Region: score: bits: id: ... */
+#define Q_TARGET 0
+#define L_TARGET 1
+
+int seq_pos(int pos, int rev, int off);
+
+int
+calc_cons_a(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int *nc,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ char *seqc0, char *seqc1, char *seqca, int *cumm_seq_score,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a, const struct annot_str *annot0_p, char *seqc0a,
+ const unsigned char *aa1a, const struct annot_str *annot1_p, char *seqc1a,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str,
+ void *pstat_void)
+{
+ int i0, i1, i, j;
+ int lenc, not_c, itmp, ngap_p, ngap_d, nfs;
+ int *i_spa;
+ char *sp0, *sp0a, *sp1, *sp1a, *spa, t_spa;
+ const unsigned char *sq;
+
+ const unsigned char *ap0, *ap1;
+ const unsigned char *ap1a; /* ap1 always points to protein, and
+ only protein has annotations */
+ int *rp, *rpmax;
+ int have_ann = 0;
+
+ /* variables for variant changes/region scores */
+ char tmp_str[MAX_LSTR];
+ void *annot_stack;
+ int have_push_features, prev_match;
+ char *sim_sym = aln_map_sym[MX_ACC];
+ struct annot_entry **s_annot1_arr_p;
+ int i1_annot, v_delta, v_tmp;
+ long i0_offset, i1_offset;
+
+ long i1_left_end;
+ int d1_score, d1_ident, d1_alen;
+ struct domfeat_link *left_domain_list1;
+
+ char *ann_comment;
+
+ *score_delta = 0;
+ d1_score = d1_ident = d1_alen = 0;
+ i1_left_end = -1;
+ left_domain_list1 = NULL;
+
+ NULL_dyn_string(annot_var_dyn);
+
+ if (ppst->ext_sq_set) {sq = ppst->sqx;}
+ else {sq = ppst->sq;}
+
+#ifndef TFAST /* FASTX */
+ aln->amin1 = aln->smin1 = a_res->min0; /* prot */
+ aln->amin0 = aln->smin0 = a_res->min1; /* DNA */
+
+ i0_offset = aln->q_offset;
+ i1_offset = aln->l_offset;
+
+ ap0 = f_str->aa0y; /* translated DNA */
+ ap1 = aa1; /* protein */
+
+ sp0 = seqc0; /* translated DNA */
+ sp1 = seqc1; /* protein */
+
+ have_ann = (seqc0a !=NULL && aa1a != NULL);
+ ap1a = aa1a;
+ sp1a = seqc1a; /* protein library can have annotation */
+ sp0a = seqc0a; /* sp0a is always ' ' - no translated
+ annotation */
+#else /* TFASTX */
+ aln->amin0 = aln->smin0 = a_res->min0; /* DNA */
+ aln->amin1 = aln->smin1 = a_res->min1; /* prot */
+
+ i1_offset = aln->q_offset;
+ i0_offset = aln->l_offset;
+
+ ap1 = aa0; /* aa0 is protein */
+ /* with fx_malign(), there is no guarantee that we have a valid f_str->aa1y, so make one */
+ pre_cons(aa1,n1,aln->frame, f_str);
+ ap0 = f_str->aa1y; /* aa1 is DNA */
+ sp1 = seqc0; /* sp1 points to protein query */
+ sp0 = seqc1; /* sp0 points to DNA */
+
+ have_ann = (seqc0a !=NULL && aa0a != NULL);
+ ap1a = aa0a;
+ sp1a = seqc0a; /* protein query can have annotation */
+ sp0a = seqc1a; /* sp0a is always ' ' - no translated
+ annotation */
+#endif
+ spa = seqca;
+ if (cumm_seq_score) i_spa = cumm_seq_score;
+
+ rp = a_res->res;
+ rpmax = &a_res->res[a_res->nres];
+
+ lenc = not_c = aln->nident = aln->nmismatch = aln->nsim = aln->npos = ngap_p = ngap_d = nfs= 0;
+ i0 = a_res->min1;
+ i1 = a_res->min0;
+
+ v_delta = 0;
+ i1_annot = 0;
+ annot_stack = NULL;
+ s_annot1_arr_p = NULL;
+ have_push_features = prev_match = 0;
+ if (have_ann) {
+ if (annot1_p && annot1_p->n_annot > 0) annot_stack = init_stack(64,64);
+ if (annot1_p && annot1_p->n_annot > 0) {
+ s_annot1_arr_p = annot1_p->s_annot_arr_p;
+
+ while (i1_annot < annot1_p->n_annot) {
+ if (s_annot1_arr_p[i1_annot]->pos >= i1+i1_offset) {break;}
+ if (s_annot1_arr_p[i1_annot]->end < i1+i1_offset) {i1_annot++; continue;}
+
+ if (s_annot1_arr_p[i1_annot]->label == '-') {
+ process_annot_match(&itmp, NULL, i1_offset+seq_pos(i1,aln->llrev,0), i0_offset + seq_pos(i0,aln->qlrev,0),
+ sp1, sp1a, sq, s_annot1_arr_p[i1_annot], &ann_comment,
+ annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end, 0);
+ }
+ i1_annot++;
+ }
+ }
+ }
+
+ while (rp < rpmax) {
+ /* fprintf(stderr,"%d %d %d (%c) %d (%c)\n"
+ ,(int)(rp-res),*rp,i0,sq[ap0[i0]],i1,sq[ap1[i1]]);
+ */
+ switch (*rp++) {
+ case 0: /* aa insertion */
+ *sp0++ = '-';
+ *sp1++ = sq[ap1[i1]];
+ *spa++ = M_DEL;
+ if (cumm_seq_score) {
+ if (prev_match) *i_spa = ppst->gdelval;
+ *i_spa++ += ppst->ggapval;
+ }
+
+ if (have_ann) {
+ *sp0a = ' ';
+ *sp1a = ann_arr[ap1a[i1]];
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,
+ ppst->ggapval+ppst->gdelval
+ );
+ }
+
+ if (prev_match) d1_score += ppst->gdelval;
+ d1_score += ppst->ggapval;
+ d1_alen++;
+ prev_match = 0;
+ }
+ sp0a++; sp1a++;
+ }
+
+ if (have_ann && have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+
+ i1++;
+ lenc++;
+ ngap_d++;
+ break;
+ case 2: /* -1 frameshift, which is treatead as an insertion/match for annotations */
+ nfs++;
+ /* frameshifts produce a two-character alignment string */
+ /* first annotate the frameshift (first character) */
+ *sp0++ = '/';
+ i0 -= 1;
+ *sp1++ = '-';
+ *spa++ = M_DEL;
+
+ if (have_ann) {*sp0a++ = *sp1a++ = ' ';}
+ not_c++;
+
+ /* then annotate the match after the frameshift */
+
+ itmp=ppst->pam2[0][ap0[i0]][ap1[i1]];
+ *sp0 = sq[ap0[i0]];
+ *sp1 = sq[ap1[i1]];
+
+ if (cumm_seq_score) *i_spa++ = ppst->gshift;
+
+ if (have_ann) {
+ have_push_features = 0;
+ /* this simple strategy works because the coordinate system
+ for the alignment is reversed appropriately */
+ *sp1a = ann_arr[ap1a[i1]];
+ *sp0a = ' ';
+ if (s_annot1_arr_p) {
+ /* coordiates are much more complex for next_annot_match,
+ and comment_var, because they may need to be reversed */
+
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,0);
+
+ if (sq[ap1[i1]] != *sp1) {
+ t_spa = align_type(itmp, *sp0, *sp1, 0, NULL, ppst->pam_x_id_sim);
+
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn,1,1);
+ }
+ }
+ d1_score += ppst->gshift;
+ d1_score += itmp;
+ prev_match = 1;
+ }
+ sp0a++; sp1a++;
+ }
+
+ *spa = align_type(itmp, *sp0, *sp1, 0, aln, ppst->pam_x_id_sim);
+
+ d1_alen++;
+ if (*spa == M_IDENT) {d1_ident++;}
+
+ if (have_ann && have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+ i0 += 3;
+ i1++;
+
+ sp0++; sp1++; spa++;
+ lenc++;
+ break;
+ case 3: /* codon/aa match */
+ itmp=ppst->pam2[0][ap0[i0]][ap1[i1]];
+ *sp0 = sq[ap0[i0]];
+ *sp1 = sq[ap1[i1]];
+
+ if (have_ann) {
+ *sp1a = ann_arr[ap1a[i1]];
+ *sp0a = ' ';
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,0);
+
+ if (sq[ap1[i1]] != *sp1) {
+ t_spa = align_type(itmp, *sp0, *sp1, 0, NULL, ppst->pam_x_id_sim);
+
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn, 1, 1);
+ }
+ }
+ prev_match = 1;
+ d1_score += itmp;
+ }
+ sp0a++; sp1a++;
+ }
+
+ *spa = align_type(itmp, *sp0, *sp1, 0, aln, ppst->pam_x_id_sim);
+ d1_alen++;
+ if (*spa == M_IDENT) {d1_ident++;}
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+
+ if (have_ann && have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+
+ i0 += 3;
+ i1++;
+
+ sp0++; sp1++; spa++;
+ lenc++;
+ break;
+ case 4: /* +1 frameshift */
+ nfs++;
+ /* frameshift produces two alignment characters */
+ /* first frameshift */
+ *sp0++ = '\\';
+ i0 += 1;
+ *sp1++ = '-';
+ *spa++ = M_DEL;
+ if (cumm_seq_score) *i_spa++ = ppst->gshift;
+
+ if (have_ann) {*sp1a++ = *sp0a++ = ' ';}
+ not_c++;
+
+ /* then alignment */
+ itmp=ppst->pam2[0][ap0[i0]][ap1[i1]];
+ *sp0 = sq[ap0[i0]];
+ *sp1 = sq[ap1[i1]];
+
+ if (have_ann) {
+ *sp1a = ann_arr[ap1a[i1]];
+ *sp0a = ' ';
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,0);
+
+ if (sq[ap1[i1]] != *sp1) {
+ t_spa = align_type(itmp, *sp0, *sp1, 0, NULL, ppst->pam_x_id_sim);
+
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn, 1, DP_FULL_FMT);
+ }
+ }
+ d1_score += ppst->gshift;
+ d1_score += itmp;
+ prev_match = 1;
+ }
+ sp0a++; sp1a++;
+ }
+
+ *spa = align_type(itmp, *sp0, *sp1, 0, aln, ppst->pam_x_id_sim);
+ d1_alen++;
+ if (*spa == M_IDENT) {d1_ident++;}
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+
+ /* now we have done all the ?modified identity checks, display
+ potential site annotations */
+ if (have_ann && have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+
+ i0 += 3;
+ i1++;
+
+ sp0++; sp1++; spa++;
+ lenc++;
+ break;
+ case 5: /* codon insertion */
+ if (have_ann) {
+ *sp1a++ = *sp0a++ = ' ';
+ }
+
+ if (cumm_seq_score) {
+ if (prev_match) *i_spa = ppst->gdelval;
+ *i_spa++ = ppst->ggapval;
+ }
+
+ if (prev_match) d1_score += ppst->gdelval;
+ d1_score += ppst->ggapval;
+
+ prev_match = 0;
+
+ *sp0++ = sq[ap0[i0]];
+ i0 += 3;
+ *sp1++ = '-';
+ *spa++ = M_DEL;
+ lenc++;
+ ngap_p++;
+ break;
+ }
+ }
+
+ /* done with alignment loop */
+
+ if (have_ann) {
+ *sp0a = *sp1a = '\0';
+ if (s_annot1_arr_p) {
+ have_push_features = 0;
+
+ if (s_annot1_arr_p && i1_left_end > 0) {
+ close_annot_match(-1, annot_stack, &have_push_features,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,
+ 0);
+ }
+
+ if (have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+ }
+ }
+ *spa = '\0';
+
+#ifndef TFAST
+ aln->amax0 = i0;
+ aln->amax1 = i1;
+ aln->ngap_q = ngap_d;
+ aln->ngap_l = ngap_p;
+#else
+ aln->amax1 = i0;
+ aln->amax0 = i1;
+ aln->ngap_q = ngap_p;
+ aln->ngap_l = ngap_d;
+#endif
+ aln->calc_last_set = 1;
+
+ aln->nfs = nfs;
+
+ *score_delta = v_delta;
+
+ if (have_ann) {
+ *sp0a = *sp1a = '\0';
+ have_push_features = 0;
+ /* check for left ends after alignment */
+ if (annot1_p && i1_left_end > 0) {
+ close_annot_match(-1, annot_stack, &have_push_features,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,
+ 0);
+ }
+
+ if (have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ i0_offset + seq_pos(aln->amax0-1,aln->qlrev,0), *sp0,
+ i1_offset + seq_pos(aln->amax1-1,aln->llrev,0), *sp1,
+ sim_sym[*spa],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ }
+
+ free_stack(annot_stack);
+ }
+
+ if (lenc < 0) lenc = 1;
+ *nc = lenc;
+/* now we have the middle, get the right end */
+ return lenc+not_c;
+}
+
+void
+calc_astruct(struct a_struct *aln_p, struct a_res_str *a_res_p, struct f_struct *f_str) {
+
+ aln_p->calc_last_set = 0;
+
+#ifndef TFAST /* FASTX */
+ aln_p->amin1 = a_res_p->min0; /* prot */
+ aln_p->amin0 = a_res_p->min1; /* DNA */
+ aln_p->amax1 = a_res_p->max0; /* prot */
+ aln_p->amax0 = a_res_p->max1; /* DNA */
+#else /* TFASTX */
+ aln_p->amin0 = a_res_p->min0; /* DNA */
+ aln_p->amin1 = a_res_p->min1; /* prot */
+ aln_p->amax0 = a_res_p->max0; /* DNA */
+ aln_p->amax1 = a_res_p->max1; /* prot */
+#endif
+}
+
+/* build an array of match/ins/del - length strings */
+
+/* modified 10-June-2014 to distinguish matches from mismatches, op=1
+ (previously unused) indicates an aligned non-identity */
+
+/* op_codes are: 0 - aa insertion
+ 1 - (now) aligned non-identity
+ 2 - -1 frameshift
+ 3 - aligned identity
+ 4 - +1 frameshift
+ 5 - codon insertion
+*/
+
+static struct update_code_str *
+init_update_data(show_code) {
+
+ struct update_code_str *update_data_p;
+
+ if ((update_data_p = (struct update_code_str *)calloc(1,sizeof(struct update_code_str)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - init_update_data(): cannot allocate update_code_str\n",
+ __FILE__, __LINE__);
+ return NULL;
+ }
+
+ update_data_p->p_op_cnt = 0;
+ update_data_p->show_code = show_code;
+
+ if ((show_code & SHOW_CODE_MASK) == SHOW_CODE_CIGAR) {
+ update_data_p->op_map = cigar_code;
+ update_data_p->cigar_order = 1;
+ }
+ else {
+ update_data_p->op_map = ori_code;
+ update_data_p->cigar_order = 0;
+ }
+
+ if ((show_code & SHOW_CODE_EXT) == SHOW_CODE_EXT) {
+ update_data_p->show_ext = 1;
+ }
+ else {
+ update_data_p->show_ext = 0;
+ }
+
+ return update_data_p;
+}
+
+static void
+close_update_data(char *al_str, int al_str_max,
+ struct update_code_str *up_dp) {
+ char tmp_cnt[MAX_SSTR];
+
+ if (!up_dp) return;
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx, up_dp->p_op_cnt);
+ strncat(al_str,tmp_cnt,al_str_max);
+
+ free(up_dp);
+}
+
+/* update_indel_code() has been modified to work more correctly with
+ ggsearch/glsearch, which, because alignments can start with either
+ insertions or deletions, can produce an initial code of "0=". When
+ that happens, it is ignored and no code is added.
+
+ *al_str - alignment string [al_str_max] - not dynamic
+ op -- encoded operation, currently 0=match, 1-delete, 2-insert, 3-term-match, 4-mismatch
+ op_cnt -- length of run
+ show_code -- SHOW_CODE_CIGAR uses cigar_code, otherwise legacy
+*/
+
+/* update_indel_code() is called for insertions and deletions
+ update_match_code() is called for every match
+*/
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *up_dp, int op_idx, int op_cnt) {
+
+ if (op_cnt == 0) return;
+
+ if (up_dp->cigar_order) {
+ sprintf(tmp_str,"%d%c",op_cnt,up_dp->op_map[op_idx]);
+ }
+ else {
+ sprintf(tmp_str,"%c%d",up_dp->op_map[op_idx],op_cnt);
+ }
+}
+
+static void
+update_code(char *al_str, int al_str_max,
+ struct update_code_str *up_dp, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1)
+{
+ char tmp_cnt[MAX_SSTR];
+
+ /* there are two kinds of "op's", one time and accumulating */
+ /* op == 2, 4 are one-time: */
+
+ switch (op) {
+ case 2:
+ case 4:
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ strncat(al_str,tmp_cnt,al_str_max);
+ sprintf_code(tmp_cnt,up_dp, op, 1);
+ strncat(al_str,tmp_cnt,al_str_max);
+ up_dp->p_op_cnt = 0;
+ break;
+ case 0:
+ case 5:
+ if (op == up_dp->p_op_idx) {
+ up_dp->p_op_cnt++;
+ }
+ else {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ strncat(al_str,tmp_cnt,al_str_max);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ break;
+ case 1:
+ case 3:
+ if (sp0 != '*' && sp1 != '*') { /* default case, not termination */
+ if (up_dp->show_ext) {
+ if (sim_code != M_IDENT) { op = 1;}
+ }
+ }
+ else { /* have a termination codon, output for !SHOW_CODE_CIGAR */
+ if (!up_dp->cigar_order) {
+ if (sp0 == '*' || sp1 == '*') { op = 6;}
+ }
+ else if (up_dp->show_ext && (sp0 != sp1)) { op = 1;}
+ }
+
+ if (up_dp->p_op_cnt == 0) {
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ else if (op != up_dp->p_op_idx) {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ strncat(al_str,tmp_cnt,al_str_max);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ else {
+ up_dp->p_op_cnt++;
+ }
+ break;
+ }
+ return;
+}
+
+int calc_code(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ char *al_str, int al_str_n,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a,
+ const struct annot_str *annot0_p,
+ const unsigned char *aa1a,
+ const struct annot_str *annot1_p,
+ struct dyn_string_str *annot_code_dyn,
+ int *score_delta,
+ struct f_struct *f_str,
+ void *pstat_void,
+ int display_code)
+{
+ int i0, i1, i, j;
+ int lenc, not_c, itmp, ngap_p, ngap_d, nfs;
+ char op_char[10], ann_ch0, ann_ch1;
+ char sp0, sp1;
+ struct update_code_str *update_data_p;
+ unsigned char *sq;
+ const unsigned char *ap0, *ap1, *ap1a;
+ int *rp, *rpmax;
+
+ int have_ann = 0;
+ char tmp_astr[MAX_STR];
+ int sim_code, t_spa;
+ int show_code, annot_fmt;
+ char *sim_sym= aln_map_sym[MX_ACC];
+ int aa1c;
+ /* variables for variant changes */
+ char tmp_str[MAX_SSTR];
+
+ /* variables for variant changes, regions */
+ void *annot_stack;
+ struct annot_entry **s_annot1_arr_p;
+ int i1_annot, v_delta, v_tmp;
+ long i0_offset, i1_offset;
+ int have_push_features, prev_match;
+
+ long i1_left_end;
+ int d1_score, d1_ident, d1_alen;
+ struct domfeat_link *left_domain_list1;
+
+ *score_delta = 0;
+
+ show_code = (display_code & (SHOW_CODE_MASK + SHOW_CODE_EXT));
+ annot_fmt = 2;
+ if (display_code & SHOW_ANNOT_FULL) {
+ annot_fmt = 1;
+ }
+
+ if (ppst->ext_sq_set) {sq = ppst->sqx;}
+ else {sq = ppst->sq;}
+
+#ifndef TFAST /* FASTX */
+ i0_offset = aln->q_offset;
+ i1_offset = aln->l_offset;
+
+ aln->amin1 = a_res->min0; /* prot, i1 */
+ aln->amin0 = a_res->min1; /* DNA, i0 */
+
+ ap0 = f_str->aa0y;
+ ap1 = aa1;
+ have_ann = (ann_arr[0] != '\0' && aa1a != NULL);
+ ap1a = aa1a;
+#else /* TFASTX */
+ i1_offset = aln->q_offset;
+ i0_offset = aln->l_offset;
+
+ aln->amin0 = a_res->min0; /* DNA, i1 */
+ aln->amin1 = a_res->min1; /* prot, i0 */
+
+ ap1 = aa0;
+ /* with fx_malign(), there is no guarantee that we have a valid f_str->aa1y, so make one */
+ pre_cons(aa1,n1,aln->frame, f_str);
+ ap0 = f_str->aa1y;
+
+ have_ann = (ann_arr[0] != '\0' && aa0a != NULL);
+ ap1a = aa0a;
+#endif
+
+ rp = a_res->res;
+ rpmax = &a_res->res[a_res->nres];
+
+ lenc = not_c = aln->nident = aln->nmismatch = aln->nsim = aln->npos = ngap_p = ngap_d = nfs = 0;
+
+ update_data_p = init_update_data(show_code);
+
+ i0 = a_res->min1;
+ i1 = a_res->min0;
+
+ v_delta = 0;
+ i1_annot = 0;
+ s_annot1_arr_p = NULL;
+ have_push_features = prev_match = 0;
+
+ i1_left_end = 0;
+ left_domain_list1 = NULL;
+ d1_score = d1_ident = d1_alen = 0;
+
+ if (have_ann) {
+
+ if (annot0_p || annot1_p) annot_stack = init_stack(64,64);
+ if (annot1_p && annot1_p->n_annot > 0) {
+ s_annot1_arr_p = annot1_p->s_annot_arr_p;
+ while (i1_annot < annot1_p->n_annot) {
+ if (s_annot1_arr_p[i1_annot]->pos >= i1+i1_offset) {break;}
+ if (s_annot1_arr_p[i1_annot]->end < i1+i1_offset) {i1_annot++; continue;}
+
+ if (s_annot1_arr_p[i1_annot]->label == '-') {
+ process_annot_match(&itmp, NULL, i1_offset+seq_pos(i1,aln->llrev,0), i0_offset + seq_pos(i0,aln->qlrev,0),
+ &sp1, NULL, sq, s_annot1_arr_p[i1_annot], NULL,
+ annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end, 0);
+ }
+ i1_annot++;
+ }
+ }
+ }
+
+ while (rp < rpmax) {
+ switch (*rp++) {
+ case 0: /* aa insertion */
+ sim_code = 5; /* indel code */
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 0, sim_code,'-','-');
+
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ NULL, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,0);
+ }
+
+ if (prev_match) d1_score += ppst->gdelval;
+ d1_score += ppst->ggapval;
+ d1_alen++;
+ prev_match = 0;
+ }
+
+ /* check for an annotation */
+ if (have_ann && !(ann_arr[ap1a[i1]] == ' ' || ann_arr[ap1a[i1]]=='[' || ann_arr[ap1a[i1]]==']')) {
+#ifndef TFAST
+ sprintf(tmp_astr, "|%ld:%ld:X%c:%c%c%c",
+ i0_offset+seq_pos(i0,aln->qlrev,0)+1,i1_offset+seq_pos(i1,aln->llrev,0)+1,ann_arr[ap1a[i1]],sim_sym[sim_code],sp0,sp1);
+#else
+ sprintf(tmp_astr, "|%ld:%ld:%cX:%c%c%c",
+ i0_offset+seq_pos(i1,aln->llrev,0)+1,i1_offset+seq_pos(i0,aln->qlrev,0)+1,ann_arr[ap1a[i1]],sim_sym[sim_code],sp0,sp1);
+#endif
+ /* SAFE_STRNCAT(annot_code_s, tmp_astr, n_annot_code_s); */
+ dyn_strcat(annot_code_dyn, tmp_astr);
+ }
+
+ if (s_annot1_arr_p && have_push_features) {
+ display_push_features(annot_stack, annot_code_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sim_sym[sim_code],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ i1++;
+ lenc++;
+ ngap_d++;
+ break;
+ case 2: /* -1 frameshift */
+ /* close previous run */
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 2, sim_code,'-','-');
+
+ nfs++;
+ i0 -= 1;
+ not_c++;
+
+ aa1c = ap1[i1];
+ itmp = ppst->pam2[0][ap0[i0]][aa1c];
+ sp0 = sq[aa0[i0]];
+ sp1 = sq[aa1c];
+
+ /* variant annot_p annotations can cause substitution */
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ NULL, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,0);
+ }
+
+ if (sq[aa1c] != sp1) {
+ t_spa = align_type(itmp, sp0, sp1, 0, NULL, ppst->pam_x_id_sim);
+
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sq[aa1c], sim_sym[t_spa], NULL, annot_code_dyn,
+ 1,annot_fmt);
+ }
+
+ d1_score += ppst->gshift;
+ d1_score += itmp;
+ prev_match = 1;
+ }
+
+ sim_code = align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+
+ d1_alen++;
+ if (sim_code == M_IDENT) {d1_ident++;}
+
+ /* check for an annotation */
+ if (have_ann && !(ann_arr[ap1a[i1]] == ' ' || ann_arr[ap1a[i1]]=='[' || ann_arr[ap1a[i1]]==']')) {
+#ifndef TFAST
+ sprintf(tmp_astr, "|X%c:%ld%c%c%ld%c",
+ ann_arr[ap1a[i1]],i0_offset+seq_pos(i0,aln->qlrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i1,aln->llrev,0)+1,sp1);
+#else
+ sprintf(tmp_astr, "|%cX:%ld%c%c%ld%c",
+ ann_arr[ap1a[i1]],i0_offset+seq_pos(i1,aln->llrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i0,aln->qlrev,0)+1,sp1);
+
+#endif
+ /* SAFE_STRNCAT(annot_code_s, tmp_astr, n_annot_code_s); */
+ dyn_strcat(annot_code_dyn, tmp_astr);
+ }
+
+ if (s_annot1_arr_p && have_push_features) {
+ display_push_features(annot_stack, annot_code_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sim_sym[sim_code],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 3, sim_code,sp0,sp1);
+
+ if (sim_code == M_IDENT) {
+ aln->nident++;
+ }
+ else {
+ aln->nmismatch++;
+ }
+
+ i0 += 3;
+ i1++;
+ lenc++;
+ break;
+ case 3: /* codon/aa match */
+ aa1c = ap1[i1];
+ itmp = ppst->pam2[0][ap0[i0]][aa1c];
+ sp0 = sq[ap0[i0]];
+ sp1 = sq[aa1c];
+
+ /* variant annot1_p annotations can cause substitution */
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ NULL, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,0);
+ }
+
+ if (sq[aa1c] != sp1) {
+ t_spa = align_type(itmp, sp0, sp1, 0, NULL, ppst->pam_x_id_sim);
+
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1, sq[aa1c],
+ sim_sym[t_spa], NULL, annot_code_dyn,1,annot_fmt);
+ }
+
+ d1_score += itmp;
+ prev_match = 1;
+ }
+
+ sim_code = align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+ d1_alen++;
+ if (sim_code == M_IDENT) {d1_ident++;}
+
+ /* check for an annotation */
+ if (have_ann && !(ann_arr[ap1a[i1]] == ' ' || ann_arr[ap1a[i1]]=='[' || ann_arr[ap1a[i1]]==']')) {
+#ifndef TFAST
+ sprintf(tmp_astr, "|X%c:%ld%c%c%ld%c",
+ ann_arr[ap1a[i1]],i0_offset+seq_pos(i0,aln->qlrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i1,aln->llrev,0)+1,sp1);
+#else
+ sprintf(tmp_astr, "|%cX:%ld%c%c%ld%c",
+ ann_arr[ap1a[i1]],i0_offset+seq_pos(i1,aln->llrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i0,aln->qlrev,0)+1,sp1);
+
+#endif
+ /* SAFE_STRNCAT(annot_code_s, tmp_astr, n_annot_code_s); */
+ dyn_strcat(annot_code_dyn, tmp_astr);
+ }
+
+ if (s_annot1_arr_p && have_push_features) {
+ display_push_features(annot_stack, annot_code_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sim_sym[sim_code],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 3, sim_code,sp0,sp1);
+
+ i0 += 3;
+ i1++;
+ lenc++;
+ break;
+ case 4: /* +1 frameshift */
+ /* finish previous run */
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 4, sim_code,'-','-');
+ /* mark frameshift */
+
+ nfs++;
+ i0 += 1;
+ not_c++;
+
+ aa1c = ap1[i1];
+ itmp = ppst->pam2[0][ap0[i0]][aa1c];
+ sp0 = sq[ap0[i0]];
+ sp1 = sq[aa1c];
+
+ /* variant annot1_p annotations can cause substitution */
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ NULL, annot_stack, &have_push_features, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,0);
+ }
+
+ if (sq[aa1c] != sp1) {
+ t_spa = align_type(itmp, sp0, sp1, 0, NULL, ppst->pam_x_id_sim);
+
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sq[aa1c], sim_sym[t_spa], NULL,
+ annot_code_dyn,1,annot_fmt);
+ }
+
+ d1_score += itmp;
+ prev_match = 1;
+ }
+
+ sim_code = align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+ d1_alen++;
+ if (sim_code == M_IDENT) {d1_ident++;}
+
+ /* check for an annotation */
+ if (have_ann && !(ann_arr[ap1a[i1]] == ' ' || ann_arr[ap1a[i1]]=='[' || ann_arr[ap1a[i1]]==']')) {
+#ifndef TFAST
+ sprintf(tmp_astr, "|%ld:%ld:X%c:%c%c%c",
+ i0_offset+seq_pos(i0,aln->qlrev,0)+1,i1_offset+seq_pos(i1,aln->llrev,0)+1,
+ ann_arr[ap1a[i1]],sim_sym[sim_code],sp0,sp1);
+#else
+ sprintf(tmp_astr, "|%ld:%ld:%cX:%c%c%c",
+ i0_offset+seq_pos(i1,aln->llrev,0)+1,i1_offset+seq_pos(i0,aln->qlrev,0)+1,
+ ann_arr[ap1a[i1]],sim_sym[sim_code],sp0,sp1);
+
+#endif
+ /* SAFE_STRNCAT(annot_code_s, tmp_astr, n_annot_code_s); */
+ dyn_strcat(annot_code_dyn, tmp_astr);
+ }
+
+ if (s_annot1_arr_p && have_push_features) {
+ display_push_features(annot_stack, annot_code_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sim_sym[sim_code],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 3, sim_code,sp0,sp1);
+
+ /* start next match/mismatch run */
+ if (sim_code == M_IDENT) {
+ aln->nident++;
+ }
+ else {
+ aln->nmismatch++;
+ }
+
+ i0 += 3;
+ i1++;
+ lenc++;
+ break;
+ case 5: /* codon insertion */
+ sim_code = 5;
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 5, sim_code,'-','-');
+
+ if (prev_match) d1_score += ppst->gdelval;
+ d1_score += ppst->ggapval;
+ d1_alen++;
+ prev_match = 0;
+
+ i0 += 3;
+ lenc++;
+ ngap_p++;
+ break;
+ }
+ }
+
+ close_update_data(al_str, al_str_n-strlen(al_str), update_data_p);
+
+#ifndef TFAST
+ aln->amax0 = i0;
+ aln->amax1 = i1;
+ aln->ngap_q = ngap_d;
+ aln->ngap_l = ngap_p;
+#else
+ aln->amax1 = i0;
+ aln->amax0 = i1;
+ aln->ngap_q = ngap_p;
+ aln->ngap_l = ngap_d;
+#endif
+ aln->calc_last_set = 1;
+
+ aln->nfs = nfs;
+
+ if (lenc < 0) lenc = 1;
+
+ if (have_ann) {
+ have_push_features = 0;
+ if (s_annot1_arr_p) {
+ /* also check for regions after alignment */
+ while (i1_annot < annot1_p->n_annot && s_annot1_arr_p[i1_annot]->pos < i1_offset+n1) {
+ if (s_annot1_arr_p[i1_annot]->label == '[') break;
+ if (s_annot1_arr_p[i1_annot]->label == ']') {
+ push_stack(annot_stack, s_annot1_arr_p[i1_annot]);
+ have_push_features = 1;
+ }
+ i1_annot++;
+ }
+ }
+
+ if (have_push_features) {
+ display_push_features(annot_stack, annot_code_dyn,
+ i0_offset+a_res->max0-1, sp0,
+ i1_offset+a_res->max1-1, sp1,
+ sim_sym[sim_code],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ }
+ }
+
+ if (annot0_p || s_annot1_arr_p) free_stack(annot_stack);
+ *score_delta = v_delta;
+
+ return lenc;
+}
+
+int calc_id(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str)
+{
+ int i0, i1, i, j;
+ int lenc, not_c, ngap_p, ngap_d, nfs;
+ char sp0, sp1;
+ char tmp_str[MAX_SSTR];
+ unsigned char *sq;
+ const unsigned char *ap0, *ap1;
+ int *rp, *rpmax;
+
+ int aa1c;
+ /* variables for variant changes */
+ struct annot_entry **s_annot1_arr_p;
+ int itmp, i1_annot, v_delta, v_tmp;
+ long i0_offset, i1_offset;
+
+ long i1_left_end;
+ int d1_score, d1_ident, d1_alen;
+ struct domfeat_link *left_domain_list1;
+
+ *score_delta = 0;
+ i1_left_end = -1;
+ left_domain_list1 = NULL;
+
+ NULL_dyn_string(annot_var_dyn);
+
+ if (ppst->ext_sq_set) {sq = ppst->sqx;}
+ else {sq = ppst->sq;}
+
+#ifndef TFAST /* FASTX */
+ aln->amin1 = a_res->min0; /* prot */
+ aln->amin0 = a_res->min1; /* DNA */
+
+ i0_offset = aln->q_offset;
+ i1_offset = aln->l_offset;
+
+ ap0 = f_str->aa0y;
+ ap1 = aa1;
+#else /* TFASTX */
+ aln->amin0 = a_res->min0; /* DNA */
+ aln->amin1 = a_res->min1; /* prot */
+
+ i1_offset = aln->q_offset;
+ i0_offset = aln->l_offset;
+
+ ap1 = aa0;
+ /* with fx_malign(), there is no guarantee that we have a valid f_str->aa1y, so make one */
+ pre_cons(aa1,n1,aln->frame, f_str);
+ ap0 = f_str->aa1y;
+#endif
+
+ rp = a_res->res;
+ rpmax = &a_res->res[a_res->nres];
+
+ lenc = not_c = aln->nident = aln->nmismatch = aln->nsim = aln->npos = ngap_p = ngap_d = nfs = 0;
+ i0 = a_res->min1;
+ i1 = a_res->min0;
+
+ v_delta = 0;
+ i1_annot = 0;
+
+ s_annot1_arr_p = NULL;
+ if (annot1_p && annot1_p->n_annot > 0) s_annot1_arr_p = annot1_p->s_annot_arr_p;
+
+ while (rp < rpmax) {
+ /* fprintf(stderr,"%d %d %d (%c) %d (%c)\n"
+ ,(int)(rp-res),*rp,i0,sq[ap0[i0]],i1,sq[ap1[i1]]);
+ */
+ switch (*rp++) {
+ case 0: /* aa insertion */
+ i1++;
+ lenc++;
+ ngap_d++;
+ break;
+ case 2: /* -1 frameshift */
+ nfs++;
+ i0 -= 1;
+ not_c++;
+
+ /* then check for v_delta after the frameshift */
+
+ itmp = ppst->pam2[0][ap0[i0]][ap1[i1]];
+
+ sp0 = sq[ap0[i0]];
+ sp1 = sq[ap1[i1]];
+
+ if (s_annot1_arr_p && (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end)) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ NULL, NULL, NULL, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,0);
+
+ if (sq[ap1[i1]] != sp1) {
+ sprintf(tmp_str,"%c%d%c;",sq[ap1[i1]],i1+1,sp1);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+
+ align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+
+ i0 += 3;
+ i1++;
+
+ lenc++;
+ break;
+ case 3: /* codon/aa match */
+
+ aa1c = ap1[i1];
+ itmp = ppst->pam2[0][ap0[i0]][aa1c];
+ sp0 = sq[ap0[i0]];
+ sp1 = sq[ap1[i1]];
+
+ if (s_annot1_arr_p && (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end)) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ NULL, NULL, NULL, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,0);
+
+ if (sq[ap1[i1]] != sp1) {
+ sprintf(tmp_str,"%c%d%c;",sq[ap1[i1]],i1+1,sp1);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+
+ align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+
+ i0 += 3;
+ i1++;
+
+ lenc++;
+ break;
+ case 4: /* +1 frameshift */
+ nfs++;
+ i0 += 1;
+ not_c++;
+
+
+ itmp = ppst->pam2[0][ap0[i0]][ap1[i1]];
+
+ sp0 = sq[ap0[i0]];
+ sp1 = sq[ap1[i1]];
+
+ if (s_annot1_arr_p && (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end)) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ NULL, NULL, NULL, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &left_domain_list1, &i1_left_end,0);
+
+ if (sq[ap1[i1]] != sp1) {
+ sprintf(tmp_str,"%c%d%c;",sq[ap1[i1]],i1+1,sp1);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+
+ align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+
+ i0 += 3;
+ i1++;
+ lenc++;
+
+ break;
+ case 5: /* codon insertion */
+ i0 += 3;
+ lenc++;
+ ngap_p++;
+ break;
+ }
+ }
+
+#ifndef TFAST
+ aln->amax0 = i0;
+ aln->amax1 = i1;
+ aln->ngap_q = ngap_d;
+ aln->ngap_l = ngap_p;
+#else
+ aln->amax1 = i0;
+ aln->amax0 = i1;
+ aln->ngap_q = ngap_p;
+ aln->ngap_l = ngap_d;
+#endif
+ aln->calc_last_set = 1;
+
+ aln->nfs = nfs;
+
+ *score_delta = v_delta;
+
+ if (lenc < 0) lenc = 1;
+/* now we have the middle, get the right end */
+ return lenc;
+}
diff --git a/src/dropfx2.c b/src/dropfx2.c
new file mode 100644
index 0000000..f7e4566
--- /dev/null
+++ b/src/dropfx2.c
@@ -0,0 +1,3892 @@
+/* $Id: dropfx.c 1280 2014-08-21 00:47:55Z wrp $ */
+/* $Revision: 1280 $ */
+
+/* copyright (c) 1998, 1999, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* implements the fastx algorithm, see:
+
+ W. R. Pearson, T. Wood, Z. Zhang, A W. Miller (1997) "Comparison of
+ DNA sequences with protein sequences" Genomics 46:24-36
+
+ see dropnfa.c for better variable descriptions and comments
+*/
+
+/* 17-Sept-2008 - modified for multiple non-overlapping alignments */
+
+/* 18-Sept-2006 - remove global variables used for alignment */
+
+/* 22-June-2006 - correct incorrect alignment coordinates generated
+ after pro_dna() on projected DNA region.
+*/
+
+/* 9-May-2003 -> 3.46 changed lx_band to use projected protein
+ boundary end. this fixes some addressing issues on MacOSX, and
+ speeds up alignment on very long proteins
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+
+#include "defs.h"
+#include "param.h"
+#define XTERNAL
+#include "upam.h"
+
+/* this must be consistent with upam.h */
+#define MAXHASH 32
+#define NMAP MAXHASH+1
+
+/* globals for fasta */
+#define MAXWINDOW 64
+
+#ifndef MAXSAV
+#define MAXSAV 10
+#endif
+
+#ifndef ALLOCN0
+static char *verstr="3.8 June 2014";
+#else
+static char *verstr="3.8an0 June 2014";
+#endif
+
+struct dstruct /* diagonal structure for saving current run */
+{
+ int score; /* hash score of current match */
+ int start; /* start of current match */
+ int stop; /* end of current match */
+ struct savestr *dmax; /* location in vmax[] where best score data saved */
+};
+
+struct savestr
+{
+ int score; /* pam score with segment optimization */
+ int score0; /* pam score of best single segment */
+ int gscore; /* score from global match */
+ int dp; /* diagonal of match */
+ int start; /* start of match in lib seq */
+ int stop; /* end of match in lib seq */
+};
+
+struct swstr { int H, E;};
+/* struct bdstr { int CC, DD, CP, DP;}; */
+
+#define SGW1 100
+#define SGW2 300
+struct smgl_str {
+ int C[SGW1+1][SGW2+1];
+ int st[SGW1+1][SGW2+1];
+ int D[SGW2+7], I[SGW2+1];
+};
+
+struct update_code_str {
+ int p_op_idx;
+ int p_op_cnt;
+ int btop_enc;
+ int show_code;
+ int cigar_order;
+ int show_ext;
+ char *op_map;
+};
+
+#ifdef TFAST
+static char *ori_code = "-x/=\\+*"; /* FASTX */
+static char *cigar_code = "DXFMRI*";
+#else
+static char *ori_code = "+x/=\\-*"; /* TFASTX */
+static char *cigar_code = "IXFMRD*";
+#endif
+
+unsigned long adler32(unsigned long, const unsigned char *, unsigned int);
+
+void kpsort (struct savestr **v, int n);
+extern void *init_stack(int, int);
+extern void push_stack(void *, void *);
+extern void *pop_stack(void *);
+extern void *free_stack(void *);
+extern struct domfeat_data * init_domfeat_data(const struct annot_str *annot_p);
+
+struct sx_s {int C1, C2, C3, I1, I2, I3, flag; };
+
+struct f_struct {
+ struct dstruct *diag;
+ int ndo;
+ int noff;
+ int hmask; /* hash constants */
+ int *pamh1; /* pam based array */
+ int *pamh2; /* pam based kfact array */
+ int *link, *harr; /* hash arrays */
+ int kshft; /* shift width */
+ int nsav; /* number of saved runs, worst saved run */
+#ifndef TFAST
+ unsigned char *aa0x; /* contains translated codons 111222333*/
+ unsigned char *aa0y; /* contains translated codons 123123123*/
+#else
+ unsigned char *aa1x; /* contains translated codons 111222333 */
+ unsigned char *aa1y; /* contains translated codons 123123123 */
+ int have_yaa; /* flag if translation is done */
+#endif
+ struct sx_s *cur;
+ int cur_sp_size;
+ int *waa0;
+ int *waa1;
+ struct smgl_str smgl_s;
+ int *res;
+ int max_res;
+};
+
+#define DROP_INTERN
+#include "drop_func.h"
+
+int shscore(unsigned char *aa0, int n0, int **pam2);
+int saatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame);
+extern int ELK_to_s(double E_join, int n0, int n1, double Lambda, double K, double H);
+
+int savemax (struct dstruct *dptr, int dpos,
+ struct savestr *vmax, struct savestr **lowmax);
+
+int spam (const unsigned char *aa0, const unsigned char *aa1,
+ struct savestr *dmax, int **pam2,
+ struct f_struct *f_str);
+int sconn (struct savestr **v, int n,int cgap, int pgap, struct f_struct *f_str);
+int lx_band(const unsigned char *prot_seq, int len_prot,
+ const unsigned char *dna_prot_seq, int len_dna_prot,
+ int **pam_matrix, int gopen, int gext,
+ int gshift, int start_diag, int width, struct f_struct *f_str);
+
+void fx_walign (const unsigned char *aa0, int n0,
+ const unsigned char *xaa, int n1, unsigned char *yaa,
+ int frame, int max_res,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct a_res_str *a_res,
+ int score_thresh);
+
+struct a_res_str *
+merge_ares_chains(struct a_res_str *cur_ares,
+ struct a_res_str *tmpl_ares,
+ int score_ix, const char *msg);
+
+extern void w_abort (char *p, char *p1);
+
+static struct update_code_str *
+init_update_data(int show_code);
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *, int op_idx, int op_cnt);
+
+static void
+update_code(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *update_data, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1);
+
+static void
+close_update_data(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *update_data);
+
+/* initialize for fasta */
+
+void
+init_work (unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ int mhv, phv;
+ int hmax;
+ int i0, hv;
+ int pamfact;
+ int btemp;
+ struct f_struct *f_str;
+ int ktup; /* word size examined */
+ int fact; /* factor used to scale ktup match value */
+ int kt1; /* ktup-1 */
+ int lkt; /* last ktup - initiall kt1, but can be increased
+ for hsq >= NMAP */
+
+ int maxn0;
+ int *pwaa;
+ int i, j, q;
+ struct swstr *ss, *r_ss;
+ int *waa;
+ int *res;
+ int nsq, ip, *hsq;
+#ifndef TFAST
+ int last_n0, itemp;
+ unsigned char *fd, *fs, *aa0x, *aa0y, *aa0s;
+ int n0x, n0x3;
+#endif
+
+ if (ppst->ext_sq_set) {
+ nsq = ppst->nsqx; ip = 1;
+ hsq = ppst->hsqx;
+ }
+ else {
+ nsq = ppst->nsqx; ip = 0;
+ hsq = ppst->hsq;
+ }
+
+ f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct));
+
+ if (!ppst->param_u.fa.use_E_thresholds) {
+ btemp = 2 * ppst->param_u.fa.bestoff / 3 +
+ n0 / ppst->param_u.fa.bestscale +
+ ppst->param_u.fa.bkfact *
+ (ppst->param_u.fa.bktup - ppst->param_u.fa.ktup);
+ btemp = min (btemp, ppst->param_u.fa.bestmax);
+ if (btemp > 3 * n0) btemp = 3 * shscore(aa0,n0,ppst->pam2[0]) / 5;
+
+ ppst->param_u.fa.cgap = btemp + ppst->param_u.fa.bestoff / 3;
+ if (ppst->param_u.fa.optcut_set != 1) {
+#ifndef TFAST
+ ppst->param_u.fa.optcut = (btemp*5)/4;
+#else
+ ppst->param_u.fa.optcut = (btemp*4)/3;
+#endif
+ }
+ }
+
+#ifdef OLD_FASTA_GAP
+ ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval;
+#else
+ ppst->param_u.fa.pgap = ppst->gdelval + 2*ppst->ggapval;
+#endif
+
+ ppst->param_u.fa.cgap = max(ppst->param_u.fa.cgap, -ppst->param_u.fa.pgap);
+
+ pamfact = ppst->param_u.fa.pamfact;
+ ktup = ppst->param_u.fa.ktup;
+ fact = ppst->param_u.fa.scfact * ktup;
+
+ if (pamfact == -1)
+ pamfact = 0;
+ else if (pamfact == -2)
+ pamfact = 1;
+
+ for (i0 = 1, mhv = -1; i0 <=ppst->nsq; i0++)
+ if (hsq[i0] < NMAP && hsq[i0] > mhv) mhv = hsq[i0];
+
+ if (mhv <= 0) {
+ fprintf (stderr, " maximum hsq <=0 %d\n", mhv);
+ exit (1);
+ }
+
+ for (f_str->kshft = 0; mhv > 0; mhv /= 2)
+ f_str->kshft++;
+
+/* kshft = 2; */
+ kt1 = ktup - 1;
+ hv = 1;
+ for (i0 = 0; i0 < ktup; i0++) {
+ hv = hv << f_str->kshft;
+ }
+ hmax = hv;
+ f_str->hmask = (hmax >> f_str->kshft) - 1;
+
+ if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate hash array\n");
+ exit (1);
+ }
+ if ((f_str->pamh1 = (int *) calloc (ppst->nsq+1, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh1 array\n");
+ exit (1);
+ }
+ if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh2 array\n");
+ exit (1);
+ }
+ if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate hash link array");
+ exit (1);
+ }
+
+#ifdef TFAST
+ if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+2,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+2);
+ exit (1);
+ }
+ f_str->aa1x++;
+
+ if ((f_str->aa1y =(unsigned char *)calloc((size_t)ppst->maxlen+2,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa1y array %d\n", ppst->maxlen+2);
+ exit (1);
+ }
+ f_str->aa1y++;
+#else /* FASTX */
+ maxn0 = n0 + 2;
+ if ((aa0x =(unsigned char *)calloc((size_t)maxn0,sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa0x array %d\n", maxn0);
+ exit (1);
+ }
+ aa0x++;
+ f_str->aa0x = aa0x;
+
+ if ((aa0y =(unsigned char *)calloc((size_t)maxn0,sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa0y array %d\n", maxn0);
+ exit (1);
+ }
+ aa0y++;
+ f_str->aa0y = aa0y;
+
+ last_n0 = 0;
+ for (itemp=0; itemp<3; itemp++) {
+ n0x = saatran(aa0,&aa0x[last_n0],n0,itemp);
+
+ /*
+ for (i=0; i<n0x; i++) {
+ fprintf(stderr,"%c",aa[aa0x[last_n0+i]]);
+ if ((i%60)==59) fprintf(stderr,"\n");
+ }
+ fprintf(stderr,"\n");
+ */
+ last_n0 += n0x+1;
+ }
+ /*
+ fprintf(stderr,"\n");
+ */
+ for (itemp=0, fs=aa0x; itemp <3; itemp++,fs++) {
+ for (fd = &aa0y[itemp]; *fs!=EOSEQ; fd += 3, fs++) *fd = *fs;
+ *fd=EOSEQ;
+ }
+
+ /* now switch aa0 and aa0x for hashing functions */
+ /* this seems dangerous in threaded code, but only the pointer is changed,
+ not the data itself */
+
+ fs = aa0;
+ aa0 = aa0x;
+ aa0x = fs;
+
+#endif
+
+ for (i0 = 0; i0 < hmax; i0++)
+ f_str->harr[i0] = -1;
+ for (i0 = 0; i0 < n0; i0++)
+ f_str->link[i0] = -1;
+
+ /* encode the aa0 array */
+
+ phv = hv = 0;
+ lkt = kt1;
+ for (i0 = 0; i0 < min(lkt,n0); i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {hv=phv=0; lkt=i0+ktup; continue;}
+ hv = (hv << f_str->kshft) + hsq[aa0[i0]];
+ phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup;
+ }
+
+ for (; i0 < n0; i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {
+ hv=phv=0;
+ lkt = i0+ktup;
+ /* restart hv, phv calculation */
+ for (; (i0 < lkt || hsq[aa0[i0]]>=NMAP) && i0<n0; i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {hv=phv=0; lkt = i0+ktup; continue;}
+ hv = (hv << f_str->kshft) + hsq[aa0[i0]];
+ phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup;
+ }
+ }
+ if (i0 >= n0) break;
+ hv = ((hv & f_str->hmask) << f_str->kshft) + hsq[aa0[i0]];
+ f_str->link[i0] = f_str->harr[hv];
+ f_str->harr[hv] = i0;
+ if (pamfact) {
+ f_str->pamh2[hv] = (phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup);
+ /* this check should always be true, but just in case */
+ if (hsq[aa0[i0-kt1]]<NMAP)
+ phv -= ppst->pam2[ip][aa0[i0 - kt1]][aa0[i0 - kt1]] * ktup;
+ }
+ else f_str->pamh2[hv] = fact * ktup;
+ }
+
+#ifndef TFAST
+ /* done hashing, now switch aa0, aa0x back */
+ fs = aa0;
+ aa0 = aa0x;
+ aa0x = fs;
+#endif
+
+ if (pamfact)
+ for (i0 = 1; i0 < ppst->nsq; i0++)
+ f_str->pamh1[i0] = ppst->pam2[ip][i0][i0] * ktup;
+ else
+ for (i0 = 1; i0 < ppst->nsq; i0++)
+ f_str->pamh1[i0] = fact;
+
+ f_str->ndo = 0; /* used to save time on diagonals with long queries */
+
+#ifndef ALLOCN0
+ if ((f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG,
+ sizeof (struct dstruct)))==NULL) {
+ fprintf (stderr," cannot allocate diagonal arrays: %ld\n",
+ (long) MAXDIAG *sizeof (struct dstruct));
+ exit (1);
+ };
+#else
+ if ((f_str->diag = (struct dstruct *) calloc ((size_t)n0,
+ sizeof (struct dstruct)))==NULL) {
+ fprintf (stderr," cannot allocate diagonal arrays: %ld\n",
+ (long)n0*sizeof (struct dstruct));
+ exit (1);
+ };
+#endif
+
+
+ if ((waa= (int *)malloc (sizeof(int)*(nsq+1)*n0)) == NULL) {
+ fprintf(stderr,"cannot allocate waa struct %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ pwaa = waa;
+ for (i=0; i<nsq; i++) {
+ for (j=0;j<n0; j++) {
+ *pwaa = ppst->pam2[ip][i][aa0[j]];
+ pwaa++;
+ }
+ }
+ f_str->waa0 = waa;
+
+ if ((waa= (int *)malloc (sizeof(int)*(nsq+1)*n0)) == NULL) {
+ fprintf(stderr,"cannot allocate waa struct %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ pwaa = waa;
+ for (i=0; i<nsq; i++) {
+ for (j=0;j<n0; j++) {
+ *pwaa = ppst->pam2[0][i][aa0[j]];
+ pwaa++;
+ }
+ }
+ f_str->waa1 = waa;
+
+#ifndef TFAST
+ maxn0 = max(2*n0,MIN_RES);
+#else
+ /* maxn0 needs to be large enough to accomodate introns
+ for TFASTX. For all other functions, it will be
+ more reasonable. */
+ maxn0 = max(4*n0,MIN_RES);
+#endif
+ if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) {
+ fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0);
+ exit(1);
+ }
+ f_str->res = res;
+ f_str->max_res = maxn0;
+
+ *f_arg = f_str;
+}
+
+
+/* pstring1 is a message to the manager, currently 512 */
+/* pstring2 is the same information, but in a markx==10 format */
+void
+get_param (const struct pstruct *ppst,
+ char **pstring1, char *pstring2,
+ struct score_count_s *s_info)
+{
+ char options_str1[128];
+ char options_str2[128];
+#ifndef TFAST
+ char *pg_str="FASTX";
+#else
+ char *pg_str="TFASTX";
+#endif
+
+ if (!ppst->param_u.fa.use_E_thresholds) {
+ sprintf(options_str1,"join: %d (%.3g), opt: %d (%.3g)",
+ ppst->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ sprintf(options_str2,"pg_join: %d (%.3g)\n; pg_optcut: %d (%.3g)",
+ ppst->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ }
+ else {
+ sprintf(options_str1,"E-join: %.2g (%.3g), E-opt: %.2g (%.3g)",
+ ppst->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ sprintf(options_str2,"pg_join_E(): %.2g (%.3g)\n; pg_optcut_E(): %.2g (%.3g)",
+ ppst->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ }
+
+ if (!ppst->param_u.fa.optflag) {
+ sprintf (pstring1[0], "%s (%s)",pg_str,verstr);
+ }
+ else {
+ sprintf (pstring1[0], "%s (%s) [optimized]",pg_str,verstr);
+ }
+
+#ifdef OLD_FASTA_GAP
+ sprintf (pstring1[1], "%s matrix (%d:%d)%s, gap-pen: %d/%d, shift: %d\n ktup: %d, %s, width: %3d",
+#else
+ sprintf (pstring1[1], "%s matrix (%d:%d)%s, open/ext: %d/%d, shift: %d\n ktup: %d, %s, width: %3d",
+#endif
+ ppst->pam_name, ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set) ? "xS":"\0",
+ ppst->gdelval, ppst->ggapval, ppst->gshift,
+ ppst->param_u.fa.ktup, options_str1, ppst->param_u.fa.optwid);
+
+ if (ppst->param_u.fa.iniflag) strcat(pstring1[0]," init1");
+
+ if (pstring2 != NULL) {
+#ifdef OLD_FASTA_GAP
+ sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n\
+; pg_gap-pen: %d %d\n; pg_ktup: %d\n; %s\n",
+#else
+ sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n\
+; pg_open_ext: %d %d\n; pg_ktup: %d\n; %s\n",
+#endif
+ pg_str,verstr,ppst->pam_name, ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set) ? "xS":"\0", ppst->gdelval,
+ ppst->ggapval,ppst->param_u.fa.ktup,options_str2);
+ }
+}
+
+void
+close_work (const unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ struct f_struct *f_str;
+
+ f_str = *f_arg;
+
+ if (f_str != NULL) {
+ if (f_str->cur != NULL) free(f_str->cur);
+#ifndef TFAST
+ f_str->aa0y--;
+ free(f_str->aa0y);
+ f_str->aa0x--;
+ free(f_str->aa0x);
+#else
+ f_str->aa1y--;
+ free(f_str->aa1y);
+ f_str->aa1x--;
+ free(f_str->aa1x);
+#endif
+ free(f_str->res);
+ free(f_str->waa1);
+ free(f_str->waa0);
+ free(f_str->diag);
+ free(f_str->link);
+ free(f_str->pamh2);
+ free(f_str->pamh1);
+ free(f_str->harr);
+ free(f_str);
+ *f_arg = NULL;
+ }
+}
+
+/* do_fastx() always compares a (possibly translated) protein query
+ sequence to another protein sequence.
+
+ #ifndef TFAST (e.g. FASTX),
+ then the hash table was built from the translated (amino-acid)
+ version of the query.
+
+ #ifdef TFAST, then aa0 is already a protein sequence
+
+ Args:
+ aa0, n0 query sequence
+ aa1, n1 library sequence
+ yaa translated DNA sequence (from either aa0 or aa1)
+ *ppst -> param struct
+ *f_str -> function structure set in init_work()
+ *rst -> scores (results struct)
+ *hoff -> offset of query in library sequence
+ */
+void do_fastx (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ unsigned char *yaa, /* translated 123123... */
+ const struct pstruct *ppst, struct f_struct *f_str,
+ struct rstruct *rst, int *hoff, int shuff_flg,
+ struct score_count_s *s_info)
+{
+ int nd; /* diagonal array size */
+ int lhval;
+ int kfact;
+ int i;
+ int my_hoff;
+ int c_gap, opt_cut;
+ const unsigned char *aa_prot, *aa_trans_prot;
+ int n_aap, n_taap;
+ register struct dstruct *dptr;
+ struct savestr vmax[MAXSAV]; /* best matches saved for one sequence */
+ struct savestr *vptr[MAXSAV];
+ struct savestr *lowmax;
+ int lowscor;
+ register int tscor;
+
+#ifndef ALLOCN0
+ register struct dstruct *diagp;
+#else
+ register int dpos;
+ int lposn0;
+#endif
+ struct dstruct *dpmax;
+ register int lpos;
+ int tpos;
+ struct savestr *vmptr;
+ int scor, tmp;
+ int im, ib, nsave;
+ int ktup, kt1, ip, lkt, ktup_sq;
+ const int *hsq;
+ int n0_eff;
+#ifndef TFAST
+ int n0x31, n0x32;
+ n0x31 = (n0-2)/3;
+ n0x32 = n0x31+1+(n0-n0x31-1)/2;
+#else
+ const unsigned char *fs;
+ unsigned char *fd;
+ int n1x31, n1x32, itemp;
+ n1x31 = (n1-2)/3;
+ n1x32 = n1x31+1+(n1-n1x31-1)/2;
+#endif
+
+ if (ppst->ext_sq_set) {
+ ip = 1;
+ hsq = ppst->hsqx;
+ }
+ else {
+ ip = 0;
+ hsq = ppst->hsq;
+ }
+
+ ktup = ppst->param_u.fa.ktup;
+ kt1 = ktup-1;
+ if (ktup <= 3) {
+ ktup_sq = ktup*ktup;
+ }
+ else {
+ ktup_sq = ktup;
+ }
+ if (ktup == 1) ktup_sq *= 2;
+
+ if (n1 < ktup) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+ if (n0+n1+1 >= MAXDIAG) {
+ fprintf(stderr,"n0,n1 too large: %d, %d\n",n0,n1);
+ rst->score[0] = rst->score[1] = rst->score[2] = -1;
+ return;
+ }
+
+ if (ppst->param_u.fa.use_E_thresholds) {
+ rst->valid_stat = 0;
+ n0_eff = n0;
+ if (n0 > 120) n0_eff = (n0+2)/3;
+ c_gap = ELK_to_s(ppst->param_u.fa.E_join*ktup_sq, n0_eff, n1, ppst->pLambda, ppst->pK, ppst->pH);
+ opt_cut = ELK_to_s(ppst->param_u.fa.E_band_opt*ktup_sq, n0_eff, n1, ppst->pLambda, ppst->pK, ppst->pH);
+ }
+ else {
+ c_gap = ppst->param_u.fa.cgap;
+ opt_cut = ppst->param_u.fa.optcut;
+ rst->valid_stat = 1;
+ }
+ /* if (shuff_flg) rst->valid_stat = 1; */
+
+ f_str->noff = n0 - 1;
+
+#ifdef ALLOCN0
+ nd = n0;
+#endif
+
+#ifndef ALLOCN0
+ nd = n0 + n1;
+#endif
+
+ dpmax = &f_str->diag[nd];
+ for (dptr = &f_str->diag[f_str->ndo]; dptr < dpmax;)
+ {
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+
+ for (vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++)
+ vmptr->score = 0;
+ lowmax = vmax;
+ lowscor = 0;
+
+ /* start hashing */
+ lhval = 0;
+ lkt = kt1;
+ for (lpos = 0; (lpos < lkt || hsq[aa1[lpos]]>=NMAP) && lpos<n1; lpos++) {
+ if (hsq[aa1[lpos]]>=NMAP) {
+ lhval = 0; lkt=lpos+ktup; continue;
+#ifdef ALLOCN0 /* reinitialize dptr */
+ dptr = &f_str->diag[lpos % nd];
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr->score = 0;
+#endif
+ }
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + hsq[aa1[lpos]];
+ }
+
+#ifndef ALLOCN0
+ diagp = &f_str->diag[f_str->noff + lkt];
+ for (; lpos < n1; lpos++, diagp++) {
+ /* if (hsq[aa1[lpos]]>=NMAP) {lhval = 0; continue;} */
+ if (hsq[aa1[lpos]]>=NMAP) {
+ lpos++ ; diagp++;
+ while (lpos < n1 && hsq[aa1[lpos]]>=NMAP) {lpos++; diagp++;}
+ if (lpos >= n1) break;
+ lhval = 0;
+ }
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + hsq[aa1[lpos]];
+ for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
+ if ((tscor = (dptr = &diagp[-tpos])->stop) >= 0) {
+#else
+ lposn0 = f_str->noff + lpos;
+ for (; lpos < n1; lpos++, lposn0++) {
+ if (hsq[aa1[lpos]]>=NMAP) {lhval = 0; goto loopl;}
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + hsq[aa1[lpos]];
+ for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
+ dpos = lposn0 - tpos;
+ if ((tscor = (dptr = &f_str->diag[dpos % nd])->stop) >= 0) {
+#endif
+ tscor += ktup;
+ if ((tscor -= lpos) <= 0) { /* better to start over */
+ scor = dptr->score;
+ if ((tscor += (kfact = f_str->pamh2[lhval])) < 0 && lowscor < scor) {
+#ifdef ALLOCN0
+ lowscor = savemax (dptr, dpos, vmax, &lowmax);
+#else
+ lowscor = savemax (dptr, dptr - f_str->diag, vmax, &lowmax);
+#endif
+ }
+ if ((tscor += scor) >= kfact) {
+ dptr->score = tscor;
+ dptr->stop = lpos;
+ }
+ else {
+ dptr->score = kfact;
+ dptr->start = (dptr->stop = lpos) - kt1;
+ }
+ } /* continue current run in diagonal */
+ else {
+ dptr->score += f_str->pamh1[aa0[tpos]];
+ dptr->stop = lpos;
+ }
+ }
+ else {
+ dptr->score = f_str->pamh2[lhval];
+ dptr->start = (dptr->stop = lpos) - kt1;
+ }
+ } /* end tpos */
+
+#ifdef ALLOCN0
+ /* reinitialize diag structure */
+ loopl:
+ if ((dptr = &f_str->diag[lpos % nd])->score > lowscor) {
+ lowscor = savemax (dptr, lpos, vmax, &lowmax);
+ }
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr->score = 0;
+#endif
+ } /* end lpos */
+
+#ifdef ALLOCN0
+ for (tpos = 0, dpos = f_str->noff + n1 - 1; tpos < n0; tpos++, dpos--) {
+ if ((dptr = &f_str->diag[dpos % nd])->score > lowscor) {
+ lowscor = savemax (dptr, dpos, vmax, &lowmax);
+ }
+ }
+#else
+ for (dptr = f_str->diag; dptr < dpmax;) {
+ if (dptr->score > lowscor) {
+ lowscor = savemax (dptr, dptr - f_str->diag, vmax, &lowmax);
+ }
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+ f_str->ndo = nd;
+#endif
+
+/*
+ at this point all of the elements of aa1[lpos]
+ have been searched for elements of aa0[tpos]
+ with the results in diag[dpos]
+*/
+
+ for (nsave = 0, vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++)
+ {
+ /*
+ fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
+ f_str->noff+vmptr->start-vmptr->dp,
+ f_str->noff+vmptr->stop-vmptr->dp,
+ vmptr->start,vmptr->stop,
+ vmptr->dp,vmptr->score);
+ */
+ if (vmptr->score > 0) {
+ vmptr->score = spam (aa0, aa1, vmptr, ppst->pam2[ip], f_str);
+ vptr[nsave++] = vmptr;
+ }
+ }
+
+ if (nsave <= 0) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+#ifndef TFAST
+ /* FASTX code here to modify the start, stop points for
+ the three phases of the translated protein sequence
+ */
+ /*
+ fprintf(stderr,"n0x: %d; n0x31:%d; n0x32: %d\n",n0,n0x31,n0x32);
+ for (ib=0; ib<nsave; ib++) {
+ fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
+ f_str->noff+vptr[ib]->start-vptr[ib]->dp,
+ f_str->noff+vptr[ib]->stop-vptr[ib]->dp,
+ vptr[ib]->start,vptr[ib]->stop,
+ vptr[ib]->dp,vptr[ib]->score);
+ }
+
+ fprintf(stderr,"---\n");
+ */
+ for (ib=0; ib<nsave; ib++) {
+ if (f_str->noff-vptr[ib]->dp+vptr[ib]->start >= n0x32)
+ vptr[ib]->dp += n0x32;
+ if (f_str->noff-vptr[ib]->dp +vptr[ib]->start >= n0x31)
+ vptr[ib]->dp += n0x31;
+ }
+#else
+ /* TFASTX code here to modify the start, stop points for
+ the three phases of the translated protein sequence
+ TFASTX modifies library start points, rather than
+ query start points
+ */
+
+ for (ib=0; ib<nsave; ib++) {
+ if (vptr[ib]->start >= n1x32) {
+ vptr[ib]->start -= n1x32;
+ vptr[ib]->stop -= n1x32;
+ vptr[ib]->dp -= n1x32;
+ }
+ if (vptr[ib]->start >= n1x31) {
+ vptr[ib]->start -= n1x31;
+ vptr[ib]->stop -= n1x31;
+ vptr[ib]->dp -= n1x31;
+ }
+ }
+
+#endif /* TFASTX */
+
+ scor = sconn (vptr, nsave, c_gap,
+ ppst->param_u.fa.pgap, f_str);
+
+ for (vmptr=vptr[0],ib=1; ib<nsave; ib++)
+ if (vptr[ib]->score > vmptr->score) vmptr=vptr[ib];
+
+/* kssort (vptr, nsave); */
+
+ rst->score[1] = vmptr->score; /* best single score - init1*/
+ rst->score[0] = max (scor, vmptr->score); /* initn */
+ rst->score[2] = rst->score[0]; /* initn */
+
+#ifndef TFAST /* FASTX */
+ *hoff = my_hoff=f_str->noff - vmptr->dp;
+#else
+ *hoff = my_hoff = vmptr->dp-f_str->noff;
+#endif
+
+ /*
+ if (n1 > 5000) {
+ fprintf(stderr," Long n1: %d\n",n1);
+ }
+ */
+
+ s_info->tot_scores++;
+ if (rst->score[0] >= c_gap) {s_info->s_cnt[0]++;}
+ if (ppst->param_u.fa.optflag) {
+#ifdef TFAST
+ if ( /* shuff_flg || */ rst->score[0] > opt_cut) {
+/* generate f_str->aa1y only if it is not there */
+ if ( !f_str->have_yaa ) {
+ for (fs=aa1,itemp=0; itemp <3; itemp++,fs++) {
+ for (fd= yaa+itemp; *fs!=EOSEQ; fd += 3, fs++) {*fd = *fs;}
+ *fd=EOSEQ;
+ }
+ }
+ }
+ aa_prot = aa0;
+ n_aap = n0;
+ aa_trans_prot= yaa;
+ n_taap = n1;
+#else
+ aa_prot = aa1;
+ n_aap = n1;
+ aa_trans_prot= yaa;
+ n_taap = n0;
+#endif
+ if ( /* shuff_flg || */ rst->score[0] > opt_cut) {
+ s_info->s_cnt[2]++;
+ rst->valid_stat = 1;
+ rst->score[2] = lx_band(aa_prot,n_aap,aa_trans_prot,n_taap,
+ ppst->pam2[ip],
+ -ppst->gdelval,
+ -ppst->ggapval,-ppst->gshift,
+ my_hoff-ppst->param_u.fa.optwid/2,ppst->param_u.fa.optwid,
+ f_str);
+ }
+ }
+}
+
+/* returns rst.score[0] - initn
+ rst.score[1] - init1
+ rst.score[2] - opt
+*/
+
+void do_work (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ int qr_flg, int shuff_flg, struct rstruct *rst,
+ struct score_count_s *s_info)
+{
+ int hoff;
+ int last_n1, itx, itt, n10, i;
+
+#ifdef TFAST
+ unsigned char *aa1x;
+ /* aa0 has a protein sequence */
+ /* aa1 has a raw DNA sequence */
+
+ itt = frame;
+ last_n1 = 0;
+ aa1x = f_str->aa1x;
+ for (itx= itt*3; itx< itt*3+3; itx++) {
+ n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
+ /*
+ fprintf(stderr," itt %d itx: %d\n",itt,itx);
+ for (i=0; i<n10; i++) {
+ fprintf(stderr,"%c",aa[f_str->aa1x[last_n1+i]]);
+ if ((i%60)==59) fprintf(stderr,"\n");
+ }
+ fprintf(stderr,"\n");
+ */
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+ f_str->have_yaa = 0;
+#endif
+
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ rst->escore = 1.0;
+ rst->segnum = rst->seglen = 1;
+
+#ifndef TFAST
+ do_fastx (f_str->aa0x, n0, aa1, n1, f_str->aa0y, ppst, f_str, rst, &hoff, shuff_flg, s_info);
+#else /* tfastx */
+ do_fastx (aa0, n0, f_str->aa1x, n10, f_str->aa1y, ppst, f_str, rst, &hoff, shuff_flg, s_info);
+#endif
+
+ rst->comp = rst->H = -1.0;
+}
+
+void do_opt (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct rstruct *rst)
+{
+ int optflag, tscore, hoff;
+ struct score_count_s s_info;
+
+#ifdef TFAST
+ int last_n1, itx, itt, n10, i;
+ unsigned char *xaa;
+
+ /* aa0 has a protein sequence */
+ /* aa1 has a raw DNA sequence */
+
+ itt = frame;
+ last_n1 = 0;
+ xaa = f_str->aa1x;
+ for (itx= itt*3; itx< itt*3+3; itx++) {
+ n10 = saatran(aa1,&xaa[last_n1],n1,itx);
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+ f_str->have_yaa = 0;
+#endif
+
+ optflag = ppst->param_u.fa.optflag;
+ ppst->param_u.fa.optflag = 1;
+
+#ifndef TFAST
+ do_fastx (f_str->aa0x, n0, aa1, n1, f_str->aa0y, ppst, f_str, rst, &hoff, 0, &s_info);
+#else /* TFASTX */
+ do_fastx (aa0, n0, xaa, n10, f_str->aa1y, ppst, f_str, rst, &hoff, 0, &s_info);
+#endif
+
+ ppst->param_u.fa.optflag = optflag;
+}
+
+int
+savemax (struct dstruct *dptr, int dpos,
+ struct savestr *vmax, struct savestr **lowmax)
+{
+ struct savestr *vmptr;
+ int i;
+
+/* check to see if this is the continuation of a run that is already saved */
+
+ if ((vmptr = dptr->dmax) != NULL && vmptr->dp == dpos &&
+ vmptr->start == dptr->start) {
+ vmptr->stop = dptr->stop;
+ if ((i = dptr->score) <= vmptr->score) return (*lowmax)->score;
+ vmptr->score = i;
+ if (vmptr != (*lowmax)) return (*lowmax)->score;
+ }
+ else {
+ i = (*lowmax)->score = dptr->score;
+ (*lowmax)->dp = dpos;
+ (*lowmax)->start = dptr->start;
+ (*lowmax)->stop = dptr->stop;
+ dptr->dmax = (*lowmax);
+ }
+
+ for (vmptr = vmax; vmptr < vmax+MAXSAV; vmptr++) {
+ if (vmptr->score < i) {
+ i = vmptr->score;
+ *lowmax = vmptr;
+ }
+ }
+ return i;
+}
+
+int spam (const unsigned char *aa0, const unsigned char *aa1,
+ struct savestr *dmax, int **pam2,
+ struct f_struct *f_str)
+{
+ int lpos;
+ int tot, mtot;
+ struct {
+ int start, stop, score;
+ } curv, maxv;
+ const unsigned char *aa0p, *aa1p;
+
+ aa1p = &aa1[lpos = dmax->start];
+ aa0p = &aa0[lpos - dmax->dp + f_str->noff];
+ curv.start = lpos;
+
+ tot = curv.score = maxv.score = 0;
+ for (; lpos <= dmax->stop; lpos++) {
+ tot += pam2[*aa0p++][*aa1p++];
+ if (tot > curv.score) {
+ curv.stop = lpos;
+ curv.score = tot;
+ }
+ else if (tot < 0) {
+ if (curv.score > maxv.score) {
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+ tot = curv.score = 0;
+ curv.start = lpos+1;
+ }
+ }
+
+ if (curv.score > maxv.score) {
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+
+/* if (maxv.start != dmax->start || maxv.stop != dmax->stop)
+ printf(" new region: %3d %3d %3d %3d\n",maxv.start,
+ dmax->start,maxv.stop,dmax->stop);
+*/
+ dmax->start = maxv.start;
+ dmax->stop = maxv.stop;
+
+ return maxv.score;
+}
+
+#define XFACT 10
+
+int sconn (struct savestr **v, int n,
+ int cgap, int pgap, struct f_struct *f_str)
+{
+ int i, si;
+ struct slink {
+ int score;
+ struct savestr *vp;
+ struct slink *next;
+ } *start, *sl, *sj, *so, sarr[MAXSAV];
+ int lstart, tstart, plstop, ptstop;
+
+/* sort the score left to right in lib pos */
+
+ kpsort (v, n);
+
+ start = NULL;
+
+/* for the remaining runs, see if they fit */
+
+ for (i = 0, si = 0; i < n; i++)
+ {
+
+/* if the score is less than the gap penalty, it never helps */
+ if (v[i]->score < cgap)
+ continue;
+ lstart = v[i]->start;
+ tstart = lstart - v[i]->dp + f_str->noff;
+
+/* put the run in the group */
+ sarr[si].vp = v[i];
+ sarr[si].score = v[i]->score;
+ sarr[si].next = NULL;
+
+/* if it fits, then increase the score */
+ for (sl = start; sl != NULL; sl = sl->next)
+ {
+ plstop = sl->vp->stop;
+ ptstop = plstop - sl->vp->dp + f_str->noff;
+ if (plstop < lstart+XFACT && ptstop < tstart+XFACT) {
+ sarr[si].score = sl->score + v[i]->score + pgap;
+ break;
+ }
+ }
+
+/* now recalculate where the score fits */
+ if (start == NULL)
+ start = &sarr[si];
+ else
+ for (sj = start, so = NULL; sj != NULL; sj = sj->next)
+ {
+ if (sarr[si].score > sj->score)
+ {
+ sarr[si].next = sj;
+ if (so != NULL)
+ so->next = &sarr[si];
+ else
+ start = &sarr[si];
+ break;
+ }
+ so = sj;
+ }
+ si++;
+ }
+
+ if (start != NULL)
+ return (start->score);
+ else
+ return (0);
+}
+
+void
+kssort (v, n)
+struct savestr *v[];
+int n;
+{
+ int gap, i, j;
+ struct savestr *tmp;
+
+ for (gap = n / 2; gap > 0; gap /= 2)
+ for (i = gap; i < n; i++)
+ for (j = i - gap; j >= 0; j -= gap)
+ {
+ if (v[j]->score >= v[j + gap]->score)
+ break;
+ tmp = v[j];
+ v[j] = v[j + gap];
+ v[j + gap] = tmp;
+ }
+}
+
+void
+kpsort (struct savestr **v, int n) {
+ int gap, i, j, k;
+ int incs[4] = { 21, 7, 3, 1 };
+ struct savestr *tmp;
+ int v_start;
+
+ for ( k = 0; k < 4; k++) {
+ gap = incs[k];
+ for (i = gap; i < n; i++) {
+ tmp = v[i];
+ j = i;
+ v_start = v[i]->start;
+ while (j >= gap && v[j - gap]->start > v_start) {
+ v[j] = v[j - gap];
+ j -= gap;
+ }
+ v[j] = tmp;
+ }
+ }
+}
+
+static void
+init_row(struct sx_s *row, int sp) {
+ int i;
+ for (i = 0; i < sp; i++) {
+ row[i].C1 = row[i].I1 = 0;
+ row[i].C2 = row[i].I2 = 0;
+ row[i].C3 = row[i].I3 = 0;
+ row[i].flag = 0;
+ }
+}
+
+int
+lx_band(const unsigned char *prot_seq, /* array with protein sequence numbers*/
+ int len_prot, /* length of prot. seq */
+ const unsigned char *dna_prot_seq, /* translated DNA sequence numbers*/
+ int len_dna_prot, /* length trans. seq. */
+ int **pam_matrix, /* scoring matrix */
+ int gopen, int gext, /* gap open, gap extend penalties */
+ int gshift, /* frame-shift penalty */
+ int start_diag, /* start diagonal of band */
+ int width, /* width for band alignment */
+ struct f_struct *f_str)
+{
+ void *ckalloc();
+ int i, j, bd, bd1, x1, sp, p1=0, p2=0, end_prot;
+ int sc, del, best = 0, cd,ci, e1, e2, e3, cd1, cd2, cd3, f, gg;
+ register int *wt;
+ const unsigned char *dp;
+ register struct sx_s *ap, *aq;
+
+ sp = width+7;
+ gg = gopen+gext;
+ /* sp = sp/3; */
+ if (f_str->cur == NULL) {
+ f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp);
+ f_str->cur_sp_size = sp;
+ }
+ else if (f_str->cur_sp_size != sp) {
+ free(f_str->cur);
+ f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp);
+ f_str->cur_sp_size = sp;
+ }
+
+ init_row(f_str->cur, sp);
+
+ /*
+ if (start_diag %3 !=0) start_diag = start_diag/3-1;
+ else start_diag = start_diag/3;
+ */
+
+ /*
+ if (width % 3 != 0) width = width/3+1;
+ else width = width /3;
+ */
+
+ /* currently, this code assumes that the DNA sequence is longer than the
+ protein sequence. This is not always true. len_prot in the loop below
+ should be decreased to the projection of the DNA on the protein */
+
+ x1 = start_diag; /* x1 = lower bound of DNA */
+
+
+ end_prot = max(0,-width-start_diag) + (len_dna_prot+5)/3 + width;
+ end_prot = min(end_prot,len_prot);
+
+ /* i counts through protein sequence, x1 through DNAp */
+
+ for (i = max(0, -width-start_diag), x1+=i; i < end_prot; i++, x1++) {
+ bd = min(x1+width, len_dna_prot/3); /* upper bound of band */
+ bd1 = max(0,x1); /* lower bound of band */
+ wt = pam_matrix[prot_seq[i]];
+ del = 1-x1; /*adjustment*/
+ bd += del;
+ bd1 +=del;
+
+ ap = &f_str->cur[bd1];
+ aq = ap+1;
+ e1 = f_str->cur[bd1-1].C3;
+ e2 = ap->C1;
+ cd1 = cd2= cd3= 0;
+
+ for (dp = &dna_prot_seq[(bd1-del)*3]; ap < &f_str->cur[bd]; ap++) {
+ sc = max(max(e1, (e3=ap->C2))-gshift, e2)+wt[*dp++];
+ if (cd1 > sc) sc = cd1;
+ cd1 -= gext;
+ if ((ci = aq->I1) > 0) {
+ if (sc < ci) { ap->C1 = ci; ap->I1 = ci-gext;}
+ else {
+ ap->C1 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd1 < sc) cd1 = sc;
+ ap->I1 = max(ci-gext, sc);
+ } else ap->I1 = ci-gext;
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I1 = ap->C1 = 0;
+ } else {
+ ap->C1 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd1 < sc) cd1 = sc;
+ ap->I1 = sc;
+ } else ap->I1 = 0;
+ }
+ }
+ sc = max(max(e2, (e1=ap->C3))-gshift, e3)+wt[*dp++];
+ if (cd2 > sc) sc = cd2;
+ cd2 -= gext;
+ if ((ci = aq->I2) > 0) {
+ if (sc < ci) { ap->C2 = ci; ap->I2 = ci-gext;}
+ else {
+ ap->C2 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd2 < sc) cd2 = sc;
+ ap->I2 = max(ci-gext, sc);
+ }
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I2 = ap->C2 = 0;
+ } else {
+ ap->C2 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd2 < sc) cd2 = sc;
+ ap->I2 = sc;
+ } else ap->I2 = 0;
+ }
+ }
+ sc = max(max(e3, (e2=aq->C1))-gshift, e1)+wt[*dp++];
+ if (cd3 > sc) sc = cd3;
+ cd3 -= gext;
+ if ((ci = aq++->I3) > 0) {
+ if (sc < ci) { ap->C3 = ci; ap->I3 = ci-gext;}
+ else {
+ ap->C3 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd3 < sc) cd3 = sc;
+ ap->I3 = max(ci-gext, sc);
+ }
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I3 = ap->C3 = 0;
+ } else {
+ ap->C3 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd3 < sc) cd3 = sc;
+ ap->I3 = sc;
+ } else ap->I3 = 0;
+ }
+ }
+ }
+ }
+ /* printf("The best score is %d\n", best); */
+ return best+gopen+gext;
+}
+
+/* ckalloc - allocate space; check for success */
+void *ckalloc(size_t amount)
+{
+ void *p;
+
+ if ((p = (void *)malloc( (size_t)amount)) == NULL)
+ w_abort("Ran out of memory.","");
+ return(p);
+}
+
+/* calculate the 100% identical score */
+int
+shscore(unsigned char *aa0, int n0, int **pam2)
+{
+ int i, sum;
+ for (i=0,sum=0; i<n0; i++)
+ sum += pam2[aa0[i]][aa0[i]];
+ return sum;
+}
+
+#define WIDTH 60
+
+/* code above is to convert sequence into numbers */
+
+typedef struct mat *match_ptr;
+
+typedef struct mat {
+ int i, j, l;
+ match_ptr next;
+} match_node;
+
+typedef struct {
+ int i,j;
+} state;
+
+typedef state *state_ptr;
+
+typedef struct st_s { int C, I, D;} *st_ptr;
+
+/* static st_ptr up=NULL, down, tp; */
+/* static int *st_up; */
+/* static int gop, gext, shift; */
+
+void *ckalloc(size_t);
+static match_ptr small_global(), global();
+static int local_align(), find_best();
+static void init_row2(), init_ROW();
+
+int
+pro_dna(const unsigned char *prot_seq, /* array with prot. seq. numbers*/
+ int len_prot, /* length of prot. seq */
+ const unsigned char *dna_prot_seq, /* trans. DNA seq. numbers*/
+ int len_dna_prot, /* length trans. seq. */
+ int **pam_matrix, /* scoring matrix */
+ int gopen, int gex, /* gap open, gap extend penalties */
+ int gshift, /* frame-shift penalty */
+ struct smgl_str *smgl_sp,
+ int max_res,
+ struct a_res_str *a_res) /* alignment info */
+{
+ match_ptr align, ap, aq;
+ int x, y, ex, ey, i, score;
+ int *alignment;
+ st_ptr up, down, tp;
+
+ /* these globals removed */
+ /* gext = gex; gop = gopen; shift = gshift; */
+
+ /* for fastx (but not tfastx), these could be moved into init_work(),
+ and done only once */
+
+ up = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+ down = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+ tp = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+
+ /*local alignment find the best local alignment x (prot) and y (DNA)
+ is the starting position of the best local alignment
+ and ex (prot) ey (DNA) is the ending position */
+ score= local_align(&x, &y, &ex, &ey, pam_matrix,
+ gopen, gex, gshift,
+ dna_prot_seq, len_dna_prot,
+ prot_seq, len_prot, up, down);
+
+ /* this is very strange, since local_align initialized up, down */
+ up += 3; down += 3; tp += 3;
+
+ /* x, y - start in prot, dna_prot */
+ a_res->min0 = x; /* prot */
+ a_res->max0 = ex; /* prot */
+
+ a_res->min1 = y; /* DNA-prot */
+ a_res->max1 = ey; /* DNA-prot */
+
+ align = global(x, y, ex, ey, pam_matrix, gopen, gex, gshift,
+ dna_prot_seq, prot_seq, 0, 0, &up, &down, &tp,
+ smgl_sp);
+
+ alignment = a_res->res;
+
+ /* from earlier version */
+ /* alignment[0] = x; */ /* start of alignment in prot */
+ /* alignment[1] = y; */ /* start of alignment in DNA */
+
+ for (ap = align, i= 0; ap; i++) {
+ if (i < max_res) {alignment[i] = ap->l;}
+ aq = ap->next; free(ap); ap = aq;
+ }
+
+ if (i >= max_res) {
+ fprintf(stderr," alignment truncated: %d/%d\n", max_res,i);
+ }
+
+ up = &up[-3]; down = &down[-3]; tp = &tp[-3];
+ free(up); free(tp); free(down);
+ /* free(st_up); */ /* moved into local align */
+
+ a_res->nres = i; /* i has the length of the alignment */
+ return score;
+}
+
+static void
+swap(void **a, void **b) {
+ void *t;
+
+ t = *a;
+ *a = *b;
+ *b = t;
+}
+
+/*
+ local alignment find the best local alignment x and y
+ is the starting position of the best local alignment
+ and ex ey is the ending position
+*/
+static int
+local_align(int *x, int *y, int *ex, int *ey,
+ int **wgts, int gop, int gext, int shift,
+ const unsigned char *dnap, int ld,
+ const unsigned char *pro, int lp,
+ st_ptr up, st_ptr down) {
+
+ int i, j, score, x1,x2,x3,x4, e1, e2 = 0, e3,
+ sc, del, e, best = 0, *wt, cd, ci;
+ state_ptr cur_st, last_st, cur_i_st;
+ st_ptr cur, last;
+ const unsigned char *dp;
+ int *st_up, *cur_d_st;
+
+/*
+ Array rowiC store the best scores of alignment ending at a position
+ Arrays rowiD, and rowiI store the best scores of alignment ending
+ at a position with a deletion or insrtion
+ Arrays sti stores the starting position of the best alignment whose
+ score stored in the corresponding row array.
+ The program stores two rows to complete the computation, same is
+ for the global alignment routine.
+*/
+
+ /* for fastx (but not tfastx), this could be moved into init_work(),
+ and done only once */
+ st_up = (int *) ckalloc(sizeof(int)*(ld+10));
+ init_row2(st_up, ld+5);
+
+ ld += 2;
+ init_ROW(up, ld+1); /* set to zero */
+ init_ROW(down, ld+1); /* set to zero */
+
+
+ cur = up+1;
+ last = down+1;
+
+ /* for fastx (but not tfastx), these could be moved into init_work(),
+ and done only once */
+ cur_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+ last_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+ cur_i_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+
+ cur_d_st = st_up;
+
+ dp = dnap-2;
+ for (i = 0; i < lp; i++) {
+ wt = &wgts[pro[i]][0];
+ for (j = 0; j < 2; j++) {
+ cur_st[j].i = i+1;
+ cur_st[j].j = j+1;
+ }
+ for (j = 2; j < ld; j++) {
+ score = wt[dp[j]];
+ del = -1;
+ if (j >= 3) {
+ sc = -score;
+ e3 = e2-shift; e2 = last[j-3].C;
+ e1 = last[j-2].C-shift;
+ if (e1 > sc) {sc = e1; del = 2;}
+ if (e2 > sc) {sc = e2; del = 3;}
+ if (e3 > sc) {sc = e3; del = 4;}
+ } else {
+ sc = e2 = 0;
+ if (sc < -score) sc=-score;
+ else del = 3;
+ }
+ sc += score;
+ if (sc < (ci=last[j].I)) {
+ sc = ci; del = 0;
+ }
+ if (sc < (cd=cur[j].D)) {
+ sc = cd; del = 5;
+ }
+ cur[j].C = sc;
+ e = sc - gop;
+ if (e > cd) {
+ cur[j+3].D = e-gext;
+ cur_d_st[j+3] = 3;
+ } else {
+ cur[j+3].D = cd-gext;
+ cur_d_st[j+3] = cur_d_st[j]+3;
+ }
+ switch(del) {
+ case 5:
+ e1 = cur_d_st[j];
+ cur_st[j].i = cur_st[j-e1].i;
+ cur_st[j].j = cur_st[j-e1].j;
+ break;
+ case 0:
+ cur_st[j].i = cur_i_st[j].i;
+ cur_st[j].j = cur_i_st[j].j;
+ break;
+ case 2:
+ case 3:
+ case 4:
+ if (i) {
+ if (j-del >= 0) {
+ cur_st[j].i = last_st[j-del].i;
+ cur_st[j].j = last_st[j-del].j;
+ } else {
+ cur_st[j].i = i;
+ cur_st[j].j = 0;
+ }
+ } else {
+ cur_st[j].i = 0;
+ cur_st[j].j = max(0, j-del+1);
+ }
+ break;
+ case -1:
+ cur_st[j].i = i+1;
+ cur_st[j].j = j+1;
+ break;
+ }
+ if (e > ci) {
+ cur[j].I = e -gext;
+ cur_i_st[j].i = cur_st[j].i;
+ cur_i_st[j].j = cur_st[j].j;
+ } else {
+ cur[j].I = ci- gext;
+ }
+ if (sc > best) {
+ x1 = cur_st[j].i;
+ x2 = cur_st[j].j;
+ best =sc;
+ x3 = i;
+ x4 = j;
+ }
+ }
+ swap((void **)&last, (void **)&cur);
+ swap((void **)&cur_st, (void **)&last_st);
+ }
+ /* printf("The best score is %d\n", best); */
+ *x = x1; *y = x2; *ex = x3; *ey = x4;
+ free(cur_st); free(last_st); free(cur_i_st);
+ free(st_up);
+ return best;
+}
+
+/*
+ Both global_up and global_down do linear space score only global
+ alignments on subsequence pro[x]...pro[ex], and dna[y]...dna[ey].
+ global_up does the algorithm upwards, from row x towards row y.
+ global_down does the algorithm downwards, from row y towards x.
+*/
+
+static void
+global_up(st_ptr *row1, st_ptr *row2,
+ int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext, int shift,
+ unsigned char *dnap,
+ unsigned char *pro,
+ int N) {
+ int i, j, k, sc, e, e1, e2, e3, t, ci, cd, score, *wt;
+ st_ptr cur, last;
+
+ cur = *row1; last = *row2;
+
+ sc = -gop-gext;
+
+ for (j = 1; j <= ey-y+1; j++) {
+ if (j % 3 == 0) {last[j].C = sc; sc -= gext; last[j].I = sc-gop;}
+ else { last[j].I = last[j].C = -10000;}
+ cur[j].I = -10000;
+ }
+
+ last[0].C = 0; cur[0].D = cur[1].D = cur[2].D = -10000;
+ last[0].D = last[1].D = last[2].D = -10000;
+
+ if (N) last[0].I = -gext;
+ else last[0].I = -gop-gext;
+
+ for (i = 1; i <= ex-x+1; i++) {
+ wt = &wgts[pro[i+x-1]][0]; e2 = last[0].C; e1 = -10000;
+ for (j = 0; j <= ey-y+1; j++) {
+ t = j+y;
+ sc = -10000;
+ if (t < 3) score = -10000;
+ else score = wt[dnap[t-3]];
+ if (j < 4) {
+ if (j == 3) sc = e2;
+ else if (j == 2) sc = e2-shift;
+ }
+ else {
+ e3 = e2; e2 = e1;
+ e1 = last[j-2].C;
+ sc = max(max(e1, e3)-shift, e2);
+ }
+ sc += score;
+ sc = max(sc, max(ci=last[j].I, cd = cur[j].D));
+ cur[j].C = sc;
+ cur[j+3].D = max(cd, sc-gop)-gext;
+ cur[j].I = max(ci, sc-gop)-gext;
+ }
+ swap((void **)&last, (void **)&cur);
+ }
+ for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
+ if (*row1 != last) swap((void **)row1, (void **)row2);
+}
+
+static void
+global_down(st_ptr *row1, st_ptr *row2,
+ int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext, int shift,
+ unsigned char *dnap, unsigned char *pro,
+ int N) {
+ int i, j, k, sc, del, *tmp, e, t, e1,e2,e3, ci,cd, s1, s2, s3, *wt;
+ st_ptr cur, last;
+
+ cur = (*row1); last = *row2;
+
+ sc = -gop-gext;
+
+ for (j = ey-y; j >= 0; j--) {
+ if ((ey-y+1-j) % 3) {last[j].C = sc; sc-=gext; last[j].I = sc-gop;}
+ else last[j].I = last[j].C = -10000;
+ }
+
+ last[ey-y+1].C = 0;
+ cur[ey-y+1].D = cur[ey-y].D = cur[ey-y-1].D = -10000;
+ last[ey-y+1].D = last[ey-y].D = last[ey-y-1].D = -10000;
+
+ if (N) last[ey-y+1].I = -gext;
+ else last[ey-y+1].I = -gop-gext;
+
+ for (i = ex-x; i >= 0; i--) {
+ wt = &wgts[pro[i+x]][0]; e2 = last[ey-y+1].C;
+ e1 = s2 = s3 = -10000;
+ for (j = ey-y+1; j >= 0; j--) {
+ t = j+y;
+ s1 = wt[dnap[t-1]];
+ sc = -10000;
+ if (t+3 > ey) {
+ if (t+2==ey) sc = e2+s2;
+ else if (t+1==ey) sc = e2-shift+s1;
+ } else {
+ e3 = e2; e2 = e1;
+ e1 = last[j+2].C;
+ sc = max(max(e1+s1, e3+s3)-shift, e2+s2);
+ }
+ if (sc < (cd= cur[j].D)) {
+ sc = cd;
+ cur[j-3].D = cd-gext;
+ } else cur[j-3].D =max(cd, sc-gop)-gext;
+ if (sc < (ci= last[j].I)) {
+ sc = ci; del = 0;
+ cur[j].I = ci - gext;
+ } else cur[j].I = max(sc-gop,ci)-gext;
+ cur[j].C = sc;
+ s3 = s2; s2 = s1;
+ }
+ swap((void **)&last, (void **)&cur);
+ }
+ for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
+ if (*row1 != last) swap((void **)row1, (void **)row2);
+}
+
+static void
+init_row2(int *row, int ld) {
+ int i;
+ for (i = 0; i < ld; i++) row[i] = 0;
+}
+
+static void
+init_ROW(st_ptr row, int ld) {
+ int i;
+ for (i = 0; i < ld; i++) row[i].I = row[i].D = row[i].C = 0;
+}
+
+static match_ptr
+combine(match_ptr x1, match_ptr x2, int st) {
+ match_ptr x;
+
+ if (x1 == NULL) return x2;
+ for (x = x1; x->next; x = x->next);
+ x->next = x2;
+ if (st) {
+ for (x = x2; x; x = x->next) {
+ x->j++;
+ if (x->l == 3 || x->l == 4) break;
+ }
+ x->l--;
+ }
+ return x1;
+}
+
+/*
+ global use the two upwards and downwards score only linear
+ space global alignment subroutine to recursively build the
+ alignment.
+*/
+
+match_ptr
+global(int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext, int shift,
+ unsigned char *dnap,
+ unsigned char *pro,
+ int N1, int N2,
+ st_ptr *up_stp, st_ptr *dn_stp, st_ptr *tp_stp,
+ struct smgl_str *smgl_sp
+ )
+{
+ int m;
+ int m1, m2;
+ match_ptr x1, x2, mm1, mm2;
+ /*printf("%d %d %d %d\n", x,y, ex, ey);*/
+ /*
+ if the space required is limited, we can do a quadratic space
+ algorithm to find the alignment.
+ */
+ if (ex <= x) {
+ mm1 = NULL; mm2= NULL;
+ for (m = y+3; m <= ey; m+=3) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = 5; x1->next = mm1;
+ if (mm1== NULL) mm2 = x1;
+ mm1 = x1;
+ }
+ if (ex == x) {
+ if ((ey-y) % 3 != 0) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = ((ey-y) % 3) +1; x1->next = NULL;
+ if (mm2) mm2->next = x1;
+ else mm1 = x1;
+ } else {
+ if (mm2) mm2->l = 4;
+ }
+ }
+ return mm1;
+ }
+ if (ey <= y) {
+ mm1 = NULL;
+ for (m = x; m <= ex; m++) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = 0; x1->next = mm1; mm1 = x1;
+ }
+ return mm1;
+ }
+ if (ex -x < SGW1-1 && ey-y < SGW2-1)
+ return small_global(x,y,ex,ey,
+ wgts, gop, gext, shift,
+ dnap, pro, N1, N2, smgl_sp);
+ m = (x+ex)/2;
+ /*
+ Do the score only global alignment from row x to row m, m is
+ the middle row of x and ex. Store the information of row m in
+ upC, upD, and upI.
+ */
+ global_up(up_stp, tp_stp, x, y, m, ey,
+ wgts, gop, gext, shift,
+ dnap, pro, N1);
+
+ /*
+ Do the score only global alignment downwards from row ex
+ to row m+1, store information of row m+1 in downC downI and downD
+ */
+ global_down(dn_stp, tp_stp, m+1, y, ex, ey,
+ wgts, gop, gext, shift,
+ dnap, pro, N2);
+
+ /*
+ Use these information of row m and m+1, to find the crossing
+ point of the best alignment with the middle row. The crossing
+ point is given by m1 and m2. Then we recursively call global
+ itself to compute alignments in two smaller regions found by
+ the crossing point and combine the two alignments to form a
+ whole alignment. Return that alignment.
+ */
+ if (find_best(*up_stp, *dn_stp, &m1, &m2, ey-y+1, y, gop)) {
+ x1 = global(x, y, m, m1, wgts, gop, gext, shift, dnap, pro, N1, 0,
+ up_stp, dn_stp, tp_stp, smgl_sp);
+ x2 = global(m+1, m2, ex, ey, wgts, gop, gext, shift, dnap, pro, 0, N2,
+ up_stp, dn_stp, tp_stp, smgl_sp);
+ if (m1 == m2) x1 = combine(x1,x2,1);
+ else x1 = combine(x1, x2,0);
+ } else {
+ x1 = global(x, y, m-1, m1, wgts, gop, gext, shift, dnap, pro, N1, 1,
+ up_stp, dn_stp, tp_stp, smgl_sp);
+ x2 = global(m+2, m2, ex, ey, wgts, gop, gext, shift, dnap, pro, 1, N2,
+ up_stp, dn_stp, tp_stp, smgl_sp);
+ mm1 = (match_ptr) ckalloc(sizeof(match_node));
+ mm1->i = m; mm1->l = 0; mm1->j = m1;
+ mm2 = (match_ptr) ckalloc(sizeof(match_node));
+ mm2->i = m+1; mm2->l = 0; mm2->j = m1;
+ mm1->next = mm2; mm2->next = x2;
+ x1 = combine(x1, mm1, 0);
+ }
+ return x1;
+}
+
+static int
+find_best(st_ptr up, st_ptr down,
+ int *m1, int *m2,
+ int ld, int y, int gop) {
+ int i, best = -100000, j = 0, s1, s2, s3, s4, st;
+ up++;
+ for (i = 1; i < ld; i++) {
+ s2 = up[i-1].C + down[i].C;
+ s4 = up[i-1].I + down[i].I + gop;
+ if (best < s2) {
+ best = s2; j = i; st = 1;
+ }
+ if (best < s4) {
+ best = s4; j = i; st = 0;
+ }
+ }
+ *m1 = j-1+y;
+ *m2 = j+y;
+ /*printf("find best score =%d\n", best);*/
+ return st;
+}
+
+/*
+ An alignment is represented as a linked list whose element
+ is of type match_node. Each element represent an edge in the
+ path of the alignment graph. The fields of match_node are
+ l --- gives the type of the edge.
+ i, j --- give the end position.
+*/
+
+static match_ptr
+small_global(int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext, int shift,
+ unsigned char *dnap, unsigned char *pro,
+ int N1, int N2, struct smgl_str *smgl_sp) {
+
+ /* int C[SGW1+1][SGW2+1], st[SGW1+1][SGW2+1], D[SGW2+7], I[SGW2+1]; */
+
+ int i, j, e, sc, score, del, k, t, *wt, ci, cd;
+ int *cI, *cD, *cC, *lC, *cst, e2, e3, e4;
+ match_ptr mp, first;
+
+ /*printf("small_global %d %d %d %d\n", x, y, ex, ey);*/
+ sc = -gop-gext; smgl_sp->C[0][0] = 0;
+
+ cI = smgl_sp->I;
+ if (N1) cI[0] = -gext; else cI[0] = sc;
+ for (j = 1; j <= ey-y+1; j++) {
+ if (j % 3== 0) {
+ smgl_sp->C[0][j] = sc;
+ sc -= gext;
+ cI[j] = sc-gop;
+ }
+ else {cI[j] = smgl_sp->C[0][j] = -10000;}
+ smgl_sp->st[0][j] = 5;
+ }
+
+ lC = &smgl_sp->C[0][0];
+ cD = smgl_sp->D; cD[0] = cD[1] = cD[2] = -10000;
+
+ for (i = 1; i <= ex-x+1; i++) {
+ cC = &smgl_sp->C[i][0];
+ wt = &wgts[pro[i+x-1]][0]; cst = &smgl_sp->st[i][0];
+ for (j = 0; j <=ey-y+1; j++) {
+ sc = -10000; del = 0;
+ ci = cI[j];
+ cd= cD[j];
+ t = j+y;
+ if (t < 3) score = -10000;
+ else score = wt[dnap[t-3]];
+ if (j >= 4) {
+ e2 = lC[j-2]-shift; sc = lC[j-3]; e4 = lC[j-4]-shift;
+ del = 3;
+ if (e2 > sc) { sc = e2; del = 2;}
+ if (e4 >= sc) { sc = e4; del = 4;}
+ } else {
+ if (j ==3) {sc= lC[0]; del = 3;}
+ else if (j == 2) {sc = lC[0]-shift; del = 2;}
+ }
+ sc = sc+score;
+ if (sc < ci) {
+ sc = ci; del = 0;
+ }
+ if (sc <= cd) {
+ sc = cd;
+ del = 5;
+ }
+ cC[j] = sc;
+ sc -= gop;
+ if (sc < cd) {
+ del += 10;
+ cD[j+3] = cd - gext;
+ } else cD[j+3] = sc -gext;
+ if (sc < ci) {
+ del += 20;
+ cI[j] = ci-gext;
+ } else cI[j] = sc-gext;
+ *(cst++) = del;
+ }
+ lC = cC;
+ }
+ if (N2 && ci +gop > cC[ey-y+1]) {
+ smgl_sp->st[ex-x+1][ey-y+1] = 0;
+ /*printf("small score = %d\n", ci+gop);*/
+ } /*else printf("small score =%d\n", cC[ey-y+1]);*/
+ first = NULL; e = 1;
+ for (i = ex+1, j = ey+1; i > x || j > y; i--) {
+ mp = (match_ptr) ckalloc(sizeof(match_node));
+ mp->i = i-1;
+ k = (t=smgl_sp->st[i-x][j-y])%10;
+ mp->j = j-1;
+ if (e == 5 && (t/10)%2 == 1) k = 5;
+ if (e == 0 && (t/20)== 1) k = 0;
+ if (k == 5) { j -= 3; i++; e=5;}
+ else {j -= k;if (k==0) e= 0; else e = 1;}
+ mp->l = k;
+ mp->next = first;
+ first = mp;
+ }
+
+ /* for (i = 0; i <= ex-x; i++) {
+ for (j = 0; j <= ey-y; j++)
+ printf("%d ", C[i][j]);
+ printf("\n");
+ }
+ */
+ return first;
+}
+
+#define XTERNAL
+#include "upam.h"
+
+extern void
+display_alig(int *a, unsigned char *dna, unsigned char * pro, int length, int ld)
+{
+ int len = 0, i, j, x, y, lines, k;
+ char line1[100], line2[100], line3[100],
+ tmp[10] = " ";
+ unsigned char *dna1, c1, c2, c3, *st;
+
+ dna1 = ckalloc((size_t)ld);
+ for (st = dna, i = 0; i < ld; i++, st++) dna1[i] = NCBIstdaa[*st];
+ line1[0] = line2[0] = line3[0] = '\0'; x= a[0]; y = a[1]-1;
+
+ for (len = 0, j = 2, lines = 0; j < length; j++) {
+ i = a[j];
+ /*printf("%d %d %d\n", i, len, b->j);*/
+ if (i > 0 && i < 5) tmp[i-2] = NCBIstdaa[pro[x++]];
+ if (i == 5) {
+ i = 3; tmp[0] = tmp[1] = tmp[2] = '-';
+ if (a[j+1] == 2) tmp[2] = ' ';
+ }
+ if (i > 0) {
+ strncpy(&line1[len], (const char *)&dna1[y], i); y+=i;
+ } else {line1[len] = '-'; i = 1; tmp[0] = NCBIstdaa[pro[x++]];}
+ strncpy(&line2[len], tmp, i);
+ for (k = 0; k < i; k++) {
+ if (tmp[k] != ' ' && tmp[k] != '-') {
+ if (k == 2) tmp[k] = '\\';
+ else if (k == 1) tmp[k] = '|';
+ else tmp[k] = '/';
+ } else tmp[k] = ' ';
+ }
+ if (i == 1) tmp[0] = ' ';
+ strncpy(&line3[len], tmp, i);
+ tmp[0] = tmp[1] = tmp[2] = ' ';
+ len += i;
+ line1[len] = line2[len] =line3[len] = '\0';
+ if (len >= WIDTH) {
+ printf("\n%5d", WIDTH*lines++);
+ for (k = 10; k <= WIDTH; k+=10)
+ printf(" . :");
+ if (k-5 < WIDTH) printf(" .");
+ c1 = line1[WIDTH]; c2 = line2[WIDTH]; c3 = line3[WIDTH];
+ line1[WIDTH] = line2[WIDTH] = line3[WIDTH] = '\0';
+ printf("\n %s\n %s\n %s\n", line1, line3, line2);
+ line1[WIDTH] = c1; line2[WIDTH] = c2; line3[WIDTH] = c3;
+ strncpy(line1, &line1[WIDTH], sizeof(line1)-1);
+ strncpy(line2, &line2[WIDTH], sizeof(line2)-1);
+ strncpy(line3, &line3[WIDTH], sizeof(line3)-1);
+ len = len - WIDTH;
+ }
+ }
+ printf("\n%5d", WIDTH*lines);
+ for (k = 10; k < len; k+=10)
+ printf(" . :");
+ if (k-5 < len) printf(" .");
+ printf("\n %s\n %s\n %s\n", line1, line3, line2);
+}
+
+
+/* alignment store the operation that align the protein and dna sequence.
+ The code of the number in the array is as follows:
+ 0: delete of an amino acid.
+ 2: frame shift, 2 nucleotides match with an amino acid
+ 3: match an amino acid with a codon
+ 4: the other type of frame shift
+ 5: delete of a codon
+
+
+ Also the first two element of the array stores the starting point
+ in the protein and dna sequences in the local alignment.
+
+ Display looks like where WIDTH is assumed to be divisible by 10.
+
+ 0 . : . : . : . : . : . :
+ CCTATGATACTGGGATACTGGAACGTCCGCGGACTGACACACCCGATCCGCATGCTCCTG
+ P M I L G Y W N V R G L T H P I R M L L
+
+ 60 . : . : . : . : . : . :
+ GAATACACAGACTCAAGCTATGATGAGAAGAGATACACCATGGGTGACGCTCCCGACTTT
+ E Y T D S S Y D E K R Y T M G D A P D F
+*/
+
+
+/* fatal - print message and die */
+void fatal(msg)
+char *msg;
+{
+ fprintf(stderr, "%s\n", msg);
+ exit(1);
+}
+
+void
+fx_walign (const unsigned char *aa0, int n0,
+ const unsigned char *xaa, int n1, unsigned char *yaa,
+ int frame, int max_res,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct a_res_str *a_res,
+ int score_thresh
+ )
+{
+ unsigned char *local_xaa, *local_yaa;
+ int score;
+ int i, last_n1, itemp, n10;
+ int hoff, l_min, l_max, n_nt, n_aa, w_fact;
+ int score_ix, window;
+ int aa1_min_s, aa1_max_s;
+ unsigned char *fs, *fd;
+ struct score_count_s s_info;
+ int itx;
+
+ memset(&s_info,0,sizeof(s_info));
+
+ score_ix = ppst->score_ix;
+
+ /* check for large differences in sequence length - if there is a
+ large difference, use do_fastx() to get the offset. */
+
+#ifndef TFAST /* FASTX */
+ n_nt = n0;
+ n_aa = n1;
+#else /* TFASTX */
+ n_nt = n1;
+ n_aa = n0;
+#endif
+
+ do_fastx(aa0, n0, xaa, n1, yaa, ppst, f_str, &a_res->rst, &hoff,1, &s_info);
+
+ if (a_res->rst.score[score_ix] <= score_thresh) {
+ a_res->sw_score = 0;
+ a_res->n1 = n1;
+ return;
+ }
+
+ /* now we will do an alignment, but we need to be certain to do the
+ alignment in the region mapped by hoff to include the
+ high-scoring region */
+
+ /* if initn > 2 * init1, use wider window */
+ if (a_res->rst.score[0] > 2 * a_res->rst.score[1]) {w_fact = 4;}
+ else w_fact = 2;
+
+ /* Here we need to use different strategies depending on whether we
+ have DNA or protein. For a DNA query (protein library, FASTX), the
+ strategy is simple -- NULL bound the library protein sequence and
+ do the alignment. For a protein query (TFASTX), things are more complex.
+ Moreover, the mapping must be calculated differently in each case.
+ */
+
+
+#ifndef TFAST /* map onto the protein (aa1) sequence */
+ window = min(n1, ppst->param_u.fa.optwid);
+ l_min = max(0, -window - hoff);
+ l_max = min(n1, n0-hoff+window);
+
+ local_yaa = yaa;
+ local_xaa = (unsigned char *)xaa;
+ if (l_min > 0 || l_max < n1 -1 ) {
+ local_xaa = (unsigned char *)calloc(l_max - l_min+2,sizeof(char));
+ local_xaa++;
+ memcpy(local_xaa, xaa+l_min, l_max - l_min);
+ }
+/*
+ if (l_min > 0) {
+ aa1_min_s = xaa[l_min-1];
+ local_xaa[l_min-1] = '\0';
+ }
+ if (l_max < n1 - 1) {
+ aa1_max_s = xaa[l_max];
+ xaa[l_max] = '\0';
+ }
+*/
+#else
+ window = min(n0, ppst->param_u.fa.optwid);
+ l_min = max(0,(hoff-window)*3);
+ l_max = min((hoff+window+n0)*3,n_nt);
+ local_xaa = (unsigned char *)xaa;
+ local_yaa = yaa;
+ if (l_min > 0 || l_max <n_nt -1) {
+ local_yaa = (unsigned char*)calloc(l_max - l_min + 2,sizeof(unsigned char));
+ local_yaa++;
+ memcpy(local_yaa, yaa+l_min, l_max - l_min);
+ }
+
+ /*
+ if (l_min > 0)
+ aa1_min_s = yaa[l_min-1];
+ yaa[l_min-1] = '\0';
+ }
+ if (l_max < n_nt-1) {
+
+ aa1_max_s = yaa[l_max];
+ yaa[l_max] = '\0';
+ }
+ */
+#endif
+
+ if (a_res->rst.score[ppst->score_ix] <= score_thresh) {
+ a_res->sw_score = 0;
+ a_res->n1 = n1;
+ return;
+ }
+
+ /* pro_dna always compares protein to DNA, and returns protein
+ coordinates in a_res->min0,max0 */
+
+ a_res->sw_score =
+ pro_dna(
+#ifndef TFAST /* FASTX */
+ local_xaa, l_max - l_min, /* true protein is in aa1/xaa */
+ yaa, n_nt,
+#else /* TFASTX */
+ aa0, n0, /* true protein is in aa0 */
+ local_yaa, l_max - l_min,
+#endif
+ ppst->pam2[0],
+#ifdef OLD_FASTA_GAP
+ -(ppst->gdelval - ppst->ggapval),
+#else
+ -ppst->gdelval,
+#endif
+ -ppst->ggapval,
+ -ppst->gshift,
+ &f_str->smgl_s,
+ max_res, a_res);
+
+ /*
+ if (a_res->rst.score[0] < a_res->sw_score) {
+ a_res->rst.score[0] = a_res->sw_score;
+ a_res->rst.score[ppst->score_ix] = a_res->sw_score;
+ }
+ */
+
+#ifndef TFAST
+ if (l_min > 0 || l_max < n1-1) free(--local_xaa);
+/*
+ if (l_min > 0) {
+ xaa[l_min-1] = aa1_min_s;
+ }
+ if (l_max < n1 - 1) {
+ xaa[l_max] = aa1_max_s;
+ }
+*/
+ a_res->min0 += l_min;
+ a_res->max0 += l_min;
+#else
+ if (l_min > 0 || l_max < n1-1) free(--local_yaa);
+ /*
+ if (l_min > 0) {
+ yaa[l_min-1] = aa1_min_s;
+ }
+ if (l_max < n1 - 1) {
+ yaa[l_max] = aa1_max_s;
+ }
+ */
+ a_res->n1 = n1;
+ a_res->min1 += l_min;
+ a_res->max1 += l_min;
+#endif
+
+}
+
+/*
+ fx_malign is a recursive interface to fx_walign() that is called
+ from do_walign(). fx_malign() first does an alignment, then checks
+ to see if the score is greater than the threshold. If so, it tries
+ doing a left and right alignment.
+
+ In this implementation, the translation required for f_str->aa1x and
+ f_str->aa1y is done at each recursive level. A better implementation
+ would do the translation once, and then be more sophisticated about
+ the boundaries on f_str->aa1x,y. This is challenging, however,
+ because there is no easy way to subset aa1x [111112222233333],
+ though it is possible to subset aa1y cleanly. The current solution
+ is to re-generate xaa from yaa.
+
+ 21-Nov-2010 -- like do_walign(), fx_malign() uses a const xaa, to
+ ensure that threads do not interfere with each other. If a
+ sub-range is needed, a new sequence is produced.
+
+ */
+struct a_res_str *
+fx_malign (const unsigned char *aa0, int n0,
+ const unsigned char *xaa, int n1, unsigned char *yaa,
+ int frame,
+ int score_thresh, int max_res,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct a_res_str *cur_ares,
+ int first_align)
+{
+ struct a_res_str *tmpl_ares, *tmpr_ares, *this_ares;
+ struct a_res_str *mtmpl_ares, *mtmpr_ares, *mt_next;
+ unsigned char *my_xaa;
+ unsigned char *local_xaa, *local_yaa;
+ int nxyaa;
+ int hoff, score_ix;
+ int min_alen;
+ struct rstruct rst;
+ /* char save_res; */
+ int iphase, i;
+ unsigned char *fd;
+ int max_sub_score = -1;
+
+ score_ix = ppst->score_ix;
+
+ /* now we need alignment storage - get it */
+ if ((cur_ares->res = (int *)calloc((size_t)max_res,sizeof(int)))==NULL) {
+ fprintf(stderr," *** cannot allocate alignment results array %d\n",max_res);
+ exit(1);
+ }
+
+ cur_ares->next = NULL;
+
+#ifdef TFAST
+ min_alen = min(n0,MIN_LOCAL_LEN)*3; /* n0 in aa, min_alen in nt */
+#else
+ min_alen = min(n0/3,MIN_LOCAL_LEN); /* no in nt, min_alen in aa */
+#endif
+
+#ifdef TFAST
+ /* convert yaa to xaa -- cannot use *fs to stop because subset
+ does not have '\0' in all three frames */
+ my_xaa = (unsigned char *)calloc(n1+2,sizeof(unsigned char));
+ my_xaa++;
+ for (fd=my_xaa, iphase = 0; iphase < 3; iphase++) {
+ for (i=iphase; i<n1; i+=3,fd++) *fd = yaa[i];
+ }
+ *fd=EOSEQ;
+#else
+ my_xaa = (unsigned char *)xaa;
+#endif
+
+ fx_walign(aa0, n0, my_xaa, n1, yaa, frame, max_res,
+ ppst, f_str, cur_ares,(first_align ? 1 : score_thresh));
+
+ /* in cur_ares, min0,max0 are always protein, min1,max1 are always
+ DNA, but n0 could be protein or DNA, depending on
+ FASTX/TFASTX */
+
+ if (!ppst->do_rep || cur_ares->rst.score[ppst->score_ix] <= score_thresh) {
+#ifdef TFAST
+ free(--my_xaa);
+#endif
+ return cur_ares;
+ }
+
+ /* we are going to do a recursive edit, so we need a local copy of
+ xaa (fastx) or yaa (tfastx) */
+
+#ifdef TFAST /* TFASTX, n1 is nt */
+ nxyaa = cur_ares->min1;
+#else /* FASTX n1 is aa */
+ nxyaa = cur_ares->min0;
+#endif
+
+ if (nxyaa >= min_alen) { /* try the left */
+ /* allocate a_res */
+ tmpl_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+
+#ifdef TFAST /* TFASTX, no xaa */
+ local_xaa = my_xaa; /* my_xaa is calloc'ed for TFAST */
+ local_yaa = (unsigned char *)calloc(nxyaa+2, sizeof(unsigned char));
+ local_yaa++; /* skip the initial zero */
+ memcpy(local_yaa, yaa, nxyaa);
+/*
+ save_res = yaa[cur_ares->min1];
+ yaa[cur_ares->min1] = '\0';
+*/
+#else
+ local_yaa = yaa;
+ local_xaa = (unsigned char *)calloc(nxyaa+2, sizeof(unsigned char));
+ local_xaa++; /* skip the initial zero */
+ memcpy(local_xaa, xaa, nxyaa);
+/*
+ save_res = xaa[cur_ares->min0];
+ xaa[cur_ares->min0] = '\0';
+*/
+#endif
+ tmpl_ares = fx_malign(aa0, n0, local_xaa, nxyaa,
+ local_yaa,
+ frame, score_thresh, max_res,
+ ppst, f_str, tmpl_ares, 0);
+
+#ifdef TFAST
+ free(--local_yaa); /* local_yaa, allocated above */
+#else
+ free(--local_xaa); /* FASTX - local_xaa allocated above */
+#endif
+
+ if (tmpl_ares->rst.score[ppst->score_ix] > score_thresh) {
+ max_sub_score = tmpl_ares->rst.score[ppst->score_ix];
+ }
+ else {
+ if (tmpl_ares->res) free(tmpl_ares->res);
+ free(tmpl_ares);
+ tmpl_ares = NULL;
+ }
+ }
+ else {tmpl_ares = NULL;}
+
+ /* do the right */
+#ifdef TFAST /* TFASTX - n0 is aa, n1 nt */
+ nxyaa = n1 - cur_ares->max1 - 1;
+#else /* FASTX - n1 is aa, n0 nt */
+ /* this is counter-intuitive, because n1 is the length of the DNA
+ sequence in both cases */
+ nxyaa = n1 - cur_ares->max0 - 1;
+#endif
+
+ if (nxyaa >= min_alen) { /* try the right */
+ /* allocate a_res */
+ tmpr_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+
+ /* find boundaries */
+#ifdef TFAST /* TFASTX, no xaa */
+ local_xaa = my_xaa;
+ local_yaa = (unsigned char *)calloc(nxyaa+2, sizeof(unsigned char));
+ local_yaa++; /* skip the initial zero */
+ memcpy(local_yaa, yaa+cur_ares->max1+1,nxyaa);
+/*
+ save_res = yaa[cur_ares->max1];
+ yaa[cur_ares->max1] = '\0';
+*/
+#else
+ local_yaa = yaa;
+ local_xaa = (unsigned char *)calloc(nxyaa+2, sizeof(unsigned char));
+ local_xaa++; /* skip the initial zero */
+ memcpy(local_xaa, xaa+cur_ares->max0+1,nxyaa);
+/*
+ save_res = xaa[cur_ares->max0];
+ xaa[cur_ares->max0] = '\0';
+*/
+#endif
+ tmpr_ares = fx_malign(aa0, n0,
+ local_xaa, nxyaa, local_yaa,
+ frame,
+ score_thresh, max_res,
+ ppst, f_str, tmpr_ares,0);
+#ifdef TFAST /* TFASTX, no xaa */
+ free(--local_yaa);
+#else
+ free(--local_xaa);
+#endif
+/* yaa[cur_ares->max1] = save_res;*/
+
+ if (tmpr_ares->rst.score[ppst->score_ix] > score_thresh) {
+ /* adjust the left boundary */
+ for (this_ares = tmpr_ares; this_ares; this_ares = this_ares->next) {
+#ifdef TFAST
+ this_ares->min1 += cur_ares->max1+1;
+ this_ares->max1 += cur_ares->max1+1;
+#else
+ this_ares->min0 += cur_ares->max0+1;
+ this_ares->max0 += cur_ares->max0+1;
+#endif
+ }
+
+ if (tmpr_ares->rst.score[ppst->score_ix] > max_sub_score) {
+ max_sub_score = tmpr_ares->rst.score[ppst->score_ix];
+ }
+ }
+ else {
+ if (tmpr_ares->res) free(tmpr_ares->res);
+ free(tmpr_ares);
+ tmpr_ares = NULL;
+ }
+ }
+ else {tmpr_ares = NULL;}
+
+#ifdef TFAST
+ free(--my_xaa);
+#endif
+
+ if (max_sub_score <= score_thresh) {
+ if (tmpl_ares) {
+ if (tmpl_ares->res) free(tmpl_ares->res);
+ free(tmpl_ares);
+ }
+ if (tmpr_ares) {
+ if (tmpr_ares->res) free(tmpr_ares->res);
+ free(tmpr_ares);
+ }
+ return cur_ares;
+ }
+
+ cur_ares = merge_ares_chains(cur_ares, tmpl_ares, score_ix, "left");
+ cur_ares = merge_ares_chains(cur_ares, tmpr_ares, score_ix, "right");
+
+ return cur_ares;
+}
+
+/* do_walign() can be called with aa0,n0 as nt (FASTX) or
+ aa0,n0 as aa (TFASTX). if aa0 is nt, then f_str->aa0x,y have the
+ translations already. if aa0 is aa, then f_str->aa1x,y must be
+ generated.
+
+ This is the last time that aa0 can be nt or aa; in all lower
+ functions (fx_malign, do_fastx, fx_walign), both aa0, n0 and aa1,
+ n1 are amino acids; though one or the other may be translated.
+
+ In the lower functions, yaa can be aa0y (FASTX) or aa1y (TFASTX).
+ If it is aa1y, there may be no translation available.
+
+ 21-Nov-2010 With fasta-36.3.1, do_walign() uses const aa0, aa1. If aa1 needs
+ modification for recursive alignment, a copy is made.
+*/
+
+struct a_res_str *
+do_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int repeat_thresh,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ int *have_ares)
+{
+ int hoff, use_E_thresholds_s, optflag_s, optcut_s, optwid_s, score;
+ struct a_res_str *a_res, *tmp_a_res;
+ int a_res_index;
+ int last_n1, itx, itt, n10, iphase;
+ unsigned char *xaa, *fs, *fd;
+ struct rstruct rst;
+#ifdef DEBUG
+ unsigned long adler32_crc;
+#endif
+
+ *have_ares = 0x3; /* set 0x2 bit to indicate local copy */
+
+ if ((a_res = (struct a_res_str *)calloc(1, sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr," [do_walign] Cannot allocate a_res");
+ return NULL;
+ }
+
+#ifdef DEBUG
+ adler32_crc = adler32(1L,aa1,n1);
+#endif
+
+ use_E_thresholds_s = ppst->param_u.fa.use_E_thresholds;
+ optflag_s = ppst->param_u.fa.optflag;
+ optcut_s = ppst->param_u.fa.optcut;
+ optwid_s = ppst->param_u.fa.optwid;
+ ppst->param_u.fa.use_E_thresholds = 0;
+ ppst->param_u.fa.optflag = 1;
+ ppst->param_u.fa.optcut = 0;
+ if (!ppst->param_u.fa.optwid_set) {
+ ppst->param_u.fa.optwid *= 2;
+ }
+
+#ifndef TFAST /* FASTX */
+ a_res = fx_malign(f_str->aa0x, n0, aa1, n1, f_str->aa0y, frame,
+ repeat_thresh, f_str->max_res,
+ ppst, f_str, a_res, 1);
+#else /* TFASTX */
+ /* aa0 has a protein sequence */
+ /* aa1 has a raw DNA sequence */
+
+ itt = frame;
+ last_n1 = 0;
+ xaa = f_str->aa1x;
+ for (itx= itt*3; itx< itt*3+3; itx++) {
+ n10 = saatran(aa1,&xaa[last_n1],n1,itx);
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+ /* create aa1y from xaa */
+ for (fs=xaa,iphase=0; iphase <3; iphase++,fs++) {
+ for (fd= &f_str->aa1y[iphase]; *fs!=EOSEQ; fd += 3, fs++) *fd = *fs;
+ *fd=EOSEQ;
+ }
+ f_str->have_yaa = 1;
+
+ a_res = fx_malign(aa0, n0, xaa, n10, f_str->aa1y, frame,
+ repeat_thresh, f_str->max_res,
+ ppst, f_str, a_res, 1);
+#endif
+ /*
+ if (a_res->res[0] != 3) {
+ fprintf(stderr, "*** alignment does not start with match: %d\n",a_res->res[0]);
+ }
+ */
+
+#ifdef DEBUG
+ if (adler32(1L,aa1,n1) != adler32_crc) {
+ fprintf(stderr,"[dropfx.c/do_walign] adler32_crc mismatch n1: %d\n",n1);
+ }
+#endif
+
+ a_res_index = 0;
+ for (tmp_a_res=a_res; tmp_a_res; tmp_a_res = tmp_a_res->next) {
+ tmp_a_res->index = a_res_index++;
+ }
+
+ ppst->param_u.fa.use_E_thresholds = use_E_thresholds_s;
+ ppst->param_u.fa.optflag = optflag_s;
+ ppst->param_u.fa.optcut = optcut_s;
+ ppst->param_u.fa.optwid = optwid_s;
+ return a_res;
+}
+
+/* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
+/* call from calcons, calc_id, calc_code */
+void
+aln_func_vals(int frame, struct a_struct *aln) {
+
+#ifndef TFAST
+ aln->llrev = 0;
+ aln->llfact = 1;
+ aln->llmult = 1;
+ aln->qlfact = 3;
+ aln->frame = frame;
+ if (frame > 0) aln->qlrev = 1;
+ else aln->qlrev = 0;
+ aln->llrev = 0;
+#else /* TFASTX */
+ aln->qlfact = 1;
+ aln->qlrev = 0;
+ aln->llfact = 3;
+ aln->llmult = 1;
+ aln->frame = frame;
+ if (frame > 0) aln->llrev = 1;
+ else aln->llrev = 0;
+ aln->qlrev = 0;
+#endif /* TFASTX */
+}
+
+/* this function is required for programs like tfastx/y/s that do
+ translations on DNA sequences and save them in f_str->aa1??
+*/
+
+void
+pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {
+#ifdef TFAST
+ int i, last_n1, itemp, n10;
+ unsigned char *fs, *fd;
+ int itx;
+
+ last_n1 = 0;
+ for (itx=3*frame; itx<3+3*frame; itx++) {
+ n10 = saatran(aa1,&f_str->aa1x[last_n1],n1,itx);
+/*
+ for (i=0; i<n10; i++) {
+ fprintf(stderr,"%c",ppst->sq[aa10[last_n1+i]]);
+ if ((i%60)==59) fprintf(stderr,"\n");
+ }
+ fprintf(stderr,"\n");
+*/
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+
+ /* create aa1y from aa1x */
+ for (fs=f_str->aa1x,itemp=0; itemp <3; itemp++,fs++) {
+ for (fd= &f_str->aa1y[itemp]; *fs!=EOSEQ; fd += 3, fs++) *fd = *fs;
+ *fd=EOSEQ;
+ }
+ f_str->have_yaa = 1;
+#endif
+}
+
+/*
+ Alignment: store the operation that align the protein and dna sequence.
+ The code of the number in the array is as follows:
+ 0: delete of an amino acid.
+ 2: frame shift, 2 nucleotides match with an amino acid
+ 3: match an amino acid with a codon
+ 4: the other type of frame shift
+ 5: delete of a codon
+
+ The first two elements of the array stores the starting point
+ in the protein and dna sequences in the local alignment.
+*/
+
+#include "a_mark.h"
+
+extern int align_type(int score, char sp0, char sp1, int nt_align, struct a_struct *aln, int pam_x_id_sim);
+
+extern void
+process_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ struct annot_entry *annot_arr_p, int n_annots, char **ann_comment,
+ void *annot_stack, int *have_push_features, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_head_p,
+ struct domfeat_data *left_domain_p,
+ long *left_end_p, int init_score);
+
+extern int
+next_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ int i_annot, int n_annot, struct annot_entry **annot_arr, char **ann_comment,
+ void *annot_stack, int *have_push_features, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_head_p,
+ struct domfeat_data *left_domain_p,
+ long *left_domain_end, int init_score);
+
+extern void
+close_annot_match (int ia, void *annot_stack, int *have_push_features,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_p,
+ long *left_end_p, int init_score);
+
+extern void
+comment_var(long i0, char sp0, long i1, char sp1, char o_sp1, char sim_char,
+ const char *ann_comment, struct dyn_string_str *annot_var_dyn,
+ int target, int d_type);
+
+void
+display_push_features(void *annot_stack, struct dyn_string_str *annot_var_dyn,
+ long i0_pos, char sp0, long i1_pos, char sp1, char sym,
+ int score, double comp, int sw_score, int n0, int n1,
+ void *pstat_void, int d_type);
+
+#define DP_FULL_FMT 1 /* Region: score: bits: id: ... */
+#define Q_TARGET 0
+#define L_TARGET 1
+
+int seq_pos(int pos, int rev, int off);
+
+/* values of calc_func_mode */
+#define CALC_CONS 1
+#define CALC_CODE 2
+#define CALC_ID 3
+#define CALC_ID_DOM 4
+
+/* add_annot_code: adds annotation codes to struct dyn_string_str ann_code_dyn */
+void
+add_annot_code(int have_ann, char sp0, char sp1,
+ char ann_aa1_i1,
+ long q_off_pos, long l_off_pos, char sim_sym_code,
+ struct dyn_string_str *ann_code_dyn)
+{
+ char ann_ch0, ann_ch1;
+ char tmp_astr[MAX_STR];
+
+ ann_ch0 = ann_ch1 = '\0';
+
+ if (have_ann && ann_aa1_i1 != ' ') {
+ ann_ch0 = 'X';
+ ann_ch1 = ann_aa1_i1;
+ }
+ else {return;}
+
+ if (!(ann_ch1 == '[' || ann_ch1 == ']')) {
+ sprintf(tmp_astr, "|%c%c:%ld%c%c%ld%c",
+ ann_ch0,ann_ch1, q_off_pos+1,sp0,
+ sim_sym_code, l_off_pos+1,sp1);
+ dyn_strcat(ann_code_dyn, tmp_astr);
+ }
+}
+
+/* universal alignment code builder for calc_cons_a(), calc_code(), and calc_id() */
+/* see cal_cons2.c/calc_cons_u() for strategy */
+
+int
+calc_cons_u( /* inputs */
+ const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_res_str *a_res, /* alignment encoding */
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ void *pstat_void,
+ /* annotation stuff */
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a, const struct annot_str *annot0_p,
+ const unsigned char *aa1a, const struct annot_str *annot1_p,
+ int calc_func_mode, /* CALC_CONS, CALC_CODE, CALC_ID */
+ int display_code, /* used only by CALC_CODE */
+ /* outputs */
+ int *nc,
+ char *seqc0, char *seqc1, char *seqca, int *cumm_seq_score,
+ char *seqc0a, char *seqc1a,
+ struct a_struct *aln,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct dyn_string_str *align_code_dyn)
+{
+ int i0, i1, i, j;
+ int lenc, not_c, itmp, ngap_p, ngap_d, nfs;
+ int *i_spa;
+ char *sp0_p, *sp0a_p, *sp1_p, *sp1a_p, *spa_p, t_spa;
+ char sp0_c, sp1_c, spa_c; /* used for CALC_ID, CALC_CODE */
+ char sp0a_c, sp1a_c; /* used for CALC_CODE */
+
+ struct update_code_str *update_data_p;
+
+ const unsigned char *sq;
+ const unsigned char *ap0, *ap1;
+ const unsigned char *ap1a; /* ap1 always points to protein, and
+ only protein has annotations */
+ const struct annot_str *annotp_p; /* protein annotations from annot_str */
+ int comment_target;
+
+ int *rp, *rpmax;
+ int have_ann;
+
+ /* variables for variant changes/region scores */
+ char tmp_str[MAX_LSTR];
+ void *annot_stack;
+ int have_push_features, prev_match, *have_push_features_p;
+
+ char *sim_sym = aln_map_sym[MX_ACC];
+ struct annot_entry **s_annotp_arr_p;
+ int i1_annot, v_delta, v_tmp;
+ long i0_offset, i1_offset;
+
+ long i1_left_end;
+ int show_code, annot_fmt, start_flag;
+
+ int d1_score, d1_ident, d1_alen, d1_gaplen;
+ struct domfeat_data *left_domain_list1, *left_domain_head1;
+
+ char *ann_comment;
+
+ *score_delta = 0;
+ d1_score = d1_ident = d1_alen = d1_gaplen = 0;
+ i1_left_end = -1;
+ left_domain_head1 = left_domain_list1 = NULL;
+
+ NULL_dyn_string(annot_var_dyn);
+
+ if (ppst->ext_sq_set) {sq = ppst->sqx;}
+ else {sq = ppst->sq;}
+
+#ifndef TFAST /* FASTX */
+ comment_target = 1;
+ aln->amin1 = aln->smin1 = a_res->min0; /* prot */
+ aln->amin0 = aln->smin0 = a_res->min1; /* DNA */
+
+ i0_offset = aln->q_offset;
+ i1_offset = aln->l_offset;
+
+ ap0 = f_str->aa0y; /* translated DNA */
+ ap1 = aa1; /* protein */
+
+ ap1a = aa1a;
+ annotp_p = annot1_p;
+
+ if (calc_func_mode == CALC_CONS) {
+ have_ann = (seqc0a !=NULL && aa1a != NULL);
+ sp0_p = seqc0; /* translated DNA */
+ sp1_p = seqc1; /* protein */
+ spa_p = seqca;
+ sp1a_p = seqc1a; /* protein library can have annotation */
+ sp0a_p = seqc0a; /* sp0a is always ' ' - no translated annotation */
+ annot_fmt = DP_FULL_FMT;
+ }
+ else if (calc_func_mode == CALC_ID || calc_func_mode == CALC_ID_DOM) {
+ /* does not require aa0a/aa1a, only for variants */
+ have_ann = ((annotp_p && annotp_p->n_annot > 0) || (annot0_p && annot0_p->n_annot > 0));
+ sp0_p = &sp0_c;
+ sp1_p = &sp1_c;
+ spa_p = &spa_c;
+ sp0a_p = &sp0a_c;
+ sp1a_p = &sp1a_c;
+ annot_fmt = 3;
+ }
+ else if (calc_func_mode == CALC_CODE) {
+ spa_p = &spa_c;
+ sp0_p = &sp0_c;
+ sp1_p = &sp1_c;
+ sp0a_p = &sp0a_c;
+ sp1a_p = &sp1a_c;
+
+ show_code = (display_code & (SHOW_CODE_MASK+SHOW_CODE_EXT)); /* see defs.h; SHOW_CODE_ALIGN=2,_CIGAR=3,_CIGAR_EXT=4 */
+ annot_fmt = 2;
+ if (display_code & SHOW_ANNOT_FULL) {
+ annot_fmt = 1;
+ }
+ /* have_ann encodes number of sequences annotated */
+ have_ann = 0;
+ if ((annotp_p && annotp_p->n_annot > 0) || (aa1a != NULL)) { have_ann |= 2;}
+ update_data_p = init_update_data(show_code);
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] --- cal_cons_u() invalid calc_func_mode: %d\n",
+ __FILE__, __LINE__, calc_func_mode);
+ exit(1);
+ }
+
+#else /* TFASTX */
+ comment_target = 0;
+ aln->amin0 = aln->smin0 = a_res->min0; /* DNA */
+ aln->amin1 = aln->smin1 = a_res->min1; /* prot */
+
+ i1_offset = aln->q_offset;
+ i0_offset = aln->l_offset;
+
+ ap1 = aa0; /* aa0 is protein */
+ /* with fx_malign(), there is no guarantee that we have a valid f_str->aa1y, so make one */
+ pre_cons(aa1,n1,aln->frame, f_str);
+ ap0 = f_str->aa1y; /* aa1 is DNA */
+ ap1a = aa0a;
+ annotp_p = annot0_p;
+
+ have_ann = (seqc0a !=NULL && aa0a != NULL);
+ if (calc_func_mode == CALC_CONS) {
+ sp1_p = seqc0; /* sp1 points to protein query */
+ sp0_p = seqc1; /* sp0 points to DNA */
+ spa_p = seqca;
+ sp1a_p = seqc0a; /* protein query can have annotation */
+ sp0a_p = seqc1a; /* sp0a is always ' ' - no translated annotation */
+ annot_fmt = DP_FULL_FMT;
+ }
+ else if (calc_func_mode == CALC_ID || calc_func_mode == CALC_ID_DOM) {
+ have_ann = (annotp_p && annotp_p->n_annot > 0);
+ spa_p = &spa_c;
+ sp0_p = &sp0_c;
+ sp1_p = &sp1_c;
+
+ sp0a_p = &sp0a_c;
+ sp1a_p = &sp1a_c;
+ annot_fmt = 3;
+
+ /* does not require aa0a/aa1a, only for variants */
+ }
+ else if (calc_func_mode == CALC_CODE) {
+ spa_p = &spa_c;
+ sp0_p = &sp0_c;
+ sp1_p = &sp1_c;
+
+ sp0a_p = &sp0a_c;
+ sp1a_p = &sp1a_c;
+
+ show_code = (display_code & (SHOW_CODE_MASK+SHOW_CODE_EXT)); /* see defs.h; SHOW_CODE_ALIGN=2,_CIGAR=3,_CIGAR_EXT=4 */
+ annot_fmt = 2;
+ if (display_code & SHOW_ANNOT_FULL) {
+ annot_fmt = 1;
+ }
+
+ /* have_ann encodes number of sequences annotated */
+ if ((annotp_p && annotp_p->n_annot > 0) || (ap1a != NULL)) { have_ann |= 1;}
+
+ update_data_p = init_update_data(show_code);
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] --- cal_cons_u() invalid calc_func_mode: %d\n",
+ __FILE__, __LINE__, calc_func_mode);
+ exit(1);
+ }
+#endif
+ if (cumm_seq_score) i_spa = cumm_seq_score;
+
+ rp = a_res->res;
+ rpmax = &a_res->res[a_res->nres];
+
+ lenc = not_c = aln->nident = aln->nmismatch = aln->nsim = aln->npos = ngap_p = ngap_d = nfs= 0;
+ i0 = a_res->min1;
+ i1 = a_res->min0;
+
+ v_delta = 0;
+ i1_annot = 0;
+ annot_stack = NULL;
+ s_annotp_arr_p = NULL;
+ have_push_features = prev_match = 0;
+
+ if (have_ann) {
+ have_push_features_p = &have_push_features;
+
+ if (annotp_p && annotp_p->n_annot > 0) {
+ annot_stack = init_stack(64,64);
+ left_domain_list1=init_domfeat_data(annotp_p);
+ s_annotp_arr_p = annotp_p->s_annot_arr_p;
+
+ while (i1_annot < annotp_p->n_annot) {
+ if (s_annotp_arr_p[i1_annot]->pos >= i1+i1_offset) {break;}
+ if (s_annotp_arr_p[i1_annot]->end <= i1+i1_offset) {i1_annot++; continue;}
+
+ if (s_annotp_arr_p[i1_annot]->label == '-') {
+ process_annot_match(&itmp, NULL,
+#ifndef TFAST
+ i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sp1_p, sp1a_p, sq, s_annotp_arr_p[i1_annot], annotp_p->n_annot,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, &left_domain_list1[i1_annot], &i1_left_end, 0);
+ }
+ i1_annot++;
+ }
+ }
+ }
+
+ while (rp < rpmax) {
+ /* fprintf(stderr,"%d %d %d (%c) %d (%c)\n"
+ ,(int)(rp-res),*rp,i0,sq[ap0[i0]],i1,sq[ap1[i1]]);
+ */
+ switch (*rp++) {
+ case 0: /* aa insertion */
+ *sp0_p = '-';
+ *sp1_p = sq[ap1[i1]];
+ *spa_p = M_DEL;
+
+ if (calc_func_mode == CALC_CODE) {
+ *spa_p = 5; /* indel code */
+ update_code(align_code_dyn, update_data_p, 0, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+
+ if (cumm_seq_score) {
+ if (prev_match) *i_spa = ppst->gdelval;
+ *i_spa++ += ppst->ggapval;
+ }
+
+ if (have_ann) {
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ *sp0a_p = ' ';
+ *sp1a_p = ann_arr[ap1a[i1]];
+ }
+ if (s_annotp_arr_p) {
+ if (i1+i1_offset == s_annotp_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]],
+#ifndef TFAST
+ i1_offset+seq_pos(i1,aln->llrev,0), /* annotated target (prot) coordinate */
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sp1_p, sp1a_p, sq,
+ i1_annot, annotp_p->n_annot, s_annotp_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, left_domain_list1, &i1_left_end,
+ ppst->ggapval+ppst->gdelval);
+ }
+
+ if (prev_match) d1_score += ppst->gdelval;
+ d1_score += ppst->ggapval;
+ d1_alen++;
+ d1_gaplen++;
+ prev_match = 0;
+ }
+ if (calc_func_mode == CALC_CONS) {sp0a_p++; sp1a_p++;}
+ }
+
+ if (have_ann && have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ i1++;
+ lenc++;
+ ngap_d++;
+ break;
+ case 2: /* -1 frameshift, which is treatead as an insertion/match for annotations */
+ nfs++;
+ /* frameshifts produce a two-character alignment string */
+ /* first annotate the frameshift (first character) */
+ *sp0_p = '/';
+ i0 -= 1;
+ *sp1_p = '-';
+ *spa_p = M_DEL;
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 2, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ if (calc_func_mode == CALC_CONS) {
+ sp0_p++; sp1_p++; spa_p++;
+ if (have_ann) {*sp0a_p++ = *sp1a_p++ = ' ';}
+ }
+
+ not_c++;
+
+ /* then annotate the match after the frameshift */
+
+ itmp=ppst->pam2[0][ap0[i0]][ap1[i1]];
+ *sp0_p = sq[ap0[i0]];
+ *sp1_p = sq[ap1[i1]];
+
+ if (cumm_seq_score) *i_spa++ = ppst->gshift;
+
+ if (have_ann) {
+ have_push_features = 0;
+ /* this simple strategy works because the coordinate system
+ for the alignment is reversed appropriately */
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ *sp1a_p = ann_arr[ap1a[i1]];
+ *sp0a_p = ' ';
+ }
+ if (s_annotp_arr_p) {
+ /* coordiates are much more complex for next_annot_match,
+ and comment_var, because they may need to be reversed */
+
+ if (i1+i1_offset == s_annotp_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]],
+#ifndef TFAST
+ i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sp1_p, sp1a_p, sq,
+ i1_annot, annotp_p->n_annot, s_annotp_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, left_domain_list1, &i1_left_end,0);
+
+ if (sq[ap1[i1]] != *sp1_p) {
+ t_spa = align_type(itmp, *sp0_p, *sp1_p, 0, NULL, ppst->pam_x_id_sim);
+
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ comment_var(
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn,comment_target,annot_fmt);
+ }
+ else {
+ sprintf(tmp_str,"%c%d%c;",sq[ap1[i1]],i1+1,*sp1_p);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+ }
+ d1_score += ppst->gshift;
+ d1_score += itmp;
+ prev_match = 1;
+ }
+ if (calc_func_mode == CALC_CONS) {sp0a_p++; sp1a_p++;}
+ }
+
+ *spa_p = align_type(itmp, *sp0_p, *sp1_p, 0, aln, ppst->pam_x_id_sim);
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 3, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ d1_alen++;
+ if (*spa_p == M_IDENT) {d1_ident++;}
+
+ if (have_ann && calc_func_mode == CALC_CODE) {
+ add_annot_code(have_ann, *sp0_p, *sp1_p, *sp1a_p,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+ i1_offset+seq_pos(i1,aln->llrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sim_sym[*spa_p], annot_var_dyn);
+ }
+
+ if (have_ann && have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+ i0 += 3;
+ i1++;
+
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+ lenc++;
+ break;
+ case 3: /* codon/aa match */
+ itmp=ppst->pam2[0][ap0[i0]][ap1[i1]];
+ *sp0_p = sq[ap0[i0]];
+ *sp1_p = sq[ap1[i1]];
+
+ if (have_ann) {
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ *sp1a_p = ann_arr[ap1a[i1]];
+ *sp0a_p = ' ';
+ }
+ if (s_annotp_arr_p) {
+ if (i1+i1_offset == s_annotp_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]],
+#ifndef TFAST
+ i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sp1_p, sp1a_p, sq,
+ i1_annot, annotp_p->n_annot, s_annotp_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, left_domain_list1, &i1_left_end,0);
+
+ if (sq[ap1[i1]] != *sp1_p) {
+ t_spa = align_type(itmp, *sp0_p, *sp1_p, 0, NULL, ppst->pam_x_id_sim);
+
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+
+ comment_var(
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn, comment_target, annot_fmt);
+ }
+ else {
+ sprintf(tmp_str,"%c%d%c;",sq[ap1[i1]],i1+1,*sp1_p);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+ }
+ prev_match = 1;
+ d1_score += itmp;
+ }
+ if (calc_func_mode == CALC_CONS) {sp0a_p++; sp1a_p++;}
+ }
+
+ *spa_p = align_type(itmp, *sp0_p, *sp1_p, 0, aln, ppst->pam_x_id_sim);
+ d1_alen++;
+ if (*spa_p == M_IDENT) {d1_ident++;}
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 3, *spa_p, *sp0_p, *sp1_p);
+
+ if (have_push_features) {
+ add_annot_code(have_ann, *sp0_p, *sp1_p, *sp1a_p,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+ i1_offset+seq_pos(i1,aln->llrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sim_sym[*spa_p], annot_var_dyn);
+ }
+ }
+
+ if (have_ann && have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ i0 += 3;
+ i1++;
+
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+ lenc++;
+ break;
+ case 4: /* +1 frameshift */
+ nfs++;
+ /* frameshift produces two alignment characters */
+ /* first frameshift */
+ *sp0_p = '\\';
+ i0 += 1;
+ *sp1_p = '-';
+ *spa_p = M_DEL;
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 4, *spa_p, *sp0_p, *sp1_p);
+ }
+
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+
+ if (cumm_seq_score) *i_spa++ = ppst->gshift;
+
+ if (have_ann && calc_func_mode == CALC_CONS) {*sp1a_p++ = *sp0a_p++ = ' ';}
+ not_c++;
+
+ /* then alignment */
+ itmp=ppst->pam2[0][ap0[i0]][ap1[i1]];
+ *sp0_p = sq[ap0[i0]];
+ *sp1_p = sq[ap1[i1]];
+
+ if (have_ann) {
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ *sp1a_p = ann_arr[ap1a[i1]];
+ *sp0a_p = ' ';
+ }
+ if (s_annotp_arr_p) {
+ if (i1+i1_offset == s_annotp_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]],
+#ifndef TFAST
+ i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sp1_p, sp1a_p, sq,
+ i1_annot, annotp_p->n_annot, s_annotp_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, left_domain_list1, &i1_left_end,0);
+
+ if (sq[ap1[i1]] != *sp1_p) {
+ t_spa = align_type(itmp, *sp0_p, *sp1_p, 0, NULL, ppst->pam_x_id_sim);
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ comment_var(
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn, comment_target, annot_fmt);
+ }
+ else {
+ sprintf(tmp_str,"%c%d%c;",sq[ap1[i1]],i1+1,*sp1_p);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+ }
+ d1_score += ppst->gshift;
+ d1_score += itmp;
+ prev_match = 1;
+ }
+ if (calc_func_mode == CALC_CONS) {sp0a_p++; sp1a_p++;}
+ }
+
+ *spa_p = align_type(itmp, *sp0_p, *sp1_p, 0, aln, ppst->pam_x_id_sim);
+ d1_alen++;
+ if (*spa_p == M_IDENT) {d1_ident++;}
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 3, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+
+ /* now we have done all the ?modified identity checks, display
+ potential site annotations */
+ if (have_ann && have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ i0 += 3;
+ i1++;
+
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+ lenc++;
+ break;
+ case 5: /* codon insertion */
+ if (have_ann && calc_func_mode == CALC_CONS) {
+ *sp1a_p++ = *sp0a_p++ = ' ';
+ }
+
+ if (cumm_seq_score) {
+ if (prev_match) *i_spa = ppst->gdelval;
+ *i_spa++ = ppst->ggapval;
+ }
+
+ if (prev_match) d1_score += ppst->gdelval;
+ d1_score += ppst->ggapval;
+
+ prev_match = 0;
+
+ *sp0_p = sq[ap0[i0]];
+ *sp1_p = '-';
+ *spa_p = M_DEL;
+
+ if (calc_func_mode == CALC_CODE) {
+ *spa_p = 5;
+ update_code(align_code_dyn, update_data_p, 5, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+ i0 += 3;
+
+ lenc++;
+ ngap_p++;
+ break;
+ }
+ }
+
+ /* done with alignment loop */
+
+ if (calc_func_mode == CALC_CODE) {
+ close_update_data(align_code_dyn, update_data_p);
+ }
+
+ if (have_ann) {
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {*sp0a_p = *sp1a_p = '\0';}
+ if (s_annotp_arr_p) {
+ have_push_features = 0;
+
+ if (s_annotp_arr_p && i1_left_end > 0) {
+ close_annot_match(-1, annot_stack, have_push_features_p,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, &i1_left_end,
+ 0);
+ }
+
+ if (have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+#ifndef TFAST
+ i0_offset+seq_pos(i0-1,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1-1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1-1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0-1,aln->llrev,0), *sp1_p,
+#endif
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+ }
+ if (left_domain_list1) free(left_domain_list1);
+ free_stack(annot_stack);
+ }
+ *spa_p = '\0';
+
+#ifndef TFAST
+ aln->amax0 = i0;
+ aln->amax1 = i1;
+ aln->ngap_q = ngap_d;
+ aln->ngap_l = ngap_p;
+#else
+ aln->amax1 = i0;
+ aln->amax0 = i1;
+ aln->ngap_q = ngap_p;
+ aln->ngap_l = ngap_d;
+#endif
+ aln->calc_last_set = 1;
+
+ aln->nfs = nfs;
+
+ *score_delta = v_delta;
+
+ if (lenc < 0) lenc = 1;
+ *nc = lenc;
+/* now we have the middle, get the right end */
+ return lenc+not_c;
+}
+
+int
+calc_cons_a(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int *nc,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ char *seqc0, char *seqc1, char *seqca, int *cumm_seq_score,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a, const struct annot_str *annot0_p, char *seqc0a,
+ const unsigned char *aa1a, const struct annot_str *annot1_p, char *seqc1a,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str,
+ void *pstat_void)
+{
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, pstat_void,
+ ann_arr, aa0a, annot0_p, aa1a, annot1_p, CALC_CONS, 0,
+ nc, seqc0, seqc1, seqca, cumm_seq_score,
+ seqc0a, seqc1a, aln, score_delta, annot_var_dyn, NULL
+ );
+}
+
+void
+calc_astruct(struct a_struct *aln_p, struct a_res_str *a_res_p, struct f_struct *f_str) {
+
+ aln_p->calc_last_set = 0;
+
+#ifndef TFAST /* FASTX */
+ aln_p->amin1 = a_res_p->min0; /* prot */
+ aln_p->amin0 = a_res_p->min1; /* DNA */
+ aln_p->amax1 = a_res_p->max0; /* prot */
+ aln_p->amax0 = a_res_p->max1; /* DNA */
+#else /* TFASTX */
+ aln_p->amin0 = a_res_p->min0; /* DNA */
+ aln_p->amin1 = a_res_p->min1; /* prot */
+ aln_p->amax0 = a_res_p->max0; /* DNA */
+ aln_p->amax1 = a_res_p->max1; /* prot */
+#endif
+}
+
+/* build an array of match/ins/del - length strings */
+
+/* modified 10-June-2014 to distinguish matches from mismatches, op=1
+ (previously unused) indicates an aligned non-identity */
+
+/* op_codes are: 0 - aa insertion
+ 1 - (now) aligned non-identity
+ 2 - -1 frameshift
+ 3 - aligned identity
+ 4 - +1 frameshift
+ 5 - codon insertion
+*/
+
+static struct update_code_str *
+init_update_data(show_code) {
+
+ struct update_code_str *update_data_p;
+
+ if ((update_data_p = (struct update_code_str *)calloc(1,sizeof(struct update_code_str)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - init_update_data(): cannot allocate update_code_str\n",
+ __FILE__, __LINE__);
+ return NULL;
+ }
+
+ update_data_p->p_op_idx = -1;
+ update_data_p->p_op_cnt = 0;
+ update_data_p->show_code = show_code;
+ update_data_p->btop_enc = 0;
+
+ if ((show_code & SHOW_CODE_CIGAR) == SHOW_CODE_CIGAR) {
+ update_data_p->op_map = cigar_code;
+ update_data_p->cigar_order = 1;
+ }
+ else if ((show_code & SHOW_CODE_BTOP) == SHOW_CODE_BTOP) {
+ update_data_p->op_map = ori_code;
+ update_data_p->cigar_order = 0;
+ update_data_p->btop_enc = 1;
+ }
+ else {
+ update_data_p->op_map = ori_code;
+ update_data_p->cigar_order = 0;
+ }
+
+ if ((show_code & SHOW_CODE_EXT) == SHOW_CODE_EXT) {
+ update_data_p->show_ext = 1;
+ }
+ else {
+ update_data_p->show_ext = 0;
+ }
+
+ return update_data_p;
+}
+
+static void
+close_update_data(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *up_dp) {
+ char tmp_cnt[MAX_SSTR];
+
+ if (!up_dp) return;
+
+ if (up_dp->p_op_cnt) {
+ if (up_dp->btop_enc) {
+ sprintf(tmp_cnt,"%d",up_dp->p_op_cnt);
+ up_dp->p_op_cnt = 0;
+ }
+ else {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx, up_dp->p_op_cnt);
+ }
+ dyn_strcat(align_code_dyn,tmp_cnt);
+ }
+
+ free(up_dp);
+}
+
+/* sprintf_code() generates the short alignment code string (max
+ length MAX_SSTR=32) which is later added on to the dynamic
+ alignment code string
+
+ tmp_str[MAX_STR=32] -- alignment encoding output
+ up_dp -- used to determine cigar_order and mapping
+ op_idx -- code type
+
+
+*/
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *up_dp, int op_idx, int op_cnt) {
+
+ if (op_cnt == 0) return;
+
+ if (up_dp->cigar_order) {
+ sprintf(tmp_str,"%d%c",op_cnt,up_dp->op_map[op_idx]);
+ }
+ else {
+ sprintf(tmp_str,"%c%d",up_dp->op_map[op_idx],op_cnt);
+ }
+}
+
+/* only called for btop alignment encoding, for identity, update
+ count, otherwise, print previous count and current difference.
+ assumes that up_dp->p_op_cnt only tracks identity
+
+ for fx/fz, op=0,
+*/
+
+static void
+sprintf_btop(char *tmp_str,
+ struct update_code_str *up_dp,
+ int op, int sim_code,
+ unsigned char sp0, unsigned char sp1)
+{
+ char local_str[MAX_SSTR];
+ local_str[0]='\0';
+
+ /* only aligned identities update counts */
+ if (op==3 && sim_code == M_IDENT) {
+ up_dp->p_op_cnt++;
+ return;
+ }
+ else {
+ if (up_dp->p_op_cnt > 0) {
+ sprintf(local_str,"%d",up_dp->p_op_cnt);
+ }
+ up_dp->p_op_cnt = 0;
+ sprintf(tmp_str,"%s%c%c",local_str,sp0,sp1);
+ }
+}
+
+/* update_code() has been modified to work more correctly with
+ ggsearch/glsearch, which, because alignments can start with either
+ insertions or deletions, can produce an initial code of "0=". When
+ that happens, it is ignored and no code is added.
+
+ *align_code_dyn - alignment string (dynamic)
+ op -- encoded operation, currently 0=match, 1-delete, 2-insert, 3-term-match, 4-mismatch
+ op_cnt -- length of run
+ show_code -- SHOW_CODE_CIGAR uses cigar_code, SHOW_CODE_ALIGN: legacy; SHOW_CODE_BTOP: btop
+*/
+
+static void
+update_code(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *up_dp, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1)
+{
+ char tmp_cnt[MAX_SSTR];
+ tmp_cnt[0]='\0';
+
+ if (up_dp->btop_enc) {
+ sprintf_btop(tmp_cnt, up_dp, op, sim_code, sp0, sp1);
+ dyn_strcat(align_code_dyn,tmp_cnt);
+ return;
+ }
+
+ /* there are two kinds of "op's", one time and accumulating */
+ /* op == 2, 4 -- frameshifts -- are one-time: */
+
+ switch (op) {
+ case 2: /* frameshifts */
+ case 4:
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ dyn_strcat(align_code_dyn,tmp_cnt);
+
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ break;
+ case 0: /* aa insertion */
+ case 5: /* codon insertion (aa deletion) */
+ if (op == up_dp->p_op_idx) {
+ up_dp->p_op_cnt++;
+ }
+ else {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ dyn_strcat(align_code_dyn,tmp_cnt);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ break;
+ case 1: /* mismatch (non-id match) */
+ case 3: /* identical match */
+ if (sp0 != '*' && sp1 != '*') { /* default case, not termination */
+ if (up_dp->show_ext) {
+ if (sim_code != M_IDENT) { op = 1;}
+ }
+ }
+ else { /* have a termination codon, output for !SHOW_CODE_CIGAR */
+ if (!up_dp->cigar_order) {
+ if (sp0 == '*' || sp1 == '*') { op = 6;}
+ }
+ else if (up_dp->show_ext && (sp0 != sp1)) { op = 1;}
+ }
+
+ if (up_dp->p_op_cnt == 0) {
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ else if (op != up_dp->p_op_idx) {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ dyn_strcat(align_code_dyn,tmp_cnt);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ else {
+ up_dp->p_op_cnt++;
+ }
+ break;
+ }
+ return;
+}
+
+int calc_code(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ struct dyn_string_str *align_code_dyn,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a,
+ const struct annot_str *annot0_p,
+ const unsigned char *aa1a,
+ const struct annot_str *annot1_p,
+ struct dyn_string_str *annot_code_dyn,
+ int *score_delta,
+ struct f_struct *f_str,
+ void *pstat_void,
+ int display_code)
+{
+ int nc;
+
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, pstat_void,
+ ann_arr, aa0a, annot0_p, aa1a, annot1_p, CALC_CODE,
+ display_code,
+ &nc, NULL, NULL, NULL, NULL,
+ NULL, NULL, aln, score_delta, annot_code_dyn,
+ align_code_dyn
+ );
+}
+
+/* calc_id never looks at domains or features, only variation */
+
+int calc_id(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str)
+{
+ int nc;
+
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, NULL,
+ NULL, NULL, annot0_p, NULL, annot1_p, CALC_ID, 0,
+ &nc, NULL, NULL, NULL, NULL,
+ NULL, NULL, aln, score_delta, annot_var_dyn,
+ NULL
+ );
+}
+
+/* calc_idd also looks at domains */
+
+int calc_idd(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str)
+{
+ int nc;
+
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, NULL,
+ NULL, NULL, annot0_p, NULL, annot1_p, CALC_ID_DOM, 0,
+ &nc, NULL, NULL, NULL, NULL,
+ NULL, NULL, aln, score_delta, annot_var_dyn,
+ NULL
+ );
+}
+
diff --git a/src/dropfz2.c b/src/dropfz2.c
new file mode 100644
index 0000000..0838ae2
--- /dev/null
+++ b/src/dropfz2.c
@@ -0,0 +1,3969 @@
+/* $Id: dropfz2.c 1280 2014-08-21 00:47:55Z wrp $ */
+/* $Revision: 1280 $ */
+
+/* copyright (c) 1998, 1999, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* 18-Sept-2006 - removed static global variables for alignment */
+
+/* 2002/06/23 finally correctly implement fix to translate 'N' to 'X' */
+
+/* 1999/11/29 modification by Z. Zhang to translate DNA 'N' as 'X' */
+
+/* implements an improved version of the fasty algorithm, see:
+
+ W. R. Pearson, T. Wood, Z. Zhang, A W. Miller (1997) "Comparison of
+ DNA sequences with protein sequences" Genomics 46:24-36
+
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "defs.h"
+#include "param.h"
+#define XTERNAL
+#include "upam.h"
+#include "uascii.h"
+
+#define NT_N 16
+
+/* globals for fasta */
+#define MAXWINDOW 64
+
+#ifndef MAXSAV
+#define MAXSAV 10
+#endif
+
+#ifndef ALLOCN0
+static char *verstr="3.8 June 2014";
+#else
+static char *verstr="3.8an0 Jul 2014";
+#endif
+
+struct dstruct /* diagonal structure for saving current run */
+{
+ int score; /* hash score of current match */
+ int start; /* start of current match */
+ int stop; /* end of current match */
+ struct savestr *dmax; /* location in vmax[] where best score data saved */
+};
+
+struct savestr
+{
+ int score; /* pam score with segment optimization */
+ int score0; /* pam score of best single segment */
+ int gscore; /* score from global match */
+ int dp; /* diagonal of match */
+ int start; /* start of match in lib seq */
+ int stop; /* end of match in lib seq */
+};
+
+struct update_code_str {
+ int p_op_idx;
+ int p_op_cnt;
+ int btop_enc;
+ int show_code;
+ int cigar_order;
+ int show_ext;
+ char *op_map;
+};
+
+#ifndef TFAST
+static char *ori_code = "-x/=\\+*"; /* FASTX */
+static char *cigar_code = "DXFMRI*";
+#else
+static char *ori_code = "+x/=\\-*"; /* TFASTX */
+static char *cigar_code = "IXFMRD*";
+#endif
+
+static struct update_code_str *
+init_update_data(int show_code);
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *, int op_idx, int op_cnt);
+
+static void
+update_code(char *al_str, int al_str_max,
+ struct update_code_str *update_data, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1);
+
+static void
+close_update_data(char *al_str, int al_str_max,
+ struct update_code_str *update_data);
+
+void kpsort (struct savestr **v, int n);
+extern void *init_stack(int, int);
+extern void push_stack(void *, void *);
+extern void *pop_stack(void *);
+extern void *free_stack(void *);
+
+#define SGW1 100
+#define SGW2 300
+struct smgl_str {
+ int C[SGW1+1][SGW2+1];
+ int st[SGW1+1][SGW2+1];
+ int D[SGW2+7], I[SGW2+1];
+};
+
+struct sx_s {int C1, C2, C3, I1, I2, I3, flag; };
+
+struct wgt { int iii, ii, iv;};
+struct wgtc {char c2, c3, c4, c5;};
+
+typedef struct st_s { int C, I, D;} *st_ptr;
+
+struct f_struct {
+ struct dstruct *diag;
+ int frame;
+ int ndo;
+ int noff;
+ int hmask; /* hash constants */
+ int *pamh1; /* pam based array */
+ int *pamh2; /* pam based kfact array */
+ int *link, *harr; /* hash arrays */
+ int kshft; /* shift width */
+ int nsav, lowscor; /* number of saved runs, worst saved run */
+#ifndef TFAST
+ unsigned char *aa0x, *aa0v; /* aa0x - 111122223333 */
+#else
+ unsigned char *aa1x, *aa1v; /* aa1x - 111122223333 */
+#endif /* aa1v - computed codons */
+ struct sx_s *cur;
+ int cur_sp_size;
+ struct wgt **weight0;
+ struct wgt **weight1;
+ struct wgtc **weight_c;
+ int *waa;
+ int *res;
+ int max_res;
+ st_ptr up, down, tp;
+ struct smgl_str smgl_s;
+};
+
+#define DROP_INTERN
+#include "drop_func.h"
+
+static int dmatchz(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ const unsigned char *aa1v,
+ int hoff, int window,
+ int **pam2, int gdelval, int ggapval, int gshift,
+ struct f_struct *f_str);
+
+int shscore(unsigned char *aa0, int n0, int **pam2);
+int saatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame);
+extern int ELK_to_s(double E_join, int n0, int n1, double Lambda, double K, double H);
+
+int savemax (struct dstruct *, int,
+ struct savestr *vmax, struct savestr **lowmax);
+
+int spam (const unsigned char *aa0, const unsigned char *aa1,
+ struct savestr *dmax, int **pam2,
+ struct f_struct *f_str);
+int sconn (struct savestr **v, int n,int cgap, int pgap, struct f_struct *f_str);
+int lx_band(const unsigned char *prot_seq, int len_prot,
+ const unsigned char *dna_prot_seq, int len_dna_prot,
+ int **pam_matrix, int gopen, int gext,
+ int gshift, int start_diag, int width, struct f_struct *f_str);
+
+struct a_res_str *
+merge_ares_chains(struct a_res_str *cur_ares,
+ struct a_res_str *tmpl_ares,
+ int score_ix, const char *msg);
+
+extern void w_abort (char *p, char *p1);
+extern void aagetmap(char *to, int n);
+
+/* initialize for fasta */
+/* modified 30-August-1999 by Zheng Zhang to work with an extended alphabet */
+/* Assume naa=47, and wgts[47][23] matches both upper and lower case
+amoino acids with another amino acid. And also assume the DNA letter
+does not have upper/lower case difference. If you also allow DNA
+sequence to be upper/lower case letters, more needs be changed. Not
+only here, but also in the alignment code, the way that pack a codon
+into a number between 0-63 need be changed. */
+
+/* modified so that if **weightci==NULL, do not fiddle with characters */
+
+/* modified 3-Aug-2010 for NCBIstdaa alphabet, which requires MAXUC
+ 28, MAXLC 56, so we must have 58, not 47, entries */
+
+void
+init_weights(struct wgt ***weighti, struct wgtc ***weightci,
+ int **wgts, int gshift, int gsubs, int naa)
+{
+ int i, j, do_wgtc=0;
+ int aa, b, a, x, y, z;
+ int *wwt, e;
+ struct wgt **weight;
+ struct wgtc **weightc;
+ char aacmap[64];
+ int temp[MAXLC+1][64]; /*change*/
+ char le[MAXLC+1][64];
+
+ if (naa > MAXLC) {
+ fprintf(stderr,"*** dropfz2.c compilation problem naa(%d) > MAXLX(%d) ***\n",
+ naa, MAXLC);
+ }
+
+ if ((*weighti=(struct wgt **)calloc((size_t)(naa+1),sizeof(struct wgt *)))
+ ==NULL) {
+ fprintf(stderr," cannot allocate weights array: %d\n",naa);
+ exit(1);
+ }
+
+ weight = *weighti;
+ /* allocate weight[aa 0..MAXLC] */
+ for (aa=0; aa <= naa; aa++) {
+ if ((weight[aa]=(struct wgt *)calloc((size_t)256,sizeof(struct wgt)))
+ ==NULL) {
+ fprintf(stderr," cannot allocate weight[]: %d/%d\n",aa,naa);
+ exit(1);
+ }
+ }
+
+ /* allocate weightci[aa 0..MAXLC] */
+ if (weightci !=NULL) {
+ if ((*weightci=(struct wgtc **)calloc((size_t)(naa+1),
+ sizeof(struct wgtc *)))==NULL) {
+ fprintf(stderr," cannot allocate weight_c array: %d\n",naa);
+ exit(1);
+ }
+ weightc = *weightci;
+
+ for (aa=0; aa <= naa; aa++) {
+ if ((weightc[aa]=(struct wgtc *)calloc((size_t)256,sizeof(struct wgtc)))
+ ==NULL) {
+ fprintf(stderr," cannot allocate weightc[]: %d/%d\n",aa,naa);
+ exit(1);
+ }
+ }
+ do_wgtc = 1;
+ }
+ else do_wgtc = 0;
+
+ aagetmap(aacmap,64);
+
+ for (aa = 0; aa < naa; aa++) { /* change*/
+ wwt = wgts[aa]; /* pam matrix */
+ for (i = 0; i < 64; i++) { /* i iterates through the codons */
+ x = -10000; /* large negative */
+ y = i;
+ for (j = 0; j < 64; j++) { /* j iterates through the codons */
+ z = ((~i & j) | (i & ~j));
+ b = 0; /* score = 0 */
+ if (z % 4) b-= gsubs;
+ if (z /16) b-= gsubs;
+ if ((z /4) % 4) b -= gsubs;
+ b += wwt[aascii[aacmap[j]]]; /* add the match score for char j*/
+ if (b > x) {
+ x = b; /* x has the score */
+ y = j; /* y has the character (codon index)*/
+ }
+ }
+#ifdef DEBUG
+ if (y < 0 || y > 63) printf("%d %d %d %d ",aa, i, x, y);
+#endif
+ temp[aa][i] = x;
+ le[aa][i] = y;
+ }
+ /* printf("\n"); */
+ }
+
+ for (aa= 0; aa < naa; aa++) {
+ wwt = temp[aa];
+ for (i = 0; i < 256; i++) {
+ for (x=-100,b = 0; b < 4; b++) {
+ z = (i/ (1 << ((b+1)*2)))*(1<<(b*2))+(i%(1<<(b*2)));
+ if (x < (e=wwt[z])) {
+ x = e;
+ if (do_wgtc) weightc[aa][i].c4 = aacmap[le[aa][z]];
+ }
+ }
+ weight[aa][i].iv=x-gshift;
+ weight[aa][i].iii = wwt[i%64];
+
+ if (do_wgtc) {
+ weightc[aa][i].c5 = aacmap[le[aa][i%64]];
+ weightc[aa][i].c3 = aacmap[i%64];
+ }
+ x = i %16;
+ for (y = -100, b = 0; b < 3; b++) {
+ z = ((x >> (b*2)) << (b*2+2)) + (x % (1 << (b*2)));
+ for (a = 0; a < 4; a++) {
+ if ((e =wwt[z+(a<<(b*2))]) > y) {
+ y = e;
+ if (do_wgtc)
+ weightc[aa][i].c2 = aacmap[le[aa][z+(a<<(b*2))]];
+ }
+ }
+ }
+ weight[aa][i].ii = y-gshift;
+ }
+ }
+ /*106=CGGG*/
+ for (aa = 0; aa < naa; aa++) {
+ weight[aa][106].iii = wgts[aa][23]; /* is 23 the code for 'X'?*/
+ weight[aa][106].iv = weight[aa][106].ii = weight[aa][106].iii-gshift;
+ if (do_wgtc) {
+ weightc[aa][106].c5 = weightc[aa][106].c4 = weightc[aa][106].c3
+ = weightc[aa][106].c2 = 'X';
+ }
+ }
+}
+
+void
+free_weights(struct wgt ***weighti0, struct wgt ***weighti1,
+ struct wgtc ***weightci, int naa)
+{
+ int aa;
+ struct wgt **weight0;
+ struct wgt **weight1;
+ struct wgtc **weightc;
+
+ weight0 = *weighti0;
+ weight1 = *weighti1;
+ weightc = *weightci;
+
+ for (aa=0; aa < naa; aa++) {free(weight0[aa]);}
+ for (aa=0; aa < naa; aa++) {free(weight1[aa]);}
+ for (aa=0; aa < naa; aa++) {free(weightc[aa]);}
+
+ free(weight0);
+ free(weight1);
+ free(weightc);
+}
+
+static void
+pre_com(const unsigned char *aa0, int n0, unsigned char *aa0v) {
+ int dnav, i;
+ dnav = (hnt[aa0[0]]<<2) + hnt[aa0[1]];
+ for (i=2; i<n0; i++) {
+ dnav = ((dnav<<2)+hnt[aa0[i]])&255;
+ if (aa0[i] == NT_N || aa0[i-1]==NT_N || aa0[i-2] == NT_N) {
+ aa0v[i-2] = 106;
+ }
+ else {
+ if (dnav == 106/*CGGG*/) {dnav = 42/*AGGG*/;}
+ aa0v[i-2]=dnav;
+ }
+ }
+}
+
+static void
+pre_com_r(const unsigned char *aa0, int n0, unsigned char *aa0v) {
+ int dnav, i, ir;
+ dnav = ((3-hnt[aa0[n0-1]])<<2) + 3-hnt[aa0[n0-2]];
+ for (i=2, ir=n0-3; i<n0; i++,ir--) {
+ dnav = ((dnav<<2)+3-hnt[aa0[ir]])&255;
+ if (aa0[ir] == NT_N || aa0[ir+1]==NT_N || aa0[ir+2] == NT_N) {
+ aa0v[i-2] = 106;
+ }
+ else {
+ if (dnav == 106) dnav = 42;
+ aa0v[i-2]=dnav;
+ }
+ }
+}
+
+void
+init_work (unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ int mhv, phv;
+ int hmax;
+ int i0, hv;
+ int pamfact;
+ int btemp;
+ struct f_struct *f_str;
+ struct bdstr *bss;
+ /* these used to be globals, but do not need to be */
+ int ktup, fact, kt1, lkt;
+
+ int maxn0;
+ int *pwaa;
+ int i, j, q;
+ struct swstr *ss, *r_ss;
+ int *waa;
+ int *res;
+ int nsq, ip, *hsq, naat;
+#ifndef TFAST
+ int last_n0, itemp, dnav;
+ unsigned char *fd, *fs, *aa0x, *aa0v;
+ int n0x, n0x3;
+#endif
+
+ if (nt[NT_N] != 'N') {
+ fprintf(stderr," nt[NT_N] (%d) != 'X' (%c) - recompile\n",NT_N,nt[NT_N]);
+ exit(1);
+ }
+
+ if (ppst->ext_sq_set) {
+ nsq = ppst->nsqx; ip = 1;
+ hsq = ppst->hsqx;
+ }
+ else {
+ nsq = ppst->nsqx; ip = 0;
+ hsq = ppst->hsq;
+ }
+
+ f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct));
+
+ if (!ppst->param_u.fa.use_E_thresholds) {
+ btemp = 2 * ppst->param_u.fa.bestoff / 3 +
+ n0 / ppst->param_u.fa.bestscale +
+ ppst->param_u.fa.bkfact *
+ (ppst->param_u.fa.bktup - ppst->param_u.fa.ktup);
+ btemp = min (btemp, ppst->param_u.fa.bestmax);
+ if (btemp > 3 * n0) btemp = 3 * shscore(aa0,n0,ppst->pam2[0]) / 5;
+
+ ppst->param_u.fa.cgap = btemp + ppst->param_u.fa.bestoff / 3;
+ if (ppst->param_u.fa.optcut_set != 1) {
+#ifndef TFAST
+ ppst->param_u.fa.optcut = (btemp*5)/4;
+#else
+ ppst->param_u.fa.optcut = (btemp*4)/3;
+#endif
+ }
+ }
+
+#ifdef OLD_FASTA_GAP
+ ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval;
+#else
+ ppst->param_u.fa.pgap = ppst->gdelval + 2*ppst->ggapval;
+#endif
+ pamfact = ppst->param_u.fa.pamfact;
+ ktup = ppst->param_u.fa.ktup;
+ fact = ppst->param_u.fa.scfact * ktup;
+
+#ifndef TFAST
+ /* before hashing, we must set up some space and translate the sequence */
+
+ maxn0 = n0 + 2;
+ if ((aa0x =(unsigned char *)calloc((size_t)maxn0,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa0x array %d\n", maxn0);
+ exit (1);
+ }
+ aa0x++;
+ f_str->aa0x = aa0x;
+
+
+ if ((aa0v =(unsigned char *)calloc((size_t)maxn0,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa0v array %d\n", maxn0);
+ exit (1);
+ }
+ aa0v++;
+ f_str->aa0v = aa0v;
+
+ /* make a precomputed codon number series */
+ pre_com(aa0, n0, aa0v);
+
+ last_n0 = 0;
+ for (itemp=0; itemp<3; itemp++) {
+ n0x=saatran(aa0,&aa0x[last_n0],n0,itemp);
+ /* for (i=0; i<n0x; i++) {
+ fprintf(stderr,"%c",aa[aa0x[last_n0+i]]);
+ if ((i%60)==59) fprintf(stderr,"\n");
+ }
+ fprintf(stderr,"\n");
+ */
+ last_n0 += n0x+1;
+ }
+
+ /* fprintf(stderr,"\n"); */
+ n0x = n0;
+ n0x3 = n0x/3;
+
+ /* now switch aa0 and aa0x for hashing functions */
+ fs = aa0;
+ aa0 = aa0x;
+ aa0x = fs;
+#endif
+
+ /* naat must always be MAXLC because library can have LC aa residues */
+ /*
+ if (ppst->ext_sq_set) naat = MAXLC;
+ else naat = MAXUC;
+ */
+ naat = MAXLC;
+
+ init_weights(&f_str->weight0, NULL,
+ ppst->pam2[ip],-ppst->gshift,-ppst->gsubs,naat);
+ init_weights(&f_str->weight1, &f_str->weight_c,
+ ppst->pam2[0],-ppst->gshift,-ppst->gsubs,naat);
+
+ if (pamfact == -1)
+ pamfact = 0;
+ else if (pamfact == -2)
+ pamfact = 1;
+
+ for (i0 = 1, mhv = -1; i0 < ppst->nsq; i0++)
+ if (hsq[i0] < NMAP && hsq[i0] > mhv)
+ mhv = ppst->hsq[i0];
+
+ if (mhv <= 0)
+ {
+ fprintf (stderr, " maximum hsq <=0 %d\n", mhv);
+ exit (1);
+ }
+
+ for (f_str->kshft = 0; mhv > 0; mhv /= 2) f_str->kshft++;
+
+/* kshft = 2; */
+ kt1 = ktup - 1;
+ hv = 1;
+ for (i0 = 0; i0 < ktup; i0++)
+ hv = hv << f_str->kshft;
+ hmax = hv;
+ f_str->hmask = (hmax >> f_str->kshft) - 1;
+
+ if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate hash array\n");
+ exit (1);
+ }
+ if ((f_str->pamh1 = (int *) calloc (ppst->nsq+1, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh1 array\n");
+ exit (1);
+ }
+ if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh2 array\n");
+ exit (1);
+ }
+ if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate hash link array");
+ exit (1);
+ }
+
+ for (i0 = 0; i0 < hmax; i0++)
+ f_str->harr[i0] = -1;
+ for (i0 = 0; i0 < n0; i0++)
+ f_str->link[i0] = -1;
+
+ /* encode the aa0 array */
+ phv = hv = 0;
+ lkt = kt1;
+ for (i0 = 0; i0 < min(n0,lkt); i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {
+ hv=phv=0; lkt = i0+ktup; continue;
+ }
+ hv = (hv << f_str->kshft) + ppst->hsq[aa0[i0]];
+ phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup;
+ }
+
+ for (; i0 < n0; i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {
+ hv=phv=0;
+ /* restart hv, phv calculation */
+ for (lkt = i0+kt1; (i0 < lkt || hsq[aa0[i0]]>=NMAP) && i0<n0; i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {
+ hv=phv=0;
+ lkt = i0+ktup;
+ continue;
+ }
+ hv = (hv << f_str->kshft) + hsq[aa0[i0]];
+ phv += ppst->pam2[ip][aa0[i0]][aa0[i0]]*ktup;
+ }
+ }
+ if (i0 >= n0) break;
+ hv = ((hv & f_str->hmask) << f_str->kshft) + ppst->hsq[aa0[i0]];
+ f_str->link[i0] = f_str->harr[hv];
+ f_str->harr[hv] = i0;
+ if (pamfact) {
+ f_str->pamh2[hv] = (phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup);
+ if (hsq[aa0[i0-kt1]] < NMAP)
+ phv -= ppst->pam2[ip][aa0[i0 - kt1]][aa0[i0 - kt1]] * ktup;
+ }
+ else f_str->pamh2[hv] = fact * ktup;
+ }
+
+/* this has been modified from 0..<ppst->nsq to 1..<=ppst->nsq because the
+ pam2[0][0] is now undefined for consistency with blast
+*/
+
+ if (pamfact)
+ for (i0 = 1; i0 < ppst->nsq; i0++)
+ f_str->pamh1[i0] = ppst->pam2[ip][i0][i0] * ktup;
+ else
+ for (i0 = 1; i0 < ppst->nsq; i0++)
+ f_str->pamh1[i0] = fact;
+
+ f_str->ndo = 0; /* used to save time on diagonals with long queries */
+
+
+#ifndef ALLOCN0
+ if ((f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG,
+ sizeof (struct dstruct)))==NULL) {
+ fprintf (stderr," cannot allocate diagonal arrays: %lu\n",
+ MAXDIAG *sizeof (struct dstruct));
+ exit (1);
+ };
+#else
+ if ((f_str->diag = (struct dstruct *) calloc ((size_t)n0,
+ sizeof (struct dstruct)))==NULL) {
+ fprintf (stderr," cannot allocate diagonal arrays: %ld\n",
+ (long)n0*sizeof (struct dstruct));
+ exit (1);
+ };
+#endif
+
+#ifndef TFAST
+ /* done hashing, now switch aa0, aa0x back */
+ fs = aa0;
+ aa0 = aa0x;
+ aa0x = fs;
+#else
+ if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+4,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+4);
+ exit (1);
+ }
+ f_str->aa1x++;
+
+ if ((f_str->aa1v =(unsigned char *)calloc((size_t)ppst->maxlen+4,
+ sizeof(unsigned char))) == NULL) {
+ fprintf (stderr, "cannot allocate aa1v array %d\n", ppst->maxlen+4);
+ exit (1);
+ }
+ f_str->aa1v++;
+
+#endif
+
+ if ((waa= (int *)malloc (sizeof(int)*(nsq+1)*n0)) == NULL) {
+ fprintf(stderr,"cannot allocate waa struct %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ pwaa = waa;
+ for (i=0; i<nsq; i++) {
+ for (j=0;j<n0; j++) {
+ *pwaa = ppst->pam2[ip][i][aa0[j]];
+ pwaa++;
+ }
+ }
+ f_str->waa = waa;
+
+#ifndef TFAST
+ maxn0 = max(2*n0,MIN_RES);
+#else
+ maxn0 = max(4*n0,MIN_RES);
+#endif
+ if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) {
+ fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0);
+ exit(1);
+ }
+ f_str->res = res;
+ f_str->max_res = maxn0;
+
+ *f_arg = f_str;
+}
+
+/* pstring1 is a message to the manager, currently 512 */
+/* pstring2 is the same information, but in a markx==10 format */
+void
+get_param (const struct pstruct *ppst,
+ char **pstring1, char *pstring2,
+ struct score_count_s *s_info)
+{
+ char options_str1[128];
+ char options_str2[128];
+#ifndef TFAST
+ char *pg_str="FASTY";
+#else
+ char *pg_str="TFASTY";
+#endif
+
+ if (!ppst->param_u.fa.use_E_thresholds) {
+ sprintf(options_str1,"join: %d (%.3g), opt: %d (%.3g)",
+ ppst->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ sprintf(options_str2,"pg_join: %d (%.3g)\n; pg_optcut: %d (%.3g)",
+ ppst->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ }
+ else {
+ sprintf(options_str1,"E-join: %.2g (%.3g), E-opt: %.2g (%.3g)",
+ ppst->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ sprintf(options_str2,"pg_join_E(): %.2g (%.3g)\n; pg_optcut_E(): %.2g (%.3g)",
+ ppst->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ }
+
+ if (!ppst->param_u.fa.optflag)
+ sprintf (pstring1[0], "%s (%s)",pg_str, verstr);
+ else
+ sprintf (pstring1[0], "%s (%s) [optimized]",pg_str, verstr);
+
+ sprintf (pstring1[1],
+#ifdef OLD_FASTA_GAP
+ "%s matrix (%d:%d)%s, gap-pen: %3d/%3d, shift: %3d, subs: %3d\n ktup: %d, %s, width: %3d",
+#else
+ "%s matrix (%d:%d)%s, open/ext: %3d/%3d, shift: %3d, subs: %3d\n ktup: %d, %s, width: %3d",
+#endif
+ ppst->pam_name, ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set) ? "xS":"\0",
+ ppst->gdelval, ppst->ggapval,
+ ppst->gshift,ppst->gsubs,
+ ppst->param_u.fa.ktup, options_str1, ppst->param_u.fa.optwid);
+
+ if (ppst->param_u.fa.iniflag) strcat(pstring1[0]," init1");
+
+ if (pstring2 != NULL) {
+#ifdef OLD_FASTA_GAP
+ sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n\
+; pg_gap-pen: %d %d\n; pg_ktup: %d\n; %s\n",
+#else
+ sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n\
+; pg_open-ext: %d %d\n; pg_ktup: %d\n; %s\n",
+#endif
+ pg_str,verstr,ppst->pam_name, ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set) ? "xS":"\0", ppst->gdelval,
+ ppst->ggapval,ppst->param_u.fa.ktup,options_str2);
+ }
+ }
+
+void
+close_work (const unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ struct f_struct *f_str;
+ int naat;
+
+ f_str = *f_arg;
+
+ if (f_str != NULL) {
+ if (ppst->ext_sq_set) naat = MAXLC;
+ else naat = MAXUC;
+ free_weights(&f_str->weight0,&f_str->weight1,&f_str->weight_c,naat);
+ free(f_str->cur);
+#ifndef TFAST
+ f_str->aa0v--;
+ free(f_str->aa0v);
+ f_str->aa0x--;
+ free(f_str->aa0x);
+#else /* TFAST */
+ f_str->aa1x--;
+ free(f_str->aa1x);
+ f_str->aa1v--;
+ free(f_str->aa1v);
+#endif
+ free(f_str->res);
+ free(f_str->waa);
+ free(f_str->diag);
+ free(f_str->link);
+ free(f_str->pamh2);
+ free(f_str->pamh1);
+ free(f_str->harr);
+ free(f_str);
+ *f_arg = NULL;
+ }
+}
+
+void do_fastz (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ const unsigned char *aa1v,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ struct rstruct *rst, int *hoff, int shuff_flg,
+ struct score_count_s *s_info)
+{
+ int nd; /* diagonal array size */
+ int lhval;
+ int kfact;
+ int i;
+ register struct dstruct *dptr;
+ struct savestr vmax[MAXSAV]; /* best matches saved for one sequence */
+ struct savestr *vptr[MAXSAV];
+ struct savestr *lowmax;
+ int lowscor;
+ register int tscor;
+ int xdebug = 0;
+
+#ifndef ALLOCN0
+ register struct dstruct *diagp;
+#else
+ register int dpos;
+ int lposn0;
+#endif
+ struct dstruct *dpmax;
+ register int lpos;
+ int tpos;
+ struct savestr *vmptr;
+ int scor, tmp;
+ int im, ib, nsave;
+ int ktup, kt1, ip, lkt, ktup_sq;
+ const int *hsq;
+ int c_gap, opt_cut;
+#ifndef TFAST
+ int n0x31, n0x32;
+ n0x31 = (n0-2)/3;
+ n0x32 = n0x31+1+(n0-n0x31-1)/2;
+#else
+ unsigned char *fs, *fd;
+ int n1x31, n1x32, last_n1, itemp;
+ n1x31 = (n1-2)/3;
+ n1x32 = n1x31+1+(n1-n1x31-1)/2;
+#endif
+
+ if (ppst->ext_sq_set) {
+ ip = 1;
+ hsq = ppst->hsqx;
+ }
+ else {
+ ip = 0;
+ hsq = ppst->hsq;
+ }
+
+ ktup = ppst->param_u.fa.ktup;
+ ktup_sq = ktup*ktup;
+ if (ktup == 1) ktup_sq *= 2;
+
+ kt1 = ktup-1;
+
+ if (n1 < ktup) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+ if (n0+n1+1 >= MAXDIAG) {
+ fprintf(stderr,"n0,n1 too large: %d, %d\n",n0,n1);
+ rst->score[0] = rst->score[1] = rst->score[2] = -1;
+ return;
+ }
+
+ if (ppst->param_u.fa.use_E_thresholds) {
+ c_gap = ELK_to_s(ppst->param_u.fa.E_join*ktup_sq*2.5, n0, n1, ppst->pLambda, ppst->pK, ppst->pH);
+ opt_cut = ELK_to_s(ppst->param_u.fa.E_band_opt*ktup_sq*2.0, n0, n1, ppst->pLambda, ppst->pK, ppst->pH);
+ rst->valid_stat = 0;
+ }
+ else {
+ c_gap = ppst->param_u.fa.cgap;
+ opt_cut = ppst->param_u.fa.optcut;
+ rst->valid_stat = 1;
+ }
+
+ f_str->noff = n0 - 1;
+
+#ifdef ALLOCN0
+ nd = n0;
+#endif
+
+#ifndef ALLOCN0
+ nd = n0 + n1;
+#endif
+
+ dpmax = &f_str->diag[nd];
+ for (dptr = &f_str->diag[f_str->ndo]; dptr < dpmax;)
+ {
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+
+ for (vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++)
+ vmptr->score = 0;
+ lowmax = vmax;
+ lowscor = 0;
+
+ if (n1 > 1000 && aa1[0]==23 && aa1[100]==23 &&
+ aa1[1400]==23 && aa1[1401]!=23) {
+ xdebug = 1;
+ }
+ else xdebug = 0;
+
+ /* start hashing */
+ lhval = 0;
+ lkt = kt1;
+ for (lpos = 0; (lpos < lkt || hsq[aa1[lpos]]>=NMAP) && lpos<n1; lpos++) {
+ /* restart lhval calculation */
+ if (hsq[aa1[lpos]]>=NMAP) {
+ lhval = 0; lkt=lpos+ktup;
+ continue;
+ }
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
+ }
+
+#ifndef ALLOCN0
+ diagp = &f_str->diag[f_str->noff + lkt];
+ for (; lpos < n1; lpos++, diagp++) {
+ if (hsq[aa1[lpos]]>=NMAP) {
+ lpos++ ; diagp++;
+ while (lpos < n1 && hsq[aa1[lpos]]>=NMAP) {lpos++; diagp++;}
+ if (lpos >= n1) break;
+ lhval = 0;
+ }
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
+ for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
+ if ((tscor = (dptr = &diagp[-tpos])->stop) >= 0) {
+#else
+ lposn0 = f_str->noff + lpos;
+ for (; lpos < n1; lpos++, lposn0++) {
+ if (hsq[aa1[lpos]]>=NMAP) {lhval = 0; goto loopl;}
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
+ for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
+ dpos = lposn0 - tpos;
+ if ((tscor = (dptr = &f_str->diag[dpos % nd])->stop) >= 0) {
+#endif
+ tscor += ktup;
+ if ((tscor -= lpos) <= 0) {
+ scor = dptr->score;
+ if ((tscor += (kfact = f_str->pamh2[lhval])) < 0 && lowscor < scor) {
+#ifdef ALLOCN0
+ lowscor = savemax (dptr, dpos, vmax, &lowmax);
+#else
+ lowscor = savemax (dptr, dptr- f_str->diag, vmax, &lowmax);
+#endif
+ }
+ if ((tscor += scor) >= kfact) {
+ dptr->score = tscor;
+ dptr->stop = lpos;
+ }
+ else {
+ dptr->score = kfact;
+ dptr->start = (dptr->stop = lpos) - kt1;
+ }
+ }
+ else {
+ dptr->score += f_str->pamh1[aa0[tpos]];
+ dptr->stop = lpos;
+ }
+ }
+ else {
+ dptr->score = f_str->pamh2[lhval];
+ dptr->start = (dptr->stop = lpos) - kt1;
+ }
+ } /* end tpos */
+
+#ifdef ALLOCN0
+ /* reinitialize diag structure */
+ loopl:
+ if ((dptr = &f_str->diag[lpos % nd])->score > lowscor)
+ lowscor = savemax (dptr, lpos, vmax, &lowmax);
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr->score = 0;
+#endif
+ } /* end lpos */
+
+#ifdef ALLOCN0
+ for (tpos = 0, dpos = f_str->noff + n1 - 1; tpos < n0; tpos++, dpos--) {
+ if ((dptr = &f_str->diag[dpos % nd])->score > lowscor)
+ lowscor = savemax (dptr, dpos, vmax, &lowmax, f_str);
+ }
+#else
+ for (dptr = f_str->diag; dptr < dpmax;) {
+ if (dptr->score > lowscor) savemax (dptr, dptr - f_str->diag, vmax, &lowmax);
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+ f_str->ndo = nd;
+#endif
+
+ for (nsave = 0, vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++) {
+ if (vmptr->score > 0) {
+ vmptr->score = spam (aa0, aa1, vmptr, ppst->pam2[ip], f_str);
+ vptr[nsave++] = vmptr;
+ }
+ }
+
+ if (nsave <= 0) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+#ifndef TFAST
+ /* FASTX code here to modify the start, stop points for
+ the three phases of the translated protein sequence
+ */
+
+ for (ib=0; ib<nsave; ib++) {
+ if (f_str->noff-vptr[ib]->dp+vptr[ib]->start >= n0x32)
+ vptr[ib]->dp += n0x32;
+ if (f_str->noff-vptr[ib]->dp +vptr[ib]->start >= n0x31)
+ vptr[ib]->dp += n0x31;
+ }
+#else
+ /* TFAST code here to modify the start, stop points for
+ the three phases of the translated protein sequence
+ TFAST modifies library start points, rather than
+ query start points
+ */
+
+ for (ib=0; ib<nsave; ib++) {
+ if (vptr[ib]->start >= n1x32) {
+ vptr[ib]->start -= n1x32;
+ vptr[ib]->stop -= n1x32;
+ vptr[ib]->dp -= n1x32;
+ }
+ if (vptr[ib]->start >= n1x31) {
+ vptr[ib]->start -= n1x31;
+ vptr[ib]->stop -= n1x31;
+ vptr[ib]->dp -= n1x31;
+ }
+ }
+#endif /* TFAST */
+
+ scor = sconn (vptr, nsave, c_gap,
+ ppst->param_u.fa.pgap, f_str);
+
+ for (vmptr=vptr[0],ib=1; ib<nsave; ib++)
+ if (vptr[ib]->score > vmptr->score) vmptr=vptr[ib];
+
+/* kssort (vptr, nsave); */
+
+ rst->score[1] = vmptr->score;
+ rst->score[0] = max (scor, vmptr->score);
+ rst->score[2] = rst->score[0]; /* initn */
+
+ s_info->tot_scores++;
+ if (rst->score[0] > c_gap) { s_info->s_cnt[0]++;}
+#ifndef TFAST
+ *hoff=f_str->noff - vmptr->dp;
+#else /* TFAST */
+ *hoff=vmptr->dp-f_str->noff;
+#endif /* TFAST */
+ if (ppst->param_u.fa.optflag) {
+ if (/* shuff_flg || */ rst->score[0] > opt_cut) {
+ s_info->s_cnt[2]++;
+ rst->valid_stat = 1;
+ rst->score[2] = dmatchz(aa0, n0,aa1,n1, aa1v,
+ *hoff,ppst->param_u.fa.optwid,
+ ppst->pam2[ip],
+ ppst->gdelval,ppst->ggapval,ppst->gshift,
+ f_str);
+ }
+ }
+}
+
+void do_work (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ const struct pstruct *ppst,
+ struct f_struct *f_str,
+ int qr_flg, int shuff_flg, struct rstruct *rst,
+ struct score_count_s *s_info)
+{
+ int hoff;
+ int last_n1, itx, dnav, n10, i, ir;
+ unsigned char *aa1x;
+
+ rst->escore = 1.0;
+ rst->segnum = rst->seglen = 1;
+ rst->valid_stat = 0;
+
+ if (n1 < ppst->param_u.fa.ktup) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+#ifndef TFAST
+ do_fastz (f_str->aa0x, n0, aa1, n1, f_str->aa0v, ppst, f_str, rst, &hoff, shuff_flg, s_info);
+#else
+ /* make a precomputed codon number series */
+
+ if (frame == 0) {
+ pre_com(aa1, n1, f_str->aa1v);
+ }
+ else {
+ pre_com_r(aa1, n1, f_str->aa1v);
+ }
+
+ /* make translated sequence */
+ last_n1 = 0;
+ aa1x = f_str->aa1x;
+#ifdef DEBUG
+ if (frame > 1) {
+ fprintf(stderr, "*** fz_walign - frame: %d - out of range [0,1]\n",frame);
+ }
+#endif
+
+ for (itx= frame*3; itx< frame*3+3; itx++) {
+ n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+
+ do_fastz (aa0, n0, f_str->aa1x, n10, f_str->aa1v, ppst, f_str, rst, &hoff, shuff_flg, s_info);
+#endif
+
+ rst->comp = rst->H = -1.0;
+}
+
+void do_opt (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct rstruct *rst)
+{
+ int optflag, tscore, hoff;
+ int last_n1, itx, n10, i, ir;
+ unsigned char *aa1x;
+ struct score_count_s s_info = {0, 0, 0, 0};
+
+ optflag = ppst->param_u.fa.optflag;
+ ppst->param_u.fa.optflag = 1;
+
+#ifndef TFAST
+ do_fastz (f_str->aa0x, n0, aa1, n1, f_str->aa0v, ppst, f_str, rst, &hoff, 0, &s_info);
+#else
+ /* make a precomputed codon number series */
+
+ if (frame == 0) {
+ pre_com(aa1, n1, f_str->aa1v);
+ }
+ else {
+ pre_com_r(aa1, n1, f_str->aa1v);
+ }
+
+ /* make translated sequence */
+ last_n1 = 0;
+ aa1x = f_str->aa1x;
+ for (itx= frame*3; itx< frame*3+3; itx++) {
+ n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+
+ do_fastz (aa0, n0, f_str->aa1x, n10, f_str->aa1v, ppst, f_str, rst, &hoff, 0, &s_info );
+#endif
+
+ ppst->param_u.fa.optflag = optflag;
+}
+
+int
+savemax (struct dstruct *dptr, int dpos,
+ struct savestr *vmax, struct savestr **lowmax)
+{
+ struct savestr *vmptr;
+ int i;
+
+/* check to see if this is the continuation of a run that is already saved */
+
+ if ((vmptr = dptr->dmax) != NULL && vmptr->dp == dpos &&
+ vmptr->start == dptr->start) {
+ vmptr->stop = dptr->stop;
+ if ((i = dptr->score) <= vmptr->score) return (*lowmax)->score;
+ vmptr->score = i;
+ if (vmptr != *lowmax) return (*lowmax)->score;
+ }
+ else {
+ i = (*lowmax)->score = dptr->score;
+ (*lowmax)->dp = dpos;
+ (*lowmax)->start = dptr->start;
+ (*lowmax)->stop = dptr->stop;
+ dptr->dmax = *lowmax;
+ }
+
+ for (vmptr = vmax; vmptr < vmax+MAXSAV; vmptr++) {
+ if (vmptr->score < i) {
+ i = vmptr->score;
+ *lowmax = vmptr;
+ }
+ }
+ return i;
+}
+
+int spam (const unsigned char *aa0,
+ const unsigned char *aa1,
+ struct savestr *dmax, int **pam2,
+ struct f_struct *f_str)
+{
+ int lpos;
+ int tot, mtot;
+ struct {
+ int start, stop, score;
+ } curv, maxv;
+ const unsigned char *aa0p, *aa1p;
+
+ aa1p = &aa1[lpos = dmax->start];
+ aa0p = &aa0[lpos - dmax->dp + f_str->noff];
+ curv.start = lpos;
+
+ tot = curv.score = maxv.score = 0;
+ for (; lpos <= dmax->stop; lpos++) {
+ tot += pam2[*aa0p++][*aa1p++];
+ if (tot > curv.score) {
+ curv.stop = lpos;
+ curv.score = tot;
+ }
+ else if (tot < 0) {
+ if (curv.score > maxv.score) {
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+ tot = curv.score = 0;
+ curv.start = lpos+1;
+ }
+ }
+
+ if (curv.score > maxv.score) {
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+
+/* if (maxv.start != dmax->start || maxv.stop != dmax->stop)
+ printf(" new region: %3d %3d %3d %3d\n",maxv.start,
+ dmax->start,maxv.stop,dmax->stop);
+*/
+ dmax->start = maxv.start;
+ dmax->stop = maxv.stop;
+
+ return maxv.score;
+}
+
+#define XFACT 10
+
+int sconn (struct savestr **v, int n,
+ int cgap, int pgap, struct f_struct *f_str)
+{
+ int i, si;
+ struct slink {
+ int score;
+ struct savestr *vp;
+ struct slink *next;
+ } *start, *sl, *sj, *so, sarr[MAXSAV];
+ int lstart, tstart, plstop, ptstop;
+
+/* sort the score left to right in lib pos */
+
+ kpsort (v, n);
+
+ start = NULL;
+
+/* for the remaining runs, see if they fit */
+
+ for (i = 0, si = 0; i < n; i++)
+ {
+
+/* if the score is less than the gap penalty, it never helps */
+ if (v[i]->score < cgap)
+ continue;
+ lstart = v[i]->start;
+ tstart = lstart - v[i]->dp + f_str->noff;
+
+/* put the run in the group */
+ sarr[si].vp = v[i];
+ sarr[si].score = v[i]->score;
+ sarr[si].next = NULL;
+
+/* if it fits, then increase the score */
+ for (sl = start; sl != NULL; sl = sl->next)
+ {
+ plstop = sl->vp->stop;
+ ptstop = plstop - sl->vp->dp + f_str->noff;
+ if (plstop < lstart+XFACT && ptstop < tstart+XFACT) {
+ sarr[si].score = sl->score + v[i]->score + pgap;
+ break;
+ }
+ }
+
+/* now recalculate where the score fits */
+ if (start == NULL)
+ start = &sarr[si];
+ else
+ for (sj = start, so = NULL; sj != NULL; sj = sj->next)
+ {
+ if (sarr[si].score > sj->score)
+ {
+ sarr[si].next = sj;
+ if (so != NULL)
+ so->next = &sarr[si];
+ else
+ start = &sarr[si];
+ break;
+ }
+ so = sj;
+ }
+ si++;
+ }
+
+ if (start != NULL)
+ return (start->score);
+ else
+ return (0);
+}
+
+void
+kssort (v, n)
+struct savestr *v[];
+int n;
+{
+ int gap, i, j;
+ struct savestr *tmp;
+
+ for (gap = n / 2; gap > 0; gap /= 2)
+ for (i = gap; i < n; i++)
+ for (j = i - gap; j >= 0; j -= gap)
+ {
+ if (v[j]->score >= v[j + gap]->score)
+ break;
+ tmp = v[j];
+ v[j] = v[j + gap];
+ v[j + gap] = tmp;
+ }
+}
+
+void
+kpsort (struct savestr **v, int n) {
+ int gap, i, j, k;
+ int incs[4] = { 21, 7, 3, 1 };
+ struct savestr *tmp;
+ int v_start;
+
+ for ( k = 0; k < 4; k++) {
+ gap = incs[k];
+ for (i = gap; i < n; i++) {
+ tmp = v[i];
+ j = i;
+ v_start = v[i]->start;
+ while (j >= gap && v[j - gap]->start > v_start) {
+ v[j] = v[j - gap];
+ j -= gap;
+ }
+ v[j] = tmp;
+ }
+ }
+}
+
+static int
+dmatchz(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ const unsigned char *aa1v,
+ int hoff, int window,
+ int **pam2, int gdelval, int ggapval, int gshift,
+ struct f_struct *f_str)
+{
+
+ hoff -= window/2;
+
+#ifndef TFAST
+ return lx_band(aa1,n1,f_str->aa0v,n0-2,
+ pam2,
+#ifdef OLD_FASTA_GAP
+ -(gdelval - ggapval),
+#else
+ -gdelval,
+#endif
+ -ggapval,-gshift,
+ hoff,window,f_str);
+#else
+ return lx_band(aa0,n0,aa1v,n1-2,
+ pam2,
+#ifdef OLD_FASTA_GAP
+ -(gdelval - ggapval),
+#else
+ -gdelval,
+#endif
+ -ggapval,-gshift,
+ hoff,window,f_str);
+#endif
+}
+
+static void
+init_row(struct sx_s *row, int sp) {
+ int i;
+ for (i = 0; i < sp; i++) {
+ row[i].C1 = row[i].I1 = 0;
+ row[i].C2 = row[i].I2 = 0;
+ row[i].C3 = row[i].I3 = 0;
+ row[i].flag = 0;
+ }
+}
+
+int lx_band(const unsigned char *prot_seq, /* array with protein sequence numbers*/
+ int len_prot, /* length of prot. seq */
+ const unsigned char *dna_prot_seq, /* translated DNA sequence numbers*/
+ int len_dna_prot, /* length trans. seq. */
+ int **pam_matrix, /* scoring matrix */
+ int gopen, int gext, /* gap open, gap extend penalties */
+ int gshift, /* frame-shift penalty */
+ int start_diag, /* start diagonal of band */
+ int width, /* width for band alignment */
+ struct f_struct *f_str)
+{
+ void *ckalloc();
+ int i, j, bd, bd1, x1, x2, sp, p1=0, p2=0, end_prot;
+ struct sx_s *last, *tmp;
+ int sc, del, best = 0, cd,ci, e1, e2, e3, cd1, cd2, cd3, f, gg;
+ const unsigned char *dp;
+ register struct sx_s *ap, *aq;
+ struct wgt *wt, *ww;
+ int aa, b, a,x,y,z;
+
+ sp = width+7;
+ gg = gopen+gext;
+ /* sp = sp/3+1; */
+
+ if (f_str->cur == NULL ) {
+ f_str->cur_sp_size = sp;
+ f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp);
+ }
+ else if (f_str->cur_sp_size != sp) {
+ free(f_str->cur);
+ f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp);
+ f_str->cur_sp_size = sp;
+ }
+
+ init_row(f_str->cur, sp);
+
+ /*
+ if (start_diag %3 !=0) start_diag = start_diag/3-1;
+ else start_diag = start_diag/3;
+ if (width % 3 != 0) width = width/3+1;
+ else width = width /3;
+ */
+
+ x1 = start_diag; /* x1 = lower bound of DNA */
+ x2 = 1; /* the amount of position shift from last row*/
+
+ end_prot = max(0,-width-start_diag) + (len_dna_prot+5)/3 + width;
+ end_prot = min(end_prot,len_prot);
+
+ /* i counts through protein sequence, x1 through DNAp */
+
+ for (i = max(0, -width-start_diag), x1+=i; i < len_prot; i++, x1++) {
+ bd = min(x1+width, (len_dna_prot+2)/3); /* upper bound of band */
+ bd1 = max(0,x1); /* lower bound of band */
+ wt = f_str->weight0[prot_seq[i]];
+ del = 1-x1; /*adjustment*/
+ bd += del;
+ bd1 +=del;
+
+ ap = &f_str->cur[bd1]; aq = ap+1;
+ e1 = f_str->cur[bd1-1].C3; e2 = ap->C1; cd1 = cd2= cd3= 0;
+ for (dp = &dna_prot_seq[(bd1-del)*3]; ap < &f_str->cur[bd]; ap++) {
+ ww = &wt[(unsigned char) *dp++];
+ sc = max(max(e1+ww->iv, (e3=ap->C2)+ww->ii), e2+ww->iii);
+ if (cd1 > sc) sc = cd1;
+ cd1 -= gext;
+ if ((ci = aq->I1) > 0) {
+ if (sc < ci) { ap->C1 = ci; ap->I1 = ci-gext;}
+ else {
+ ap->C1 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd1 < sc) cd1 = sc;
+ ap->I1 = max(ci-gext, sc);
+ } else ap->I1 = ci-gext;
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I1 = ap->C1 = 0;
+ } else {
+ ap->C1 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd1 < sc) cd1 = sc;
+ ap->I1 = sc;
+ } else ap->I1 = 0;
+ }
+ }
+ ww = &wt[(unsigned char) *dp++];
+ sc = max(max(e2+ww->iv, (e1=ap->C3)+ww->ii), e3+ww->iii);
+ if (cd2 > sc) sc = cd2;
+ cd2 -= gext;
+ if ((ci = aq->I2) > 0) {
+ if (sc < ci) { ap->C2 = ci; ap->I2 = ci-gext;}
+ else {
+ ap->C2 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd2 < sc) cd2 = sc;
+ ap->I2 = max(ci-gext, sc);
+ }
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I2 = ap->C2 = 0;
+ } else {
+ ap->C2 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd2 < sc) cd2 = sc;
+ ap->I2 = sc;
+ } else ap->I2 = 0;
+ }
+ }
+ ww = &wt[(unsigned char)*dp++];
+ sc = max(max(e3+ww->iv, (e2=aq->C1)+ww->ii), e1+ww->iii);
+ if (cd3 > sc) sc = cd3;
+ cd3 -= gext;
+ if ((ci = aq++->I3) > 0) {
+ if (sc < ci) { ap->C3 = ci; ap->I3 = ci-gext;}
+ else {
+ ap->C3 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd3 < sc) cd3 = sc;
+ ap->I3 = max(ci-gext, sc);
+ }
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I3 = ap->C3 = 0;
+ } else {
+ ap->C3 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd3 < sc) cd3 = sc;
+ ap->I3 = sc;
+ } else ap->I3 = 0;
+ }
+ }
+ }
+ }
+ /* printf("The best score is %d\n", best); */
+ return best+gg;
+}
+
+/* ckalloc - allocate space; check for success */
+void *ckalloc(size_t amount)
+{
+ void *p;
+
+ if ((p = (void *)malloc( (size_t)amount)) == NULL)
+ w_abort("Ran out of memory.","");
+ return(p);
+}
+
+/* calculate the 100% identical score */
+int
+shscore(unsigned char *aa0, int n0, int **pam2)
+{
+ int i, sum;
+ for (i=0,sum=0; i<n0; i++)
+ sum += pam2[aa0[i]][aa0[i]];
+ return sum;
+}
+
+#define WIDTH 60
+
+typedef struct mat *match_ptr;
+
+typedef struct mat {
+ int i, j, l;
+ match_ptr next;
+} match_node;
+
+typedef struct { int i,j;} state;
+typedef state *state_ptr;
+
+
+void *ckalloc();
+static match_ptr small_global(), global();
+static int local_align(), find_best();
+static void init_row2(), init_ROW();
+
+int
+pro_dna(const unsigned char *prot_seq, /* array with prot. seq. numbers*/
+ int len_prot, /* length of prot. seq */
+ const unsigned char *dna_prot_seq, /* trans. DNA seq. numbers*/
+ int len_dna_prot, /* length trans. seq. */
+ int **pam_matrix, /* scoring matrix */
+ int gopen, int gext, /* gap open, gap extend penalties */
+ int gshift, /* frame-shift penalty */
+ struct f_struct *f_str,
+ int max_res,
+ struct a_res_str *a_res) /* alignment info */
+{
+ match_ptr align, ap, aq;
+ int x, y, ex, ey, i, score;
+ int *alignment;
+
+ f_str->up = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+ f_str->down = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+ f_str->tp = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+
+ /*local alignment find the best local alignment x and y
+ is the starting position of the best local alignment
+ and ex ey is the ending position */
+
+ score= local_align(&x, &y, &ex, &ey,
+ pam_matrix, gopen, gext,
+ dna_prot_seq, len_dna_prot,
+ prot_seq, len_prot, f_str);
+
+ f_str->up += 3; f_str->down += 3; f_str->tp += 3;
+
+ /* x, y - start in prot, dna_prot */
+ a_res->min0 = x; /* prot */
+ a_res->min1 = y; /* DNA */
+ a_res->max0 = ex; /* prot */
+ a_res->max1 = ey; /* DNA */
+
+ align = global(x, y, ex, ey,
+ pam_matrix, gopen, gext,
+ dna_prot_seq, prot_seq,
+ 0, 0, f_str);
+
+ alignment = a_res->res;
+
+ for (ap = align, i= 0; ap; i++) {
+ if (i < max_res) alignment[i] = ap->l;
+ aq = ap->next; free(ap); ap = aq;
+ }
+ if (i >= max_res)
+ fprintf(stderr,"***alignment truncated: %d/%d***\n", max_res,i);
+
+ /* up = &up[-3]; down = &down[-3]; tp = &tp[-3]; */
+ free(&f_str->up[-3]); free(&f_str->tp[-3]); free(&f_str->down[-3]);
+
+ a_res->nres = i;
+ return score;
+}
+
+static void
+swap(void **a, void **b)
+{
+ void *t = *a;
+ *a = *b; *b = t;
+}
+
+/*
+ local alignment find the best local alignment x and y
+ is the starting position of the best local alignment
+ and ex ey is the ending position
+*/
+static int
+local_align(int *x, int *y, int *ex, int *ey,
+ int **wgts, int gop, int gext,
+ const unsigned char *dnap, int ld,
+ const unsigned char *pro, int lp,
+ struct f_struct *f_str)
+{
+ int i, j, score, x1,x2,x3,x4, e1 = 0, e2 = 0, e3,
+ sc, del, e, best = 0, cd, ci, c;
+ struct wgt *wt, *ww;
+ state_ptr cur_st, last_st, cur_i_st;
+ st_ptr cur, last;
+ const unsigned char *dp;
+ int *cur_d_st, *st_up;
+
+ /*
+ Array rowiC stores the best scores of alignment ending at a position
+ Arrays rowiD and rowiI store the best scores of alignment ending
+ at a position with a deletion or insrtion
+ Arrays sti stores the starting position of the best alignment whose
+ score stored in the corresponding row array.
+ The program stores two rows to complete the computation, same is
+ for the global alignment routine.
+ */
+
+
+ st_up = (int *) ckalloc(sizeof(int)*(ld+10));
+ init_row2(st_up, ld+5);
+
+ ld += 2;
+
+ init_ROW(f_str->up, ld+1);
+ init_ROW(f_str->down, ld+1);
+ cur = f_str->up+1;
+ last = f_str->down+1;
+
+ cur_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+ last_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+ cur_i_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+ cur_d_st = st_up;
+ dp = dnap-2;
+ for (i = 0; i < lp; i++) {
+ wt = f_str->weight1[pro[i]]; e2 =0; e1 = last[0].C;
+ for (j = 0; j < 2; j++) {
+ cur_st[j].i = i+1;
+ cur_st[j].j = j+1;
+ }
+ for (j = 2; j < ld; j++) {
+ ww = &wt[(unsigned char) dp[j]];
+ del = -1;
+ if (j >= 3) {
+ sc = 0;
+ e3 = e2; e2 = e1;
+ e1 = last[j-2].C;
+ if ((e=e2+ww->iii) > sc) {sc = e; del = 3;}
+ if ((e=e1+ww->ii) > sc) {sc = e; del = 2;}
+ if ((e = e3+ww->iv) > sc) {sc = e; del = 4;}
+ } else {
+ sc = e2 = 0;
+ if (ww->iii > 0) {sc = ww->iii; del = 3;}
+ }
+ if (sc < (ci=last[j].I)) {
+ sc = ci; del = 0;
+ }
+ if (sc < (cd=cur[j].D)) {
+ sc = cd; del = 5;
+ }
+ cur[j].C = sc;
+ e = sc - gop;
+ if (e > cd) {
+ cur[j+3].D = e-gext;
+ cur_d_st[j+3] = 3;
+ } else {
+ cur[j+3].D = cd-gext;
+ cur_d_st[j+3] = cur_d_st[j]+3;
+ }
+ switch(del) {
+ case 5:
+ c = cur_d_st[j];
+ cur_st[j].i = cur_st[j-c].i;
+ cur_st[j].j = cur_st[j-c].j;
+ break;
+ case 0:
+ cur_st[j].i = cur_i_st[j].i;
+ cur_st[j].j = cur_i_st[j].j;
+ break;
+ case 2:
+ case 3:
+ case 4:
+ if (i) {
+ if (j-del >= 0) {
+ cur_st[j].i = last_st[j-del].i;
+ cur_st[j].j = last_st[j-del].j;
+ } else {
+ cur_st[j].i = i;
+ cur_st[j].j = 0;
+ }
+ } else {
+ cur_st[j].i = 0;
+ cur_st[j].j = max(0, j-del+1);
+ }
+ break;
+ case -1:
+ cur_st[j].i = i+1;
+ cur_st[j].j = j+1;
+ break;
+ }
+ if (e > ci) {
+ cur[j].I = e -gext;
+ cur_i_st[j].i = cur_st[j].i;
+ cur_i_st[j].j = cur_st[j].j;
+ } else {
+ cur[j].I = ci- gext;
+ }
+ if (sc > best) {
+ x1 = cur_st[j].i;
+ x2 = cur_st[j].j;
+ best =sc;
+ x3 = i;
+ x4 = j;
+ }
+ }
+ swap((void *)&last, (void *)&cur);
+ swap((void *)&cur_st, (void *)&last_st);
+ }
+ /* printf("The best score is %d\n", best);*/
+ *x = x1; *y = x2; *ex = x3; *ey = x4;
+ free(cur_st); free(last_st); free(cur_i_st);
+ free(st_up);
+ return best;
+}
+
+/*
+ Both global_up and global_down do linear space score only global
+ alignments on subsequence pro[x]...pro[ex], and dna[y]...dna[ey].
+ global_up do the algorithm upwards, from row x towards row y.
+ global_down do the algorithm downwards, from row y towards x.
+*/
+
+static void
+global_up(st_ptr *row1, st_ptr *row2,
+ int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext,
+ unsigned char *dnap, unsigned char *pro,
+ int N, struct f_struct *f_str)
+{
+ int i, j, k, sc, e, e1, e2, e3, t, ci, cd, score;
+ struct wgt *wt, *ww;
+ st_ptr cur, last;
+
+ cur = *row1; last = *row2;
+ sc = -gop;
+ for (j = 0; j <= ey-y+1; j++) {
+ if (j % 3 == 0) {last[j].C = sc; sc -= gext; last[j].I = sc-gop;}
+ else { last[j].I = last[j].C = -10000;}
+ }
+ last[0].C = 0; cur[0].D = cur[1].D = cur[2].D = -10000;
+ last[0].D = last[1].D = last[2].D = -10000;
+ if (N) last[0].I = -gext;
+ for (i = 1; i <= ex-x+1; i++) {
+ wt = f_str->weight1[pro[i+x-1]]; e1 = -10000; e2 = last[0].C;
+ for (j = 0; j <= ey-y+1; j++) {
+ t = j+y;
+ sc = -10000;
+ ww = &wt[(unsigned char) dnap[t-3]];
+ if (j < 4) {
+ if (j == 3) {
+ sc = e2+ww->iii;
+ } else if (j == 2) {
+ sc = e2 + ww->ii;
+ }
+ } else {
+ e3 = e2; e2 = e1;
+ e1 = last[j-2].C;
+ sc = max(e2+ww->iii, max(e1+ww->ii, e3+ww->iv));
+ }
+ sc = max(sc, max(ci=last[j].I, cd = cur[j].D));
+ cur[j].C = sc;
+ cur[j+3].D = max(cd, sc-gop)-gext;
+ cur[j].I = max(ci, sc-gop)-gext;
+ }
+ swap((void *)&last, (void *)&cur);
+ }
+ /*printf("global up score =%d\n", last[ey-y+1].C);*/
+ for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
+ if (*row1 != last) swap((void *)row1, (void *)row2);
+}
+
+static void
+global_down(st_ptr *row1, st_ptr *row2,
+ int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext,
+ unsigned char *dnap, unsigned char *pro,
+ int N, struct f_struct *f_str)
+{
+ int i, j, k, sc, del, *tmp, e, t, e1,e2,e3, ci,cd, score;
+ struct wgt *wt, *w1, *w2, *w3;
+ st_ptr cur, last;
+
+ cur = (*row1); last = *row2;
+ sc = -gop;
+ for (j = ey-y+1; j >= 0; j--) {
+ if ((ey-y+1-j) % 3) {last[j].C = sc; sc-=gext; last[j].I = sc-gop;}
+ else last[j].I = last[j].C = -10000;
+ cur[j].I = -10000;
+ }
+ last[ey-y+1].C = 0;
+ if (N) last[ey-y+1].I = -gext;
+ cur[ey-y+1].D = cur[ey-y].D = cur[ey-y-1].D = -10000;
+ last[ey-y+1].D = last[ey-y].D = last[ey-y-1].D = -10000;
+ for (i = ex-x; i >= 0; i--) {
+ wt = f_str->weight1[pro[i+x]]; e2 = last[ey-y+1].C;
+ e1 = -10000;
+ w3 = &wt[(unsigned char) dnap[ey]];
+ w2 = &wt[(unsigned char) dnap[ey-1]];
+ for (j = ey-y+1; j >= 0; j--) {
+ t = j+y;
+ w1 = &wt[(unsigned char) dnap[t-1]];
+ sc = -10000;
+ if (t+3 > ey) {
+ if (t+2 == ey) {
+ sc = e2+w2->iii;
+ } else if (t+1 == ey) {
+ sc = e2+w1->ii;
+ }
+ } else {
+ e3 = e2; e2 = e1;
+ e1 = last[j+2].C;
+ sc = max(e2+w2->iii, max(e1+w1->ii,e3+w3->iv)) ;
+ }
+ if (sc < (cd= cur[j].D)) {
+ sc = cd;
+ cur[j-3].D = cd-gext;
+ } else cur[j-3].D =max(cd, sc-gop)-gext;
+ if (sc < (ci= last[j].I)) {
+ sc = ci;
+ cur[j].I = ci - gext;
+ } else cur[j].I = max(sc-gop,ci)-gext;
+ cur[j].C = sc;
+ w3 = w2; w2 = w1;
+ }
+ swap((void *)&last, (void *)&cur);
+ }
+ for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
+ if (*row1 != last) swap((void *)row1, (void *)row2);
+}
+
+static void
+init_row2(int *row, int ld) {
+ int i;
+ for (i = 0; i < ld; i++) row[i] = 0;
+}
+
+static void init_ROW(st_ptr row, int ld) {
+ int i;
+ for (i = 0; i < ld; i++) row[i].I = row[i].D = row[i].C = 0;
+}
+
+static match_ptr
+combine(match_ptr x1, match_ptr x2, int st) {
+ match_ptr x;
+
+ if (x1 == NULL) return x2;
+ for (x = x1; x->next; x = x->next);
+ x->next = x2;
+ if (st) {
+ for (x = x2; x; x = x->next) {
+ x->j++;
+ if (x->l == 3 || x->l == 4) break;
+ }
+ x->l--;
+ }
+ return x1;
+}
+
+/*
+ global use the two upwards and downwards score only linear
+ space global alignment subroutine to recursively build the
+ alignment.
+*/
+
+match_ptr
+global(int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext,
+ unsigned char *dnap, unsigned char *pro, int N1, int N2,
+ struct f_struct *f_str)
+{
+ int m;
+ int m1, m2;
+ match_ptr x1, x2, mm1, mm2;
+
+ /*printf("%d %d %d %d %d %d\n", x,y, ex, ey, N1, N2);*/
+ /*
+ if the space required is limited, we can do a quadratic space
+ algorithm to find the alignment.
+ */
+
+ if (ex <= x) {
+ mm1 = NULL;
+ for (m = y+3; m <= ey; m+=3) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = 5; x1->next = mm1;
+ if (mm1== NULL) mm2 = x1;
+ mm1 = x1;
+ }
+ if (ex == x) {
+ if ((ey-y) % 3 != 0) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = ((ey-y) % 3) +1; x1->next = NULL;
+ if (mm1) mm2->next = x1; else mm1 = x1;
+ } else mm2->l = 4;
+ }
+ return mm1;
+ }
+ if (ey <= y) {
+ mm1 = NULL;
+ for (m = x; m <= ex; m++) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = 0; x1->next = mm1; mm1 = x1;
+ }
+ return mm1;
+ }
+ if (ex -x < SGW1 && ey-y < SGW2)
+ return small_global(x,y,ex,ey,wgts, gop, gext, dnap, pro, N1, N2,f_str);
+ m = (x+ex)/2;
+ /*
+ Do the score only global alignment from row x to row m, m is
+ the middle row of x and ex. Store the information of row m in
+ upC, upD, and upI.
+ */
+ global_up(&f_str->up, &f_str->tp, x, y, m, ey,
+ wgts, gop, gext,
+ dnap, pro, N1, f_str);
+ /*
+ Do the score only global alignment downwards from row ex
+ to row m+1, store information of row m+1 in downC downI and downD
+ */
+ global_down(&f_str->down, &f_str->tp, m+1, y, ex, ey,
+ wgts, gop, gext,
+ dnap, pro, N2, f_str);
+
+ /*
+ Use this information for row m and m+1 to find the crossing
+ point of the best alignment with the middle row. The crossing
+ point is given by m1 and m2. Then we recursively call global
+ itself to compute alignments in two smaller regions found by
+ the crossing point and combine the two alignments to form a
+ whole alignment. Return that alignment.
+ */
+ if (find_best(f_str->up, f_str->down, &m1, &m2, ey-y+1, y, gop)) {
+ x1 = global(x, y, m, m1, wgts, gop, gext, dnap, pro, N1, 0, f_str);
+ x2 = global(m+1, m2, ex, ey, wgts, gop, gext, dnap, pro, 0, N2, f_str);
+ if (m1 == m2) x1 = combine(x1,x2,1);
+ else x1 = combine(x1, x2,0);
+ } else {
+ x1 = global(x, y, m-1, m1, wgts, gop, gext, dnap, pro, N1, 1, f_str);
+ x2 = global(m+2, m2, ex, ey, wgts, gop, gext, dnap, pro, 1, N2, f_str);
+ mm1 = (match_ptr) ckalloc(sizeof(match_node));
+ mm1->i = m; mm1->l = 0; mm1->j = m1;
+ mm2 = (match_ptr) ckalloc(sizeof(match_node));
+ mm2->i = m+1; mm2->l = 0; mm2->j = m1;
+ mm1->next = mm2; mm2->next = x2;
+ x1 = combine(x1, mm1, 0);
+ }
+ return x1;
+}
+
+static int
+find_best(st_ptr up, st_ptr down, int *m1, int *m2, int ld, int y, int gop) {
+
+ int i, best = -1000, j = 0, s1, s2, s3, s4, st;
+
+ for (i = 1; i < ld; i++) {
+ s2 = up[i].C + down[i].C;
+ s4 = up[i].I + down[i].I + gop;
+ if (best < s2) {
+ best = s2; j = i; st = 1;
+ }
+ if (best < s4) {
+ best = s4; j = i; st = 0;
+ }
+ }
+ *m1 = j-1+y;
+ *m2 = j+y;
+ /*printf("score=%d\n", best);*/
+ return st;
+}
+
+/*
+ An alignment is represented as a linked list whose element
+ is of type match_node. Each element represent an edge in the
+ path of the alignment graph. The fields of match_node are
+ l --- gives the type of the edge.
+ i, j --- give the end position.
+*/
+
+static match_ptr
+small_global(int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext,
+ unsigned char *dnap, unsigned char *pro,
+ int N1, int N2, struct f_struct *f_str) {
+
+ /* int C[SGW1+1][SGW2+1], st[SGW1+1][SGW2+1], D[SGW2+7], I[SGW2+1]; */
+ int i, j, e, sc, score, del, k, t, ci, cd;
+ int *cI, *cD, *cC, *lC, *cst, e2, e3, e4;
+ match_ptr mp, first;
+ struct wgt *wt, *ww;
+
+ /*printf("small_global %d %d %d %d\n", x, y, ex, ey);*/
+ sc = -gop-gext; f_str->smgl_s.C[0][0] = 0;
+
+ cI = f_str->smgl_s.I;
+ if (N1) cI[0] = -gext; else cI[0] = sc;
+
+ for (j = 1; j <= ey-y+1; j++) {
+ if (j % 3== 0) {
+ f_str->smgl_s.C[0][j] = sc;
+ sc -= gext;
+ cI[j] = sc-gop;
+ }
+ else {
+ cI[j] = f_str->smgl_s.C[0][j] = -10000;
+ }
+ f_str->smgl_s.st[0][j] = 5;
+ }
+
+ lC = &f_str->smgl_s.C[0][0];
+ cD = f_str->smgl_s.D; cD[0] = cD[1] = cD[2] = -10000;
+ for (i = 1; i <= ex-x+1; i++) {
+ cC = &f_str->smgl_s.C[i][0];
+ wt = f_str->weight1[pro[i+x-1]]; cst = &f_str->smgl_s.st[i][0];
+ for (j = 0; j <=ey-y+1; j++) {
+ ci = cI[j];
+ cd= cD[j];
+ t = j+y;
+ ww = &wt[(unsigned char) dnap[t-3]];
+ if (j >= 4) {
+ sc = lC[j-3]+ww->iii; e2 = lC[j-2]+ww->ii;
+ e4 = lC[j-4]+ww->iv; del = 3;
+ if (e2 > sc) { sc = e2; del = 2;}
+ if (e4 >= sc) { sc = e4; del = 4;}
+ } else {
+ if (j == 3) {
+ sc = lC[0]+ww->iii; del =3;
+ } else if (j == 2) {
+ sc = lC[0]+ww->ii; del = 2;
+ } else {sc = -10000; del = 0;}
+ }
+ if (sc < ci) {
+ sc = ci; del = 0;
+ }
+ if (sc <= cd) {
+ sc = cd;
+ del = 5;
+ }
+ cC[j] = sc;
+ sc -= gop;
+ if (sc <= cd) {
+ del += 10;
+ cD[j+3] = cd - gext;
+ } else cD[j+3] = sc -gext;
+ if (sc < ci) {
+ del += 20;
+ cI[j] = ci-gext;
+ } else cI[j] = sc-gext;
+ *(cst++) = del;
+ }
+ lC = cC;
+ }
+ /*printf("small global score =%d\n", f_str->smgl_s.C[ex-x+1][ey-y+1]);*/
+ if (N2 && cC[ey-y+1] < ci+gop) f_str->smgl_s.st[ex-x+1][ey-y+1] =0;
+ first = NULL; e = 1;
+ for (i = ex+1, j = ey+1; i > x || j > y; i--) {
+ mp = (match_ptr) ckalloc(sizeof(match_node));
+ mp->i = i-1;
+ k = (t=f_str->smgl_s.st[i-x][j-y])%10;
+ mp->j = j-1;
+ if (e == 5 && (t/10)%2 == 1) k = 5;
+ if (e == 0 && (t/20)== 1) k = 0;
+ if (k == 5) { j -= 3; i++; e=5;}
+ else {j -= k;if (k==0) e= 0; else e = 1;}
+ mp->l = k;
+ mp->next = first;
+ first = mp;
+ }
+
+ /* for (i = 0; i <= ex-x; i++) {
+ for (j = 0; j <= ey-y; j++)
+ printf("%d ", C[i][j]);
+ printf("\n");
+ }
+ */
+ return first;
+}
+
+#define XTERNAL
+#include "upam.h"
+
+void
+display_alig(int *a, unsigned char *dna, unsigned char *pro,
+ int length, int ld, struct f_struct *f_str)
+{
+ int len = 0, i, j, x, y, lines, k, iaa;
+ static char line1[100], line2[100], line3[100],
+ tmp[10] = " ", *st;
+ char *dna1, c1, c2, c3;
+
+ line1[0] = line2[0] = line3[0] = '\0'; x= a[0]; y = a[1]-3;
+
+ printf("\n%5d\n%5d", y+3, x);
+ for (len = 0, j = 2, lines = 0; j < length; j++) {
+ i = a[j];
+ line3[len] = ' ';
+ switch (i) {
+ case 3:
+ y += 3;
+ line2[len] = NCBIstdaa[iaa=pro[x++]];
+ line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c5;
+ if (line1[len] != f_str->weight_c[iaa][(unsigned char) dna[y]].c3)
+ line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
+ break;
+ case 2:
+ y += 2;
+ line1[len] = '\\';
+ line2[len++] = ' ';
+ line2[len] = NCBIstdaa[iaa=pro[x++]];
+ line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c2;
+ line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
+ break;
+ case 4:
+ y += 4;
+ line1[len] = '/';
+ line2[len++] = ' ';
+ line2[len] = NCBIstdaa[iaa=pro[x++]];
+ line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c4;
+ line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
+ break;
+ case 5:
+ y += 3;
+ line1[len] = f_str->weight_c[0][(unsigned char) dna[y]].c3;
+ line2[len] = '-';
+ break;
+ case 0:
+ line1[len] = '-';
+ line2[len] = NCBIstdaa[pro[x++]];
+ break;
+ }
+ len++;
+ line1[len] = line2[len] = line3[len] = '\0';
+ if (len >= WIDTH) {
+ for (k = 10; k <= WIDTH; k+=10)
+ printf(" . :");
+ if (k-5 < WIDTH) printf(" .");
+ c1 = line1[WIDTH]; c2 = line2[WIDTH]; c3 = line3[WIDTH];
+ line1[WIDTH] = line2[WIDTH] = line3[WIDTH] = '\0';
+ printf("\n %s\n %s\n %s\n", line1, line3, line2);
+ line1[WIDTH] = c1; line2[WIDTH] = c2;
+ strncpy(line1, &line1[WIDTH], sizeof(line1)-1);
+ strncpy(line2, &line2[WIDTH], sizeof(line2)-1);
+ strncpy(line3, &line3[WIDTH], sizeof(line3)-1);
+ len = len - WIDTH;
+ printf("\n%5d\n%5d", y+3, x);
+ }
+ }
+ for (k = 10; k < len; k+=10)
+ printf(" . :");
+ if (k-5 < len) printf(" .");
+ printf("\n %s\n %s\n %s\n", line1, line3, line2);
+}
+
+
+/* alignment store the operation that align the protein and dna sequence.
+ The code of the number in the array is as follows:
+ 0: delete of an amino acid.
+ 2: frame shift, 2 nucleotides match with an amino acid
+ 3: match an amino acid with a codon
+ 4: the other type of frame shift
+ 5: delete of a codon
+
+
+ Also the first two element of the array stores the starting point
+ in the protein and dna sequences in the local alignment.
+
+ Display looks like where WIDTH is assumed to be divisible by 10.
+
+ 0 . : . : . : . : . : . :
+ AACE/N\PLK\G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LWA\S\C\E/P\PRIRZ
+ I S G S V F N R Q L A G S V F N R Q L A
+ AACE P P-- G HK Y TWA A C E P P---- G HK Y TWA A C E P P----
+
+ 60 . : . : . : . : . : . :
+ /G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LW
+ G S V F N R Q L A G S V F N R Q L A G S V F
+ G HK Y TWA A C E P P---- G HK Y TWA A C E P P---- G HK Y TW
+
+For frame shift, the middle row show the letter in the original sequence,
+and the letter in the top row is the amino acid that is chose by the
+alignment (translated codon chosen from 4 nucleotides, or 2+1).
+*/
+
+/* fatal - print message and die */
+void
+fatal(msg)
+ char *msg;
+{
+ fprintf(stderr, "%s\n", msg);
+ exit(1);
+}
+
+/* 10-Feb-2010 - fz_walign modified to ensure that the final alignment
+ overlaps the initial lz_band() region. In earlier versions, the
+ final alignment (using pam2p[0]) might have been outside the band
+ region */
+
+void
+fz_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int max_res,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct a_res_str *a_res,
+ int score_thresh)
+{
+ int score;
+ int i, last_n1, itemp, n10;
+ int hoff, nt_min, nt_max, n_nt, n_aa, w_fact;
+ int l_min, l_max, window;
+ unsigned char *fs, *fd;
+ /*
+ unsigned char *aa1_min_s, aa1_max_s;
+ */
+ unsigned char *local_aa1;
+ int optflag_s;
+ int itx;
+ unsigned char *aa1x;
+ struct score_count_s s_info = {0,0,0,0};;
+
+#ifndef TFAST
+ do_fastz (f_str->aa0x, n0, aa1, n1, f_str->aa0v, ppst, f_str, &a_res->rst, &hoff, 1, &s_info);
+#else
+ /* make translated sequence */
+ last_n1 = 0;
+ aa1x = f_str->aa1x;
+ for (itx= frame*3; itx< frame*3+3; itx++) {
+ n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+
+ /* do_fastz (lz_band) also needs a pre-computed number series */
+ if (frame == 0) {
+ pre_com(aa1, n1, f_str->aa1v);
+ }
+ else {
+ pre_com_r(aa1, n1, f_str->aa1v);
+ }
+
+ do_fastz (aa0, n0, f_str->aa1x, n10, f_str->aa1v, ppst, f_str, &a_res->rst, &hoff, 1, &s_info);
+#endif
+
+ if (a_res->rst.score[ppst->score_ix] < score_thresh) {
+ a_res->sw_score = 0;
+ a_res->n1 = n1;
+ return;
+ }
+
+#ifndef TFAST
+ window = min(n1, ppst->param_u.fa.optwid);
+ l_min = max(0, -window - hoff);
+ l_max = min(n1, n0-hoff+window);
+ local_aa1 = (unsigned char *)aa1;
+ if (l_min > 0 || l_max < n1 -1 ) {
+ local_aa1 = (unsigned char *)calloc(l_max - l_min + 2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1+l_min, l_max - l_min);
+ }
+
+ /*
+ if (l_min > 0) {
+ aa1_min_s = aa1[l_min-1];
+ aa1[l_min-1] = '\0';
+ }
+ if (l_max < n1 - 1) {
+ aa1_max_s = aa1[l_max];
+ aa1[l_max] = '\0';
+ }
+ */
+#else
+ window = min(n0, ppst->param_u.fa.optwid);
+ if (frame==0) {
+ l_min = max(0,(hoff-window)*3);
+ l_max = min((hoff+window+n0)*3,n1);
+
+ local_aa1 = (unsigned char *)aa1;
+ if (l_min > 0 || l_max < n1 -1 ) {
+ local_aa1 = (unsigned char *)calloc(l_max - l_min + 2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1+l_min, l_max - l_min);
+ }
+
+ /*
+ if (l_min > 0) {
+ aa1_min_s = aa1[l_min-1];
+ aa1[l_min-1] = '\0';
+ }
+ if (l_max < n1-1) {
+ aa1_max_s = aa1[l_max];
+ aa1[l_max] = '\0';
+ }
+ */
+ /* re-do precomputed codon number series for limited region */
+ pre_com(local_aa1, l_max - l_min, f_str->aa1v);
+ }
+ else {
+ /* things are more complicated here because the mapping of hoff is
+ with respect to the reversed aa1 */
+
+ l_max = n1 - max(0,(hoff-window)*3);
+ l_min = n1 - min((hoff+window+n0)*3,n1);
+
+ local_aa1 = (unsigned char *)aa1;
+ if (l_min > 0 || l_max < n1 -1 ) {
+ local_aa1 = (unsigned char *)calloc(l_max - l_min + 2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1+l_min, l_max - l_min);
+ }
+
+ /*
+ if (l_min > 0) {
+ aa1_min_s = aa1[l_min-1];
+ aa1[l_min-1] = '\0';
+ }
+ if (l_max < n1-1) {
+ aa1_max_s = aa1[l_max];
+ aa1[l_max] = '\0';
+ }
+ */
+
+ pre_com_r(local_aa1, l_max - l_min, f_str->aa1v);
+ }
+#endif
+
+ a_res->sw_score =
+ pro_dna(
+#ifndef TFAST
+ aa1+l_min, l_max - l_min,
+ f_str->aa0v, n0-2,
+#else
+ aa0, n0,
+ f_str->aa1v, l_max - l_min-2,
+#endif
+ ppst->pam2[0],
+ -ppst->gdelval, -ppst->ggapval, -ppst->gshift,
+ f_str, f_str->max_res, a_res);
+
+ if (l_min > 0 || l_max < n1 - 1) { free(--local_aa1); }
+ /*
+ if (l_min > 0) {
+ aa1[l_min-1] = aa1_min_s;
+ }
+ if (l_max < n1 - 1) {
+ aa1[l_max] = aa1_max_s;
+ }
+ */
+#ifndef TFAST
+ a_res->min0 += l_min;
+ a_res->max0 += l_min;
+#else
+ if (frame==1) {
+ a_res->min1 += n1 - l_max;
+ a_res->max1 += n1 - l_max;
+ }
+ else {
+ a_res->min1 += l_min;
+ a_res->max1 += l_min;
+ }
+#endif
+
+ /* display_alig(f_str->res,f_str->aa0v+2,aa1,*nres,n0-2,f_str); */
+}
+
+/*
+ fz_malign is a recursive interface to fz_walign() that is called
+ from do_walign(). fz_malign() first does an alignment, then checks
+ to see if the score is greater than the threshold. If so, it tries
+ doing a left and right alignment.
+
+ In this implementation, the DNA sequence is preserved as DNA for
+ TFAST, so that it can be sub-setted and translated correctly. Thus,
+ the translation required for f_str->aa1x and f_str->aa1v is done at
+ each recursive level (in fz_walign).
+ */
+
+struct a_res_str *
+fz_malign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ int score_thresh, int max_res,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct a_res_str *cur_ares,
+ int first_align)
+{
+ struct a_res_str *tmpl_ares, *tmpr_ares, *this_ares;
+ struct a_res_str *mtmpl_ares, *mtmpr_ares, *mt_next;
+ int sq_start, sq_end, sq_save;
+ int hoff, score_ix;
+ int min_alen;
+ struct rstruct rst;
+ unsigned char *local_aa1;
+ /* char save_res; */
+ int iphase, i;
+ unsigned char *fd;
+ int max_sub_score = -1;
+
+ score_ix = ppst->score_ix;
+
+#ifdef TFAST
+ min_alen = min(n0,MIN_LOCAL_LEN)*3; /* n0 in aa, min_alen in nt */
+#else
+ min_alen = min(n0/3,MIN_LOCAL_LEN); /* no in nt, min_alen in aa */
+#endif
+
+ /* now we need alignment storage - get it */
+ if ((cur_ares->res = (int *)calloc((size_t)max_res,sizeof(int)))==NULL) {
+ fprintf(stderr," *** cannot allocate alignment results array %d\n",max_res);
+ exit(1);
+ }
+
+ cur_ares->next = NULL;
+
+ fz_walign(aa0, n0, aa1, n1, frame, max_res, ppst, f_str, cur_ares, (first_align ? 1 : score_thresh));
+
+ /* in cur_ares, min0,max0 are always protein, min1,max1 are always
+ DNA, but n0 could be protein or DNA, depending on
+ FASTY/TFASTY */
+
+ if (!ppst->do_rep || cur_ares->rst.score[ppst->score_ix] < score_thresh) {
+ return cur_ares;
+ }
+
+ /* have a score >= threshold - try left and right */
+
+ /* in code below, cur_ares->min0/max0 always refers to aa
+ cur_ares->min1/max1 always refers to nt
+
+ however, things are more complex because if frame==1, then
+ offsets are from the end (n1), not the beginning. There is no
+ frame==1 for fasty, only for TFASTY
+ */
+ cur_ares->v_start = sq_start = 0;
+#ifdef TFAST
+ if (frame == 0) {sq_end = cur_ares->min1-1;} /* aa1[sq_start --> sq_end] */
+ else {sq_end = n1 - cur_ares->max1;}
+ sq_save = sq_end;
+#else
+ sq_save = sq_end = cur_ares->min0;
+#endif
+ cur_ares->v_len = sq_end - sq_start;
+
+ if (cur_ares->v_len >= min_alen) { /* try the left */
+ /* allocate a_res */
+ tmpl_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+ local_aa1 = (unsigned char *)calloc(cur_ares->v_len+2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1, aa1, cur_ares->v_len);
+
+ /*
+ save_res = aa1[sq_save];
+ aa1[sq_save] = '\0';
+ */
+ tmpl_ares = fz_malign(aa0, n0, local_aa1, cur_ares->v_len,
+ frame, score_thresh, max_res,
+ ppst, f_str, tmpl_ares,0);
+
+ free(--local_aa1);
+ /* aa1[sq_save] = save_res; */
+
+ if (tmpl_ares->rst.score[ppst->score_ix] > score_thresh) {
+ max_sub_score = tmpl_ares->rst.score[ppst->score_ix];
+#ifdef TFAST
+ if (frame == 1) {
+ for (this_ares = tmpl_ares; this_ares; this_ares = this_ares->next) {
+ this_ares->v_start += n1 - sq_end;
+ this_ares->min1 += n1 - sq_end;
+ this_ares->max1 += n1 - sq_end;
+ }
+ }
+#endif
+ }
+ else {
+ if (tmpl_ares->res) free(tmpl_ares->res);
+ free(tmpl_ares);
+ tmpl_ares=NULL;
+ }
+ }
+ else {tmpl_ares = NULL;}
+
+ /* now the right end */
+ /* for fasty -- max positions refer to the aa,codon, not the next
+ residue, so they must be incremented */
+
+ sq_end = n1;
+#if TFAST
+ if (frame == 0) {sq_start = cur_ares->max1+1;}
+ else {sq_start = n1 - cur_ares->min1;}
+#else
+ sq_start = cur_ares->max0+1;
+#endif
+ sq_save = sq_start-1;
+ cur_ares->v_len = sq_end - sq_start;
+
+ if (cur_ares->v_len >= min_alen) { /* try the right */
+ /* allocate a_res */
+ tmpr_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+
+ /* find boundaries */
+ local_aa1 = (unsigned char *)calloc(cur_ares->v_len+2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1+sq_start,cur_ares->v_len);
+ /*
+ save_res = aa1[sq_save];
+ aa1[sq_save] = '\0';
+ */
+
+ tmpr_ares = fz_malign(aa0, n0,
+ local_aa1, cur_ares->v_len,
+ frame,
+ score_thresh, max_res,
+ ppst, f_str, tmpr_ares,0);
+ free(--local_aa1);
+ /*
+ aa1[sq_save] = save_res;
+ */
+
+ if (tmpr_ares->rst.score[ppst->score_ix] > score_thresh) {
+ /* adjust the left boundary */
+ for (this_ares = tmpr_ares; this_ares; this_ares = this_ares->next) {
+#ifndef TFAST
+ this_ares->min0 += sq_start;
+ this_ares->max0 += sq_start;
+#else
+ if (frame == 0) {
+ this_ares->v_start += sq_start;
+ this_ares->min1 += sq_start;
+ this_ares->max1 += sq_start;
+ }
+#endif
+ }
+ if (tmpr_ares->rst.score[ppst->score_ix] > max_sub_score) {
+ max_sub_score = tmpr_ares->rst.score[ppst->score_ix];
+ }
+ }
+ else {
+ if (tmpr_ares->res) free(tmpr_ares->res);
+ free(tmpr_ares);
+ tmpr_ares=NULL;
+ }
+ }
+ else {tmpr_ares = NULL;}
+
+ if (max_sub_score < score_thresh) return cur_ares;
+
+ cur_ares = merge_ares_chains(cur_ares, tmpl_ares, score_ix, "left");
+ cur_ares = merge_ares_chains(cur_ares, tmpr_ares, score_ix, "right");
+
+ return cur_ares;
+}
+
+/* do_walign() can be called with aa0,n0 as nt (FASTY) or
+ aa0,n0 as aa (TFASTY). if aa0 is nt, then f_str->aa0x,y have the
+ translations already. if aa0 is aa, then f_str->aa1x,y must be
+ generated.
+*/
+
+struct a_res_str *
+do_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int repeat_thresh,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ int *have_ares)
+{
+ struct a_res_str *a_res, *tmp_a_res;
+ int a_res_index;
+ int hoff, use_E_thresholds_s, optflag_s, optcut_s, optwid_s, score;
+ int last_n1, itx, itt, n10, iphase;
+ unsigned char *fs, *fd;
+ struct rstruct rst;
+#ifdef DEBUG
+ unsigned long adler32_crc;
+#endif
+
+ *have_ares = 0x3; /* set 0x2 bit to indicate local copy */
+
+ if ((a_res = (struct a_res_str *)calloc(1, sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr," [do_walign] Cannot allocate a_res");
+ return NULL;
+ }
+
+#ifdef DEBUG
+ adler32_crc = adler32(1L,aa1,n1);
+#endif
+
+ f_str->frame = frame; /* need frame for later pre_cons() in calcons() */
+
+ use_E_thresholds_s = ppst->param_u.fa.use_E_thresholds;
+ optflag_s = ppst->param_u.fa.optflag;
+ optcut_s = ppst->param_u.fa.optcut;
+ optwid_s = ppst->param_u.fa.optwid;
+ ppst->param_u.fa.use_E_thresholds = 0;
+ ppst->param_u.fa.optflag = 1;
+ if (!ppst->param_u.fa.optwid_set) {
+ ppst->param_u.fa.optwid *= 2;
+ }
+
+ a_res = fz_malign(aa0, n0, aa1, n1, frame,
+ repeat_thresh, f_str->max_res,
+ ppst, f_str, a_res, 1);
+
+#ifdef DEBUG
+ if (adler32(1L,aa1,n1) != adler32_crc) {
+ fprintf(stderr,"*** error [%s:%d] adler32_crc mismatch n1: %d\n",__FILE__, __LINE__, n1);
+ }
+#endif
+
+ a_res_index = 0;
+ for (tmp_a_res=a_res; tmp_a_res; tmp_a_res = tmp_a_res->next) {
+ tmp_a_res->index = a_res_index++;
+ }
+
+ ppst->param_u.fa.use_E_thresholds = use_E_thresholds_s;
+ ppst->param_u.fa.optflag = optflag_s;
+ ppst->param_u.fa.optwid = optwid_s;
+ return a_res;
+}
+
+void
+pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {
+
+#ifdef TFAST
+ int i, last_n1, itemp, n10;
+ unsigned char *fs, *fd;
+ int itx;
+
+ /* make a precomputed codon number series */
+ if (frame==0) {
+ pre_com(aa1, n1, f_str->aa1v);
+ }
+ else { /* must do things backwards */
+ pre_com_r(aa1, n1, f_str->aa1v);
+ }
+#endif
+}
+
+/* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
+/* call from calcons, calc_id, calc_code */
+void
+aln_func_vals(int frame, struct a_struct *aln) {
+
+#ifndef TFAST
+ aln->llrev = 0;
+ aln->llfact = 1;
+ aln->llmult = 1;
+ aln->qlfact = 3;
+ aln->frame = frame;
+ if (frame > 0) aln->qlrev = 1;
+ else aln->qlrev = 0;
+ aln->llrev = 0;
+#else /* TFASTX */
+ aln->qlfact = 1;
+ aln->qlrev = 0;
+ aln->llfact = 3;
+ aln->llmult = 1;
+ aln->frame = frame;
+ if (frame > 0) aln->llrev = 1;
+ else aln->llrev = 0;
+ aln->qlrev=0;
+#endif /* TFASTX */
+}
+
+#include "structs.h"
+#include "a_mark.h"
+
+extern int align_type(int score, char sp0, char sp1, int nt_align, struct a_struct *aln, int pam_x_id_sim);
+
+extern int
+next_annot_match(int *itmp, int *pam2aa0v, long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ int i_annot, int n_annot, struct annot_entry **annot_arr, char **ann_comment,
+ void *annot_stack, int *have_push_features, int *v_delta,
+ struct annot_entry **region_p, struct annot_entry *tmp_region_p, int init_score);
+
+extern void
+comment_var(long i0, char sp0, long i1, char sp1, char o_sp1, char sim_char,
+ const char *ann_comment, struct dyn_string_str *annot_var_dyn, int target, int d_type);
+
+void
+display_push_features(void *annot_stack, struct dyn_string_str *annot_var_dyn,
+ long i0, char sp0, long i1, char sp1, char sym,
+ struct annot_entry **region0_p,
+ struct annot_entry **region1_p,
+ int score, double comp, int n0, int n1,
+ void *pstat_void, int d_type);
+
+#define DP_FULL_FMT 1 /* Region: score: bits: id: ... */
+#define Q_TARGET 0
+#define L_TARGET 1
+
+int seq_pos(int pos, int rev,int off);
+
+int
+calc_cons_a(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int *nc,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ char *seqc0, char *seqc1, char *seqca, int *cumm_seq_score,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a, const struct annot_str *annot0_p, char *seqc0a,
+ const unsigned char *aa1a, const struct annot_str *annot1_p, char *seqc1a,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str,
+ void *pstat_void
+ )
+{
+ int i0, i1;
+ int lenc, not_c, itmp, ngap_p, ngap_d, nfs;
+ char *sp0, *sp0a, *sp1, *sp1a, *spa, t_spa;
+ int *i_spa;
+ const unsigned char *sq;
+ unsigned char aap;
+
+ const unsigned char *ap0, *ap1;
+ const unsigned char *ap1a; /* ap1 always points to protein, and
+ only protein has annotations */
+ int *rp, *rpmax;
+ int have_ann = 0;
+
+ /* variables for variant changes */
+ char tmp_str[MAX_LSTR];
+ int *annot_stack, annot_stack_n, annot_top=0;
+ char *sim_sym = aln_map_sym[MX_ACC];
+ struct annot_entry **s_annot1_arr_p, *region1_p, pre_annot1;
+ struct annot_entry **s_annot0_arr_p, *region0_p, pre_annot0;
+ struct annot_entry *this_annot_p;
+ int i1_annot, v_delta, v_tmp;
+ int have_push_features, prev_match;
+ long i0_offset, i1_offset;
+
+ char *ann_comment;
+
+ *score_delta = 0;
+
+ if (ppst->ext_sq_set) {sq = ppst->sqx;}
+ else {sq = ppst->sq;}
+
+ /* res[0] has start of protein sequence */
+ /* res[1] has start of translated DNA sequence */
+
+#ifndef TFAST
+ aln->amin1 = aln->smin1 = a_res->min0; /* start in protein sequence */
+ aln->amin0 = aln->smin0 = a_res->min1; /* start in DNA/codon sequence */
+
+ i0_offset = aln->q_offset;
+ i1_offset = aln->l_offset;
+
+ ap0 = f_str->aa0v; /* computed codons -> ap0*/
+ ap1 = aa1; /* protein sequence -> ap1 */
+
+ sp0 = seqc0; /* translated DNA */
+ sp1 = seqc1; /* protein */
+
+ have_ann = (seqc0a != NULL && aa1a != NULL);
+ ap1a = aa1a;
+ sp1a = seqc1a; /* protein library can have annotation */
+ sp0a = seqc0a; /* sp0a is always ' ' - no translated
+ annotation */
+#else /* TFASTYZ */
+ if (aln->frame == 0) {
+ pre_com(aa1, n1, f_str->aa1v);
+ }
+ else {
+ pre_com_r(aa1, n1, f_str->aa1v);
+ }
+
+ aln->amin0 = aln->smin0 = a_res->min0; /* start in protein sequence */
+ aln->amin1 = aln->smin1 = a_res->min1; /* start in codon sequence */
+
+ i1_offset = aln->q_offset;
+ i0_offset = aln->l_offset;
+
+ ap1 = aa0; /* protein sequence */
+ ap0 = f_str->aa1v; /* computed codons -> ap0*/
+
+ sp0 = seqc1; /* protein */
+ sp1 = seqc0; /* translated DNA */
+
+ have_ann = (seqc0a != NULL && aa0a != NULL);
+ ap1a = aa0a;
+ sp1a = seqc0a; /* protein query can have annotation */
+ sp0a = seqc1a; /* sp0a is always ' ' - no translated
+ annotation */
+#endif
+ spa = seqca;
+ i_spa = cumm_seq_score;
+
+ rp = a_res->res; /* start of alignment info */
+ rpmax = &a_res->res[a_res->nres]; /* end of alignment info */
+
+ lenc = not_c = aln->nident = aln->nmismatch = aln->nsim = aln->npos = ngap_d = ngap_p = nfs = 0;
+ i0 = a_res->min1-3; /* start of codon sequence */
+ i1 = a_res->min0; /* start of protein sequence */
+
+ v_delta = 0;
+ i1_annot = 0;
+ region0_p = region1_p = NULL;
+ s_annot0_arr_p = s_annot1_arr_p = NULL;
+ annot_stack = NULL;
+ have_push_features = prev_match = 0;
+ if (have_ann) {
+ if (annot1_p && annot1_p->n_annot > 0) annot_stack = init_stack(64,64);
+ if (annot1_p && annot1_p->n_annot > 0) {
+ s_annot1_arr_p = annot1_p->s_annot_arr_p;
+ while (i1_annot < annot1_p->n_annot && s_annot1_arr_p[i1_annot]->pos < i1) {
+ if (s_annot1_arr_p[i1_annot]->label == '[') {
+ memcpy(&pre_annot1,s_annot1_arr_p[i1_annot], sizeof(struct annot_entry));
+ pre_annot1.pos = aln->amin1 + i1_offset;
+ pre_annot1.a_pos = aln->amin0 + i0_offset;
+ region1_p = &pre_annot1;
+ region1_p->score = region1_p->n_aln = region1_p->n_ident = 0;
+ }
+ else if (s_annot1_arr_p[i1_annot]->label == ']') {
+ region1_p = NULL;
+ }
+ i1_annot++;
+ }
+ }
+ }
+
+ while (rp < rpmax ) {
+ switch (*rp++) {
+ case 3: /* match */
+ i0 += 3;
+
+ *sp1 = sq[aap=ap1[i1]];
+ *sp0 = f_str->weight_c[aap][ap0[i0]].c5;
+ itmp = ppst->pam2[0][aap][pascii[*sp0]];
+
+
+ if (have_ann) {
+ *sp1a = ann_arr[ap1a[i1]];
+ *sp0a = ' ';
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[*sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, &have_push_features, &v_delta,
+ ®ion1_p, &pre_annot1, 0);
+
+ /* must be out of the loop to capture the last value */
+ if (ppst->sq[ap1[i1]] != *sp1) {
+ t_spa = align_type(itmp, *sp0, *sp1, 0, NULL, ppst->pam_x_id_sim);
+
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn,1,1);
+ }
+ }
+ prev_match = 1;
+ if (region1_p) {region1_p->score += itmp;}
+ }
+ sp0a++; sp1a++;
+ }
+
+ *spa = align_type(itmp, *sp0, *sp1, 0, aln, ppst->pam_x_id_sim);
+
+ if (region1_p) {
+ region1_p->n_aln++;
+ if (*spa == M_IDENT) {region1_p->n_ident++;}
+ }
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+
+ if (have_ann && have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa], ®ion0_p, ®ion1_p,
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+
+ i1++;
+ sp0++; sp1++; spa++;
+ lenc++;
+ break;
+ case 2: /* frame shift +2, then match */
+ nfs++;
+ i0 += 2;
+ *sp0++ = '/';
+ *sp1++ = '-';
+ *spa++ = M_DEL;
+ if (cumm_seq_score) *i_spa++ = ppst->gshift;
+
+ if (have_ann) {*sp0a++ = *sp1a++ = ' ';}
+ not_c++;
+
+ *sp1 = sq[aap=ap1[i1]];
+ *sp0 = f_str->weight_c[aap][ap0[i0]].c2;
+ itmp = ppst->pam2[0][pascii[*sp0]][aap];
+
+ if (have_ann) {
+ *sp1a = ann_arr[ap1a[i1]];
+ *sp0a = ' ';
+ if (s_annot1_arr_p) {
+ if (i1 + i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[*sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, &have_push_features, &v_delta,
+ ®ion1_p, &pre_annot1, 0);
+
+ /* must be out of the loop to capture the last value */
+ if (ppst->sq[ap1[i1]] != *sp1) {
+ t_spa = align_type(itmp, *sp0, *sp1, 0, NULL, ppst->pam_x_id_sim);
+
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn,1,DP_FULL_FMT);
+ }
+ }
+ if (region1_p) {
+ region1_p->score += ppst->gshift;
+ region1_p->score += itmp;
+ }
+ prev_match = 1;
+ }
+ sp0a++; sp1a++;
+ }
+
+ *spa = align_type(itmp, *sp0, *sp1, 0, aln, ppst->pam_x_id_sim);
+
+ if (region1_p) {
+ region1_p->n_aln++;
+ if (*spa == M_IDENT) {region1_p->n_ident++;}
+ }
+
+ if (have_ann && have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa], ®ion0_p, ®ion1_p,
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+
+ i1++;
+ sp0++; sp1++; spa++;
+ lenc++;
+ break;
+ case 4: /* frame shift, -1, then match */
+ nfs++;
+ i0 += 4;
+ if (have_ann) {
+ *sp1a++ = *sp0a++ = ' ';
+ }
+ *sp0++ = '\\';
+ *sp1++ = '-';
+ *spa++ = M_DEL;
+ if (cumm_seq_score) *i_spa++ = ppst->gshift;
+
+ not_c++;
+
+ *sp1 = sq[aap=ap1[i1]];
+ *sp0 = f_str->weight_c[aap][ap0[i0]].c4;
+ itmp = ppst->pam2[0][pascii[*sp0]][aap];
+
+ if (have_ann) {
+ *sp1a = ann_arr[ap1a[i1]];
+ *sp0a = ' ';
+ if (s_annot1_arr_p && i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[*sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p, &ann_comment,
+ annot_stack, &have_push_features, &v_delta, ®ion1_p, &pre_annot1, 0);
+
+ /* must be out of the loop to capture the last value */
+ if (ppst->sq[ap1[i1]] != *sp1) {
+ t_spa = align_type(itmp, *sp0, *sp1, 0, NULL, ppst->pam_x_id_sim);
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn,1,DP_FULL_FMT);
+ }
+ prev_match = 1;
+ if (region1_p) {region1_p->score += itmp;}
+ }
+ sp0a++; sp1a++;
+ }
+
+ *spa = align_type(itmp, *sp0, *sp1, 0, aln, ppst->pam_x_id_sim);
+ if (region1_p) {
+ region1_p->n_aln++;
+ if (*spa == M_IDENT) {region1_p->n_ident++;}
+ }
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+
+ if (have_ann && have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa], ®ion0_p, ®ion1_p,
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+
+ i1++;
+ sp0++; sp1++; spa++;
+ lenc++;
+ break;
+ case 5: /* insertion in 0 */
+ if (have_ann) {
+ *sp1a++ = *sp0a++ = ' ';
+ }
+ i0 += 3;
+ *sp0++ = f_str->weight_c[0][ap0[i0]].c3;
+ *sp1++ = '-';
+ *spa++ = M_DEL;
+ lenc++;
+ ngap_p++;
+ if (cumm_seq_score) *i_spa++ = ppst->gdelval;
+ break;
+ case 0: /* insertion in 1 */
+ *sp0++ = '-';
+ *sp1++ = sq[ap1[i1]];
+ *spa++ = M_DEL;
+ if (cumm_seq_score) {
+ if (prev_match) *i_spa = ppst->gdelval;
+ *i_spa++ += ppst->gdelval;
+ }
+
+ if (have_ann) {
+ *sp0a = ' ';
+ *sp1a = ann_arr[ap1a[i1]];
+
+ if (s_annot1_arr_p) {
+ /* coordiates are much more complex for next_annot_match,
+ and comment_var, because they may need to be reversed */
+
+ if (i1 + i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p,
+ &ann_comment, annot_stack, &have_push_features, &v_delta,
+ ®ion1_p, &pre_annot1, 0);
+ }
+
+ if (region1_p) {
+ if (prev_match) region1_p->score += ppst->gdelval;
+ region1_p->score += ppst->ggapval;
+ region1_p->n_aln++;
+ }
+ prev_match = 0;
+ }
+ sp0a++; sp1a++;
+ }
+
+ if (have_ann && have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa], ®ion0_p, ®ion1_p,
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+
+ i1++;
+ lenc++;
+ ngap_d++;
+ break;
+ }
+ }
+
+ if (have_ann) {
+ *sp0a = '\0';
+ if (s_annot1_arr_p) {
+ have_push_features = 0;
+ while (i1_annot < annot1_p->n_annot && s_annot1_arr_p[i1_annot]->pos < n1) {
+ if (s_annot1_arr_p[i1_annot]->label == '[') break;
+ if (s_annot1_arr_p[i1_annot]->label == ']') {
+ push_stack(annot_stack, s_annot1_arr_p[i1_annot]);
+ have_push_features = 1;
+ }
+ i1_annot++;
+ }
+ if (have_push_features) {
+ display_push_features(annot_stack, annot_var_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
+ sim_sym[*spa],®ion0_p, ®ion1_p,
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
+ have_push_features = 0;
+ }
+ }
+ }
+ *spa = '\0';
+
+#ifndef TFAST
+ aln->amax0 = i0+3; /* end of codon sequence */
+ aln->amax1 = i1; /* end of protein sequence */
+ aln->ngap_q = ngap_d;
+ aln->ngap_l = ngap_p;
+#else
+ aln->amax1 = i0+3; /* end of codon sequence */
+ aln->amax0 = i1; /* end of protein sequence */
+ aln->ngap_q = ngap_p;
+ aln->ngap_l = ngap_d;
+#endif
+ aln->calc_last_set = 1;
+ aln->nfs = nfs;
+ aln->amin0 = aln->smin0;
+ aln->amin1 = aln->smin1;
+
+ *score_delta = v_delta;
+
+ free_stack(annot_stack);
+
+ if (lenc < 0) lenc = 1;
+
+ *nc = lenc;
+/* now we have the middle, get the right end */
+
+ return lenc+not_c;
+}
+
+void
+calc_astruct(struct a_struct *aln_p, struct a_res_str *a_res_p, struct f_struct *f_str) {
+
+ aln_p->calc_last_set = 0;
+
+#ifndef TFAST /* FASTX */
+ aln_p->amin1 = a_res_p->min0; /* prot */
+ aln_p->amin0 = a_res_p->min1; /* DNA */
+ aln_p->amax1 = a_res_p->max0; /* prot */
+ aln_p->amax0 = a_res_p->max1; /* DNA */
+#else /* TFASTX */
+ aln_p->amin0 = a_res_p->min0; /* DNA */
+ aln_p->amin1 = a_res_p->min1; /* prot */
+ aln_p->amax0 = a_res_p->max0; /* DNA */
+ aln_p->amax1 = a_res_p->max1; /* prot */
+#endif
+}
+
+/* build an array of match/ins/del - length strings */
+
+/* modified 10-June-2014 to distinguish matches from mismatches, op=1
+ (previously unused) indicates an aligned non-identity */
+
+/* op_codes are: 0 - aa insertion
+ 1 - (now) aligned non-identity
+ 2 - -1 frameshift
+ 3 - aligned identity
+ 4 - +1 frameshift
+ 5 - codon insertion
+*/
+
+static struct update_code_str *
+init_update_data(show_code) {
+
+ struct update_code_str *update_data_p;
+
+ if ((update_data_p = (struct update_code_str *)calloc(1,sizeof(struct update_code_str)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - init_update_data(): cannot allocate update_code_str\n",
+ __FILE__, __LINE__);
+ return NULL;
+ }
+
+ update_data_p->p_op_cnt = 0;
+ update_data_p->show_code = show_code;
+
+ if ((show_code & SHOW_CODE_MASK) == SHOW_CODE_CIGAR) {
+ update_data_p->op_map = cigar_code;
+ update_data_p->cigar_order = 1;
+ }
+ else {
+ update_data_p->op_map = ori_code;
+ update_data_p->cigar_order = 0;
+ }
+
+ if ((show_code & SHOW_CODE_EXT) == SHOW_CODE_EXT) {
+ update_data_p->show_ext = 1;
+ }
+ else {
+ update_data_p->show_ext = 0;
+ }
+
+ return update_data_p;
+}
+
+static void
+close_update_data(char *al_str, int al_str_max,
+ struct update_code_str *up_dp) {
+ char tmp_cnt[MAX_SSTR];
+
+ if (!up_dp) return;
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx, up_dp->p_op_cnt);
+ strncat(al_str,tmp_cnt,al_str_max);
+
+ free(up_dp);
+}
+
+/* update_indel_code() has been modified to work more correctly with
+ ggsearch/glsearch, which, because alignments can start with either
+ insertions or deletions, can produce an initial code of "0=". When
+ that happens, it is ignored and no code is added.
+
+ *al_str - alignment string [al_str_max] - not dynamic
+ op -- encoded operation, currently 0=match, 1-delete, 2-insert, 3-term-match, 4-mismatch
+ op_cnt -- length of run
+ show_code -- SHOW_CODE_CIGAR uses cigar_code, otherwise legacy
+*/
+
+/* update_indel_code() is called for insertions and deletions
+ update_match_code() is called for every match
+*/
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *up_dp, int op_idx, int op_cnt) {
+
+ if (op_cnt == 0) return;
+
+ if (up_dp->cigar_order) {
+ sprintf(tmp_str,"%d%c",op_cnt,up_dp->op_map[op_idx]);
+ }
+ else {
+ sprintf(tmp_str,"%c%d",up_dp->op_map[op_idx],op_cnt);
+ }
+}
+
+static void
+update_code(char *al_str, int al_str_max,
+ struct update_code_str *up_dp, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1)
+{
+ char tmp_cnt[MAX_SSTR];
+
+ /* there are two kinds of "op's", one time and accumulating */
+ /* op == 2, 4 are one-time: */
+
+ switch (op) {
+ case 2:
+ case 4:
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ strncat(al_str,tmp_cnt,al_str_max);
+ sprintf_code(tmp_cnt,up_dp, op, 1);
+ strncat(al_str,tmp_cnt,al_str_max);
+ up_dp->p_op_cnt = 0;
+ break;
+ case 0:
+ case 5:
+ if (op == up_dp->p_op_idx) {
+ up_dp->p_op_cnt++;
+ }
+ else {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ strncat(al_str,tmp_cnt,al_str_max);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ break;
+ case 1:
+ case 3:
+ if (sp0 != '*' && sp1 != '*') { /* default case, not termination */
+ if (up_dp->show_ext) {
+ if (sim_code != M_IDENT) { op = 1;}
+ }
+ }
+ else { /* have a termination codon, output for !SHOW_CODE_CIGAR */
+ if (!up_dp->cigar_order) {
+ if (sp0 == '*' || sp1 == '*') { op = 6;}
+ }
+ else if (up_dp->show_ext && (sp0 != sp1)) { op = 1;}
+ }
+
+ if (up_dp->p_op_cnt == 0) {
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ else if (op != up_dp->p_op_idx) {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ strncat(al_str,tmp_cnt,al_str_max);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ else {
+ up_dp->p_op_cnt++;
+ }
+ break;
+ }
+ return;
+}
+
+int calc_code(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ char *al_str, int al_str_n,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a,
+ const struct annot_str *annot0_p,
+ const unsigned char *aa1a,
+ const struct annot_str *annot1_p,
+ struct dyn_string_str *annot_code_dyn,
+ int *score_delta,
+ struct f_struct *f_str,
+ void *pstat_void,
+ int display_code)
+{
+ int i0, i1;
+ int lenc, not_c, ngap_d, ngap_p, nfs;
+ char sp0, sp1;
+ struct update_code_str *update_data_p;
+ char op_char[10], ann_ch0, ann_ch1;
+ unsigned char aap;
+ const unsigned char *ap0, *ap1, *ap1a;
+ int *rp, *rpmax;
+ const unsigned char *sq;
+ int have_ann = 0;
+ char tmp_astr[MAX_STR];
+ int sim_code, t_spa;
+ int show_code, annot_fmt;
+ char *sim_sym= aln_map_sym[MX_ACC];
+ /* variables for variant changes */
+ void *annot_stack;
+ struct annot_entry **s_annot1_arr_p, *region1_p, pre_annot1;
+ struct annot_entry **s_annot0_arr_p, *region0_p, pre_annot0;
+ int itmp, i1_annot, v_delta, v_tmp;
+ int have_push_features, prev_match;
+ long i0_offset, i1_offset;
+
+ *score_delta = 0;
+
+ show_code = (display_code & SHOW_CODE_MASK + SHOW_CODE_EXT);
+ annot_fmt = 2;
+ if (display_code & SHOW_ANNOT_FULL) {
+ annot_fmt = 1;
+ }
+
+ if (ppst->ext_sq_set) {sq = ppst->sqx;}
+ else {sq = ppst->sq;}
+
+ /* don't fill in the ends */
+#ifndef TFAST
+ ap0 = f_str->aa0v; /* computed codons -> ap0*/
+ ap1 = aa1; /* protein sequence -> ap1 */
+ aln->smin1 = a_res->min0; /* start in protein sequence */
+ aln->smin0 = a_res->min1; /* start in DNA/codon sequence */
+
+ i0_offset = aln->q_offset;
+ i1_offset = aln->l_offset;
+
+ have_ann = (ann_arr[0] != '\0' && aa1a != NULL);
+ ap1a = aa1a;
+#else /* TFASTYZ */
+ if (aln->frame == 0) { pre_com(aa1, n1, f_str->aa1v);}
+ else { pre_com_r(aa1, n1, f_str->aa1v);}
+
+ ap0 = f_str->aa1v; /* computed codons -> ap0*/
+ ap1 = aa0; /* protein sequence */
+ aln->smin0 = a_res->min0; /* start in protein sequence */
+ aln->smin1 = a_res->min1; /* start in codon sequence */
+
+ i1_offset = aln->q_offset;
+ i0_offset = aln->l_offset;
+
+ have_ann = (ann_arr[0] != '\0' && aa0a != NULL);
+ ap1a = aa0a;
+#endif
+
+ rp = a_res->res; /* start of alignment info */
+ rpmax = &a_res->res[a_res->nres]; /* end of alignment info */
+
+/* now get the middle */
+
+ lenc = not_c = aln->nident = aln->nmismatch = aln->nsim = aln->npos = ngap_d = ngap_p = nfs = 0;
+ update_data_p = init_update_data(show_code);
+
+ i0 = a_res->min1-3; /* start of codon sequence */
+ i1 = a_res->min0; /* start of protein sequence */
+
+ v_delta = 0;
+ i1_annot = 0;
+ have_push_features = prev_match = 0;
+ region0_p = region1_p = NULL;
+ s_annot0_arr_p = s_annot1_arr_p = NULL;
+ annot_stack = NULL;
+ if (have_ann) {
+ if (annot1_p && annot1_p->n_annot > 0) annot_stack = init_stack(64,64);
+ if (annot1_p && annot1_p->n_annot > 0) {
+ s_annot1_arr_p = annot1_p->s_annot_arr_p;
+ while (i1_annot < annot1_p->n_annot && s_annot1_arr_p[i1_annot]->pos < i1) {
+ if (s_annot1_arr_p[i1_annot]->label == '[') {
+ memcpy(&pre_annot1,s_annot1_arr_p[i1_annot], sizeof(struct annot_entry));
+ pre_annot1.pos = aln->amin1 + i1_offset;
+ pre_annot1.a_pos = aln->amin0 + i0_offset;
+ region1_p = &pre_annot1;
+ region1_p->score = region1_p->n_aln = region1_p->n_ident = 0;
+ }
+ else if (s_annot1_arr_p[i1_annot]->label == ']') {
+ region1_p = NULL;
+ }
+ i1_annot++;
+ }
+ }
+ }
+
+ while (rp < rpmax ) {
+ switch (*rp++) {
+ case 0: /* insert in 0 */
+ sim_code = 5; /* indel code */
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 0, sim_code,'-','-');
+
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
+ annot_stack, &have_push_features, &v_delta,
+ ®ion1_p, &pre_annot1, 0);
+ }
+
+ if (region1_p) {
+ if (prev_match) region1_p->score += ppst->gdelval;
+ region1_p->score += ppst->ggapval;
+ region1_p->n_aln++;
+ }
+ prev_match = 0;
+
+ if (have_push_features) {
+ display_push_features(annot_stack, annot_code_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sim_sym[sim_code], ®ion0_p, ®ion1_p,
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+ }
+
+ i1++;
+ lenc++;
+ ngap_d++;
+ break;
+
+ case 2: /* -1 frame shift */
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 2, sim_code,'-','-');
+
+ nfs++;
+ i0 += 2;
+ not_c++;
+
+ sp1 = ppst->sq[aap=ap1[i1]];
+ sp0 = f_str->weight_c[aap][ap0[i0]].c2;
+ itmp = ppst->pam2[0][pascii[sp0]][aap];
+
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
+ annot_stack, &have_push_features, &v_delta,
+ ®ion1_p, &pre_annot1, 0);
+ }
+
+ if (sq[aap] != sp1) {
+ t_spa = align_type(itmp, sp0, sp1, 0, NULL, ppst->pam_x_id_sim);
+
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sq[aap], sim_sym[t_spa], NULL,
+ annot_code_dyn,1,annot_fmt);
+ }
+
+ if (region1_p) {
+ region1_p->score += ppst->gshift;
+ region1_p->score += itmp;
+ }
+ prev_match = 1;
+ }
+
+ sim_code = align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+ if (region1_p) {
+ region1_p->n_aln++;
+ if (sim_code == M_IDENT) {region1_p->n_ident++;}
+ }
+
+ /* check for an annotation */
+ if (have_ann && !(ann_arr[aa1a[i1]] == ' ' || ann_arr[aa1a[i1]] == '[' || ann_arr[aa1a[i1]] == ']')) {
+#ifndef TFAST
+ sprintf(tmp_astr, "|X%c:%ld%c%c%ld%c",
+ ann_arr[ap1a[i1]],i0_offset+seq_pos(i0,aln->qlrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i1,aln->llrev,0)+1,sp1);
+#else
+ sprintf(tmp_astr, "|%cX:%ld%c%c%ld%c",
+ ann_arr[ap1a[i1]],i0_offset+seq_pos(i1,aln->llrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i0,aln->qlrev,0)+1,sp1);
+
+#endif
+ /* SAFE_STRNCAT(annot_code_s, tmp_astr, n_annot_code_s); */
+ dyn_strcat(annot_code_dyn, tmp_astr);
+ }
+
+ if (s_annot1_arr_p && have_push_features) {
+ display_push_features(annot_stack, annot_code_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sim_sym[sim_code], ®ion0_p, ®ion1_p,
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ i1++;
+ lenc++;
+ break;
+
+ case 3: /* match */
+ i0 += 3;
+
+ sp1 = ppst->sq[aap=ap1[i1]];
+ sp0 = f_str->weight_c[aap][ap0[i0]].c5;
+ itmp = ppst->pam2[0][aap][pascii[sp0]];
+
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
+ annot_stack, &have_push_features, &v_delta,
+ ®ion1_p, &pre_annot1, 0);
+ }
+
+ if (sq[aap] != sp1) {
+ t_spa = align_type(itmp, sp0, sp1, 0, NULL, ppst->pam_x_id_sim);
+
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sq[aap], sim_sym[t_spa], NULL, annot_code_dyn,
+ 1,annot_fmt);
+ }
+
+ if (region1_p) {region1_p->score += itmp;}
+ prev_match = 1;
+ }
+
+ sim_code = align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+ if (region1_p) {
+ region1_p->n_aln++;
+ if (sim_code == M_IDENT) {region1_p->n_ident++;}
+ }
+
+ /* check for an annotation */
+ if (have_ann && !(ann_arr[ap1a[i1]] == ' ' || ann_arr[ap1a[i1]]=='[' || ann_arr[ap1a[i1]]==']')) {
+#ifndef TFAST
+ sprintf(tmp_astr, "|X%c:%ld%c%c%ld%c",
+ ann_arr[ap1a[i1]],i0_offset+seq_pos(i0,aln->qlrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i1,aln->llrev,0)+1,sp1);
+#else
+ sprintf(tmp_astr, "|%cX:%ld%c%c%ld%c",
+ ann_arr[ap1a[i1]],i0_offset+seq_pos(i1,aln->llrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i0,aln->qlrev,0)+1,sp1);
+
+#endif
+ /* SAFE_STRNCAT(annot_code_s, tmp_astr, n_annot_code_s); */
+ dyn_strcat(annot_code_dyn, tmp_astr);
+ }
+
+ if (s_annot1_arr_p && have_push_features) {
+ display_push_features(annot_stack, annot_code_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sim_sym[sim_code], ®ion0_p, ®ion1_p,
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 3, sim_code,sp0,sp1);
+
+ i1++;
+ lenc++;
+ break;
+
+ case 4: /* +1 frame shift */
+ /* finish previous run */
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 4, sim_code,'-','-');
+ /* mark frameshift */
+
+ nfs++;
+ i0 += 4;
+ not_c++;
+
+ sp1 = ppst->sq[aap=ap1[i1]];
+ sp0 = f_str->weight_c[aap][ap0[i0]].c2;
+ itmp = ppst->pam2[0][aap][pascii[sp0]];
+
+ if (s_annot1_arr_p) {
+ if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
+ i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
+ annot_stack, &have_push_features, &v_delta, ®ion1_p, &pre_annot1, 0);
+ }
+
+ if (sq[aap] != sp1) {
+ t_spa = align_type(itmp, sp0, sp1, 0, NULL, ppst->pam_x_id_sim);
+
+ comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sq[aap], sim_sym[t_spa], NULL, annot_code_dyn,
+ 1,annot_fmt);
+ }
+
+ if (region1_p) {
+ region1_p->score += ppst->gshift;
+ region1_p->score += itmp;
+ }
+ prev_match = 1;
+ }
+
+ sim_code = align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+ if (region1_p) {
+ region1_p->n_aln++;
+ if (sim_code == M_IDENT) {region1_p->n_ident++;}
+ }
+
+ if (have_ann && !(ann_arr[ap1a[i1]] == ' ' || ann_arr[ap1a[i1]]=='[' || ann_arr[ap1a[i1]]==']')) {
+#ifndef TFAST
+ sprintf(tmp_astr, "|X%c:%ld%c%c%ld%c",
+ ann_arr[ap1a[i1]],i0_offset+seq_pos(i0,aln->qlrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i1,aln->llrev,0)+1,sp1);
+#else
+ sprintf(tmp_astr, "|%cX:%ld%c%c%ld%c",
+ ann_arr[ap1a[i1]],i0_offset+seq_pos(i1,aln->llrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i0,aln->qlrev,0)+1,sp1);
+
+#endif
+ /* SAFE_STRNCAT(annot_code_s, tmp_astr, n_annot_code_s); */
+ dyn_strcat(annot_code_dyn, tmp_astr);
+ }
+
+ if (s_annot1_arr_p && have_push_features) {
+ display_push_features(annot_stack, annot_code_dyn,
+ i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
+ i1_offset+seq_pos(i1,aln->llrev,0), sp1,
+ sim_sym[sim_code], ®ion0_p, ®ion1_p,
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ i1++;
+ lenc++;
+ break;
+
+ case 5: /* insert in 1 */
+ sim_code = 5;
+ update_code(al_str, al_str_n-strlen(al_str), update_data_p, 5, sim_code,'-','-');
+
+ i0 += 3;
+ lenc++;
+ ngap_p++;
+ break;
+ }
+ }
+
+ close_update_data(al_str, al_str_n-strlen(al_str), update_data_p);
+
+#ifndef TFAST
+ aln->amax0 = i0+3; /* end of codon sequence */
+ aln->amax1 = i1; /* end of protein sequence */
+ aln->ngap_q = ngap_d;
+ aln->ngap_l = ngap_p;
+#else
+ aln->amax1 = i0+3; /* end of codon sequence */
+ aln->amax0 = i1; /* end of protein sequence */
+ aln->ngap_q = ngap_p;
+ aln->ngap_l = ngap_d;
+#endif
+ aln->calc_last_set = 1;
+ aln->nfs = nfs;
+ aln->amin0 = aln->smin0;
+ aln->amin1 = aln->smin1;
+
+ *score_delta = v_delta;
+
+ if (have_ann) {
+ have_push_features = 0;
+ if (s_annot1_arr_p) {
+ /* also check for regions after alignment */
+ while (i1_annot < annot1_p->n_annot && s_annot1_arr_p[i1_annot]->pos < i1_offset+n1) {
+ if (s_annot1_arr_p[i1_annot]->label == '[') break;
+ if (s_annot1_arr_p[i1_annot]->label == ']') {
+ push_stack(annot_stack, s_annot1_arr_p[i1_annot]);
+ have_push_features = 1;
+ }
+ i1_annot++;
+ }
+ }
+
+ if (have_push_features) {
+ display_push_features(annot_stack, annot_code_dyn,
+ i0_offset+a_res->max0-1, sp0,
+ i1_offset+a_res->max1-1, sp1,
+ sim_sym[sim_code], ®ion0_p, ®ion1_p,
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
+ }
+
+ if (!annot_stack) free_stack(annot_stack);
+ }
+
+
+ if (lenc < 0) lenc = 1;
+
+/* now we have the middle, get the right end */
+
+ return lenc;
+}
+
+int calc_id(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str)
+{
+ int i0, i1;
+ int lenc, not_c, ngap_d, ngap_p, nfs;
+ char sp0, sp1;
+ unsigned char aap;
+ const unsigned char *ap0, *ap1;
+ int *rp, *rpmax;
+
+ int aa1c;
+ /* variables for variant changes */
+ struct annot_entry **s_annot1_arr_p;
+ int itmp, i1_annot, v_delta, v_tmp;
+ long i0_offset, i1_offset;
+
+ char tmp_str[MAX_SSTR];
+ const unsigned char *sq;
+
+ *score_delta = 0;
+ NULL_dyn_string(annot_var_dyn);
+
+ if (ppst->ext_sq_set) {sq = ppst->sqx;}
+ else {sq = ppst->sq;}
+
+ /* don't fill in the ends */
+#ifndef TFAST /* FASTYZ */
+ ap0 = f_str->aa0v; /* computed codons -> ap0*/
+ ap1 = aa1; /* protein sequence -> ap1 */
+ aln->smin1 = a_res->min0; /* start in protein sequence */
+ aln->smin0 = a_res->min1; /* start in DNA/codon sequence */
+
+ i0_offset = aln->q_offset;
+ i1_offset = aln->l_offset;
+#else /* TFASTYZ */
+
+ if (aln->frame == 0) {
+ pre_com(aa1, n1, f_str->aa1v);
+ }
+ else {
+ pre_com_r(aa1, n1, f_str->aa1v);
+ }
+
+ ap0 = f_str->aa1v; /* computed codons -> ap0*/
+ ap1 = aa0; /* protein sequence */
+ aln->smin0 = a_res->min0; /* start in protein sequence */
+ aln->smin1 = a_res->min1; /* start in codon sequence */
+
+ i1_offset = aln->q_offset;
+ i0_offset = aln->l_offset;
+#endif
+
+ rp = a_res->res; /* start of alignment info */
+ rpmax = &a_res->res[a_res->nres]; /* end of alignment info */
+
+/* now get the middle */
+
+ lenc = not_c = aln->nident = aln->nmismatch = aln->nsim = aln->npos = ngap_d = ngap_p = nfs = 0;
+ i0 = a_res->min1-3; /* start of codon sequence */
+ i1 = a_res->min0; /* start of protein sequence */
+
+ v_delta = 0;
+ i1_annot = 0;
+ s_annot1_arr_p = NULL;
+ if (annot1_p && annot1_p->n_annot > 0) s_annot1_arr_p = annot1_p->s_annot_arr_p;
+
+ while (rp < rpmax ) {
+ switch (*rp++) {
+ case 3: /* match */
+ i0 += 3;
+ sp1 = ppst->sq[aap=ap1[i1]];
+ sp0 = f_str->weight_c[aap][ap0[i0]].c5;
+
+ itmp = ppst->pam2[0][pascii[sp0]][aap];
+
+ if (s_annot1_arr_p && i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0), i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
+ NULL, NULL, &v_delta,NULL, NULL, 0);
+
+ if (ppst->sq[aap] != sp1) {
+ sprintf(tmp_str,"%c%d%c;",ppst->sq[aap],i1+1,sp1);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+
+ align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+
+ i1++;
+ lenc++;
+ break;
+ case 2:
+ nfs++;
+ i0 += 2;
+ not_c++;
+ sp1 = ppst->sq[aap=ap1[i1]];
+ sp0 = f_str->weight_c[aap][ap0[i0]].c2;
+
+ itmp = ppst->pam2[0][aap][pascii[sp0]];
+
+ if (s_annot1_arr_p && i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0), i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
+ NULL, NULL, &v_delta,NULL, NULL, 0);
+
+ if (ppst->sq[aap] != sp1) {
+ sprintf(tmp_str,"%c%d%c;",ppst->sq[aap],i1+1,sp1);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+
+ align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+
+ i1++;
+ lenc++;
+ break;
+ case 4:
+ nfs++;
+ i0 += 4;
+ not_c++;
+ sp1 = ppst->sq[aap=ap1[i1]];
+ sp0 = f_str->weight_c[aap][ap0[i0]].c4;
+ itmp = ppst->pam2[0][pascii[sp0]][aap];
+
+ if (s_annot1_arr_p && i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0), i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
+ i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
+ NULL, NULL, &v_delta,NULL, NULL, 0);
+
+ if (ppst->sq[aap] != sp1) {
+ sprintf(tmp_str,"%c%d%c;",ppst->sq[aap],i1+1,sp1);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+
+ align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
+
+ i1++;
+ lenc++;
+ break;
+ case 5:
+ i0 += 3;
+ lenc++;
+ ngap_p++;
+ break;
+ case 0:
+ i1++;
+ lenc++;
+ ngap_d++;
+ break;
+ }
+ }
+
+#ifndef TFAST
+ aln->amax0 = i0+3; /* end of codon sequence */
+ aln->amax1 = i1; /* end of protein sequence */
+ aln->ngap_q = ngap_d;
+ aln->ngap_l = ngap_p;
+#else
+ aln->amax1 = i0+3; /* end of codon sequence */
+ aln->amax0 = i1; /* end of protein sequence */
+ aln->ngap_q = ngap_p;
+ aln->ngap_l = ngap_d;
+#endif
+ aln->calc_last_set = 1;
+ aln->nfs = nfs;
+ aln->amin0 = aln->smin0;
+ aln->amin1 = aln->smin1;
+
+ if (lenc < 0) lenc = 1;
+
+/* now we have the middle, get the right end */
+
+ return lenc;
+}
diff --git a/src/dropfz3.c b/src/dropfz3.c
new file mode 100644
index 0000000..551ce0a
--- /dev/null
+++ b/src/dropfz3.c
@@ -0,0 +1,3864 @@
+/* $Id: dropfz2.c 1280 2014-08-21 00:47:55Z wrp $ */
+
+/* copyright (c) 1998, 1999, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* 18-Sept-2006 - removed static global variables for alignment */
+
+/* 2002/06/23 finally correctly implement fix to translate 'N' to 'X' */
+
+/* 1999/11/29 modification by Z. Zhang to translate DNA 'N' as 'X' */
+
+/* implements an improved version of the fasty algorithm, see:
+
+ W. R. Pearson, T. Wood, Z. Zhang, A W. Miller (1997) "Comparison of
+ DNA sequences with protein sequences" Genomics 46:24-36
+
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "defs.h"
+#include "param.h"
+#define XTERNAL
+#include "upam.h"
+#include "uascii.h"
+
+#define NT_N 16
+
+/* globals for fasta */
+#define MAXWINDOW 64
+
+#ifndef MAXSAV
+#define MAXSAV 10
+#endif
+
+#ifndef ALLOCN0
+static char *verstr="3.8 June 2014";
+#else
+static char *verstr="3.8an0 Jul 2014";
+#endif
+
+struct dstruct /* diagonal structure for saving current run */
+{
+ int score; /* hash score of current match */
+ int start; /* start of current match */
+ int stop; /* end of current match */
+ struct savestr *dmax; /* location in vmax[] where best score data saved */
+};
+
+struct savestr
+{
+ int score; /* pam score with segment optimization */
+ int score0; /* pam score of best single segment */
+ int gscore; /* score from global match */
+ int dp; /* diagonal of match */
+ int start; /* start of match in lib seq */
+ int stop; /* end of match in lib seq */
+};
+
+struct update_code_str {
+ int p_op_idx;
+ int p_op_cnt;
+ int btop_enc;
+ int show_code;
+ int cigar_order;
+ int show_ext;
+ char *op_map;
+};
+
+#ifdef TFAST
+static char *ori_code = "-x/=\\+*"; /* FASTX */
+static char *cigar_code = "DXFMRI*";
+#else
+static char *ori_code = "+x/=\\-*"; /* TFASTX */
+static char *cigar_code = "IXFMRD*";
+#endif
+
+unsigned long adler32(unsigned long, const unsigned char *, unsigned int);
+void kpsort (struct savestr **v, int n);
+extern void *init_stack(int, int);
+extern void push_stack(void *, void *);
+extern void *pop_stack(void *);
+extern void *free_stack(void *);
+extern struct domfeat_data * init_domfeat_data(const struct annot_str *annot_p);
+
+#define SGW1 100
+#define SGW2 300
+struct smgl_str {
+ int C[SGW1+1][SGW2+1];
+ int st[SGW1+1][SGW2+1];
+ int D[SGW2+7], I[SGW2+1];
+};
+
+struct sx_s {int C1, C2, C3, I1, I2, I3, flag; };
+
+struct wgt { int iii, ii, iv;};
+struct wgtc {char c2, c3, c4, c5;};
+
+typedef struct st_s { int C, I, D;} *st_ptr;
+
+struct f_struct {
+ struct dstruct *diag;
+ int frame;
+ int ndo;
+ int noff;
+ int hmask; /* hash constants */
+ int *pamh1; /* pam based array */
+ int *pamh2; /* pam based kfact array */
+ int *link, *harr; /* hash arrays */
+ int kshft; /* shift width */
+ int nsav, lowscor; /* number of saved runs, worst saved run */
+#ifndef TFAST
+ unsigned char *aa0x, *aa0v; /* aa0x - 111122223333 */
+#else
+ unsigned char *aa1x, *aa1v; /* aa1x - 111122223333 */
+#endif /* aa1v - computed codons */
+ struct sx_s *cur;
+ int cur_sp_size;
+ struct wgt **weight0;
+ struct wgt **weight1;
+ struct wgtc **weight_c;
+ int *waa;
+ int *res;
+ int max_res;
+ st_ptr up, down, tp;
+ struct smgl_str smgl_s;
+};
+
+#define DROP_INTERN
+#include "drop_func.h"
+
+static int dmatchz(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ const unsigned char *aa1v,
+ int hoff, int window,
+ int **pam2, int gdelval, int ggapval, int gshift,
+ struct f_struct *f_str);
+
+int shscore(unsigned char *aa0, int n0, int **pam2);
+int saatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame);
+extern int ELK_to_s(double E_join, int n0, int n1, double Lambda, double K, double H);
+
+int savemax (struct dstruct *, int,
+ struct savestr *vmax, struct savestr **lowmax);
+
+int spam (const unsigned char *aa0, const unsigned char *aa1,
+ struct savestr *dmax, int **pam2,
+ struct f_struct *f_str);
+int sconn (struct savestr **v, int n,int cgap, int pgap, struct f_struct *f_str);
+int lx_band(const unsigned char *prot_seq, int len_prot,
+ const unsigned char *dna_prot_seq, int len_dna_prot,
+ int **pam_matrix, int gopen, int gext,
+ int gshift, int start_diag, int width, struct f_struct *f_str);
+
+struct a_res_str *
+merge_ares_chains(struct a_res_str *cur_ares,
+ struct a_res_str *tmpl_ares,
+ int score_ix, const char *msg);
+
+static struct update_code_str *
+init_update_data(int show_code);
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *, int op_idx, int op_cnt);
+
+static void
+update_code(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *update_data, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1);
+
+static void
+close_update_data(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *update_data);
+
+extern void w_abort (char *p, char *p1);
+extern void aagetmap(char *to, int n);
+
+/* initialize for fasta */
+/* modified 30-August-1999 by Zheng Zhang to work with an extended alphabet */
+/* Assume naa=47, and wgts[47][23] matches both upper and lower case
+amoino acids with another amino acid. And also assume the DNA letter
+does not have upper/lower case difference. If you also allow DNA
+sequence to be upper/lower case letters, more needs be changed. Not
+only here, but also in the alignment code, the way that pack a codon
+into a number between 0-63 need be changed. */
+
+/* modified so that if **weightci==NULL, do not fiddle with characters */
+
+/* modified 3-Aug-2010 for NCBIstdaa alphabet, which requires MAXUC
+ 28, MAXLC 56, so we must have 58, not 47, entries */
+
+void
+init_weights(struct wgt ***weighti, struct wgtc ***weightci,
+ int **wgts, int gshift, int gsubs, int naa)
+{
+ int i, j, do_wgtc=0;
+ int aa, b, a, x, y, z;
+ int *wwt, e;
+ struct wgt **weight;
+ struct wgtc **weightc;
+ char aacmap[64];
+ int temp[MAXLC+1][64]; /*change*/
+ char le[MAXLC+1][64];
+
+ if (naa > MAXLC) {
+ fprintf(stderr,"*** dropfz2.c compilation problem naa(%d) > MAXLX(%d) ***\n",
+ naa, MAXLC);
+ }
+
+ if ((*weighti=(struct wgt **)calloc((size_t)(naa+1),sizeof(struct wgt *)))
+ ==NULL) {
+ fprintf(stderr," cannot allocate weights array: %d\n",naa);
+ exit(1);
+ }
+
+ weight = *weighti;
+ /* allocate weight[aa 0..MAXLC] */
+ for (aa=0; aa <= naa; aa++) {
+ if ((weight[aa]=(struct wgt *)calloc((size_t)256,sizeof(struct wgt)))
+ ==NULL) {
+ fprintf(stderr," cannot allocate weight[]: %d/%d\n",aa,naa);
+ exit(1);
+ }
+ }
+
+ /* allocate weightci[aa 0..MAXLC] */
+ if (weightci !=NULL) {
+ if ((*weightci=(struct wgtc **)calloc((size_t)(naa+1),
+ sizeof(struct wgtc *)))==NULL) {
+ fprintf(stderr," cannot allocate weight_c array: %d\n",naa);
+ exit(1);
+ }
+ weightc = *weightci;
+
+ for (aa=0; aa <= naa; aa++) {
+ if ((weightc[aa]=(struct wgtc *)calloc((size_t)256,sizeof(struct wgtc)))
+ ==NULL) {
+ fprintf(stderr," cannot allocate weightc[]: %d/%d\n",aa,naa);
+ exit(1);
+ }
+ }
+ do_wgtc = 1;
+ }
+ else do_wgtc = 0;
+
+ aagetmap(aacmap,64);
+
+ for (aa = 0; aa < naa; aa++) { /* change*/
+ wwt = wgts[aa]; /* pam matrix */
+ for (i = 0; i < 64; i++) { /* i iterates through the codons */
+ x = -10000; /* large negative */
+ y = i;
+ for (j = 0; j < 64; j++) { /* j iterates through the codons */
+ z = ((~i & j) | (i & ~j));
+ b = 0; /* score = 0 */
+ if (z % 4) b-= gsubs;
+ if (z /16) b-= gsubs;
+ if ((z /4) % 4) b -= gsubs;
+ b += wwt[aascii[aacmap[j]]]; /* add the match score for char j*/
+ if (b > x) {
+ x = b; /* x has the score */
+ y = j; /* y has the character (codon index)*/
+ }
+ }
+#ifdef DEBUG
+ if (y < 0 || y > 63) printf("%d %d %d %d ",aa, i, x, y);
+#endif
+ temp[aa][i] = x;
+ le[aa][i] = y;
+ }
+ /* printf("\n"); */
+ }
+
+ for (aa= 0; aa < naa; aa++) {
+ wwt = temp[aa];
+ for (i = 0; i < 256; i++) {
+ for (x=-100,b = 0; b < 4; b++) {
+ z = (i/ (1 << ((b+1)*2)))*(1<<(b*2))+(i%(1<<(b*2)));
+ if (x < (e=wwt[z])) {
+ x = e;
+ if (do_wgtc) weightc[aa][i].c4 = aacmap[le[aa][z]];
+ }
+ }
+ weight[aa][i].iv=x-gshift;
+ weight[aa][i].iii = wwt[i%64];
+
+ if (do_wgtc) {
+ weightc[aa][i].c5 = aacmap[le[aa][i%64]];
+ weightc[aa][i].c3 = aacmap[i%64];
+ }
+ x = i %16;
+ for (y = -100, b = 0; b < 3; b++) {
+ z = ((x >> (b*2)) << (b*2+2)) + (x % (1 << (b*2)));
+ for (a = 0; a < 4; a++) {
+ if ((e =wwt[z+(a<<(b*2))]) > y) {
+ y = e;
+ if (do_wgtc)
+ weightc[aa][i].c2 = aacmap[le[aa][z+(a<<(b*2))]];
+ }
+ }
+ }
+ weight[aa][i].ii = y-gshift;
+ }
+ }
+ /*106=CGGG*/
+ for (aa = 0; aa < naa; aa++) {
+ weight[aa][106].iii = wgts[aa][23]; /* is 23 the code for 'X'?*/
+ weight[aa][106].iv = weight[aa][106].ii = weight[aa][106].iii-gshift;
+ if (do_wgtc) {
+ weightc[aa][106].c5 = weightc[aa][106].c4 = weightc[aa][106].c3
+ = weightc[aa][106].c2 = 'X';
+ }
+ }
+}
+
+void
+free_weights(struct wgt ***weighti0, struct wgt ***weighti1,
+ struct wgtc ***weightci, int naa)
+{
+ int aa;
+ struct wgt **weight0;
+ struct wgt **weight1;
+ struct wgtc **weightc;
+
+ weight0 = *weighti0;
+ weight1 = *weighti1;
+ weightc = *weightci;
+
+ for (aa=0; aa < naa; aa++) {free(weight0[aa]);}
+ for (aa=0; aa < naa; aa++) {free(weight1[aa]);}
+ for (aa=0; aa < naa; aa++) {free(weightc[aa]);}
+
+ free(weight0);
+ free(weight1);
+ free(weightc);
+}
+
+static void
+pre_com(const unsigned char *aa0, int n0, unsigned char *aa0v) {
+ int dnav, i;
+ dnav = (hnt[aa0[0]]<<2) + hnt[aa0[1]];
+ for (i=2; i<n0; i++) {
+ dnav = ((dnav<<2)+hnt[aa0[i]])&255;
+ if (aa0[i] == NT_N || aa0[i-1]==NT_N || aa0[i-2] == NT_N) {
+ aa0v[i-2] = 106;
+ }
+ else {
+ if (dnav == 106/*CGGG*/) {dnav = 42/*AGGG*/;}
+ aa0v[i-2]=dnav;
+ }
+ }
+}
+
+static void
+pre_com_r(const unsigned char *aa0, int n0, unsigned char *aa0v) {
+ int dnav, i, ir;
+ dnav = ((3-hnt[aa0[n0-1]])<<2) + 3-hnt[aa0[n0-2]];
+ for (i=2, ir=n0-3; i<n0; i++,ir--) {
+ dnav = ((dnav<<2)+3-hnt[aa0[ir]])&255;
+ if (aa0[ir] == NT_N || aa0[ir+1]==NT_N || aa0[ir+2] == NT_N) {
+ aa0v[i-2] = 106;
+ }
+ else {
+ if (dnav == 106) dnav = 42;
+ aa0v[i-2]=dnav;
+ }
+ }
+}
+
+void
+init_work (unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ int mhv, phv;
+ int hmax;
+ int i0, hv;
+ int pamfact;
+ int btemp;
+ struct f_struct *f_str;
+ struct bdstr *bss;
+ /* these used to be globals, but do not need to be */
+ int ktup, fact, kt1, lkt;
+
+ int maxn0;
+ int *pwaa;
+ int i, j, q;
+ struct swstr *ss, *r_ss;
+ int *waa;
+ int *res;
+ int nsq, ip, *hsq, naat;
+#ifndef TFAST
+ int last_n0, itemp, dnav;
+ unsigned char *fd, *fs, *aa0x, *aa0v;
+ int n0x, n0x3;
+#endif
+
+ if (nt[NT_N] != 'N') {
+ fprintf(stderr," nt[NT_N] (%d) != 'X' (%c) - recompile\n",NT_N,nt[NT_N]);
+ exit(1);
+ }
+
+ if (ppst->ext_sq_set) {
+ nsq = ppst->nsqx; ip = 1;
+ hsq = ppst->hsqx;
+ }
+ else {
+ nsq = ppst->nsqx; ip = 0;
+ hsq = ppst->hsq;
+ }
+
+ f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct));
+
+ if (!ppst->param_u.fa.use_E_thresholds) {
+ btemp = 2 * ppst->param_u.fa.bestoff / 3 +
+ n0 / ppst->param_u.fa.bestscale +
+ ppst->param_u.fa.bkfact *
+ (ppst->param_u.fa.bktup - ppst->param_u.fa.ktup);
+ btemp = min (btemp, ppst->param_u.fa.bestmax);
+ if (btemp > 3 * n0) btemp = 3 * shscore(aa0,n0,ppst->pam2[0]) / 5;
+
+ ppst->param_u.fa.cgap = btemp + ppst->param_u.fa.bestoff / 3;
+ if (ppst->param_u.fa.optcut_set != 1) {
+#ifndef TFAST
+ ppst->param_u.fa.optcut = (btemp*5)/4;
+#else
+ ppst->param_u.fa.optcut = (btemp*4)/3;
+#endif
+ }
+ }
+
+#ifdef OLD_FASTA_GAP
+ ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval;
+#else
+ ppst->param_u.fa.pgap = ppst->gdelval + 2*ppst->ggapval;
+#endif
+ pamfact = ppst->param_u.fa.pamfact;
+ ktup = ppst->param_u.fa.ktup;
+ fact = ppst->param_u.fa.scfact * ktup;
+
+#ifndef TFAST
+ /* before hashing, we must set up some space and translate the sequence */
+
+ maxn0 = n0 + 2;
+ if ((aa0x =(unsigned char *)calloc((size_t)maxn0,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa0x array %d\n", maxn0);
+ exit (1);
+ }
+ aa0x++;
+ f_str->aa0x = aa0x;
+
+
+ if ((aa0v =(unsigned char *)calloc((size_t)maxn0,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa0v array %d\n", maxn0);
+ exit (1);
+ }
+ aa0v++;
+ f_str->aa0v = aa0v;
+
+ /* make a precomputed codon number series */
+ pre_com(aa0, n0, aa0v);
+
+ last_n0 = 0;
+ for (itemp=0; itemp<3; itemp++) {
+ n0x=saatran(aa0,&aa0x[last_n0],n0,itemp);
+ /* for (i=0; i<n0x; i++) {
+ fprintf(stderr,"%c",aa[aa0x[last_n0+i]]);
+ if ((i%60)==59) fprintf(stderr,"\n");
+ }
+ fprintf(stderr,"\n");
+ */
+ last_n0 += n0x+1;
+ }
+
+ /* fprintf(stderr,"\n"); */
+ n0x = n0;
+ n0x3 = n0x/3;
+
+ /* now switch aa0 and aa0x for hashing functions */
+ fs = aa0;
+ aa0 = aa0x;
+ aa0x = fs;
+#endif
+
+ /* naat must always be MAXLC because library can have LC aa residues */
+ /*
+ if (ppst->ext_sq_set) naat = MAXLC;
+ else naat = MAXUC;
+ */
+ naat = MAXLC;
+
+ init_weights(&f_str->weight0, NULL,
+ ppst->pam2[ip],-ppst->gshift,-ppst->gsubs,naat);
+ init_weights(&f_str->weight1, &f_str->weight_c,
+ ppst->pam2[0],-ppst->gshift,-ppst->gsubs,naat);
+
+ if (pamfact == -1)
+ pamfact = 0;
+ else if (pamfact == -2)
+ pamfact = 1;
+
+ for (i0 = 1, mhv = -1; i0 < ppst->nsq; i0++)
+ if (hsq[i0] < NMAP && hsq[i0] > mhv)
+ mhv = ppst->hsq[i0];
+
+ if (mhv <= 0)
+ {
+ fprintf (stderr, " maximum hsq <=0 %d\n", mhv);
+ exit (1);
+ }
+
+ for (f_str->kshft = 0; mhv > 0; mhv /= 2) f_str->kshft++;
+
+/* kshft = 2; */
+ kt1 = ktup - 1;
+ hv = 1;
+ for (i0 = 0; i0 < ktup; i0++)
+ hv = hv << f_str->kshft;
+ hmax = hv;
+ f_str->hmask = (hmax >> f_str->kshft) - 1;
+
+ if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate hash array\n");
+ exit (1);
+ }
+ if ((f_str->pamh1 = (int *) calloc (ppst->nsq+1, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh1 array\n");
+ exit (1);
+ }
+ if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate pamh2 array\n");
+ exit (1);
+ }
+ if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) {
+ fprintf (stderr, " cannot allocate hash link array");
+ exit (1);
+ }
+
+ for (i0 = 0; i0 < hmax; i0++)
+ f_str->harr[i0] = -1;
+ for (i0 = 0; i0 < n0; i0++)
+ f_str->link[i0] = -1;
+
+ /* encode the aa0 array */
+ phv = hv = 0;
+ lkt = kt1;
+ for (i0 = 0; i0 < min(n0,lkt); i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {
+ hv=phv=0; lkt = i0+ktup; continue;
+ }
+ hv = (hv << f_str->kshft) + ppst->hsq[aa0[i0]];
+ phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup;
+ }
+
+ for (; i0 < n0; i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {
+ hv=phv=0;
+ /* restart hv, phv calculation */
+ for (lkt = i0+kt1; (i0 < lkt || hsq[aa0[i0]]>=NMAP) && i0<n0; i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {
+ hv=phv=0;
+ lkt = i0+ktup;
+ continue;
+ }
+ hv = (hv << f_str->kshft) + hsq[aa0[i0]];
+ phv += ppst->pam2[ip][aa0[i0]][aa0[i0]]*ktup;
+ }
+ }
+ if (i0 >= n0) break;
+ hv = ((hv & f_str->hmask) << f_str->kshft) + ppst->hsq[aa0[i0]];
+ f_str->link[i0] = f_str->harr[hv];
+ f_str->harr[hv] = i0;
+ if (pamfact) {
+ f_str->pamh2[hv] = (phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup);
+ if (hsq[aa0[i0-kt1]] < NMAP)
+ phv -= ppst->pam2[ip][aa0[i0 - kt1]][aa0[i0 - kt1]] * ktup;
+ }
+ else f_str->pamh2[hv] = fact * ktup;
+ }
+
+/* this has been modified from 0..<ppst->nsq to 1..<=ppst->nsq because the
+ pam2[0][0] is now undefined for consistency with blast
+*/
+
+ if (pamfact)
+ for (i0 = 1; i0 < ppst->nsq; i0++)
+ f_str->pamh1[i0] = ppst->pam2[ip][i0][i0] * ktup;
+ else
+ for (i0 = 1; i0 < ppst->nsq; i0++)
+ f_str->pamh1[i0] = fact;
+
+ f_str->ndo = 0; /* used to save time on diagonals with long queries */
+
+
+#ifndef ALLOCN0
+ if ((f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG,
+ sizeof (struct dstruct)))==NULL) {
+ fprintf (stderr," cannot allocate diagonal arrays: %lu\n",
+ MAXDIAG *sizeof (struct dstruct));
+ exit (1);
+ };
+#else
+ if ((f_str->diag = (struct dstruct *) calloc ((size_t)n0,
+ sizeof (struct dstruct)))==NULL) {
+ fprintf (stderr," cannot allocate diagonal arrays: %ld\n",
+ (long)n0*sizeof (struct dstruct));
+ exit (1);
+ };
+#endif
+
+#ifndef TFAST
+ /* done hashing, now switch aa0, aa0x back */
+ fs = aa0;
+ aa0 = aa0x;
+ aa0x = fs;
+#else
+ if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+4,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+4);
+ exit (1);
+ }
+ f_str->aa1x++;
+
+ if ((f_str->aa1v =(unsigned char *)calloc((size_t)ppst->maxlen+4,
+ sizeof(unsigned char))) == NULL) {
+ fprintf (stderr, "cannot allocate aa1v array %d\n", ppst->maxlen+4);
+ exit (1);
+ }
+ f_str->aa1v++;
+
+#endif
+
+ if ((waa= (int *)malloc (sizeof(int)*(nsq+1)*n0)) == NULL) {
+ fprintf(stderr,"cannot allocate waa struct %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ pwaa = waa;
+ for (i=0; i<nsq; i++) {
+ for (j=0;j<n0; j++) {
+ *pwaa = ppst->pam2[ip][i][aa0[j]];
+ pwaa++;
+ }
+ }
+ f_str->waa = waa;
+
+#ifndef TFAST
+ maxn0 = max(2*n0,MIN_RES);
+#else
+ maxn0 = max(4*n0,MIN_RES);
+#endif
+ if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) {
+ fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0);
+ exit(1);
+ }
+ f_str->res = res;
+ f_str->max_res = maxn0;
+
+ *f_arg = f_str;
+}
+
+/* pstring1 is a message to the manager, currently 512 */
+/* pstring2 is the same information, but in a markx==10 format */
+void
+get_param (const struct pstruct *ppst,
+ char **pstring1, char *pstring2,
+ struct score_count_s *s_info)
+{
+ char options_str1[128];
+ char options_str2[128];
+#ifndef TFAST
+ char *pg_str="FASTY";
+#else
+ char *pg_str="TFASTY";
+#endif
+
+ if (!ppst->param_u.fa.use_E_thresholds) {
+ sprintf(options_str1,"join: %d (%.3g), opt: %d (%.3g)",
+ ppst->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ sprintf(options_str2,"pg_join: %d (%.3g)\n; pg_optcut: %d (%.3g)",
+ ppst->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ }
+ else {
+ sprintf(options_str1,"E-join: %.2g (%.3g), E-opt: %.2g (%.3g)",
+ ppst->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ sprintf(options_str2,"pg_join_E(): %.2g (%.3g)\n; pg_optcut_E(): %.2g (%.3g)",
+ ppst->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppst->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ }
+
+ if (!ppst->param_u.fa.optflag)
+ sprintf (pstring1[0], "%s (%s)",pg_str, verstr);
+ else
+ sprintf (pstring1[0], "%s (%s) [optimized]",pg_str, verstr);
+
+ sprintf (pstring1[1],
+#ifdef OLD_FASTA_GAP
+ "%s matrix (%d:%d)%s, gap-pen: %3d/%3d, shift: %3d, subs: %3d\n ktup: %d, %s, width: %3d",
+#else
+ "%s matrix (%d:%d)%s, open/ext: %3d/%3d, shift: %3d, subs: %3d\n ktup: %d, %s, width: %3d",
+#endif
+ ppst->pam_name, ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set) ? "xS":"\0",
+ ppst->gdelval, ppst->ggapval,
+ ppst->gshift,ppst->gsubs,
+ ppst->param_u.fa.ktup, options_str1, ppst->param_u.fa.optwid);
+
+ if (ppst->param_u.fa.iniflag) strcat(pstring1[0]," init1");
+
+ if (pstring2 != NULL) {
+#ifdef OLD_FASTA_GAP
+ sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n\
+; pg_gap-pen: %d %d\n; pg_ktup: %d\n; %s\n",
+#else
+ sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n\
+; pg_open-ext: %d %d\n; pg_ktup: %d\n; %s\n",
+#endif
+ pg_str,verstr,ppst->pam_name, ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set) ? "xS":"\0", ppst->gdelval,
+ ppst->ggapval,ppst->param_u.fa.ktup,options_str2);
+ }
+ }
+
+void
+close_work (const unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ struct f_struct *f_str;
+ int naat;
+
+ f_str = *f_arg;
+
+ if (f_str != NULL) {
+ if (ppst->ext_sq_set) naat = MAXLC;
+ else naat = MAXUC;
+ free_weights(&f_str->weight0,&f_str->weight1,&f_str->weight_c,naat);
+ free(f_str->cur);
+#ifndef TFAST
+ f_str->aa0v--;
+ free(f_str->aa0v);
+ f_str->aa0x--;
+ free(f_str->aa0x);
+#else /* TFAST */
+ f_str->aa1x--;
+ free(f_str->aa1x);
+ f_str->aa1v--;
+ free(f_str->aa1v);
+#endif
+ free(f_str->res);
+ free(f_str->waa);
+ free(f_str->diag);
+ free(f_str->link);
+ free(f_str->pamh2);
+ free(f_str->pamh1);
+ free(f_str->harr);
+ free(f_str);
+ *f_arg = NULL;
+ }
+}
+
+void do_fastz (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ const unsigned char *aa1v,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ struct rstruct *rst, int *hoff, int shuff_flg,
+ struct score_count_s *s_info)
+{
+ int nd; /* diagonal array size */
+ int lhval;
+ int kfact;
+ int i;
+ register struct dstruct *dptr;
+ struct savestr vmax[MAXSAV]; /* best matches saved for one sequence */
+ struct savestr *vptr[MAXSAV];
+ struct savestr *lowmax;
+ int lowscor;
+ register int tscor;
+ int xdebug = 0;
+
+#ifndef ALLOCN0
+ register struct dstruct *diagp;
+#else
+ register int dpos;
+ int lposn0;
+#endif
+ struct dstruct *dpmax;
+ register int lpos;
+ int tpos;
+ struct savestr *vmptr;
+ int scor, tmp;
+ int im, ib, nsave;
+ int ktup, kt1, ip, lkt, ktup_sq;
+ const int *hsq;
+ int c_gap, opt_cut;
+#ifndef TFAST
+ int n0x31, n0x32;
+ n0x31 = (n0-2)/3;
+ n0x32 = n0x31+1+(n0-n0x31-1)/2;
+#else
+ unsigned char *fs, *fd;
+ int n1x31, n1x32, last_n1, itemp;
+ n1x31 = (n1-2)/3;
+ n1x32 = n1x31+1+(n1-n1x31-1)/2;
+#endif
+
+ if (ppst->ext_sq_set) {
+ ip = 1;
+ hsq = ppst->hsqx;
+ }
+ else {
+ ip = 0;
+ hsq = ppst->hsq;
+ }
+
+ ktup = ppst->param_u.fa.ktup;
+ ktup_sq = ktup*ktup;
+ if (ktup == 1) ktup_sq *= 2;
+
+ kt1 = ktup-1;
+
+ if (n1 < ktup) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+ if (n0+n1+1 >= MAXDIAG) {
+ fprintf(stderr,"n0,n1 too large: %d, %d\n",n0,n1);
+ rst->score[0] = rst->score[1] = rst->score[2] = -1;
+ return;
+ }
+
+ if (ppst->param_u.fa.use_E_thresholds) {
+ c_gap = ELK_to_s(ppst->param_u.fa.E_join*ktup_sq*2.5, n0, n1, ppst->pLambda, ppst->pK, ppst->pH);
+ opt_cut = ELK_to_s(ppst->param_u.fa.E_band_opt*ktup_sq*2.0, n0, n1, ppst->pLambda, ppst->pK, ppst->pH);
+ rst->valid_stat = 0;
+ }
+ else {
+ c_gap = ppst->param_u.fa.cgap;
+ opt_cut = ppst->param_u.fa.optcut;
+ rst->valid_stat = 1;
+ }
+
+ f_str->noff = n0 - 1;
+
+#ifdef ALLOCN0
+ nd = n0;
+#endif
+
+#ifndef ALLOCN0
+ nd = n0 + n1;
+#endif
+
+ dpmax = &f_str->diag[nd];
+ for (dptr = &f_str->diag[f_str->ndo]; dptr < dpmax;)
+ {
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+
+ for (vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++)
+ vmptr->score = 0;
+ lowmax = vmax;
+ lowscor = 0;
+
+ if (n1 > 1000 && aa1[0]==23 && aa1[100]==23 &&
+ aa1[1400]==23 && aa1[1401]!=23) {
+ xdebug = 1;
+ }
+ else xdebug = 0;
+
+ /* start hashing */
+ lhval = 0;
+ lkt = kt1;
+ for (lpos = 0; (lpos < lkt || hsq[aa1[lpos]]>=NMAP) && lpos<n1; lpos++) {
+ /* restart lhval calculation */
+ if (hsq[aa1[lpos]]>=NMAP) {
+ lhval = 0; lkt=lpos+ktup;
+ continue;
+ }
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
+ }
+
+#ifndef ALLOCN0
+ diagp = &f_str->diag[f_str->noff + lkt];
+ for (; lpos < n1; lpos++, diagp++) {
+ if (hsq[aa1[lpos]]>=NMAP) {
+ lpos++ ; diagp++;
+ while (lpos < n1 && hsq[aa1[lpos]]>=NMAP) {lpos++; diagp++;}
+ if (lpos >= n1) break;
+ lhval = 0;
+ }
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
+ for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
+ if ((tscor = (dptr = &diagp[-tpos])->stop) >= 0) {
+#else
+ lposn0 = f_str->noff + lpos;
+ for (; lpos < n1; lpos++, lposn0++) {
+ if (hsq[aa1[lpos]]>=NMAP) {lhval = 0; goto loopl;}
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
+ for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
+ dpos = lposn0 - tpos;
+ if ((tscor = (dptr = &f_str->diag[dpos % nd])->stop) >= 0) {
+#endif
+ tscor += ktup;
+ if ((tscor -= lpos) <= 0) {
+ scor = dptr->score;
+ if ((tscor += (kfact = f_str->pamh2[lhval])) < 0 && lowscor < scor) {
+#ifdef ALLOCN0
+ lowscor = savemax (dptr, dpos, vmax, &lowmax);
+#else
+ lowscor = savemax (dptr, dptr- f_str->diag, vmax, &lowmax);
+#endif
+ }
+ if ((tscor += scor) >= kfact) {
+ dptr->score = tscor;
+ dptr->stop = lpos;
+ }
+ else {
+ dptr->score = kfact;
+ dptr->start = (dptr->stop = lpos) - kt1;
+ }
+ }
+ else {
+ dptr->score += f_str->pamh1[aa0[tpos]];
+ dptr->stop = lpos;
+ }
+ }
+ else {
+ dptr->score = f_str->pamh2[lhval];
+ dptr->start = (dptr->stop = lpos) - kt1;
+ }
+ } /* end tpos */
+
+#ifdef ALLOCN0
+ /* reinitialize diag structure */
+ loopl:
+ if ((dptr = &f_str->diag[lpos % nd])->score > lowscor)
+ lowscor = savemax (dptr, lpos, vmax, &lowmax);
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr->score = 0;
+#endif
+ } /* end lpos */
+
+#ifdef ALLOCN0
+ for (tpos = 0, dpos = f_str->noff + n1 - 1; tpos < n0; tpos++, dpos--) {
+ if ((dptr = &f_str->diag[dpos % nd])->score > lowscor)
+ lowscor = savemax (dptr, dpos, vmax, &lowmax, f_str);
+ }
+#else
+ for (dptr = f_str->diag; dptr < dpmax;) {
+ if (dptr->score > lowscor) savemax (dptr, dptr - f_str->diag, vmax, &lowmax);
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+ f_str->ndo = nd;
+#endif
+
+ for (nsave = 0, vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++) {
+ if (vmptr->score > 0) {
+ vmptr->score = spam (aa0, aa1, vmptr, ppst->pam2[ip], f_str);
+ vptr[nsave++] = vmptr;
+ }
+ }
+
+ if (nsave <= 0) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+#ifndef TFAST
+ /* FASTX code here to modify the start, stop points for
+ the three phases of the translated protein sequence
+ */
+
+ for (ib=0; ib<nsave; ib++) {
+ if (f_str->noff-vptr[ib]->dp+vptr[ib]->start >= n0x32)
+ vptr[ib]->dp += n0x32;
+ if (f_str->noff-vptr[ib]->dp +vptr[ib]->start >= n0x31)
+ vptr[ib]->dp += n0x31;
+ }
+#else
+ /* TFAST code here to modify the start, stop points for
+ the three phases of the translated protein sequence
+ TFAST modifies library start points, rather than
+ query start points
+ */
+
+ for (ib=0; ib<nsave; ib++) {
+ if (vptr[ib]->start >= n1x32) {
+ vptr[ib]->start -= n1x32;
+ vptr[ib]->stop -= n1x32;
+ vptr[ib]->dp -= n1x32;
+ }
+ if (vptr[ib]->start >= n1x31) {
+ vptr[ib]->start -= n1x31;
+ vptr[ib]->stop -= n1x31;
+ vptr[ib]->dp -= n1x31;
+ }
+ }
+#endif /* TFAST */
+
+ scor = sconn (vptr, nsave, c_gap,
+ ppst->param_u.fa.pgap, f_str);
+
+ for (vmptr=vptr[0],ib=1; ib<nsave; ib++)
+ if (vptr[ib]->score > vmptr->score) vmptr=vptr[ib];
+
+/* kssort (vptr, nsave); */
+
+ rst->score[1] = vmptr->score;
+ rst->score[0] = max (scor, vmptr->score);
+ rst->score[2] = rst->score[0]; /* initn */
+
+ s_info->tot_scores++;
+ if (rst->score[0] > c_gap) { s_info->s_cnt[0]++;}
+#ifndef TFAST
+ *hoff=f_str->noff - vmptr->dp;
+#else /* TFAST */
+ *hoff=vmptr->dp-f_str->noff;
+#endif /* TFAST */
+ if (ppst->param_u.fa.optflag) {
+ if (/* shuff_flg || */ rst->score[0] > opt_cut) {
+ s_info->s_cnt[2]++;
+ rst->valid_stat = 1;
+ rst->score[2] = dmatchz(aa0, n0,aa1,n1, aa1v,
+ *hoff,ppst->param_u.fa.optwid,
+ ppst->pam2[ip],
+ ppst->gdelval,ppst->ggapval,ppst->gshift,
+ f_str);
+ }
+ }
+}
+
+void do_work (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ const struct pstruct *ppst,
+ struct f_struct *f_str,
+ int qr_flg, int shuff_flg, struct rstruct *rst,
+ struct score_count_s *s_info)
+{
+ int hoff;
+ int last_n1, itx, dnav, n10, i, ir;
+ unsigned char *aa1x;
+
+ rst->escore = 1.0;
+ rst->segnum = rst->seglen = 1;
+ rst->valid_stat = 0;
+
+ if (n1 < ppst->param_u.fa.ktup) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+#ifndef TFAST
+ do_fastz (f_str->aa0x, n0, aa1, n1, f_str->aa0v, ppst, f_str, rst, &hoff, shuff_flg, s_info);
+#else
+ /* make a precomputed codon number series */
+
+ if (frame == 0) {
+ pre_com(aa1, n1, f_str->aa1v);
+ }
+ else {
+ pre_com_r(aa1, n1, f_str->aa1v);
+ }
+
+ /* make translated sequence */
+ last_n1 = 0;
+ aa1x = f_str->aa1x;
+#ifdef DEBUG
+ if (frame > 1) {
+ fprintf(stderr, "*** fz_walign - frame: %d - out of range [0,1]\n",frame);
+ }
+#endif
+
+ for (itx= frame*3; itx< frame*3+3; itx++) {
+ n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+
+ do_fastz (aa0, n0, f_str->aa1x, n10, f_str->aa1v, ppst, f_str, rst, &hoff, shuff_flg, s_info);
+#endif
+
+ rst->comp = rst->H = -1.0;
+}
+
+void do_opt (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct rstruct *rst)
+{
+ int optflag, tscore, hoff;
+ int last_n1, itx, n10, i, ir;
+ unsigned char *aa1x;
+ struct score_count_s s_info = {0, 0, 0, 0};
+
+ optflag = ppst->param_u.fa.optflag;
+ ppst->param_u.fa.optflag = 1;
+
+#ifndef TFAST
+ do_fastz (f_str->aa0x, n0, aa1, n1, f_str->aa0v, ppst, f_str, rst, &hoff, 0, &s_info);
+#else
+ /* make a precomputed codon number series */
+
+ if (frame == 0) {
+ pre_com(aa1, n1, f_str->aa1v);
+ }
+ else {
+ pre_com_r(aa1, n1, f_str->aa1v);
+ }
+
+ /* make translated sequence */
+ last_n1 = 0;
+ aa1x = f_str->aa1x;
+ for (itx= frame*3; itx< frame*3+3; itx++) {
+ n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+
+ do_fastz (aa0, n0, f_str->aa1x, n10, f_str->aa1v, ppst, f_str, rst, &hoff, 0, &s_info );
+#endif
+
+ ppst->param_u.fa.optflag = optflag;
+}
+
+int
+savemax (struct dstruct *dptr, int dpos,
+ struct savestr *vmax, struct savestr **lowmax)
+{
+ struct savestr *vmptr;
+ int i;
+
+/* check to see if this is the continuation of a run that is already saved */
+
+ if ((vmptr = dptr->dmax) != NULL && vmptr->dp == dpos &&
+ vmptr->start == dptr->start) {
+ vmptr->stop = dptr->stop;
+ if ((i = dptr->score) <= vmptr->score) return (*lowmax)->score;
+ vmptr->score = i;
+ if (vmptr != *lowmax) return (*lowmax)->score;
+ }
+ else {
+ i = (*lowmax)->score = dptr->score;
+ (*lowmax)->dp = dpos;
+ (*lowmax)->start = dptr->start;
+ (*lowmax)->stop = dptr->stop;
+ dptr->dmax = *lowmax;
+ }
+
+ for (vmptr = vmax; vmptr < vmax+MAXSAV; vmptr++) {
+ if (vmptr->score < i) {
+ i = vmptr->score;
+ *lowmax = vmptr;
+ }
+ }
+ return i;
+}
+
+int spam (const unsigned char *aa0,
+ const unsigned char *aa1,
+ struct savestr *dmax, int **pam2,
+ struct f_struct *f_str)
+{
+ int lpos;
+ int tot, mtot;
+ struct {
+ int start, stop, score;
+ } curv, maxv;
+ const unsigned char *aa0p, *aa1p;
+
+ aa1p = &aa1[lpos = dmax->start];
+ aa0p = &aa0[lpos - dmax->dp + f_str->noff];
+ curv.start = lpos;
+
+ tot = curv.score = maxv.score = 0;
+ for (; lpos <= dmax->stop; lpos++) {
+ tot += pam2[*aa0p++][*aa1p++];
+ if (tot > curv.score) {
+ curv.stop = lpos;
+ curv.score = tot;
+ }
+ else if (tot < 0) {
+ if (curv.score > maxv.score) {
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+ tot = curv.score = 0;
+ curv.start = lpos+1;
+ }
+ }
+
+ if (curv.score > maxv.score) {
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+
+/* if (maxv.start != dmax->start || maxv.stop != dmax->stop)
+ printf(" new region: %3d %3d %3d %3d\n",maxv.start,
+ dmax->start,maxv.stop,dmax->stop);
+*/
+ dmax->start = maxv.start;
+ dmax->stop = maxv.stop;
+
+ return maxv.score;
+}
+
+#define XFACT 10
+
+int sconn (struct savestr **v, int n,
+ int cgap, int pgap, struct f_struct *f_str)
+{
+ int i, si;
+ struct slink {
+ int score;
+ struct savestr *vp;
+ struct slink *next;
+ } *start, *sl, *sj, *so, sarr[MAXSAV];
+ int lstart, tstart, plstop, ptstop;
+
+/* sort the score left to right in lib pos */
+
+ kpsort (v, n);
+
+ start = NULL;
+
+/* for the remaining runs, see if they fit */
+
+ for (i = 0, si = 0; i < n; i++)
+ {
+
+/* if the score is less than the gap penalty, it never helps */
+ if (v[i]->score < cgap)
+ continue;
+ lstart = v[i]->start;
+ tstart = lstart - v[i]->dp + f_str->noff;
+
+/* put the run in the group */
+ sarr[si].vp = v[i];
+ sarr[si].score = v[i]->score;
+ sarr[si].next = NULL;
+
+/* if it fits, then increase the score */
+ for (sl = start; sl != NULL; sl = sl->next)
+ {
+ plstop = sl->vp->stop;
+ ptstop = plstop - sl->vp->dp + f_str->noff;
+ if (plstop < lstart+XFACT && ptstop < tstart+XFACT) {
+ sarr[si].score = sl->score + v[i]->score + pgap;
+ break;
+ }
+ }
+
+/* now recalculate where the score fits */
+ if (start == NULL)
+ start = &sarr[si];
+ else
+ for (sj = start, so = NULL; sj != NULL; sj = sj->next)
+ {
+ if (sarr[si].score > sj->score)
+ {
+ sarr[si].next = sj;
+ if (so != NULL)
+ so->next = &sarr[si];
+ else
+ start = &sarr[si];
+ break;
+ }
+ so = sj;
+ }
+ si++;
+ }
+
+ if (start != NULL)
+ return (start->score);
+ else
+ return (0);
+}
+
+void
+kssort (v, n)
+struct savestr *v[];
+int n;
+{
+ int gap, i, j;
+ struct savestr *tmp;
+
+ for (gap = n / 2; gap > 0; gap /= 2)
+ for (i = gap; i < n; i++)
+ for (j = i - gap; j >= 0; j -= gap)
+ {
+ if (v[j]->score >= v[j + gap]->score)
+ break;
+ tmp = v[j];
+ v[j] = v[j + gap];
+ v[j + gap] = tmp;
+ }
+}
+
+void
+kpsort (struct savestr **v, int n) {
+ int gap, i, j, k;
+ int incs[4] = { 21, 7, 3, 1 };
+ struct savestr *tmp;
+ int v_start;
+
+ for ( k = 0; k < 4; k++) {
+ gap = incs[k];
+ for (i = gap; i < n; i++) {
+ tmp = v[i];
+ j = i;
+ v_start = v[i]->start;
+ while (j >= gap && v[j - gap]->start > v_start) {
+ v[j] = v[j - gap];
+ j -= gap;
+ }
+ v[j] = tmp;
+ }
+ }
+}
+
+static int
+dmatchz(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ const unsigned char *aa1v,
+ int hoff, int window,
+ int **pam2, int gdelval, int ggapval, int gshift,
+ struct f_struct *f_str)
+{
+
+ hoff -= window/2;
+
+#ifndef TFAST
+ return lx_band(aa1,n1,f_str->aa0v,n0-2,
+ pam2,
+#ifdef OLD_FASTA_GAP
+ -(gdelval - ggapval),
+#else
+ -gdelval,
+#endif
+ -ggapval,-gshift,
+ hoff,window,f_str);
+#else
+ return lx_band(aa0,n0,aa1v,n1-2,
+ pam2,
+#ifdef OLD_FASTA_GAP
+ -(gdelval - ggapval),
+#else
+ -gdelval,
+#endif
+ -ggapval,-gshift,
+ hoff,window,f_str);
+#endif
+}
+
+static void
+init_row(struct sx_s *row, int sp) {
+ int i;
+ for (i = 0; i < sp; i++) {
+ row[i].C1 = row[i].I1 = 0;
+ row[i].C2 = row[i].I2 = 0;
+ row[i].C3 = row[i].I3 = 0;
+ row[i].flag = 0;
+ }
+}
+
+int lx_band(const unsigned char *prot_seq, /* array with protein sequence numbers*/
+ int len_prot, /* length of prot. seq */
+ const unsigned char *dna_prot_seq, /* translated DNA sequence numbers*/
+ int len_dna_prot, /* length trans. seq. */
+ int **pam_matrix, /* scoring matrix */
+ int gopen, int gext, /* gap open, gap extend penalties */
+ int gshift, /* frame-shift penalty */
+ int start_diag, /* start diagonal of band */
+ int width, /* width for band alignment */
+ struct f_struct *f_str)
+{
+ void *ckalloc();
+ int i, j, bd, bd1, x1, x2, sp, p1=0, p2=0, end_prot;
+ struct sx_s *last, *tmp;
+ int sc, del, best = 0, cd,ci, e1, e2, e3, cd1, cd2, cd3, f, gg;
+ const unsigned char *dp;
+ register struct sx_s *ap, *aq;
+ struct wgt *wt, *ww;
+ int aa, b, a,x,y,z;
+
+ sp = width+7;
+ gg = gopen+gext;
+ /* sp = sp/3+1; */
+
+ if (f_str->cur == NULL ) {
+ f_str->cur_sp_size = sp;
+ f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp);
+ }
+ else if (f_str->cur_sp_size != sp) {
+ free(f_str->cur);
+ f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp);
+ f_str->cur_sp_size = sp;
+ }
+
+ init_row(f_str->cur, sp);
+
+ /*
+ if (start_diag %3 !=0) start_diag = start_diag/3-1;
+ else start_diag = start_diag/3;
+ if (width % 3 != 0) width = width/3+1;
+ else width = width /3;
+ */
+
+ x1 = start_diag; /* x1 = lower bound of DNA */
+ x2 = 1; /* the amount of position shift from last row*/
+
+ end_prot = max(0,-width-start_diag) + (len_dna_prot+5)/3 + width;
+ end_prot = min(end_prot,len_prot);
+
+ /* i counts through protein sequence, x1 through DNAp */
+
+ for (i = max(0, -width-start_diag), x1+=i; i < len_prot; i++, x1++) {
+ bd = min(x1+width, (len_dna_prot+2)/3); /* upper bound of band */
+ bd1 = max(0,x1); /* lower bound of band */
+ wt = f_str->weight0[prot_seq[i]];
+ del = 1-x1; /*adjustment*/
+ bd += del;
+ bd1 +=del;
+
+ ap = &f_str->cur[bd1]; aq = ap+1;
+ e1 = f_str->cur[bd1-1].C3; e2 = ap->C1; cd1 = cd2= cd3= 0;
+ for (dp = &dna_prot_seq[(bd1-del)*3]; ap < &f_str->cur[bd]; ap++) {
+ ww = &wt[(unsigned char) *dp++];
+ sc = max(max(e1+ww->iv, (e3=ap->C2)+ww->ii), e2+ww->iii);
+ if (cd1 > sc) sc = cd1;
+ cd1 -= gext;
+ if ((ci = aq->I1) > 0) {
+ if (sc < ci) { ap->C1 = ci; ap->I1 = ci-gext;}
+ else {
+ ap->C1 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd1 < sc) cd1 = sc;
+ ap->I1 = max(ci-gext, sc);
+ } else ap->I1 = ci-gext;
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I1 = ap->C1 = 0;
+ } else {
+ ap->C1 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd1 < sc) cd1 = sc;
+ ap->I1 = sc;
+ } else ap->I1 = 0;
+ }
+ }
+ ww = &wt[(unsigned char) *dp++];
+ sc = max(max(e2+ww->iv, (e1=ap->C3)+ww->ii), e3+ww->iii);
+ if (cd2 > sc) sc = cd2;
+ cd2 -= gext;
+ if ((ci = aq->I2) > 0) {
+ if (sc < ci) { ap->C2 = ci; ap->I2 = ci-gext;}
+ else {
+ ap->C2 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd2 < sc) cd2 = sc;
+ ap->I2 = max(ci-gext, sc);
+ }
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I2 = ap->C2 = 0;
+ } else {
+ ap->C2 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd2 < sc) cd2 = sc;
+ ap->I2 = sc;
+ } else ap->I2 = 0;
+ }
+ }
+ ww = &wt[(unsigned char)*dp++];
+ sc = max(max(e3+ww->iv, (e2=aq->C1)+ww->ii), e1+ww->iii);
+ if (cd3 > sc) sc = cd3;
+ cd3 -= gext;
+ if ((ci = aq++->I3) > 0) {
+ if (sc < ci) { ap->C3 = ci; ap->I3 = ci-gext;}
+ else {
+ ap->C3 = sc;
+ sc -= gg;
+ if (sc > 0) {
+ if (sc > best) best =sc;
+ if (cd3 < sc) cd3 = sc;
+ ap->I3 = max(ci-gext, sc);
+ }
+ }
+ } else {
+ if (sc <= 0) {
+ ap->I3 = ap->C3 = 0;
+ } else {
+ ap->C3 = sc; sc-=gg;
+ if (sc >0) {
+ if (sc > best) best =sc;
+ if (cd3 < sc) cd3 = sc;
+ ap->I3 = sc;
+ } else ap->I3 = 0;
+ }
+ }
+ }
+ }
+ /* printf("The best score is %d\n", best); */
+ return best+gg;
+}
+
+/* ckalloc - allocate space; check for success */
+void *ckalloc(size_t amount)
+{
+ void *p;
+
+ if ((p = (void *)malloc( (size_t)amount)) == NULL)
+ w_abort("Ran out of memory.","");
+ return(p);
+}
+
+/* calculate the 100% identical score */
+int
+shscore(unsigned char *aa0, int n0, int **pam2)
+{
+ int i, sum;
+ for (i=0,sum=0; i<n0; i++)
+ sum += pam2[aa0[i]][aa0[i]];
+ return sum;
+}
+
+#define WIDTH 60
+
+typedef struct mat *match_ptr;
+
+typedef struct mat {
+ int i, j, l;
+ match_ptr next;
+} match_node;
+
+typedef struct { int i,j;} state;
+typedef state *state_ptr;
+
+
+void *ckalloc();
+static match_ptr small_global(), global();
+static int local_align(), find_best();
+static void init_row2(), init_ROW();
+
+int
+pro_dna(const unsigned char *prot_seq, /* array with prot. seq. numbers*/
+ int len_prot, /* length of prot. seq */
+ const unsigned char *dna_prot_seq, /* trans. DNA seq. numbers*/
+ int len_dna_prot, /* length trans. seq. */
+ int **pam_matrix, /* scoring matrix */
+ int gopen, int gext, /* gap open, gap extend penalties */
+ int gshift, /* frame-shift penalty */
+ struct f_struct *f_str,
+ int max_res,
+ struct a_res_str *a_res) /* alignment info */
+{
+ match_ptr align, ap, aq;
+ int x, y, ex, ey, i, score;
+ int *alignment;
+
+ f_str->up = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+ f_str->down = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+ f_str->tp = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
+
+ /*local alignment find the best local alignment x and y
+ is the starting position of the best local alignment
+ and ex ey is the ending position */
+
+ score= local_align(&x, &y, &ex, &ey,
+ pam_matrix, gopen, gext,
+ dna_prot_seq, len_dna_prot,
+ prot_seq, len_prot, f_str);
+
+ f_str->up += 3; f_str->down += 3; f_str->tp += 3;
+
+ /* x, y - start in prot, dna_prot */
+ a_res->min0 = x; /* prot */
+ a_res->min1 = y; /* DNA */
+ a_res->max0 = ex; /* prot */
+ a_res->max1 = ey; /* DNA */
+
+ align = global(x, y, ex, ey,
+ pam_matrix, gopen, gext,
+ dna_prot_seq, prot_seq,
+ 0, 0, f_str);
+
+ alignment = a_res->res;
+
+ for (ap = align, i= 0; ap; i++) {
+ if (i < max_res) alignment[i] = ap->l;
+ aq = ap->next; free(ap); ap = aq;
+ }
+ if (i >= max_res)
+ fprintf(stderr,"***alignment truncated: %d/%d***\n", max_res,i);
+
+ /* up = &up[-3]; down = &down[-3]; tp = &tp[-3]; */
+ free(&f_str->up[-3]); free(&f_str->tp[-3]); free(&f_str->down[-3]);
+
+ a_res->nres = i;
+ return score;
+}
+
+static void
+swap(void **a, void **b)
+{
+ void *t = *a;
+ *a = *b; *b = t;
+}
+
+/*
+ local alignment find the best local alignment x and y
+ is the starting position of the best local alignment
+ and ex ey is the ending position
+*/
+static int
+local_align(int *x, int *y, int *ex, int *ey,
+ int **wgts, int gop, int gext,
+ const unsigned char *dnap, int ld,
+ const unsigned char *pro, int lp,
+ struct f_struct *f_str)
+{
+ int i, j, score, x1,x2,x3,x4, e1 = 0, e2 = 0, e3,
+ sc, del, e, best = 0, cd, ci, c;
+ struct wgt *wt, *ww;
+ state_ptr cur_st, last_st, cur_i_st;
+ st_ptr cur, last;
+ const unsigned char *dp;
+ int *cur_d_st, *st_up;
+
+ /*
+ Array rowiC stores the best scores of alignment ending at a position
+ Arrays rowiD and rowiI store the best scores of alignment ending
+ at a position with a deletion or insrtion
+ Arrays sti stores the starting position of the best alignment whose
+ score stored in the corresponding row array.
+ The program stores two rows to complete the computation, same is
+ for the global alignment routine.
+ */
+
+
+ st_up = (int *) ckalloc(sizeof(int)*(ld+10));
+ init_row2(st_up, ld+5);
+
+ ld += 2;
+
+ init_ROW(f_str->up, ld+1);
+ init_ROW(f_str->down, ld+1);
+ cur = f_str->up+1;
+ last = f_str->down+1;
+
+ cur_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+ last_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+ cur_i_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
+ cur_d_st = st_up;
+ dp = dnap-2;
+ for (i = 0; i < lp; i++) {
+ wt = f_str->weight1[pro[i]]; e2 =0; e1 = last[0].C;
+ for (j = 0; j < 2; j++) {
+ cur_st[j].i = i+1;
+ cur_st[j].j = j+1;
+ }
+ for (j = 2; j < ld; j++) {
+ ww = &wt[(unsigned char) dp[j]];
+ del = -1;
+ if (j >= 3) {
+ sc = 0;
+ e3 = e2; e2 = e1;
+ e1 = last[j-2].C;
+ if ((e=e2+ww->iii) > sc) {sc = e; del = 3;}
+ if ((e=e1+ww->ii) > sc) {sc = e; del = 2;}
+ if ((e = e3+ww->iv) > sc) {sc = e; del = 4;}
+ } else {
+ sc = e2 = 0;
+ if (ww->iii > 0) {sc = ww->iii; del = 3;}
+ }
+ if (sc < (ci=last[j].I)) {
+ sc = ci; del = 0;
+ }
+ if (sc < (cd=cur[j].D)) {
+ sc = cd; del = 5;
+ }
+ cur[j].C = sc;
+ e = sc - gop;
+ if (e > cd) {
+ cur[j+3].D = e-gext;
+ cur_d_st[j+3] = 3;
+ } else {
+ cur[j+3].D = cd-gext;
+ cur_d_st[j+3] = cur_d_st[j]+3;
+ }
+ switch(del) {
+ case 5:
+ c = cur_d_st[j];
+ cur_st[j].i = cur_st[j-c].i;
+ cur_st[j].j = cur_st[j-c].j;
+ break;
+ case 0:
+ cur_st[j].i = cur_i_st[j].i;
+ cur_st[j].j = cur_i_st[j].j;
+ break;
+ case 2:
+ case 3:
+ case 4:
+ if (i) {
+ if (j-del >= 0) {
+ cur_st[j].i = last_st[j-del].i;
+ cur_st[j].j = last_st[j-del].j;
+ } else {
+ cur_st[j].i = i;
+ cur_st[j].j = 0;
+ }
+ } else {
+ cur_st[j].i = 0;
+ cur_st[j].j = max(0, j-del+1);
+ }
+ break;
+ case -1:
+ cur_st[j].i = i+1;
+ cur_st[j].j = j+1;
+ break;
+ }
+ if (e > ci) {
+ cur[j].I = e -gext;
+ cur_i_st[j].i = cur_st[j].i;
+ cur_i_st[j].j = cur_st[j].j;
+ } else {
+ cur[j].I = ci- gext;
+ }
+ if (sc > best) {
+ x1 = cur_st[j].i;
+ x2 = cur_st[j].j;
+ best =sc;
+ x3 = i;
+ x4 = j;
+ }
+ }
+ swap((void *)&last, (void *)&cur);
+ swap((void *)&cur_st, (void *)&last_st);
+ }
+ /* printf("The best score is %d\n", best);*/
+ *x = x1; *y = x2; *ex = x3; *ey = x4;
+ free(cur_st); free(last_st); free(cur_i_st);
+ free(st_up);
+ return best;
+}
+
+/*
+ Both global_up and global_down do linear space score only global
+ alignments on subsequence pro[x]...pro[ex], and dna[y]...dna[ey].
+ global_up do the algorithm upwards, from row x towards row y.
+ global_down do the algorithm downwards, from row y towards x.
+*/
+
+static void
+global_up(st_ptr *row1, st_ptr *row2,
+ int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext,
+ unsigned char *dnap, unsigned char *pro,
+ int N, struct f_struct *f_str)
+{
+ int i, j, k, sc, e, e1, e2, e3, t, ci, cd, score;
+ struct wgt *wt, *ww;
+ st_ptr cur, last;
+
+ cur = *row1; last = *row2;
+ sc = -gop;
+ for (j = 0; j <= ey-y+1; j++) {
+ if (j % 3 == 0) {last[j].C = sc; sc -= gext; last[j].I = sc-gop;}
+ else { last[j].I = last[j].C = -10000;}
+ }
+ last[0].C = 0; cur[0].D = cur[1].D = cur[2].D = -10000;
+ last[0].D = last[1].D = last[2].D = -10000;
+ if (N) last[0].I = -gext;
+ for (i = 1; i <= ex-x+1; i++) {
+ wt = f_str->weight1[pro[i+x-1]]; e1 = -10000; e2 = last[0].C;
+ for (j = 0; j <= ey-y+1; j++) {
+ t = j+y;
+ sc = -10000;
+ ww = &wt[(unsigned char) dnap[t-3]];
+ if (j < 4) {
+ if (j == 3) {
+ sc = e2+ww->iii;
+ } else if (j == 2) {
+ sc = e2 + ww->ii;
+ }
+ } else {
+ e3 = e2; e2 = e1;
+ e1 = last[j-2].C;
+ sc = max(e2+ww->iii, max(e1+ww->ii, e3+ww->iv));
+ }
+ sc = max(sc, max(ci=last[j].I, cd = cur[j].D));
+ cur[j].C = sc;
+ cur[j+3].D = max(cd, sc-gop)-gext;
+ cur[j].I = max(ci, sc-gop)-gext;
+ }
+ swap((void *)&last, (void *)&cur);
+ }
+ /*printf("global up score =%d\n", last[ey-y+1].C);*/
+ for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
+ if (*row1 != last) swap((void *)row1, (void *)row2);
+}
+
+static void
+global_down(st_ptr *row1, st_ptr *row2,
+ int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext,
+ unsigned char *dnap, unsigned char *pro,
+ int N, struct f_struct *f_str)
+{
+ int i, j, k, sc, del, *tmp, e, t, e1,e2,e3, ci,cd, score;
+ struct wgt *wt, *w1, *w2, *w3;
+ st_ptr cur, last;
+
+ cur = (*row1); last = *row2;
+ sc = -gop;
+ for (j = ey-y+1; j >= 0; j--) {
+ if ((ey-y+1-j) % 3) {last[j].C = sc; sc-=gext; last[j].I = sc-gop;}
+ else last[j].I = last[j].C = -10000;
+ cur[j].I = -10000;
+ }
+ last[ey-y+1].C = 0;
+ if (N) last[ey-y+1].I = -gext;
+ cur[ey-y+1].D = cur[ey-y].D = cur[ey-y-1].D = -10000;
+ last[ey-y+1].D = last[ey-y].D = last[ey-y-1].D = -10000;
+ for (i = ex-x; i >= 0; i--) {
+ wt = f_str->weight1[pro[i+x]]; e2 = last[ey-y+1].C;
+ e1 = -10000;
+ w3 = &wt[(unsigned char) dnap[ey]];
+ w2 = &wt[(unsigned char) dnap[ey-1]];
+ for (j = ey-y+1; j >= 0; j--) {
+ t = j+y;
+ w1 = &wt[(unsigned char) dnap[t-1]];
+ sc = -10000;
+ if (t+3 > ey) {
+ if (t+2 == ey) {
+ sc = e2+w2->iii;
+ } else if (t+1 == ey) {
+ sc = e2+w1->ii;
+ }
+ } else {
+ e3 = e2; e2 = e1;
+ e1 = last[j+2].C;
+ sc = max(e2+w2->iii, max(e1+w1->ii,e3+w3->iv)) ;
+ }
+ if (sc < (cd= cur[j].D)) {
+ sc = cd;
+ cur[j-3].D = cd-gext;
+ } else cur[j-3].D =max(cd, sc-gop)-gext;
+ if (sc < (ci= last[j].I)) {
+ sc = ci;
+ cur[j].I = ci - gext;
+ } else cur[j].I = max(sc-gop,ci)-gext;
+ cur[j].C = sc;
+ w3 = w2; w2 = w1;
+ }
+ swap((void *)&last, (void *)&cur);
+ }
+ for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
+ if (*row1 != last) swap((void *)row1, (void *)row2);
+}
+
+static void
+init_row2(int *row, int ld) {
+ int i;
+ for (i = 0; i < ld; i++) row[i] = 0;
+}
+
+static void init_ROW(st_ptr row, int ld) {
+ int i;
+ for (i = 0; i < ld; i++) row[i].I = row[i].D = row[i].C = 0;
+}
+
+static match_ptr
+combine(match_ptr x1, match_ptr x2, int st) {
+ match_ptr x;
+
+ if (x1 == NULL) return x2;
+ for (x = x1; x->next; x = x->next);
+ x->next = x2;
+ if (st) {
+ for (x = x2; x; x = x->next) {
+ x->j++;
+ if (x->l == 3 || x->l == 4) break;
+ }
+ x->l--;
+ }
+ return x1;
+}
+
+/*
+ global use the two upwards and downwards score only linear
+ space global alignment subroutine to recursively build the
+ alignment.
+*/
+
+match_ptr
+global(int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext,
+ unsigned char *dnap, unsigned char *pro, int N1, int N2,
+ struct f_struct *f_str)
+{
+ int m;
+ int m1, m2;
+ match_ptr x1, x2, mm1, mm2;
+
+ /*printf("%d %d %d %d %d %d\n", x,y, ex, ey, N1, N2);*/
+ /*
+ if the space required is limited, we can do a quadratic space
+ algorithm to find the alignment.
+ */
+
+ if (ex <= x) {
+ mm1 = NULL;
+ for (m = y+3; m <= ey; m+=3) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = 5; x1->next = mm1;
+ if (mm1== NULL) mm2 = x1;
+ mm1 = x1;
+ }
+ if (ex == x) {
+ if ((ey-y) % 3 != 0) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = ((ey-y) % 3) +1; x1->next = NULL;
+ if (mm1) mm2->next = x1; else mm1 = x1;
+ } else mm2->l = 4;
+ }
+ return mm1;
+ }
+ if (ey <= y) {
+ mm1 = NULL;
+ for (m = x; m <= ex; m++) {
+ x1 = (match_ptr) ckalloc(sizeof(match_node));
+ x1->l = 0; x1->next = mm1; mm1 = x1;
+ }
+ return mm1;
+ }
+ if (ex -x < SGW1 && ey-y < SGW2)
+ return small_global(x,y,ex,ey,wgts, gop, gext, dnap, pro, N1, N2,f_str);
+ m = (x+ex)/2;
+ /*
+ Do the score only global alignment from row x to row m, m is
+ the middle row of x and ex. Store the information of row m in
+ upC, upD, and upI.
+ */
+ global_up(&f_str->up, &f_str->tp, x, y, m, ey,
+ wgts, gop, gext,
+ dnap, pro, N1, f_str);
+ /*
+ Do the score only global alignment downwards from row ex
+ to row m+1, store information of row m+1 in downC downI and downD
+ */
+ global_down(&f_str->down, &f_str->tp, m+1, y, ex, ey,
+ wgts, gop, gext,
+ dnap, pro, N2, f_str);
+
+ /*
+ Use this information for row m and m+1 to find the crossing
+ point of the best alignment with the middle row. The crossing
+ point is given by m1 and m2. Then we recursively call global
+ itself to compute alignments in two smaller regions found by
+ the crossing point and combine the two alignments to form a
+ whole alignment. Return that alignment.
+ */
+ if (find_best(f_str->up, f_str->down, &m1, &m2, ey-y+1, y, gop)) {
+ x1 = global(x, y, m, m1, wgts, gop, gext, dnap, pro, N1, 0, f_str);
+ x2 = global(m+1, m2, ex, ey, wgts, gop, gext, dnap, pro, 0, N2, f_str);
+ if (m1 == m2) x1 = combine(x1,x2,1);
+ else x1 = combine(x1, x2,0);
+ } else {
+ x1 = global(x, y, m-1, m1, wgts, gop, gext, dnap, pro, N1, 1, f_str);
+ x2 = global(m+2, m2, ex, ey, wgts, gop, gext, dnap, pro, 1, N2, f_str);
+ mm1 = (match_ptr) ckalloc(sizeof(match_node));
+ mm1->i = m; mm1->l = 0; mm1->j = m1;
+ mm2 = (match_ptr) ckalloc(sizeof(match_node));
+ mm2->i = m+1; mm2->l = 0; mm2->j = m1;
+ mm1->next = mm2; mm2->next = x2;
+ x1 = combine(x1, mm1, 0);
+ }
+ return x1;
+}
+
+static int
+find_best(st_ptr up, st_ptr down, int *m1, int *m2, int ld, int y, int gop) {
+
+ int i, best = -1000, j = 0, s1, s2, s3, s4, st;
+
+ for (i = 1; i < ld; i++) {
+ s2 = up[i].C + down[i].C;
+ s4 = up[i].I + down[i].I + gop;
+ if (best < s2) {
+ best = s2; j = i; st = 1;
+ }
+ if (best < s4) {
+ best = s4; j = i; st = 0;
+ }
+ }
+ *m1 = j-1+y;
+ *m2 = j+y;
+ /*printf("score=%d\n", best);*/
+ return st;
+}
+
+/*
+ An alignment is represented as a linked list whose element
+ is of type match_node. Each element represent an edge in the
+ path of the alignment graph. The fields of match_node are
+ l --- gives the type of the edge.
+ i, j --- give the end position.
+*/
+
+static match_ptr
+small_global(int x, int y, int ex, int ey,
+ int **wgts, int gop, int gext,
+ unsigned char *dnap, unsigned char *pro,
+ int N1, int N2, struct f_struct *f_str) {
+
+ /* int C[SGW1+1][SGW2+1], st[SGW1+1][SGW2+1], D[SGW2+7], I[SGW2+1]; */
+ int i, j, e, sc, score, del, k, t, ci, cd;
+ int *cI, *cD, *cC, *lC, *cst, e2, e3, e4;
+ match_ptr mp, first;
+ struct wgt *wt, *ww;
+
+ /*printf("small_global %d %d %d %d\n", x, y, ex, ey);*/
+ sc = -gop-gext; f_str->smgl_s.C[0][0] = 0;
+
+ cI = f_str->smgl_s.I;
+ if (N1) cI[0] = -gext; else cI[0] = sc;
+
+ for (j = 1; j <= ey-y+1; j++) {
+ if (j % 3== 0) {
+ f_str->smgl_s.C[0][j] = sc;
+ sc -= gext;
+ cI[j] = sc-gop;
+ }
+ else {
+ cI[j] = f_str->smgl_s.C[0][j] = -10000;
+ }
+ f_str->smgl_s.st[0][j] = 5;
+ }
+
+ lC = &f_str->smgl_s.C[0][0];
+ cD = f_str->smgl_s.D; cD[0] = cD[1] = cD[2] = -10000;
+ for (i = 1; i <= ex-x+1; i++) {
+ cC = &f_str->smgl_s.C[i][0];
+ wt = f_str->weight1[pro[i+x-1]]; cst = &f_str->smgl_s.st[i][0];
+ for (j = 0; j <=ey-y+1; j++) {
+ ci = cI[j];
+ cd= cD[j];
+ t = j+y;
+ ww = &wt[(unsigned char) dnap[t-3]];
+ if (j >= 4) {
+ sc = lC[j-3]+ww->iii; e2 = lC[j-2]+ww->ii;
+ e4 = lC[j-4]+ww->iv; del = 3;
+ if (e2 > sc) { sc = e2; del = 2;}
+ if (e4 >= sc) { sc = e4; del = 4;}
+ } else {
+ if (j == 3) {
+ sc = lC[0]+ww->iii; del =3;
+ } else if (j == 2) {
+ sc = lC[0]+ww->ii; del = 2;
+ } else {sc = -10000; del = 0;}
+ }
+ if (sc < ci) {
+ sc = ci; del = 0;
+ }
+ if (sc <= cd) {
+ sc = cd;
+ del = 5;
+ }
+ cC[j] = sc;
+ sc -= gop;
+ if (sc <= cd) {
+ del += 10;
+ cD[j+3] = cd - gext;
+ } else cD[j+3] = sc -gext;
+ if (sc < ci) {
+ del += 20;
+ cI[j] = ci-gext;
+ } else cI[j] = sc-gext;
+ *(cst++) = del;
+ }
+ lC = cC;
+ }
+ /*printf("small global score =%d\n", f_str->smgl_s.C[ex-x+1][ey-y+1]);*/
+ if (N2 && cC[ey-y+1] < ci+gop) f_str->smgl_s.st[ex-x+1][ey-y+1] =0;
+ first = NULL; e = 1;
+ for (i = ex+1, j = ey+1; i > x || j > y; i--) {
+ mp = (match_ptr) ckalloc(sizeof(match_node));
+ mp->i = i-1;
+ k = (t=f_str->smgl_s.st[i-x][j-y])%10;
+ mp->j = j-1;
+ if (e == 5 && (t/10)%2 == 1) k = 5;
+ if (e == 0 && (t/20)== 1) k = 0;
+ if (k == 5) { j -= 3; i++; e=5;}
+ else {j -= k;if (k==0) e= 0; else e = 1;}
+ mp->l = k;
+ mp->next = first;
+ first = mp;
+ }
+
+ /* for (i = 0; i <= ex-x; i++) {
+ for (j = 0; j <= ey-y; j++)
+ printf("%d ", C[i][j]);
+ printf("\n");
+ }
+ */
+ return first;
+}
+
+#define XTERNAL
+#include "upam.h"
+
+void
+display_alig(int *a, unsigned char *dna, unsigned char *pro,
+ int length, int ld, struct f_struct *f_str)
+{
+ int len = 0, i, j, x, y, lines, k, iaa;
+ static char line1[100], line2[100], line3[100],
+ tmp[10] = " ", *st;
+ char *dna1, c1, c2, c3;
+
+ line1[0] = line2[0] = line3[0] = '\0'; x= a[0]; y = a[1]-3;
+
+ printf("\n%5d\n%5d", y+3, x);
+ for (len = 0, j = 2, lines = 0; j < length; j++) {
+ i = a[j];
+ line3[len] = ' ';
+ switch (i) {
+ case 3:
+ y += 3;
+ line2[len] = NCBIstdaa[iaa=pro[x++]];
+ line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c5;
+ if (line1[len] != f_str->weight_c[iaa][(unsigned char) dna[y]].c3)
+ line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
+ break;
+ case 2:
+ y += 2;
+ line1[len] = '\\';
+ line2[len++] = ' ';
+ line2[len] = NCBIstdaa[iaa=pro[x++]];
+ line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c2;
+ line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
+ break;
+ case 4:
+ y += 4;
+ line1[len] = '/';
+ line2[len++] = ' ';
+ line2[len] = NCBIstdaa[iaa=pro[x++]];
+ line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c4;
+ line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
+ break;
+ case 5:
+ y += 3;
+ line1[len] = f_str->weight_c[0][(unsigned char) dna[y]].c3;
+ line2[len] = '-';
+ break;
+ case 0:
+ line1[len] = '-';
+ line2[len] = NCBIstdaa[pro[x++]];
+ break;
+ }
+ len++;
+ line1[len] = line2[len] = line3[len] = '\0';
+ if (len >= WIDTH) {
+ for (k = 10; k <= WIDTH; k+=10)
+ printf(" . :");
+ if (k-5 < WIDTH) printf(" .");
+ c1 = line1[WIDTH]; c2 = line2[WIDTH]; c3 = line3[WIDTH];
+ line1[WIDTH] = line2[WIDTH] = line3[WIDTH] = '\0';
+ printf("\n %s\n %s\n %s\n", line1, line3, line2);
+ line1[WIDTH] = c1; line2[WIDTH] = c2;
+ strncpy(line1, &line1[WIDTH], sizeof(line1)-1);
+ strncpy(line2, &line2[WIDTH], sizeof(line2)-1);
+ strncpy(line3, &line3[WIDTH], sizeof(line3)-1);
+ len = len - WIDTH;
+ printf("\n%5d\n%5d", y+3, x);
+ }
+ }
+ for (k = 10; k < len; k+=10)
+ printf(" . :");
+ if (k-5 < len) printf(" .");
+ printf("\n %s\n %s\n %s\n", line1, line3, line2);
+}
+
+
+/* alignment store the operation that align the protein and dna sequence.
+ The code of the number in the array is as follows:
+ 0: delete of an amino acid.
+ 2: frame shift, 2 nucleotides match with an amino acid
+ 3: match an amino acid with a codon
+ 4: the other type of frame shift
+ 5: delete of a codon
+
+
+ Also the first two element of the array stores the starting point
+ in the protein and dna sequences in the local alignment.
+
+ Display looks like where WIDTH is assumed to be divisible by 10.
+
+ 0 . : . : . : . : . : . :
+ AACE/N\PLK\G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LWA\S\C\E/P\PRIRZ
+ I S G S V F N R Q L A G S V F N R Q L A
+ AACE P P-- G HK Y TWA A C E P P---- G HK Y TWA A C E P P----
+
+ 60 . : . : . : . : . : . :
+ /G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LW
+ G S V F N R Q L A G S V F N R Q L A G S V F
+ G HK Y TWA A C E P P---- G HK Y TWA A C E P P---- G HK Y TW
+
+For frame shift, the middle row show the letter in the original sequence,
+and the letter in the top row is the amino acid that is chose by the
+alignment (translated codon chosen from 4 nucleotides, or 2+1).
+*/
+
+/* fatal - print message and die */
+void
+fatal(msg)
+ char *msg;
+{
+ fprintf(stderr, "%s\n", msg);
+ exit(1);
+}
+
+/* 10-Feb-2010 - fz_walign modified to ensure that the final alignment
+ overlaps the initial lz_band() region. In earlier versions, the
+ final alignment (using pam2p[0]) might have been outside the band
+ region */
+
+void
+fz_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int max_res,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct a_res_str *a_res,
+ int score_thresh)
+{
+ int score;
+ int i, last_n1, itemp, n10;
+ int hoff, nt_min, nt_max, n_nt, n_aa, w_fact;
+ int l_min, l_max, window;
+ unsigned char *fs, *fd;
+ /*
+ unsigned char *aa1_min_s, aa1_max_s;
+ */
+ unsigned char *local_aa1;
+ int optflag_s;
+ int itx;
+ unsigned char *aa1x;
+ struct score_count_s s_info = {0,0,0,0};;
+
+#ifndef TFAST
+ do_fastz (f_str->aa0x, n0, aa1, n1, f_str->aa0v, ppst, f_str, &a_res->rst, &hoff, 1, &s_info);
+#else
+ /* make translated sequence */
+ last_n1 = 0;
+ aa1x = f_str->aa1x;
+ for (itx= frame*3; itx< frame*3+3; itx++) {
+ n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
+ last_n1 += n10+1;
+ }
+ n10 = last_n1-1;
+
+ /* do_fastz (lz_band) also needs a pre-computed number series */
+ if (frame == 0) {
+ pre_com(aa1, n1, f_str->aa1v);
+ }
+ else {
+ pre_com_r(aa1, n1, f_str->aa1v);
+ }
+
+ do_fastz (aa0, n0, f_str->aa1x, n10, f_str->aa1v, ppst, f_str, &a_res->rst, &hoff, 1, &s_info);
+#endif
+
+ if (a_res->rst.score[ppst->score_ix] <= score_thresh) {
+ a_res->sw_score = 0;
+ a_res->n1 = n1;
+ return;
+ }
+
+#ifndef TFAST
+ window = min(n1, ppst->param_u.fa.optwid);
+ l_min = max(0, -window - hoff);
+ l_max = min(n1, n0-hoff+window);
+ local_aa1 = (unsigned char *)aa1;
+ if (l_min > 0 || l_max < n1 -1 ) {
+ local_aa1 = (unsigned char *)calloc(l_max - l_min + 2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1+l_min, l_max - l_min);
+ }
+
+ /*
+ if (l_min > 0) {
+ aa1_min_s = aa1[l_min-1];
+ aa1[l_min-1] = '\0';
+ }
+ if (l_max < n1 - 1) {
+ aa1_max_s = aa1[l_max];
+ aa1[l_max] = '\0';
+ }
+ */
+#else
+ window = min(n0, ppst->param_u.fa.optwid);
+ if (frame==0) {
+ l_min = max(0,(hoff-window)*3);
+ l_max = min((hoff+window+n0)*3,n1);
+
+ local_aa1 = (unsigned char *)aa1;
+ if (l_min > 0 || l_max < n1 -1 ) {
+ local_aa1 = (unsigned char *)calloc(l_max - l_min + 2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1+l_min, l_max - l_min);
+ }
+
+ /*
+ if (l_min > 0) {
+ aa1_min_s = aa1[l_min-1];
+ aa1[l_min-1] = '\0';
+ }
+ if (l_max < n1-1) {
+ aa1_max_s = aa1[l_max];
+ aa1[l_max] = '\0';
+ }
+ */
+ /* re-do precomputed codon number series for limited region */
+ pre_com(local_aa1, l_max - l_min, f_str->aa1v);
+ }
+ else {
+ /* things are more complicated here because the mapping of hoff is
+ with respect to the reversed aa1 */
+
+ l_max = n1 - max(0,(hoff-window)*3);
+ l_min = n1 - min((hoff+window+n0)*3,n1);
+
+ local_aa1 = (unsigned char *)aa1;
+ if (l_min > 0 || l_max < n1 -1 ) {
+ local_aa1 = (unsigned char *)calloc(l_max - l_min + 2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1+l_min, l_max - l_min);
+ }
+
+ /*
+ if (l_min > 0) {
+ aa1_min_s = aa1[l_min-1];
+ aa1[l_min-1] = '\0';
+ }
+ if (l_max < n1-1) {
+ aa1_max_s = aa1[l_max];
+ aa1[l_max] = '\0';
+ }
+ */
+
+ pre_com_r(local_aa1, l_max - l_min, f_str->aa1v);
+ }
+#endif
+
+ a_res->sw_score =
+ pro_dna(
+#ifndef TFAST
+ aa1+l_min, l_max - l_min,
+ f_str->aa0v, n0-2,
+#else
+ aa0, n0,
+ f_str->aa1v, l_max - l_min-2,
+#endif
+ ppst->pam2[0],
+ -ppst->gdelval, -ppst->ggapval, -ppst->gshift,
+ f_str, f_str->max_res, a_res);
+
+ if (l_min > 0 || l_max < n1 - 1) { free(--local_aa1); }
+ /*
+ if (l_min > 0) {
+ aa1[l_min-1] = aa1_min_s;
+ }
+ if (l_max < n1 - 1) {
+ aa1[l_max] = aa1_max_s;
+ }
+ */
+#ifndef TFAST
+ a_res->min0 += l_min;
+ a_res->max0 += l_min;
+#else
+ if (frame==1) {
+ a_res->min1 += n1 - l_max;
+ a_res->max1 += n1 - l_max;
+ }
+ else {
+ a_res->min1 += l_min;
+ a_res->max1 += l_min;
+ }
+#endif
+
+ /* display_alig(f_str->res,f_str->aa0v+2,aa1,*nres,n0-2,f_str); */
+}
+
+/*
+ fz_malign is a recursive interface to fz_walign() that is called
+ from do_walign(). fz_malign() first does an alignment, then checks
+ to see if the score is greater than the threshold. If so, it tries
+ doing a left and right alignment.
+
+ In this implementation, the DNA sequence is preserved as DNA for
+ TFAST, so that it can be sub-setted and translated correctly. Thus,
+ the translation required for f_str->aa1x and f_str->aa1v is done at
+ each recursive level (in fz_walign).
+ */
+
+struct a_res_str *
+fz_malign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ int score_thresh, int max_res,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct a_res_str *cur_ares,
+ int first_align)
+{
+ struct a_res_str *tmpl_ares, *tmpr_ares, *this_ares;
+ struct a_res_str *mtmpl_ares, *mtmpr_ares, *mt_next;
+ int sq_start, sq_end, sq_save;
+ int hoff, score_ix;
+ int min_alen;
+ struct rstruct rst;
+ unsigned char *local_aa1;
+ /* char save_res; */
+ int iphase, i;
+ unsigned char *fd;
+ int max_sub_score = -1;
+
+ score_ix = ppst->score_ix;
+
+#ifdef TFAST
+ min_alen = min(n0,MIN_LOCAL_LEN)*3; /* n0 in aa, min_alen in nt */
+#else
+ min_alen = min(n0/3,MIN_LOCAL_LEN); /* no in nt, min_alen in aa */
+#endif
+
+ /* now we need alignment storage - get it */
+ if ((cur_ares->res = (int *)calloc((size_t)max_res,sizeof(int)))==NULL) {
+ fprintf(stderr," *** cannot allocate alignment results array %d\n",max_res);
+ exit(1);
+ }
+
+ cur_ares->next = NULL;
+
+ fz_walign(aa0, n0, aa1, n1, frame, max_res, ppst, f_str, cur_ares, (first_align ? 1 : score_thresh));
+
+ /* in cur_ares, min0,max0 are always protein, min1,max1 are always
+ DNA, but n0 could be protein or DNA, depending on
+ FASTY/TFASTY */
+
+ if (!ppst->do_rep || cur_ares->rst.score[ppst->score_ix] <= score_thresh) {
+ return cur_ares;
+ }
+
+ /* have a score >= threshold - try left and right */
+
+ /* in code below, cur_ares->min0/max0 always refers to aa
+ cur_ares->min1/max1 always refers to nt
+
+ however, things are more complex because if frame==1, then
+ offsets are from the end (n1), not the beginning. There is no
+ frame==1 for fasty, only for TFASTY
+ */
+ cur_ares->v_start = sq_start = 0;
+#ifdef TFAST
+ if (frame == 0) {sq_end = cur_ares->min1-1;} /* aa1[sq_start --> sq_end] */
+ else {sq_end = n1 - cur_ares->max1;}
+ sq_save = sq_end;
+#else
+ sq_save = sq_end = cur_ares->min0;
+#endif
+ cur_ares->v_len = sq_end - sq_start;
+
+ if (cur_ares->v_len >= min_alen) { /* try the left */
+ /* allocate a_res */
+ tmpl_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+ local_aa1 = (unsigned char *)calloc(cur_ares->v_len+2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1, aa1, cur_ares->v_len);
+
+ /*
+ save_res = aa1[sq_save];
+ aa1[sq_save] = '\0';
+ */
+ tmpl_ares = fz_malign(aa0, n0, local_aa1, cur_ares->v_len,
+ frame, score_thresh, max_res,
+ ppst, f_str, tmpl_ares,0);
+
+ free(--local_aa1);
+ /* aa1[sq_save] = save_res; */
+
+ if (tmpl_ares->rst.score[ppst->score_ix] > score_thresh) {
+ max_sub_score = tmpl_ares->rst.score[ppst->score_ix];
+#ifdef TFAST
+ if (frame == 1) {
+ for (this_ares = tmpl_ares; this_ares; this_ares = this_ares->next) {
+ this_ares->v_start += n1 - sq_end;
+ this_ares->min1 += n1 - sq_end;
+ this_ares->max1 += n1 - sq_end;
+ }
+ }
+#endif
+ }
+ else {
+ if (tmpl_ares->res) free(tmpl_ares->res);
+ free(tmpl_ares);
+ tmpl_ares=NULL;
+ }
+ }
+ else {tmpl_ares = NULL;}
+
+ /* now the right end */
+ /* for fasty -- max positions refer to the aa,codon, not the next
+ residue, so they must be incremented */
+
+ sq_end = n1;
+#if TFAST
+ if (frame == 0) {sq_start = cur_ares->max1+1;}
+ else {sq_start = n1 - cur_ares->min1;}
+#else
+ sq_start = cur_ares->max0+1;
+#endif
+ sq_save = sq_start-1;
+ cur_ares->v_len = sq_end - sq_start;
+
+ if (cur_ares->v_len >= min_alen) { /* try the right */
+ /* allocate a_res */
+ tmpr_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+
+ /* find boundaries */
+ local_aa1 = (unsigned char *)calloc(cur_ares->v_len+2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1+sq_start,cur_ares->v_len);
+ /*
+ save_res = aa1[sq_save];
+ aa1[sq_save] = '\0';
+ */
+
+ tmpr_ares = fz_malign(aa0, n0,
+ local_aa1, cur_ares->v_len,
+ frame,
+ score_thresh, max_res,
+ ppst, f_str, tmpr_ares,0);
+ free(--local_aa1);
+ /*
+ aa1[sq_save] = save_res;
+ */
+
+ if (tmpr_ares->rst.score[ppst->score_ix] > score_thresh) {
+ /* adjust the left boundary */
+ for (this_ares = tmpr_ares; this_ares; this_ares = this_ares->next) {
+#ifndef TFAST
+ this_ares->min0 += sq_start;
+ this_ares->max0 += sq_start;
+#else
+ if (frame == 0) {
+ this_ares->v_start += sq_start;
+ this_ares->min1 += sq_start;
+ this_ares->max1 += sq_start;
+ }
+#endif
+ }
+ if (tmpr_ares->rst.score[ppst->score_ix] > max_sub_score) {
+ max_sub_score = tmpr_ares->rst.score[ppst->score_ix];
+ }
+ }
+ else {
+ if (tmpr_ares->res) free(tmpr_ares->res);
+ free(tmpr_ares);
+ tmpr_ares=NULL;
+ }
+ }
+ else {tmpr_ares = NULL;}
+
+ if (max_sub_score <= score_thresh) return cur_ares;
+
+ cur_ares = merge_ares_chains(cur_ares, tmpl_ares, score_ix, "left");
+ cur_ares = merge_ares_chains(cur_ares, tmpr_ares, score_ix, "right");
+
+ return cur_ares;
+}
+
+/* do_walign() can be called with aa0,n0 as nt (FASTY) or
+ aa0,n0 as aa (TFASTY). if aa0 is nt, then f_str->aa0x,y have the
+ translations already. if aa0 is aa, then f_str->aa1x,y must be
+ generated.
+*/
+
+struct a_res_str *
+do_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int repeat_thresh,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ int *have_ares)
+{
+ struct a_res_str *a_res, *tmp_a_res;
+ int a_res_index;
+ int hoff, use_E_thresholds_s, optflag_s, optcut_s, optwid_s, score;
+ int last_n1, itx, itt, n10, iphase;
+ unsigned char *fs, *fd;
+ struct rstruct rst;
+#ifdef DEBUG
+ unsigned long adler32_crc;
+#endif
+
+ *have_ares = 0x3; /* set 0x2 bit to indicate local copy */
+
+ if ((a_res = (struct a_res_str *)calloc(1, sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr," [do_walign] Cannot allocate a_res");
+ return NULL;
+ }
+
+#ifdef DEBUG
+ adler32_crc = adler32(1L,aa1,n1);
+#endif
+
+ f_str->frame = frame; /* need frame for later pre_cons() in calcons() */
+
+ use_E_thresholds_s = ppst->param_u.fa.use_E_thresholds;
+ optflag_s = ppst->param_u.fa.optflag;
+ optcut_s = ppst->param_u.fa.optcut;
+ optwid_s = ppst->param_u.fa.optwid;
+ ppst->param_u.fa.use_E_thresholds = 0;
+ ppst->param_u.fa.optflag = 1;
+ if (!ppst->param_u.fa.optwid_set) {
+ ppst->param_u.fa.optwid *= 2;
+ }
+
+ a_res = fz_malign(aa0, n0, aa1, n1, frame,
+ repeat_thresh, f_str->max_res,
+ ppst, f_str, a_res, 1);
+
+#ifdef DEBUG
+ if (adler32(1L,aa1,n1) != adler32_crc) {
+ fprintf(stderr,"*** error [%s:%d] adler32_crc mismatch n1: %d\n",__FILE__, __LINE__, n1);
+ }
+#endif
+
+ a_res_index = 0;
+ for (tmp_a_res=a_res; tmp_a_res; tmp_a_res = tmp_a_res->next) {
+ tmp_a_res->index = a_res_index++;
+ }
+
+ ppst->param_u.fa.use_E_thresholds = use_E_thresholds_s;
+ ppst->param_u.fa.optflag = optflag_s;
+ ppst->param_u.fa.optwid = optwid_s;
+ return a_res;
+}
+
+void
+pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {
+
+#ifdef TFAST
+ int i, last_n1, itemp, n10;
+ unsigned char *fs, *fd;
+ int itx;
+
+ /* make a precomputed codon number series */
+ if (frame==0) {
+ pre_com(aa1, n1, f_str->aa1v);
+ }
+ else { /* must do things backwards */
+ pre_com_r(aa1, n1, f_str->aa1v);
+ }
+#endif
+}
+
+/* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
+/* call from calcons, calc_id, calc_code */
+void
+aln_func_vals(int frame, struct a_struct *aln) {
+
+#ifndef TFAST
+ aln->llrev = 0;
+ aln->llfact = 1;
+ aln->llmult = 1;
+ aln->qlfact = 3;
+ aln->frame = frame;
+ if (frame > 0) aln->qlrev = 1;
+ else aln->qlrev = 0;
+ aln->llrev = 0;
+#else /* TFASTX */
+ aln->qlfact = 1;
+ aln->qlrev = 0;
+ aln->llfact = 3;
+ aln->llmult = 1;
+ aln->frame = frame;
+ if (frame > 0) aln->llrev = 1;
+ else aln->llrev = 0;
+ aln->qlrev=0;
+#endif /* TFASTX */
+}
+
+#include "structs.h"
+#include "a_mark.h"
+
+extern int align_type(int score, char sp0, char sp1, int nt_align, struct a_struct *aln, int pam_x_id_sim);
+
+extern void
+process_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ struct annot_entry *annot_arr_p, int n_annots, char **ann_comment,
+ void *annot_stack, int *have_push_features, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_head_p,
+ struct domfeat_data *left_domain_p,
+ long *left_end_p, int init_score);
+
+extern int
+next_annot_match(int *itmp, int *pam2aa0v,
+ long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
+ int i_annot, int n_annot, struct annot_entry **annot_arr, char **ann_comment,
+ void *annot_stack, int *have_push_features, int *v_delta,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_head_p,
+ struct domfeat_data *left_domain_p,
+ long *left_domain_end, int init_score);
+
+extern void
+close_annot_match (int ia, void *annot_stack, int *have_push_features,
+ int *d_score_p, int *d_ident_p, int *d_alen_p, int *d_gaplen_p,
+ struct domfeat_data **left_domain_p,
+ long *left_end_p, int init_score);
+
+extern void
+comment_var(long i0, char sp0, long i1, char sp1, char o_sp1, char sim_char,
+ const char *ann_comment, struct dyn_string_str *annot_var_dyn,
+ int target, int d_type);
+
+void
+display_push_features(void *annot_stack, struct dyn_string_str *annot_var_dyn,
+ long i0_pos, char sp0, long i1_pos, char sp1, char sym,
+ int score, double comp, int sw_score, int n0, int n1,
+ void *pstat_void, int d_type);
+
+#define DP_FULL_FMT 1 /* Region: score: bits: id: ... */
+#define Q_TARGET 0
+#define L_TARGET 1
+
+int seq_pos(int pos, int rev,int off);
+
+/* values of calc_func_mode */
+#define CALC_CONS 1
+#define CALC_CODE 2
+#define CALC_ID 3
+#define CALC_ID_DOM 4
+
+/* add_annot_code: adds annotation codes to struct dyn_string_str ann_code_dyn */
+void
+add_annot_code(int have_ann, char sp0, char sp1,
+ char ann_aa1_i1,
+ long q_off_pos, long l_off_pos, char sim_sym_code,
+ struct dyn_string_str *ann_code_dyn)
+{
+ char ann_ch0, ann_ch1;
+ char tmp_astr[MAX_STR];
+
+ ann_ch0 = ann_ch1 = '\0';
+
+ if (have_ann && ann_aa1_i1 != ' ') {
+ ann_ch0 = 'X';
+ ann_ch1 = ann_aa1_i1;
+ }
+ else {return;}
+
+ if (!(ann_ch1 == '[' || ann_ch1 == ']')) {
+ sprintf(tmp_astr, "|%c%c:%ld%c%c%ld%c",
+ ann_ch0,ann_ch1, q_off_pos+1,sp0,
+ sim_sym_code, l_off_pos+1,sp1);
+ dyn_strcat(ann_code_dyn, tmp_astr);
+ }
+}
+
+/* universal alignment code builder for calc_cons_a(), calc_code(), and calc_id() */
+/* see cal_cons2.c/calc_cons_u() for strategy */
+
+int
+calc_cons_u( /* inputs */
+ const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_res_str *a_res, /* alignment encoding */
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ void *pstat_void,
+ /* annotation stuff */
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a, const struct annot_str *annot0_p,
+ const unsigned char *aa1a, const struct annot_str *annot1_p,
+ int calc_func_mode, /* CALC_CONS, CALC_CODE, CALC_ID */
+ int display_code, /* used only by CALC_CODE */
+ /* outputs */
+ int *nc,
+ char *seqc0, char *seqc1, char *seqca, int *cumm_seq_score,
+ char *seqc0a, char *seqc1a,
+ struct a_struct *aln,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct dyn_string_str *align_code_dyn)
+{
+ int i0, i1, i, j;
+ int lenc, not_c, itmp, ngap_p, ngap_d, nfs;
+ int *i_spa;
+ char *sp0_p, *sp0a_p, *sp1_p, *sp1a_p, *spa_p, t_spa;
+ char sp0_c, sp1_c, spa_c; /* used for CALC_ID, CALC_CODE */
+ char sp0a_c, sp1a_c; /* used for CALC_CODE */
+
+ struct update_code_str *update_data_p;
+
+ const unsigned char *sq;
+ unsigned char aap;
+
+ const unsigned char *ap0, *ap1;
+ const unsigned char *ap1a; /* ap1 always points to protein, and
+ only protein has annotations */
+ const struct annot_str *annotp_p; /* protein annotations from annot_str */
+ int comment_target;
+
+ int *rp, *rpmax;
+ int have_ann;
+
+ /* variables for variant changes/region scores */
+ char tmp_str[MAX_LSTR];
+ void *annot_stack;
+ int have_push_features, prev_match, *have_push_features_p;
+
+ char *sim_sym = aln_map_sym[MX_ACC];
+ struct annot_entry **s_annotp_arr_p;
+ int i1_annot, v_delta, v_tmp;
+ long i0_offset, i1_offset;
+
+ long i1_left_end;
+ int show_code, annot_fmt, start_flag;
+
+ int d1_score, d1_ident, d1_alen, d1_gaplen;
+ struct domfeat_data *left_domain_list1, *left_domain_head1;
+
+ char *ann_comment;
+
+ *score_delta = 0;
+ d1_score = d1_ident = d1_alen = d1_gaplen = 0;
+ i1_left_end = -1;
+ left_domain_head1 = left_domain_list1 = NULL;
+
+ NULL_dyn_string(annot_var_dyn);
+
+ if (ppst->ext_sq_set) {sq = ppst->sqx;}
+ else {sq = ppst->sq;}
+
+#ifndef TFAST /* FASTX */
+ comment_target = 1;
+ aln->amin1 = aln->smin1 = a_res->min0; /* prot */
+ aln->amin0 = aln->smin0 = a_res->min1; /* DNA */
+
+ i0_offset = aln->q_offset;
+ i1_offset = aln->l_offset;
+
+ ap0 = f_str->aa0v; /* translated DNA */
+ ap1 = aa1; /* protein */
+
+ ap1a = aa1a;
+ annotp_p = annot1_p;
+
+ if (calc_func_mode == CALC_CONS) {
+ have_ann = (seqc0a !=NULL && aa1a != NULL);
+ sp0_p = seqc0; /* translated DNA */
+ sp1_p = seqc1; /* protein */
+ spa_p = seqca;
+ sp1a_p = seqc1a; /* protein library can have annotation */
+ sp0a_p = seqc0a; /* sp0a is always ' ' - no translated annotation */
+ annot_fmt = DP_FULL_FMT;
+ }
+ else if (calc_func_mode == CALC_ID || calc_func_mode == CALC_ID_DOM) {
+ /* does not require aa0a/aa1a, only for variants */
+ have_ann = ((annotp_p && annotp_p->n_annot > 0) || (annot0_p && annot0_p->n_annot > 0));
+ sp0_p = &sp0_c;
+ sp1_p = &sp1_c;
+ spa_p = &spa_c;
+ sp0a_p = &sp0a_c;
+ sp1a_p = &sp1a_c;
+ annot_fmt = 3;
+ }
+ else if (calc_func_mode == CALC_CODE) {
+ spa_p = &spa_c;
+ sp0_p = &sp0_c;
+ sp1_p = &sp1_c;
+ sp0a_p = &sp0a_c;
+ sp1a_p = &sp1a_c;
+
+ show_code = (display_code & (SHOW_CODE_MASK+SHOW_CODE_EXT)); /* see defs.h; SHOW_CODE_ALIGN=2,_CIGAR=3,_CIGAR_EXT=4 */
+ annot_fmt = 2;
+ if (display_code & SHOW_ANNOT_FULL) {
+ annot_fmt = 1;
+ }
+ /* have_ann encodes number of sequences annotated */
+ have_ann = 0;
+ if ((annotp_p && annotp_p->n_annot > 0) || (aa1a != NULL)) { have_ann |= 2;}
+ update_data_p = init_update_data(show_code);
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] --- cal_cons_u() invalid calc_func_mode: %d\n",
+ __FILE__, __LINE__, calc_func_mode);
+ exit(1);
+ }
+
+#else /* TFASTX */
+ comment_target = 0;
+ aln->amin0 = aln->smin0 = a_res->min0; /* DNA */
+ aln->amin1 = aln->smin1 = a_res->min1; /* prot */
+
+ i1_offset = aln->q_offset;
+ i0_offset = aln->l_offset;
+
+ ap1 = aa0; /* aa0 is protein */
+ /* with fx_malign(), there is no guarantee that we have a valid f_str->aa1y, so make one */
+ pre_cons(aa1,n1,aln->frame, f_str);
+ ap0 = f_str->aa1v; /* aa1 is DNA */
+ ap1a = aa0a;
+ annotp_p = annot0_p;
+
+ have_ann = (seqc0a !=NULL && aa0a != NULL);
+ if (calc_func_mode == CALC_CONS) {
+ sp1_p = seqc0; /* sp1 points to protein query */
+ sp0_p = seqc1; /* sp0 points to DNA */
+ spa_p = seqca;
+ sp1a_p = seqc0a; /* protein query can have annotation */
+ sp0a_p = seqc1a; /* sp0a is always ' ' - no translated annotation */
+ annot_fmt = DP_FULL_FMT;
+ }
+ else if (calc_func_mode == CALC_ID || calc_func_mode == CALC_ID_DOM) {
+ have_ann = (annotp_p && annotp_p->n_annot > 0);
+ spa_p = &spa_c;
+ sp0_p = &sp0_c;
+ sp1_p = &sp1_c;
+
+ sp0a_p = &sp0a_c;
+ sp1a_p = &sp1a_c;
+ annot_fmt = 3;
+
+ /* does not require aa0a/aa1a, only for variants */
+ }
+ else if (calc_func_mode == CALC_CODE) {
+ spa_p = &spa_c;
+ sp0_p = &sp0_c;
+ sp1_p = &sp1_c;
+
+ sp0a_p = &sp0a_c;
+ sp1a_p = &sp1a_c;
+
+ show_code = (display_code & (SHOW_CODE_MASK+SHOW_CODE_EXT)); /* see defs.h; SHOW_CODE_ALIGN=2,_CIGAR=3,_CIGAR_EXT=4 */
+ annot_fmt = 2;
+ if (display_code & SHOW_ANNOT_FULL) {
+ annot_fmt = 1;
+ }
+
+ /* have_ann encodes number of sequences annotated */
+ if ((annotp_p && annotp_p->n_annot > 0) || (ap1a != NULL)) { have_ann |= 1;}
+
+ update_data_p = init_update_data(show_code);
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] --- cal_cons_u() invalid calc_func_mode: %d\n",
+ __FILE__, __LINE__, calc_func_mode);
+ exit(1);
+ }
+#endif
+ if (cumm_seq_score) i_spa = cumm_seq_score;
+
+ rp = a_res->res;
+ rpmax = &a_res->res[a_res->nres];
+
+ lenc = not_c = aln->nident = aln->nmismatch = aln->nsim = aln->npos = ngap_p = ngap_d = nfs= 0;
+ i0 = a_res->min1 - 3;
+ i1 = a_res->min0;
+
+ v_delta = 0;
+ i1_annot = 0;
+ annot_stack = NULL;
+ s_annotp_arr_p = NULL;
+ have_push_features = prev_match = 0;
+ if (have_ann) {
+ have_push_features_p = &have_push_features;
+
+ if (annotp_p && annotp_p->n_annot > 0) {
+ annot_stack = init_stack(64,64);
+ left_domain_list1=init_domfeat_data(annotp_p);
+
+ s_annotp_arr_p = annotp_p->s_annot_arr_p;
+
+ while (i1_annot < annotp_p->n_annot) {
+ if (s_annotp_arr_p[i1_annot]->pos >= i1+i1_offset) {break;}
+ if (s_annotp_arr_p[i1_annot]->end <= i1+i1_offset) {i1_annot++; continue;}
+
+ if (s_annotp_arr_p[i1_annot]->label == '-') {
+ process_annot_match(&itmp, NULL, i1_offset+seq_pos(i1,aln->llrev,0), i0_offset + seq_pos(i0,aln->qlrev,0),
+ sp1_p, sp1a_p, sq, s_annotp_arr_p[i1_annot], annotp_p->n_annot, &ann_comment,
+ annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, &left_domain_list1[i1_annot], &i1_left_end, 0);
+ }
+ i1_annot++;
+ }
+ }
+ }
+
+ while (rp < rpmax) {
+ /* fprintf(stderr,"%d %d %d (%c) %d (%c)\n"
+ ,(int)(rp-res),*rp,i0,sq[ap0[i0]],i1,sq[ap1[i1]]);
+ */
+
+ switch (*rp++) {
+ case 3: /* match */
+ i0 += 3;
+
+ *sp1_p = sq[aap=ap1[i1]];
+ *sp0_p = f_str->weight_c[aap][ap0[i0]].c5;
+ itmp = ppst->pam2[0][aap][pascii[*sp0_p]];
+
+
+ if (have_ann) {
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ *sp1a_p = ann_arr[ap1a[i1]];
+ *sp0a_p = ' ';
+ }
+ if (s_annotp_arr_p) {
+ if (i1+i1_offset == s_annotp_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[*sp0_p]],
+#ifndef TFAST
+ i1_offset+seq_pos(i1,aln->llrev,0), /* annotated target (prot) coordinate */
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sp1_p, sp1a_p, sq,
+ i1_annot, annotp_p->n_annot, s_annotp_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, left_domain_list1, &i1_left_end,
+ 0);
+
+ /* must be out of the loop to capture the last value */
+ if (ppst->sq[ap1[i1]] != *sp1_p) {
+ t_spa = align_type(itmp, *sp0_p, *sp1_p, 0, NULL, ppst->pam_x_id_sim);
+
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ comment_var(
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn,comment_target,annot_fmt);
+ }
+ else {
+ sprintf(tmp_str,"%c%d%c;",ppst->sq[ap1[i1]],i1+1,*sp1_p);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+ }
+ prev_match = 1;
+ d1_score += itmp;
+ }
+ if (calc_func_mode == CALC_CONS) {sp0a_p++; sp1a_p++;}
+ }
+
+ *spa_p = align_type(itmp, *sp0_p, *sp1_p, 0, aln, ppst->pam_x_id_sim);
+ d1_alen++;
+ if (*spa_p == M_IDENT) {d1_ident++;}
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 3, *spa_p, *sp0_p, *sp1_p);
+
+ if (have_ann && have_push_features) {
+ add_annot_code(have_ann, *sp0_p, *sp1_p, *sp1a_p,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+ i1_offset+seq_pos(i1,aln->llrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sim_sym[*spa_p], annot_var_dyn);
+ }
+ }
+
+ if (have_ann && have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ i1++;
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+ lenc++;
+ break;
+ case 2: /* frame shift +2, then match */
+ nfs++;
+ i0 += 2;
+ *sp0_p = '/';
+ *sp1_p = '-';
+ *spa_p = M_DEL;
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 2, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ if (cumm_seq_score) *i_spa++ = ppst->gshift;
+
+ if (calc_func_mode == CALC_CONS) {
+ sp0_p++; sp1_p++; spa_p++;
+ if (have_ann) {*sp0a_p++ = *sp1a_p++ = ' ';}
+ }
+
+ not_c++;
+
+ *sp1_p = sq[aap=ap1[i1]];
+ *sp0_p = f_str->weight_c[aap][ap0[i0]].c2;
+ itmp = ppst->pam2[0][pascii[*sp0_p]][aap];
+
+ if (cumm_seq_score) *i_spa++ = ppst->gshift;
+
+ if (have_ann) {
+ have_push_features = 0;
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ *sp1a_p = ann_arr[ap1a[i1]];
+ *sp0a_p = ' ';
+ }
+ if (s_annotp_arr_p) {
+ if (i1 + i1_offset == s_annotp_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[*sp0_p]],
+#ifndef TFAST
+ i1_offset+seq_pos(i1,aln->llrev,0), /* annotated target (prot) coordinate */
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sp1_p, sp1a_p, sq,
+ i1_annot, annotp_p->n_annot, s_annotp_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, left_domain_list1, &i1_left_end,0);
+
+ /* must be out of the loop to capture the last value */
+ if (ppst->sq[ap1[i1]] != *sp1_p) {
+ t_spa = align_type(itmp, *sp0_p, *sp1_p, 0, NULL, ppst->pam_x_id_sim);
+
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ comment_var(
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn,comment_target,annot_fmt);
+ }
+ else {
+ sprintf(tmp_str,"%c%d%c;",sq[ap1[i1]],i1+1,*sp1_p);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+ }
+ d1_score += ppst->gshift;
+ d1_score += itmp;
+ prev_match = 1;
+ }
+ if (calc_func_mode == CALC_CONS) {sp0a_p++; sp1a_p++;}
+ }
+
+ *spa_p = align_type(itmp, *sp0_p, *sp1_p, 0, aln, ppst->pam_x_id_sim);
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 3, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ d1_alen++;
+ if (*spa_p == M_IDENT) {d1_ident++;}
+
+ if (have_ann && have_push_features && calc_func_mode != CALC_ID) {
+ if (calc_func_mode == CALC_CODE) {
+ add_annot_code(have_ann, *sp0_p, *sp1_p, *sp1a_p,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+ i1_offset+seq_pos(i1,aln->llrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sim_sym[*spa_p], annot_var_dyn);
+ }
+ display_push_features(annot_stack, annot_var_dyn,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ if (cumm_seq_score) *i_spa++ = itmp;
+
+ i1++;
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+ lenc++;
+ break;
+ case 4: /* frame shift, -1, then match */
+ nfs++;
+ i0 += 4;
+
+ *sp0_p = '\\';
+ *sp1_p = '-';
+ *spa_p = M_DEL;
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 4, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+
+ if (cumm_seq_score) *i_spa++ = ppst->gshift;
+
+ if (have_ann && calc_func_mode == CALC_CONS) {*sp1a_p++ = *sp0a_p++ = ' ';}
+ not_c++;
+
+ *sp1_p = sq[aap=ap1[i1]];
+ *sp0_p = f_str->weight_c[aap][ap0[i0]].c4;
+ itmp = ppst->pam2[0][pascii[*sp0_p]][aap];
+
+ if (have_ann) {
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ *sp1a_p = ann_arr[ap1a[i1]];
+ *sp0a_p = ' ';
+ }
+ if (s_annotp_arr_p && (i1+i1_offset == s_annotp_arr_p[i1_annot]->pos || i1+i1_offset == i1_left_end)) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[*sp0_p]],
+#ifndef TFAST
+ i1_offset+seq_pos(i1,aln->llrev,0), /* annotated target (prot) coordinate */
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sp1_p, sp1a_p, sq,
+ i1_annot, annotp_p->n_annot, s_annotp_arr_p, &ann_comment,
+ annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, left_domain_list1, &i1_left_end,0);
+
+ /* must be out of the loop to capture the last value */
+ if (ppst->sq[ap1[i1]] != *sp1_p) {
+ t_spa = align_type(itmp, *sp0_p, *sp1_p, 0, NULL, ppst->pam_x_id_sim);
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ comment_var(
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sq[ap1[i1]], sim_sym[t_spa], ann_comment,
+ annot_var_dyn, comment_target, annot_fmt);
+ }
+ else {
+ sprintf(tmp_str,"%c%d%c;",ppst->sq[ap1[i1]],i1+1,*sp1_p);
+ /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
+ dyn_strcat(annot_var_dyn, tmp_str);
+ }
+ }
+ d1_score += ppst->gshift;
+ d1_score += itmp;
+ prev_match = 1;
+ }
+ if (calc_func_mode == CALC_CONS) {sp0a_p++; sp1a_p++;}
+ }
+
+ *spa_p = align_type(itmp, *sp0_p, *sp1_p, 0, aln, ppst->pam_x_id_sim);
+
+ if (calc_func_mode == CALC_CODE) {
+ update_code(align_code_dyn, update_data_p, 3, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ d1_alen++;
+ if (*spa_p == M_IDENT) {d1_ident++;}
+ if (cumm_seq_score) *i_spa++ = itmp;
+
+ if (have_ann && have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ i1++;
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+ lenc++;
+ break;
+ case 5: /* insertion in 0 */
+ if (have_ann && calc_func_mode == CALC_CONS) {
+ *sp1a_p++ = *sp0a_p++ = ' ';
+ }
+
+ if (cumm_seq_score) {
+ if (prev_match) *i_spa = ppst->gdelval;
+ *i_spa++ = ppst->ggapval;
+ }
+
+ if (prev_match) d1_score += ppst->gdelval;
+ d1_score += ppst->ggapval;
+
+ prev_match = 0;
+
+ i0 += 3;
+ *sp0_p = f_str->weight_c[0][ap0[i0]].c3;
+ *sp1_p = '-';
+ *spa_p = M_DEL;
+
+ if (calc_func_mode == CALC_CODE) {
+ *spa_p = 5;
+ update_code(align_code_dyn, update_data_p, 5, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ lenc++;
+ ngap_p++;
+
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+ break;
+ case 0: /* insertion in 1 */
+
+ *sp0_p = '-';
+ *sp1_p = sq[ap1[i1]];
+ *spa_p = M_DEL;
+
+ if (calc_func_mode == CALC_CODE) {
+ *spa_p = 5; /* indel code */
+ update_code(align_code_dyn, update_data_p, 0, *spa_p,*sp0_p,*sp1_p);
+ }
+
+ if (cumm_seq_score) {
+ if (prev_match) *i_spa = ppst->gdelval;
+ *i_spa++ += ppst->gdelval;
+ }
+
+ if (calc_func_mode == CALC_CONS) {sp0_p++; sp1_p++; spa_p++;}
+
+ if (have_ann) {
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {
+ *sp0a_p = ' ';
+ *sp1a_p = ann_arr[ap1a[i1]];
+ }
+
+ if (s_annotp_arr_p) {
+ /* coordiates are much more complex for next_annot_match,
+ and comment_var, because they may need to be reversed */
+
+ if (i1 + i1_offset == s_annotp_arr_p[i1_annot]->pos) {
+ i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]],
+#ifndef TFAST
+ i1_offset+seq_pos(i1,aln->llrev,0), /* annotated target (prot) coordinate */
+ i0_offset+seq_pos(i0,aln->qlrev,0),
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0),
+ i0_offset+seq_pos(i0,aln->llrev,0),
+#endif
+ sp1_p, sp1a_p, sq,
+ i1_annot, annotp_p->n_annot, s_annotp_arr_p,
+ &ann_comment, annot_stack, have_push_features_p, &v_delta,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, left_domain_list1, &i1_left_end,0);
+
+ }
+
+ if (prev_match) d1_score += ppst->gdelval;
+ d1_score += ppst->ggapval;
+ d1_alen++;
+ d1_gaplen++;
+ prev_match = 0;
+ }
+ if (calc_func_mode == CALC_CONS) {sp0a_p++; sp1a_p++;}
+ }
+
+ if (have_ann && have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+
+ i1++;
+ lenc++;
+ ngap_d++;
+ break;
+ }
+ }
+
+ /* done with alignment loop */
+
+ if (calc_func_mode == CALC_CODE) {
+ close_update_data(align_code_dyn, update_data_p);
+ }
+
+ if (have_ann) {
+ if (calc_func_mode != CALC_ID && calc_func_mode != CALC_ID_DOM) {*sp0a_p = *sp1a_p = '\0';}
+ if (s_annotp_arr_p) {
+ have_push_features = 0;
+
+ if (s_annotp_arr_p && i1_left_end > 0) {
+ close_annot_match(-1, annot_stack, have_push_features_p,
+ &d1_score, &d1_ident, &d1_alen, &d1_gaplen,
+ &left_domain_head1, &i1_left_end,
+ 0);
+ }
+
+ if (have_push_features && calc_func_mode != CALC_ID) {
+ display_push_features(annot_stack, annot_var_dyn,
+#ifndef TFAST
+ i0_offset+seq_pos(i0,aln->qlrev,0), *sp0_p,
+ i1_offset+seq_pos(i1,aln->llrev,0), *sp1_p,
+#else
+ i1_offset+seq_pos(i1,aln->qlrev,0), *sp0_p,
+ i0_offset+seq_pos(i0,aln->llrev,0), *sp1_p,
+#endif
+ sim_sym[*spa_p],
+ a_res->rst.score[ppst->score_ix], a_res->rst.comp, a_res->sw_score,
+ n0, n1, pstat_void, annot_fmt);
+ have_push_features = 0;
+ }
+ }
+ if (left_domain_list1) free(left_domain_list1);
+ free_stack(annot_stack);
+ }
+ *spa_p = '\0';
+
+#ifndef TFAST
+ aln->amax0 = i0;
+ aln->amax1 = i1;
+ aln->ngap_q = ngap_d;
+ aln->ngap_l = ngap_p;
+#else
+ aln->amax1 = i0;
+ aln->amax0 = i1;
+ aln->ngap_q = ngap_p;
+ aln->ngap_l = ngap_d;
+#endif
+ aln->calc_last_set = 1;
+
+ aln->nfs = nfs;
+
+ *score_delta = v_delta;
+
+ if (lenc < 0) lenc = 1;
+ *nc = lenc;
+/* now we have the middle, get the right end */
+ return lenc+not_c;
+}
+
+int
+calc_cons_a(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int *nc,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ char *seqc0, char *seqc1, char *seqca, int *cumm_seq_score,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a, const struct annot_str *annot0_p, char *seqc0a,
+ const unsigned char *aa1a, const struct annot_str *annot1_p, char *seqc1a,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str,
+ void *pstat_void)
+{
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, pstat_void,
+ ann_arr, aa0a, annot0_p, aa1a, annot1_p, CALC_CONS, 0,
+ nc, seqc0, seqc1, seqca, cumm_seq_score,
+ seqc0a, seqc1a, aln, score_delta, annot_var_dyn, NULL
+ );
+}
+
+void
+calc_astruct(struct a_struct *aln_p, struct a_res_str *a_res_p, struct f_struct *f_str) {
+
+ aln_p->calc_last_set = 0;
+
+#ifndef TFAST /* FASTX */
+ aln_p->amin1 = a_res_p->min0; /* prot */
+ aln_p->amin0 = a_res_p->min1; /* DNA */
+ aln_p->amax1 = a_res_p->max0; /* prot */
+ aln_p->amax0 = a_res_p->max1; /* DNA */
+#else /* TFASTX */
+ aln_p->amin0 = a_res_p->min0; /* DNA */
+ aln_p->amin1 = a_res_p->min1; /* prot */
+ aln_p->amax0 = a_res_p->max0; /* DNA */
+ aln_p->amax1 = a_res_p->max1; /* prot */
+#endif
+}
+
+/* build an array of match/ins/del - length strings */
+
+/* modified 10-June-2014 to distinguish matches from mismatches, op=1
+ (previously unused) indicates an aligned non-identity */
+
+/* op_codes are: 0 - aa insertion
+ 1 - (now) aligned non-identity
+ 2 - -1 frameshift
+ 3 - aligned identity
+ 4 - +1 frameshift
+ 5 - codon insertion
+*/
+
+static struct update_code_str *
+init_update_data(show_code) {
+
+ struct update_code_str *update_data_p;
+
+ if ((update_data_p = (struct update_code_str *)calloc(1,sizeof(struct update_code_str)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] - init_update_data(): cannot allocate update_code_str\n",
+ __FILE__, __LINE__);
+ return NULL;
+ }
+
+ update_data_p->p_op_idx = -1;
+ update_data_p->p_op_cnt = 0;
+ update_data_p->show_code = show_code;
+ update_data_p->btop_enc = 0;
+
+ if ((show_code & SHOW_CODE_CIGAR) == SHOW_CODE_CIGAR) {
+ update_data_p->op_map = cigar_code;
+ update_data_p->cigar_order = 1;
+ }
+ else if ((show_code & SHOW_CODE_BTOP) == SHOW_CODE_BTOP) {
+ update_data_p->op_map = ori_code;
+ update_data_p->cigar_order = 0;
+ update_data_p->btop_enc = 1;
+ }
+ else {
+ update_data_p->op_map = ori_code;
+ update_data_p->cigar_order = 0;
+ }
+
+ if ((show_code & SHOW_CODE_EXT) == SHOW_CODE_EXT) {
+ update_data_p->show_ext = 1;
+ }
+ else {
+ update_data_p->show_ext = 0;
+ }
+
+ return update_data_p;
+}
+
+static void
+close_update_data(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *up_dp) {
+ char tmp_cnt[MAX_SSTR];
+
+ if (!up_dp) return;
+
+ if (up_dp->btop_enc) {
+ sprintf(tmp_cnt,"%d",up_dp->p_op_cnt);
+ up_dp->p_op_cnt = 0;
+ }
+ else {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx, up_dp->p_op_cnt);
+ }
+ dyn_strcat(align_code_dyn, tmp_cnt);
+
+ free(up_dp);
+}
+
+/* update_indel_code() has been modified to work more correctly with
+ ggsearch/glsearch, which, because alignments can start with either
+ insertions or deletions, can produce an initial code of "0=". When
+ that happens, it is ignored and no code is added.
+
+ *align_code_dyn - alignment string (dynamic)
+ op -- encoded operation, currently 0=match, 1-delete, 2-insert, 3-term-match, 4-mismatch
+ op_cnt -- length of run
+ show_code -- SHOW_CODE_CIGAR uses cigar_code, otherwise legacy
+*/
+
+/* update_indel_code() is called for insertions and deletions
+ update_match_code() is called for every match
+*/
+
+static void
+sprintf_code(char *tmp_str, struct update_code_str *up_dp, int op_idx, int op_cnt) {
+
+ if (op_cnt == 0) return;
+
+ if (up_dp->cigar_order) {
+ sprintf(tmp_str,"%d%c",op_cnt,up_dp->op_map[op_idx]);
+ }
+ else {
+ sprintf(tmp_str,"%c%d",up_dp->op_map[op_idx],op_cnt);
+ }
+}
+
+/* only called for btop alignment encoding, for identity, update
+ count, otherwise, print previous count and current difference.
+ assumes that up_dp->p_op_cnt only tracks identity
+
+ for fx/fz, op=0,
+*/
+
+static void
+sprintf_btop(char *tmp_str,
+ struct update_code_str *up_dp,
+ int op, int sim_code,
+ unsigned char sp0, unsigned char sp1)
+{
+ char local_str[MAX_SSTR];
+ local_str[0]='\0';
+
+ tmp_str[0] = '\0';
+
+ /* only aligned identities update counts */
+ if (op==3 && sim_code == M_IDENT) {
+ up_dp->p_op_cnt++;
+ return;
+ }
+ else {
+ if (up_dp->p_op_cnt > 0) {
+ sprintf(local_str,"%d",up_dp->p_op_cnt);
+ }
+ up_dp->p_op_cnt = 0;
+ sprintf(tmp_str,"%s%c%c",local_str,sp0,sp1);
+ }
+}
+
+static void
+update_code(struct dyn_string_str *align_code_dyn,
+ struct update_code_str *up_dp, int op,
+ int sim_code, unsigned char sp0, unsigned char sp1)
+{
+ char tmp_cnt[MAX_SSTR];
+ tmp_cnt[0]='\0';
+
+ if (up_dp->btop_enc) {
+ sprintf_btop(tmp_cnt, up_dp, op, sim_code, sp0, sp1);
+ dyn_strcat(align_code_dyn, tmp_cnt);
+ return;
+ }
+
+ /* there are two kinds of "op's", one time and accumulating */
+ /* op == 2, 4 are one-time: */
+
+ switch (op) {
+ case 2:
+ case 4:
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ dyn_strcat(align_code_dyn, tmp_cnt);
+ sprintf_code(tmp_cnt,up_dp, op, 1);
+ dyn_strcat(align_code_dyn, tmp_cnt);
+ up_dp->p_op_cnt = 0;
+ break;
+ case 0:
+ case 5:
+ if (op == up_dp->p_op_idx) {
+ up_dp->p_op_cnt++;
+ }
+ else {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ dyn_strcat(align_code_dyn, tmp_cnt);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ break;
+ case 1:
+ case 3:
+ if (sp0 != '*' && sp1 != '*') { /* default case, not termination */
+ if (up_dp->show_ext) {
+ if (sim_code != M_IDENT) { op = 1;}
+ }
+ }
+ else { /* have a termination codon, output for !SHOW_CODE_CIGAR */
+ if (!up_dp->cigar_order) {
+ if (sp0 == '*' || sp1 == '*') { op = 6;}
+ }
+ else if (up_dp->show_ext && (sp0 != sp1)) { op = 1;}
+ }
+
+ if (up_dp->p_op_cnt == 0) {
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ else if (op != up_dp->p_op_idx) {
+ sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
+ dyn_strcat(align_code_dyn, tmp_cnt);
+ up_dp->p_op_idx = op;
+ up_dp->p_op_cnt = 1;
+ }
+ else {
+ up_dp->p_op_cnt++;
+ }
+ break;
+ }
+ return;
+}
+
+int calc_code(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ struct dyn_string_str *align_code_dyn,
+ const unsigned char *ann_arr,
+ const unsigned char *aa0a,
+ const struct annot_str *annot0_p,
+ const unsigned char *aa1a,
+ const struct annot_str *annot1_p,
+ struct dyn_string_str *annot_code_dyn,
+ int *score_delta,
+ struct f_struct *f_str,
+ void *pstat_void,
+ int display_code)
+{
+ int nc;
+
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, pstat_void,
+ ann_arr, aa0a, annot0_p, aa1a, annot1_p, CALC_CODE,
+ display_code,
+ &nc, NULL, NULL, NULL, NULL,
+ NULL, NULL, aln, score_delta, annot_code_dyn,
+ align_code_dyn
+ );
+}
+
+/* calc_id never looks at domains or features, only variation */
+
+int calc_id(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str)
+{
+ int nc;
+
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, NULL,
+ NULL, NULL, annot0_p, NULL, annot1_p, CALC_ID, 0,
+ &nc, NULL, NULL, NULL, NULL,
+ NULL, NULL, aln, score_delta, annot_var_dyn,
+ NULL
+ );
+}
+
+/* calc_idd looks at domains and variation */
+
+int calc_idd(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct a_struct *aln,
+ struct a_res_str *a_res,
+ struct pstruct *ppst,
+ const struct annot_str *annot0_p,
+ const struct annot_str *annot1_p,
+ int *score_delta,
+ struct dyn_string_str *annot_var_dyn,
+ struct f_struct *f_str)
+{
+ int nc;
+
+ return calc_cons_u(
+ aa0, n0, aa1, n1,
+ a_res, ppst, f_str, NULL,
+ NULL, NULL, annot0_p, NULL, annot1_p, CALC_ID_DOM, 0,
+ &nc, NULL, NULL, NULL, NULL,
+ NULL, NULL, aln, score_delta, annot_var_dyn,
+ NULL
+ );
+}
diff --git a/src/dropgsw2.c b/src/dropgsw2.c
new file mode 100644
index 0000000..3184858
--- /dev/null
+++ b/src/dropgsw2.c
@@ -0,0 +1,1128 @@
+/* $Id: dropgsw2.c $ */
+
+/* copyright (c) 1996, 2014 by William R. Pearson and The Rector &
+ Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* 17-Aug-2006 - removed globals *sapp/last - alignment should be thread safe */
+
+/* 12-Oct-2005 - converted to use a_res and aln for alignment coordinates */
+
+/* 4-Nov-2004 - Diagonal Altivec Smith-Waterman included */
+
+/* 14-May-2003 - modified to return alignment start at 0, rather than
+ 1, for begin:end alignments
+
+ 25-Feb-2003 - modified to support Altivec parallel Smith-Waterman
+
+ 22-Sep-2003 - removed Altivec support at request of Sencel lawyers
+*/
+
+/* this code uses an implementation of the Smith-Waterman algorithm
+ designed by Phil Green, U. of Washington, that is 1.5 - 2X faster
+ than my Miller and Myers implementation. */
+
+/* the shortcuts used in this program prevent it from calculating scores
+ that are less than the gap penalty for the first residue in a gap. As
+ a result this code cannot be used with very large gap penalties, or
+ with very short sequences, and probably should not be used with prss3.
+*/
+
+/* version 3.2 fixes a subtle bug that was encountered while running
+ do_walign() interspersed with do_work(). This happens only with -m
+ 9 and pvcomplib. The fix was to more explicitly zero-out ss[] at
+ the beginning of do_work.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+
+#include "defs.h"
+#include "param.h"
+
+static char *verstr="7.2 Nov 2010";
+
+#include "dropgsw2.h"
+
+#define DROP_INTERN
+#include "drop_func.h"
+
+#ifdef SW_ALTIVEC
+#include "smith_waterman_altivec.h"
+#endif
+#ifdef SW_SSE2
+#include "smith_waterman_sse2.h"
+#endif
+
+struct swstr {int H, E;};
+
+extern void init_karlin(const unsigned char *aa0, int n0, struct pstruct *ppst,
+ double *aa0_f, double **kp);
+extern int do_karlin(const unsigned char *aa1, int n1,
+ int **pam2, const struct pstruct *ppst,
+ double *aa0_f, double *kar_p, double *lambda, double *H);
+
+extern int sw_walign (int **pam2p, int n0,
+ const unsigned char *aa1, int n1,
+ int q, int r,
+ struct swstr *ss,
+ struct a_res_str *a_res
+ );
+
+extern struct a_res_str *
+nsw_malign (int ***pam2p, int pam_ix, int n0,
+ const unsigned char *aa1, int n1,
+ int score_thresh, int max_res,
+ int gdelval, int ggapval,
+ struct swstr *ss,
+ struct a_res_str *cur_ares,
+ int (*fn_walign)
+ (
+ int **pam2p, int n0,
+ const unsigned char *aa1, int n1,
+ int q, int r,
+ struct swstr *ss,
+ struct a_res_str *a_res
+ ),
+ int do_rep
+ );
+
+void
+SIM(const unsigned char *A, /* seq1 indexed A[1..M] */
+ const unsigned char *B, /* seq2 indexed B[1..N] */
+ int M, int N, /* len seq1, seq2 */
+ struct pstruct *ppst, /* parameters */
+ int nseq, /* nseq - number of different sequences */
+ int mini_score, /* cut-off score */
+ int max_count, /* number of alignments */
+ struct a_res_str *a_res); /* alignment result structure */
+
+static int
+FLOCAL_ALIGN(const unsigned char *aa0, const unsigned char *aa1,
+ int n0, int n1, int low, int up,
+ int **W, int GG,int HH, int MW,
+ struct f_struct *f_str);
+
+extern void aancpy(char *to, char *from, int count, struct pstruct *ppst);
+
+int same_seq(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1);
+
+static int
+prof_score(const unsigned char *aa1, int n0, int *pwaa_s);
+
+/* initialize for Smith-Waterman optimal score */
+
+void
+init_work (unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ int ip;
+ int *pwaa_s, *pwaa_a;
+ int e, f, i, j;
+ struct f_struct *f_str;
+ int **pam2p;
+ struct swstr *ss;
+ int nsq;
+
+#if defined(SW_ALTIVEC) || defined(SW_SSE2)
+ int l, data, bias;
+ unsigned char * pc;
+ unsigned short * ps;
+ int overflow;
+
+ int n_count;
+ int col_len;
+#endif
+
+ if (ppst->ext_sq_set) {
+ nsq = ppst->nsqx; ip = 1;
+ }
+ else {
+ /* set for lower-case for memory mapped DBs with lower case encoding */
+ nsq = ppst->nsqx; ip = 0;
+ }
+
+ /* allocate space for function globals */
+ f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct));
+
+ if((ppst->zsflag%10) == 6) {
+ f_str->kar_p = NULL;
+ init_karlin(aa0, n0, ppst, &f_str->aa0_f[0], &f_str->kar_p);
+ }
+
+ /* allocate space for the scoring arrays */
+ if ((ss = (struct swstr *) calloc (n0+2, sizeof (struct swstr)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate ss array %3d\n", n0);
+ exit (1);
+ }
+ ss++;
+
+ ss[n0].H = -1; /* this is used as a sentinel - normally H >= 0 */
+ ss[n0].E = 1;
+ f_str->ss = ss;
+
+ /* initialize variable (-S) pam matrix */
+ if ((f_str->waa_s= (int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate waa_s array %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ /* initialize pam2p[1] pointers */
+ if ((f_str->pam2p[1]= (int **)calloc((n0+1),sizeof(int *))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1] array %3d\n",n0);
+ exit(1);
+ }
+
+ pam2p = f_str->pam2p[1];
+ if ((pam2p[0]=(int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1][] array %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ for (i=1; i<n0; i++) {
+ pam2p[i]= pam2p[0] + (i*(nsq+1));
+ }
+
+ /* initialize universal (alignment) matrix */
+ if ((f_str->waa_a= (int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate waa_a struct %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ /* initialize pam2p[0] pointers */
+ if ((f_str->pam2p[0]= (int **)calloc((n0+1),sizeof(int *))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1] array %3d\n",n0);
+ exit(1);
+ }
+
+ pam2p = f_str->pam2p[0];
+ if ((pam2p[0]=(int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1][] array %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ for (i=1; i<n0; i++) {
+ pam2p[i]= pam2p[0] + (i*(nsq+1));
+ }
+
+ /*
+ pwaa effectively has a sequence profile --
+ pwaa[0..n0-1] has pam score for residue 0 (-BIGNUM)
+ pwaa[n0..2n0-1] has pam scores for amino acid 1 (A)
+ pwaa[2n0..3n0-1] has pam scores for amino acid 2 (R), ...
+
+ thus: pwaa = f_str->waa_s + (*aa1p++)*n0; sets up pwaa so that
+ *pwaa++ rapidly moves though the scores of the aa1p[] position
+ without further indexing
+
+ For a real sequence profile, pwaa[0..n0-1] vs ['A'] could have
+ a different score in each position.
+ */
+
+ pwaa_s = f_str->waa_s;
+ pwaa_a = f_str->waa_a;
+ if (ppst->pam_pssm) {
+ for (e = 0; e <=nsq; e++) { /* for each residue in the alphabet */
+ for (f = 0; f < n0; f++) { /* for each position in aa0 */
+ *pwaa_s++ = f_str->pam2p[ip][f][e] = ppst->pam2p[ip][f][e];
+ *pwaa_a++ = f_str->pam2p[0][f][e] = ppst->pam2p[0][f][e];
+ }
+ }
+ }
+ else { /* initialize scanning matrix */
+ for (e = 0; e <=nsq; e++) /* for each residue in the alphabet */
+ for (f = 0; f < n0; f++) { /* for each position in aa0 */
+ *pwaa_s++ = f_str->pam2p[ip][f][e]= ppst->pam2[ip][aa0[f]][e];
+ *pwaa_a++ = f_str->pam2p[0][f][e] = ppst->pam2[0][aa0[f]][e];
+ }
+ }
+
+#if defined(SW_ALTIVEC)
+
+ /* First we allocate memory for the workspace - i.e. the single row
+ * of storage for H/F. Since this might be run on Linux or AIX too,
+ * we don't assume anything about the memory allocation but align
+ * it ourselves. We need two vectors (16 bytes each) per element,
+ * and some padding space to make it cache-line aligned.
+
+ * MAXTST+MAXLIB is longest allowed database sequence length...
+ * this should be m_msg.max_tot, but m_msg is not available, but
+ * ppst->maxlen has maxn, which is appropriate.
+ */
+
+ f_str->workspace_memory = (void *)malloc(2*16*(ppst->maxlen+SEQ_PAD)+256);
+ f_str->workspace = (void *) ((((size_t) f_str->workspace_memory) + 255) & (~0xff));
+ /* We always use a scoring profile in altivec, but the layout is a bit strange
+ * in order to optimize memory access order and thus cache efficiency.
+ * Normally we first try 8-bit scoring in altivec, and if this leads to overflow
+ * we recompute the score with 16-bit accuracy. Because of this we need to construct
+ * two score profiles.
+ * Since altivec always loads 16 bytes from aligned memory, corresponding to 8 or 16
+ * elements (for 16 and 8 bit scoring, respectively), we organize the scoring
+ * profile like this for 8-bit accuracy:
+ *
+ * 1. The profile starts on 256-byte aligned memory (cache line on G5 is 128 bytes).
+ * 2. First we have the score for the full alphabet for the first 16 residues of
+ * the query, i.e. positions 0-15 are the scores for the first 16 query letters
+ * vs. the first in the alphabet, positions 16-31 the scores for the same 16
+ * query positions against alphabet letter two, etc.
+ * 3. After alphabet_size*16bytes we start with the scores for residues 16-31 in
+ * the query, organized in the same way.
+ * 4. At the end of the query sequence, we pad the scoring to the next 16-tuple
+ * with neutral scores.
+ * 5. The total size of the profile is thus alphabet_size*N, where N is the
+ * size of the query rounded up to the next 16-tuple.
+ *
+ * The word (16-bit) profile is identical, but scores are stored as 8-tuples.
+ */
+
+ f_str->word_score_memory = (void *)malloc(10*2*(nsq+2)*(n0+1+16)+256);
+ f_str->byte_score_memory = (void *)malloc(10*(nsq+2)*(n0+1+16)+256);
+
+ f_str->word_score = (unsigned short *) ((((size_t) f_str->word_score_memory) + 255) & (~0xff));
+ f_str->byte_score = (unsigned char *) ((((size_t) f_str->byte_score_memory) + 255) & (~0xff));
+
+ overflow = 0;
+
+ if (ppst->pam_pssm) {
+ /* Use a position-specific scoring profile.
+ * This is essentially what we are going to construct anyway, but we'll
+ * reorder it to suit altivec.
+ */
+ bias = 127;
+ for(i = 1; i < nsq ; i++) {
+ for(j = 0; j < n0 ; j++) {
+ data = ppst->pam2p[ip][j][i];
+ if(data<bias) bias = data;
+ }
+ }
+
+ /* Fill our specially organized byte- and word-size scoring arrays. */
+ ps = f_str->word_score;
+ for(f = 0; f<n0 ; f+=8) {
+ /* e=0 */
+ for(i=0 ; i<8 ; i++) {
+ *ps++ = (unsigned short) 0;
+ }
+ /* for each chunk of 8 residues in our query */
+ for(e = 1; e<=nsq; e++) {
+ for(i=0 ; i<8 ; i++) {
+ l = f + i;
+ if(l<n0) {
+ data = ppst->pam2p[ip][l][e] - bias;
+ }
+ else {
+ data = 0;
+ }
+ *ps++ = (unsigned short)data;
+ }
+ }
+ }
+ pc = f_str->byte_score;
+ for(f = 0; f<n0 ; f+=16) {
+ /* e=0 */
+ for(i=0 ; i<16 ; i++) {
+ *pc++ = (unsigned char)0;
+ }
+
+ for(e = 1; e<=nsq; e++) {
+ for(i=0 ; i<16 ; i++) {
+ l = f + i;
+ if(l<n0) {
+ data = ppst->pam2p[ip][l][e] - bias;
+ }
+ else {
+ data = 0;
+ }
+ if(data>255) {
+ /*
+ printf("Fatal error. data: %d bias: %d, position: %d/%d, Score out of range for 8-bit Altivec/VMX datatype.\n",data,bias,l,e);
+ exit(1);
+ */
+ overflow = 1;
+ }
+ *pc++ = (unsigned char)data;
+ }
+ }
+ }
+ }
+ else {
+ /* Classical simple substitution matrix */
+ /* Find the bias to use in the substitution matrix */
+ bias = 127;
+ for(i = 1; i < nsq ; i++) {
+ for(j = 1; j < nsq ; j++) {
+ data = ppst->pam2[ip][i][j];
+#ifdef DEBUG
+ if (data < -1000) {
+ fprintf(stderr, "*** low data: %d [%d][%d][%d]\n",data,ip,i,j);
+ }
+#endif
+ if(data<bias) bias = data;
+ }
+ }
+ /* Fill our specially organized byte- and word-size scoring arrays. */
+ ps = f_str->word_score;
+ for(f = 0; f<n0 ; f+=8) {
+ /* e=0 */
+ for(i=0 ; i<8 ; i++) {
+ *ps++ = (unsigned short) 0;
+ }
+ /* for each chunk of 8 residues in our query */
+ for(e = 1; e<=nsq; e++) {
+ for(i=0 ; i<8 ; i++) {
+ l = f + i;
+ if(l<n0) {
+ data = ppst->pam2[ip][aa0[l]][e] - bias;
+ }
+ else {
+ data = 0;
+ }
+ *ps++ = (unsigned short)data;
+ }
+ }
+ }
+ pc = f_str->byte_score;
+ for(f = 0; f<n0 ; f+=16) {
+ /* e=0 */
+ for(i=0 ; i<16 ; i++) {
+ *pc++ = (unsigned char)0;
+ }
+
+ for(e = 1; e<=nsq; e++) {
+ for(i=0 ; i<16 ; i++) {
+ l = f + i;
+ if (l<n0) {
+ data = ppst->pam2[ip][aa0[l]][e] - bias;
+ }
+ else {
+ data = 0;
+ }
+ if(data>255) {
+ /*
+ printf("Fatal error. Score out of range for 8-bit Altivec/VMX datatype.\n");
+ exit(1);
+ */
+ overflow = 1;
+ }
+ *pc++ = (unsigned char)data;
+ }
+ }
+ }
+ }
+
+ f_str->bias = (unsigned char) (-bias);
+ f_str->alphabet_size = nsq+1;
+
+ /* Some variable to keep track of how many 8-bit runs we need to rerun
+ * in 16-bit accuracy. If there are too many reruns it can be faster
+ * to use 16-bit alignments directly.
+ */
+
+ /* We can only do 8-bit alignments if the scores were small enough. */
+ if(overflow==0) f_str->try_8bit = 1;
+ else f_str->try_8bit = 0;
+
+ f_str->done_8bit = 0;
+ f_str->done_16bit = 0;
+
+#endif /* SW_ALTIVEC */
+
+#if defined(SW_SSE2)
+ /* First we allocate memory for the workspace - i.e. two rows for H and
+ * one row for F. We also need enough space to hold a temporary
+ * scoring profile which will be query_length * 16 (sse2 word length).
+ * Since this might be run on Linux or AIX too, we don't assume
+ * anything about the memory allocation but align it ourselves.
+ */
+ f_str->workspace_memory = (void *)malloc(3*16*(MAXTST+MAXLIB+32)+256);
+ f_str->workspace = (void *) ((((size_t) f_str->workspace_memory) + 255) & (~0xff));
+
+ /* We always use a scoring profile for the SSE2 implementation, but the layout
+ * is a bit strange. The scoring profile is parallel to the query, but is
+ * accessed in a stripped pattern. The query is divided into equal length
+ * segments. The number of segments is equal to the number of elements
+ * processed in the SSE2 register. For 8-bit calculations, the query will
+ * be divided into 16 equal length parts. If the query is not long enough
+ * to fill the last segment, it will be filled with neutral weights. The
+ * first element in the SSE register will hold a value from the first segment,
+ * the second element of the SSE register will hold a value from the
+ * second segment and so on. So if the query length is 288, then each
+ * segment will have a length of 18. So the first 16 bytes will have
+ * the following weights: Q1, Q19, Q37, ... Q271; the next 16 bytes will
+ * have the following weights: Q2, Q20, Q38, ... Q272; and so on until
+ * all parts of all segments have been written. The last seqment will
+ * have the following weights: Q18, Q36, Q54, ... Q288. This will be
+ * done for the entire alphabet.
+ */
+
+ f_str->word_score_memory = (void *)malloc((n0 + 32) * sizeof (short) * (nsq + 1) + 256);
+ f_str->byte_score_memory = (void *)malloc((n0 + 32) * sizeof (char) * (nsq + 1) + 256);
+
+ f_str->word_score = (unsigned short *) ((((size_t) f_str->word_score_memory) + 255) & (~0xff));
+ f_str->byte_score = (unsigned char *) ((((size_t) f_str->byte_score_memory) + 255) & (~0xff));
+
+ overflow = 0;
+
+ if (ppst->pam_pssm) {
+ /* Use a position-specific scoring profile.
+ * This is essentially what we are going to construct anyway, but we'll
+ * reorder it to suit sse2.
+ */
+ bias = 127;
+ for (i = 1; i < nsq ; i++) {
+ for (j = 0; j < n0 ; j++) {
+ data = ppst->pam2p[ip][j][i];
+ if (data < bias) {
+ bias = data;
+ }
+ }
+ }
+
+ /* Fill our specially organized byte- and word-size scoring arrays. */
+ ps = f_str->word_score;
+ col_len = (n0 + 7) / 8;
+ n_count = (n0 + 7) & 0xfffffff8;
+ for (f = 0; f < n_count; ++f) {
+ *ps++ = 0;
+ }
+ for (f = 1; f < nsq ; f++) {
+ for (e = 0; e < col_len; e++) {
+ for (i = e; i < n_count; i += col_len) {
+ if ( i < n0) { data = ppst->pam2p[ip][i][f];}
+ else {data = 0;}
+ *ps++ = (unsigned short)data;
+ }
+ }
+ }
+ pc = f_str->byte_score;
+ col_len = (n0 + 15) / 16;
+ n_count = (n0 + 15) & 0xfffffff0;
+ for (f = 0; f < n_count; ++f) {
+ *pc++ = 0;
+ }
+ for (f = 1; f < nsq ; f++) {
+ for (e = 0; e < col_len; e++) {
+ for (i = e; i < n_count; i += col_len) {
+ if ( i < n0 ) { data = ppst->pam2p[ip][i][f] - bias;}
+ else {data = 0 - bias;}
+ if (data > 255) {
+ printf("Fatal error. data: %d bias: %d, position: %d/%d, "
+ "Score out of range for 8-bit SSE2 datatype.\n",
+ data, bias, f, e);
+ exit(1);
+ }
+ *pc++ = (unsigned char)data;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* Classical simple substitution matrix */
+ /* Find the bias to use in the substitution matrix */
+ bias = 127;
+ for (i = 1; i < nsq ; i++) {
+ for (j = 1; j < nsq ; j++) {
+ data = ppst->pam2[ip][i][j];
+ if (data < -128) {
+ fprintf(stderr,"*** ERROR *** data out of range: %d[%d][%d,%d]\n",
+ data, ip, i, j);
+ }
+ if (data < bias) {
+ bias = data;
+ }
+ }
+ }
+
+ /* Fill our specially organized byte- and word-size scoring arrays. */
+ ps = f_str->word_score;
+ col_len = (n0 + 7) / 8;
+ n_count = (n0 + 7) & 0xfffffff8;
+ for (f = 0; f < n_count; ++f) {
+ *ps++ = 0;
+ }
+ for (f = 1; f < nsq ; f++) {
+ for (e = 0; e < col_len; e++) {
+ for (i = e; i < n_count; i += col_len) {
+ if (i >= n0) {
+ data = 0;
+ } else {
+ data = ppst->pam2[ip][aa0[i]][f];
+ }
+ *ps++ = (unsigned short)data;
+ }
+ }
+ }
+
+ pc = f_str->byte_score;
+ col_len = (n0 + 15) / 16;
+ n_count = (n0 + 15) & 0xfffffff0;
+ for (f = 0; f < n_count; ++f) {
+ *pc++ = 0;
+ }
+ for (f = 1; f < nsq ; f++) {
+ for (e = 0; e < col_len; e++) {
+ for (i = e; i < n_count; i += col_len) {
+ if (i >= n0) {
+ data = -bias;
+ } else {
+ data = ppst->pam2[ip][aa0[i]][f] - bias;
+ }
+ if (data > 255) {
+ printf("Fatal error. data: %d bias: %d, position: %d/%d, "
+ "Score out of range for 8-bit SSE2 datatype.\n",
+ data, bias, f, e);
+ exit(1);
+ }
+ *pc++ = (unsigned char)data;
+ }
+ }
+ }
+ }
+
+ f_str->bias = (unsigned char) (-bias);
+ f_str->alphabet_size = nsq+1;
+
+ /* Some variable to keep track of how many 8-bit runs we need to rerun
+ * in 16-bit accuracy. If there are too many reruns it can be faster
+ * to use 16-bit alignments directly.
+ */
+
+ /* We can only do 8-bit alignments if the scores were small enough. */
+ f_str->try_8bit = (overflow == 0) ? 1 : 0;
+
+ f_str->done_8bit = 0;
+ f_str->done_16bit = 0;
+#endif /* SW_SSE2 */
+
+ /* minimum allocation for alignment */
+ f_str->max_res = max(3*n0/2,MIN_RES);
+
+ *f_arg = f_str;
+}
+
+void close_work (const unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ struct f_struct *f_str;
+
+ f_str = *f_arg;
+
+ if (f_str != NULL) {
+ if (f_str->kar_p !=NULL) free(f_str->kar_p);
+ f_str->ss--;
+ free(f_str->ss);
+ free(f_str->waa_a);
+ free(f_str->pam2p[0][0]);
+ free(f_str->pam2p[0]);
+ free(f_str->waa_s);
+ free(f_str->pam2p[1][0]);
+ free(f_str->pam2p[1]);
+
+#if defined(SW_ALTIVEC) || defined(SW_SSE2)
+ free(f_str->workspace_memory);
+ free(f_str->word_score_memory);
+ free(f_str->byte_score_memory);
+#endif
+ free(f_str);
+ *f_arg = NULL;
+ }
+}
+
+
+/* pstring1 is a message to the manager, currently 512 */
+/*void get_param(struct pstruct *ppst,char *pstring1)*/
+void get_param (const struct pstruct *ppst,
+ char **pstring1, char *pstring2,
+ struct score_count_s *s_cnt_info)
+{
+ char pg_str[120];
+ char psi_str[120];
+
+#if defined(SW_ALTIVEC)
+ strncpy(pg_str,"Smith-Waterman (Altivec/VMX, Erik Lindahl 2004)",sizeof(pg_str));
+#endif
+#if defined(SW_SSE2)
+ strncpy(pg_str,"Smith-Waterman (SSE2, Michael Farrar 2006)",sizeof(pg_str));
+#endif
+#if !defined(SW_ALTIVEC) && !defined(SW_SSE2)
+ strncpy(pg_str,"Smith-Waterman (PGopt)",sizeof(pg_str));
+#endif
+
+ if (ppst->pam_pssm) { strncpy(psi_str,"-PSI",sizeof(psi_str));}
+ else { psi_str[0]='\0';}
+
+ sprintf (pstring1[0], "%s (%s)", pg_str, verstr);
+ sprintf (pstring1[1],
+#ifdef OLD_FASTA_GAP
+ "%s matrix%s (%d:%d)%s, gap-penalty: %d/%d",
+#else
+ "%s matrix%s (%d:%d)%s, open/ext: %d/%d",
+#endif
+ ppst->pam_name, psi_str, ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set)?"xS":"\0", ppst->gdelval, ppst->ggapval);
+
+ if (pstring2 != NULL) {
+#ifdef OLD_FASTA_GAP
+ sprintf(pstring2,"; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s%s (%d:%d)%s\n; pg_gap-pen: %d %d\n",
+#else
+ sprintf(pstring2,"; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s%s (%d:%d)%s\n; pg_open-ext: %d %d\n",
+#endif
+ pg_str,verstr,ppst->pam_name,psi_str,ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set)?"xS":"\0",ppst->gdelval,ppst->ggapval);
+ }
+}
+
+void do_work (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ int qr_flg, int shuff_flg, struct rstruct *rst,
+ struct score_count_s *sc_info)
+{
+ int score;
+ double lambda, H;
+ int i;
+
+#ifdef LALIGN
+ if (same_seq(aa0,n0,aa1,n1)) {
+ rst->score[0] = prof_score(aa1, n0, f_str->waa_s);
+ return;
+ }
+#endif
+
+ rst->alg_info = 0;
+ rst->valid_stat = 1;
+ sc_info->s_cnt[0]++;
+ sc_info->tot_scores++;
+
+#ifdef SW_ALTIVEC
+ if(f_str->try_8bit)
+ {
+ score = smith_waterman_altivec_byte(aa0,
+ f_str->byte_score,
+ n0,
+ aa1,
+ n1,
+ f_str->bias,
+#ifndef OLD_FASTA_GAP
+ -(ppst->gdelval + ppst->ggapval),
+#else
+ -ppst->gdelval,
+#endif
+ -ppst->ggapval,
+ f_str);
+
+ f_str->done_8bit++;
+
+ if(score>=255)
+ {
+ /* Overflow, so we have to redo it in 16 bits. */
+ score = smith_waterman_altivec_word(aa0,
+ f_str->word_score,
+ n0,
+ aa1,
+ n1,
+ f_str->bias,
+#ifndef OLD_FASTA_GAP
+ -(ppst->gdelval + ppst->ggapval),
+#else
+ -ppst->gdelval,
+#endif
+ -ppst->ggapval,
+ f_str);
+
+ /* The 8 bit version is roughly 50% faster than the 16 bit version,
+ * so we are fine if less than about 1/3 of the runs have to
+ * be rerun with 16 bits. If it is more, and we have tried at least
+ * 500 sequences, we switch off the 8-bit mode.
+ */
+ f_str->done_16bit++;
+ if(f_str->done_8bit>500 && (3*f_str->done_16bit)>(f_str->done_8bit))
+ f_str->try_8bit = 0;
+ }
+ }
+ else
+ {
+ /* Just use the 16-bit altivec version directly */
+ score = smith_waterman_altivec_word(aa0,
+ f_str->word_score,
+ n0,
+ aa1,
+ n1,
+ f_str->bias,
+#ifndef OLD_FASTA_GAP
+ -(ppst->gdelval + ppst->ggapval),
+#else
+ -ppst->gdelval,
+#endif
+ -ppst->ggapval,
+ f_str);
+ }
+
+#endif /* not Altivec */
+
+#if defined(SW_SSE2)
+
+ if(f_str->try_8bit)
+ {
+ score = smith_waterman_sse2_byte(aa0,
+ f_str->byte_score,
+ n0,
+ aa1,
+ n1,
+ f_str->bias,
+#ifndef OLD_FASTA_GAP
+ -(ppst->gdelval + ppst->ggapval),
+#else
+ -ppst->gdelval,
+#endif
+ -ppst->ggapval,
+ f_str);
+
+ f_str->done_8bit++;
+
+ if(score>=255)
+ {
+ /* Overflow, so we have to redo it in 16 bits. */
+ score = smith_waterman_sse2_word(aa0,
+ f_str->word_score,
+ n0,
+ aa1,
+ n1,
+#ifndef OLD_FASTA_GAP
+ -(ppst->gdelval + ppst->ggapval),
+#else
+ -ppst->gdelval,
+#endif
+ -ppst->ggapval,
+ f_str);
+
+ /* The 8 bit version is roughly 50% faster than the 16 bit version,
+ * so we are fine if less than about 1/3 of the runs have to
+ * be rerun with 16 bits. If it is more, and we have tried at least
+ * 500 sequences, we switch off the 8-bit mode.
+ */
+ f_str->done_16bit++;
+ if(f_str->done_8bit>500 && (3*f_str->done_16bit)>(f_str->done_8bit))
+ f_str->try_8bit = 0;
+ }
+ }
+ else
+ {
+ /* Just use the 16-bit altivec version directly */
+ score = smith_waterman_sse2_word(aa0,
+ f_str->word_score,
+ n0,
+ aa1,
+ n1,
+#ifndef OLD_FASTA_GAP
+ -(ppst->gdelval + ppst->ggapval),
+#else
+ -ppst->gdelval,
+#endif
+ -ppst->ggapval,
+ f_str);
+ }
+#endif
+
+#if !defined(SW_ALTIVEC) && !defined(SW_SSE2)
+
+ score = FLOCAL_ALIGN(aa0,aa1,n0,n1,0,0,
+ NULL,
+#ifdef OLD_FASTA_GAP
+ -(ppst->gdelval - ppst->ggapval),
+#else
+ -ppst->gdelval,
+#endif
+ -ppst->ggapval,0,f_str);
+#endif
+
+ rst->score[0] = score;
+ rst->score[1] = rst->score[2] = 0;
+
+ if(((ppst->zsflag % 10) == 6) &&
+ (do_karlin(aa1, n1, ppst->pam2[0], ppst,f_str->aa0_f,
+ f_str->kar_p, &lambda, &H)>0)) {
+ rst->comp = 1.0/lambda;
+ rst->H = H;
+ }
+ else {rst->comp = rst->H = -1.0;}
+
+}
+
+static int
+FLOCAL_ALIGN(const unsigned char *aa0, const unsigned char *aa1,
+ int n0, int n1, int low, int up,
+ int **W, int GG,int HH, int MW,
+ struct f_struct *f_str) {
+
+ register int *pwaa;
+ register struct swstr *ssj;
+ struct swstr *ss;
+ register int h, e, f, p;
+ int temp, score;
+ int gap_ext, n_gap_init;
+
+ const unsigned char *aa1p;
+ ss = f_str->ss;
+ ss[n0].H = -1;
+ ss[n0].E = 1;
+
+ n_gap_init = GG + HH;
+ gap_ext = -HH; /* GG, HH are both positive,
+ gap_ext penalty should be negative */
+
+ score = 0;
+ for (h=0; h<n0; h++) { /* initialize 0th row */
+ ss[h].H = ss[h].E = 0;
+ }
+
+ aa1p=aa1;
+ while (*aa1p) { /* relies on aa1[n1]==0 for EOS flag */
+ /* waa_s has the offsets for each residue in aa0 into pam2 */
+ /* waa_s has complexity (-S) dependent scores */
+ pwaa = f_str->waa_s + (*aa1p++)*n0;
+ ssj = ss;
+
+ e = f = h = p = 0;
+ zero_f: /* in this section left-gap f==0, and is never examined */
+
+ while (1) { /* build until h > n_gap_init (f < 0 until h > n_gap_init) */
+ /* bump through the pam[][]'s for each of the aa1[] matches to
+ aa0[], because of the way *pwaa is set up */
+
+ h = p + *pwaa++; /* increment diag value */
+ p = ssj->H; /* get next diag value */
+ if ((e = ssj->E) > 0 ) { /* >0 from up-gap */
+ if (p == -1) goto next_row; /* done, -1=ss[n0].H sentinel */
+ if (h < e) h = e; /* up-gap better than diag */
+ else
+ if (h > n_gap_init) { /* we won't starting a new up-gap */
+ e += gap_ext; /* but we might be extending one */
+ goto transition; /* good h > n_gap_diag; scan f */
+ }
+ e += gap_ext; /* up-gap decreased */
+ ssj->E = (e > 0) ? e : 0; /* set to 0 if < 0 */
+ ssj++->H = h; /* diag match updated */
+ }
+ else { /* up-gap (->E) is 0 */
+ if ( h > 0) { /* diag > 0 */
+ if (h > n_gap_init) { /* we won't be starting a new up-gap */
+ e = 0; /* and we won't be extending one */
+ goto transition; /* good h > n_gap_diag; scan f */
+ }
+ ssj++->H = h; /* update diag */
+ }
+ else ssj++->H = 0; /* update diag to 0 */
+ }
+ }
+
+ /* here h > n_gap_init and h > e, => the next f will be > 0 */
+ transition:
+#ifdef DEBUG
+ if ( h > 10000)
+ fprintf(stderr,"h: %d ssj: %d\n",h, (int)(ssj-ss));
+#endif
+ if ( score < h ) score = h; /* save best score, only when h > n_gap_init */
+
+ temp = h - n_gap_init; /* best score for starting a new gap */
+ if ( f < temp ) f = temp; /* start a left-gap? */
+ if ( e < temp ) e = temp; /* start an up-gap? */
+ ssj->E = ( e > 0 ) ? e : 0; /* update up-gap */
+ ssj++->H = h; /* update diag */
+ e = 0;
+
+ do { /* stay here until f <= 0 */
+ h = p + *pwaa++; /* diag + match/mismatch */
+ p = ssj->H; /* save next (right) diag */
+
+ if ( h < f ) h = f; /* update diag using left gap */
+ f += gap_ext; /* update next left-gap */
+
+ if ((e = ssj->E) > 0) { /* good up gap */
+ if (p == -1) goto next_row; /* at the end of the row */
+ if ( h < e ) h = e; /* update diag using up-gap */
+ else
+ if ( h > n_gap_init ) {
+ e += gap_ext; /* update up gap */
+ goto transition; /* good diag > n_gap_init, restart */
+ }
+ e += gap_ext; /* update up-gap */
+ ssj->E = (e > 0) ? e : 0; /* e must be >= 0 */
+ ssj++->H = h; /* update diag */
+ }
+ else { /* up-gap <= 0 */
+ if ( h > n_gap_init ) {
+ e = 0;
+ goto transition; /* good diag > n_gap_init; restart */
+ }
+ ssj++->H = h; /* update diag */
+ }
+ } while ( f > 0 ); /* while left gap f > 0 */
+ goto zero_f; /* otherwise, go to f==0 section */
+ next_row:
+ ;
+ } /* end while(*aap1) {} */
+
+ return score;
+
+} /* here we should be all done */
+
+void do_opt (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ struct pstruct *ppst, struct f_struct *f_str,
+ struct rstruct *rst)
+{
+}
+
+struct a_res_str *
+do_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int repeat_thresh,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ int *have_ares)
+{
+ int a_res_index;
+ struct a_res_str *a_res, *tmp_a_res;
+
+ *have_ares = 0x3; /* set 0x2 bit to indicate local copy */
+
+ if ((a_res = (struct a_res_str *)calloc(1, sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr," [do_walign] Cannot allocate a_res");
+ return NULL;
+ }
+
+#ifndef LALIGN
+ a_res = nsw_malign(f_str->pam2p, (ppst->ext_sq_set ? 1 : 0), n0, aa1, n1,
+ repeat_thresh, f_str->max_res,
+ -ppst->gdelval, -ppst->ggapval,
+ f_str->ss, a_res,
+ &sw_walign, ppst->do_rep);
+
+#else /* LALIGN */
+ if (!ppst->show_ident && same_seq(aa0, n0, aa1, n1)) ppst->nseq = 1;
+ else ppst->nseq = 2;
+
+ SIM(aa0-1, aa1-1, n0, n1, ppst, ppst->nseq, repeat_thresh, ppst->max_repeat, a_res);
+#endif
+
+ /* set a_res->index for alignments */
+
+ a_res_index = 0;
+ for (tmp_a_res=a_res; tmp_a_res; tmp_a_res = tmp_a_res->next) {
+ tmp_a_res->index = a_res_index++;
+ }
+
+ return a_res;
+}
+
+/*
+#define XTERNAL
+#include "upam.h"
+
+void
+print_seq_prof(unsigned char *A, int M,
+ unsigned char *B, int N,
+ int **w, int iw, int dir) {
+ char c_max;
+ int i_max, j_max, i,j;
+
+ char *c_dir="LRlr";
+
+ for (i=1; i<=min(60,M); i++) {
+ fprintf(stderr,"%c",aa[A[i]]);
+ }
+ fprintf(stderr, - %d\n,M);
+
+ for (i=0; i<min(60,M); i++) {
+ i_max = -1;
+ for (j=1; j<21; j++) {
+ if (w[iw+i][j]> i_max) {
+ i_max = w[iw+i][j];
+ j_max = j;
+ }
+ }
+ fprintf(stderr,"%c",aa[j_max]);
+ }
+ fputc(':',stderr);
+
+ for (i=1; i<=min(60,N); i++) {
+ fprintf(stderr,"%c",aa[B[i]]);
+ }
+
+ fprintf(stderr," -%c: %d,%d\n",c_dir[dir],M,N);
+}
+*/
+
+void
+pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {
+
+#ifdef TFAST
+ f_str->n10 = aatran(aa1,f_str->aa1x,n1,frame);
+#endif
+
+}
+
+/* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
+/* call from calcons, calc_id, calc_code */
+void
+aln_func_vals(int frame, struct a_struct *aln) {
+
+ aln->llfact = aln->llmult = aln->qlfact = 1;
+ aln->llrev = 0;
+ if (frame > 0) aln->qlrev = 1;
+ else aln->qlrev = 0;
+ aln->frame = 0;
+}
+
+/* calculate the 100% identical score */
+int
+prof_score(const unsigned char *aa1p, int n0, int *pwaa_s)
+{
+ int sum=0;
+
+ while (*aa1p) {
+ sum += pwaa_s[(*aa1p++)*n0];
+ pwaa_s++;
+ }
+ return sum;
+}
+
+int same_seq(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1)
+{
+ const unsigned char *ap0, *ap1;
+ int cnt=0;
+
+ if (n0 != n1) return 0;
+
+ ap0 = aa0;
+ ap1 = aa1;
+
+ while ( *ap0 && *ap0++ == *ap1++ ) {cnt++;}
+ if (cnt != n0) return 0;
+ return 1;
+}
diff --git a/src/dropgsw2.h b/src/dropgsw2.h
new file mode 100644
index 0000000..5878321
--- /dev/null
+++ b/src/dropgsw2.h
@@ -0,0 +1,46 @@
+/* $Id: dropgsw2.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+/* global definitions shared by dropgsw.c and altivec.c */
+
+/* definitions for SW */
+
+struct f_struct {
+ struct swstr *ss;
+ int *waa_s, *waa_a;
+ int **pam2p[2];
+ int max_res;
+ double aa0_f[MAXSQ];
+ double *kar_p;
+ double e_cut;
+ int show_ident;
+ int max_repeat;
+#if defined(SW_ALTIVEC) || defined(SW_SSE2)
+ unsigned char bias;
+ unsigned char ceiling;
+ unsigned short * word_score;
+ unsigned char * byte_score;
+ void * workspace;
+ int alphabet_size;
+ void * word_score_memory;
+ void * byte_score_memory;
+ void * workspace_memory;
+ int try_8bit;
+ int done_8bit;
+ int done_16bit;
+#endif
+};
+
+#ifdef LALIGN
+void SIM(const unsigned char *A, /* seq1 indexed A[1..M] */
+ const unsigned char *B, /* seq2 indexed B[1..N] */
+ int M, int N, /* len seq1, seq2 */
+ struct pstruct *ppst, /* parameters */
+ int nseq, /* nseq - number of different sequences */
+ int mini_score, /* cut-off score */
+ int max_count, /* number of alignments */
+ struct a_res_str *a_res); /* alignment result structure */
+
+int same_seq(const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1);
+#endif
diff --git a/src/dropnfa.c b/src/dropnfa.c
new file mode 100644
index 0000000..f19889b
--- /dev/null
+++ b/src/dropnfa.c
@@ -0,0 +1,2250 @@
+/* $Id: dropnfa.c $ */
+
+/* copyright (c) 1998, 1999, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* 17-Sept-2008 - modified for multiple non-overlapping alignments */
+
+/* 18-Sep-2006 - removed global variables for alignment from nw_align
+ and bg_align */
+
+/* 18-Oct-2005 - converted to use a_res and aln for alignment coordinates */
+
+/* 14-May-2003 - modified to return alignment start at 0, rather than
+ 1, for begin:end alignments
+*/
+
+/*
+ implements the fasta algorithm, see:
+
+ W. R. Pearson, D. J. Lipman (1988) "Improved tools for biological
+ sequence comparison" Proc. Natl. Acad. Sci. USA 85:2444-2448
+
+ This version uses Smith-Waterman for final protein alignments
+
+ W. R. Pearson (1996) "Effective protein sequence comparison"
+ Methods Enzymol. 266:227-258
+
+
+ 26-April-2001 - -DGAP_OPEN redefines -f, as gap open penalty
+
+ 4-Nov-2001 - modify spam() while(1).
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+
+#include "defs.h"
+#include "param.h"
+
+/* this must be consistent with upam.h */
+#define MAXHASH 32
+#define NMAP MAXHASH+1
+
+/* globals for fasta */
+#define MAXWINDOW 64
+
+#ifndef MAXSAV
+#define MAXSAV 10
+#endif
+
+static char *verstr="3.8 Nov 2011";
+
+extern void w_abort(char *, char *);
+int shscore(const unsigned char *aa0, int n0, int **pam2);
+extern void init_karlin(const unsigned char *aa0, int n0, struct pstruct *ppst,
+ double *aa0_f, double **kp);
+extern void init_karlin_a(struct pstruct *, double *, double **);
+extern int do_karlin(const unsigned char *, int n1, int **,
+ const struct pstruct *, double *, double *,
+ double *, double *);
+extern int ELK_to_s(double E_join, int n0, int n1, double Lambda, double K, double H);
+
+extern void aancpy(char *to, char *from, int count, struct pstruct *ppst);
+char *ckalloc(size_t);
+unsigned long adler32(unsigned long, const unsigned char *, unsigned int);
+
+#ifdef TFASTA
+extern int aatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame);
+#endif
+
+#include "dropnfa.h"
+
+#define DROP_INTERN
+#include "drop_func.h"
+
+struct swstr { int H, E;};
+
+int
+dmatch (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int hoff, int window,
+ int **pam2, int gdelval, int ggapval,
+ struct f_struct *f_str);
+
+
+extern int sw_walign (int **pam2p, int n0,
+ const unsigned char *aa1, int n1,
+ int q, int r,
+ struct swstr *ss,
+ struct a_res_str *a_res
+ );
+
+int bd_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct pstruct *ppst,
+ struct f_struct *f_str, int hoff,
+ struct a_res_str *a_res);
+
+struct a_res_str *
+merge_ares_chains(struct a_res_str *cur_ares,
+ struct a_res_str *tmpl_ares,
+ int score_ix, const char *msg);
+
+/* initialize for fasta */
+
+void
+init_work (unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg
+ )
+{
+ int mhv, phv;
+ int hmax;
+ int i0, hv;
+ int pamfact;
+ int btemp;
+ struct f_struct *f_str;
+ /* these used to be globals, but do not need to be */
+ int ktup; /* word size examined */
+ int fact; /* factor used to scale ktup match value */
+ int kt1; /* ktup-1 */
+ int lkt; /* last ktup - initiall kt1, but can be increased
+ for param_u.fa.hsq >= NMAP */
+
+ int maxn0; /* used in band alignment */
+ int *pwaa_a, *pwaa_s; /* pam[aa0[]] profile */
+ int i, j, e, f;
+ struct swstr *ss;
+ int **pam2p;
+ int *waa;
+ int nsq, ip, *hsq;
+
+ if (ppst->ext_sq_set) {
+ nsq = ppst->nsqx; ip = 1;
+ hsq = ppst->hsqx;
+ }
+ else {
+ nsq = ppst->nsqx; ip = 0;
+ hsq = ppst->hsq;
+ }
+
+ f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct));
+
+#ifndef TFASTA
+ if((ppst->zsflag%10) == 6) {
+ f_str->kar_p = NULL;
+ init_karlin(aa0, n0, ppst, &f_str->aa0_f[0], &f_str->kar_p);
+ }
+#endif
+
+ if (!ppst->param_u.fa.use_E_thresholds) { /* old fashioned thresholds */
+ btemp = 2 * ppst->param_u.fa.bestoff / 3 +
+ n0 / ppst->param_u.fa.bestscale +
+ ppst->param_u.fa.bkfact *
+ (ppst->param_u.fa.bktup - ppst->param_u.fa.ktup);
+
+ if (ppst->nt_align) {
+ btemp = (btemp*ppst->pam_h)/5; /* normalize to standard +5/-4 */
+ }
+ else {
+ /* correct problem produced by allowing bktup=3, which increases
+ btemp by bkfact for proteins 2-May-2011 */
+ if (ppst->param_u.fa.ktup < 3) btemp -= ppst->param_u.fa.bkfact;
+ }
+
+ btemp = min (btemp, ppst->param_u.fa.bestmax);
+ if (btemp > 3 * n0) btemp = 3 * shscore(aa0,n0,ppst->pam2[0]) / 5;
+
+ ppst->param_u.fa.cgap = btemp + ppst->param_u.fa.bestoff / 3;
+
+ if (ppst->param_u.fa.optcut_set != 1) {
+#ifndef TFASTA
+ ppst->param_u.fa.optcut = btemp;
+#else
+ ppst->param_u.fa.optcut = (btemp*3)/2;
+#endif
+ }
+ }
+
+#ifndef OLD_FASTA_GAP
+ ppst->param_u.fa.pgap = ppst->gdelval + 2*ppst->ggapval;
+#else
+ ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval;
+#endif
+ pamfact = ppst->param_u.fa.pamfact;
+ ktup = ppst->param_u.fa.ktup;
+ fact = ppst->param_u.fa.scfact * ktup;
+
+ if (pamfact == -1) pamfact = 0;
+ else if (pamfact == -2) pamfact = 1;
+
+ for (i0 = 1, mhv = -1; i0 < ppst->nsq; i0++)
+ if (hsq[i0] < NMAP && hsq[i0] > mhv) mhv = hsq[i0];
+
+ if (mhv <= 0) {
+ fprintf (stderr, " maximum hsq <=0 %d\n", mhv);
+ exit (1);
+ }
+
+ for (f_str->kshft = 0; mhv > 0; mhv /= 2) f_str->kshft++;
+
+ /* kshft = 2; */
+ kt1 = ktup - 1;
+ hv = 1;
+ for (i0 = 0; i0 < ktup; i0++) hv = hv << f_str->kshft;
+ hmax = hv;
+ f_str->hmask = (hmax >> f_str->kshft) - 1;
+
+ if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " *** cannot allocate hash array: hmax: %d hmask: %d\n",
+ hmax, f_str->hmask);
+ exit (1);
+ }
+
+ if ((f_str->pamh1 = (int *) calloc (nsq+1, sizeof (int))) == NULL) {
+ fprintf (stderr, " *** cannot allocate pamh1 array nsq=%d\n",nsq);
+ exit (1);
+ }
+
+ if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) {
+ fprintf (stderr, " *** cannot allocate pamh2 array hmax=%d\n",hmax);
+ exit (1);
+ }
+
+ if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) {
+ fprintf (stderr, " *** cannot allocate hash link array n0=%d",n0);
+ exit (1);
+ }
+
+ for (i0 = 0; i0 < hmax; i0++) f_str->harr[i0] = -1;
+ for (i0 = 0; i0 < n0; i0++) f_str->link[i0] = -1;
+
+ /* encode the aa0 array */
+ phv = hv = 0;
+ lkt = kt1;
+ /* restart hv, phv calculation */
+ for (i0 = 0; i0 < min(lkt,n0); i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {hv=phv=0; lkt = i0+ ktup; continue;}
+ hv = (hv << f_str->kshft) + hsq[aa0[i0]];
+ phv += ppst->pam2[ip][aa0[i0]][aa0[i0]]*ktup;
+ }
+
+ for (; i0 < n0; i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {
+ hv=phv=0;
+ /* restart hv, phv calculation */
+ for (lkt = i0+kt1; (i0 < lkt || hsq[aa0[i0]]>=NMAP) && i0<n0; i0++) {
+ if (hsq[aa0[i0]] >= NMAP) {
+ hv=phv=0;
+ lkt = i0+ktup;
+ continue;
+ }
+ hv = (hv << f_str->kshft) + hsq[aa0[i0]];
+ phv += ppst->pam2[ip][aa0[i0]][aa0[i0]]*ktup;
+ }
+ }
+ if (i0 >= n0) break;
+ hv = ((hv & f_str->hmask) << f_str->kshft) + hsq[aa0[i0]];
+ f_str->link[i0] = f_str->harr[hv];
+ f_str->harr[hv] = i0;
+ if (pamfact) {
+ f_str->pamh2[hv] = (phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup);
+ /* this check should always be true, but just in case */
+ if (hsq[aa0[i0-kt1]]<NMAP)
+ phv -= ppst->pam2[ip][aa0[i0 - kt1]][aa0[i0 - kt1]] * ktup;
+ }
+ else f_str->pamh2[hv] = fact * ktup;
+ }
+
+ /* this has been modified from 0..<ppst->nsq to 1..< ppst->nsq because the
+ pam2[0][0] is now undefined for consistency with blast
+ */
+
+ if (pamfact) {
+ for (i0 = 1; i0 < nsq; i0++) {
+ f_str->pamh1[i0] = ppst->pam2[ip][i0][i0] * ktup;
+ }
+ }
+ else {
+ for (i0 = 1; i0 < nsq; i0++) {
+ f_str->pamh1[i0] = fact;
+ }
+ }
+ f_str->ndo = 0;
+ if ((f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG,
+ sizeof (struct dstruct)))==NULL) {
+ fprintf (stderr," *** cannot allocate diagonal arrays: %lu\n",
+ MAXDIAG *sizeof (struct dstruct));
+ exit (1);
+ };
+
+
+#ifdef TFASTA
+ if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+2,
+ sizeof(unsigned char)))
+ == NULL) {
+ fprintf (stderr, " *** cannot allocate aa1x array %d\n", ppst->maxlen+2);
+ exit (1);
+ }
+ f_str->aa1x++;
+#endif
+
+ f_str->bss_size = ppst->param_u.fa.optwid*2+4;
+ f_str->bss = (struct bdstr *) calloc((size_t)ppst->param_u.fa.optwid*2+4,
+ sizeof(struct bdstr));
+ f_str->bss++;
+
+ /* allocate space for the scoring arrays */
+ maxn0 = n0 + 4;
+ if ((ss = (struct swstr *) calloc (maxn0, sizeof (struct swstr)))
+ == NULL) {
+ fprintf (stderr, " *** cannot allocate ss array %3d\n", n0);
+ exit (1);
+ }
+ ss++;
+
+ ss[n0].H = -1; /* this is used as a sentinel - normally H >= 0 */
+ ss[n0].E = 1;
+ f_str->ss = ss;
+
+ /* initialize variable (-S) pam matrix */
+ if ((f_str->waa_s= (int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"*** error [%s:%d] cannot allocate waa_s array %3d\n",
+ __FILE__, __LINE__, nsq*n0);
+ exit(1);
+ }
+
+ /* initialize pam2p[1] pointers */
+ if ((f_str->pam2p[1]= (int **)calloc((n0+1),sizeof(int *))) == NULL) {
+ fprintf(stderr,"*** error [%s:%d] cannot allocate pam2p[1] array %3d\n",
+ __FILE__, __LINE__, n0);
+ exit(1);
+ }
+
+ pam2p = f_str->pam2p[1];
+ if ((pam2p[0]=(int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"*** error [%s:%d] cannot allocate pam2p[1][] array %3d\n",
+ __FILE__, __LINE__, nsq*n0);
+ exit(1);
+ }
+
+ for (i=1; i<n0; i++) {
+ pam2p[i]= pam2p[0] + (i*(nsq+1));
+ }
+
+ /* initialize universal (alignment) matrix */
+ if ((f_str->waa_a= (int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"*** error [%s:%d] cannot allocate waa_a struct %3d\n",
+ __FILE__, __LINE__, nsq*n0);
+ exit(1);
+ }
+
+ /* initialize pam2p[0] pointers */
+ if ((f_str->pam2p[0]= (int **)calloc((n0+1),sizeof(int *))) == NULL) {
+ fprintf(stderr,"*** error [%s:%d] cannot allocate pam2p[1] array %3d\n",
+ __FILE__, __LINE__, n0);
+ exit(1);
+ }
+
+ pam2p = f_str->pam2p[0];
+ if ((pam2p[0]=(int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"*** error [%s:%d] cannot allocate pam2p[1][] array %3d\n",
+ __FILE__, __LINE__, nsq*n0);
+ exit(1);
+ }
+
+ for (i=1; i<n0; i++) {
+ pam2p[i]= pam2p[0] + (i*(nsq+1));
+ }
+
+ /*
+ pwaa effectively has a sequence profile --
+ pwaa[0..n0-1] has pam score for residue 0 (-BIGNUM)
+ pwaa[n0..2n0-1] has pam scores for residue 1 (A)
+ pwaa[2n0..3n-1] has pam scores for residue 2 (R), ...
+
+ thus: pwaa = f_str->waa_s + (*aa1p++)*n0; sets up pwaa so that
+ *pwaa++ rapidly moves though the scores of the aa1p[] position
+ without further indexing
+
+ For a real sequence profile, pwaa[0..n0-1] vs ['A'] could have
+ a different score in each position.
+ */
+
+ if (ppst->pam_pssm) {
+ pwaa_s = f_str->waa_s;
+ pwaa_a = f_str->waa_a;
+ for (e = 0; e < nsq; e++) { /* for each residue in the alphabet */
+ for (f = 0; f < n0; f++) { /* for each position in aa0 */
+ *pwaa_s++ = f_str->pam2p[ip][f][e] = ppst->pam2p[ip][f][e];
+ *pwaa_a++ = f_str->pam2p[0][f][e] = ppst->pam2p[0][f][e];
+ }
+ }
+ }
+ else { /* initialize scanning matrix */
+ pwaa_s = f_str->waa_s;
+ pwaa_a = f_str->waa_a;
+ for (e = 0; e <nsq; e++) /* for each residue in the alphabet */
+ for (f = 0; f < n0; f++) { /* for each position in aa0 */
+ *pwaa_s++ = f_str->pam2p[ip][f][e]= ppst->pam2[ip][aa0[f]][e];
+ *pwaa_a++ = f_str->pam2p[0][f][e] = ppst->pam2[0][aa0[f]][e];
+ }
+ }
+
+ f_str->max_res = max(3*n0/2,MIN_RES);
+
+ *f_arg = f_str;
+}
+
+/* pstring1 is a message to the manager, currently 512 */
+/* pstring2 is the same information, but in a markx==10 format */
+void
+get_param (const struct pstruct *ppstr,
+ char **pstring1, char *pstring2, struct score_count_s *s_info)
+{
+ char options_str1[128];
+ char options_str2[128];
+#ifndef TFASTA
+ char *pg_str="FASTA";
+#else
+ char *pg_str="TFASTA";
+#endif
+
+ if (!ppstr->param_u.fa.use_E_thresholds) {
+ sprintf(options_str1,"join: %d (%0.3g), opt: %d (%0.3g)",
+ ppstr->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppstr->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ sprintf(options_str2,"pg_join: %d (%.3g)\n; pg_optcut: %d (%.3g)",
+ ppstr->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppstr->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ }
+ else {
+ sprintf(options_str1,"E-join: %.2g (%.3g), E-opt: %.2g (%.3g)",
+ ppstr->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppstr->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ sprintf(options_str2,"pg_join_E(): %.2g (%.3g)\n; pg_optcut_E(): %.2g (%.3g)",
+ ppstr->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
+ ppstr->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
+ }
+
+ if (!ppstr->param_u.fa.optflag) {
+ sprintf (pstring1[0], "%s (%s)", pg_str, verstr);
+ if (ppstr->param_u.fa.iniflag) strcat(pstring1[0]," init1");
+ }
+ else {
+ sprintf (pstring1[0], "%s (%s) [optimized]", pg_str, verstr);
+ }
+
+ sprintf (pstring1[1],
+#ifdef OLD_FASTA_GAP
+ "%s matrix (%d:%d)%s, gap-pen: %d/%d\n ktup: %d, %s, width: %3d",
+#else
+ "%s matrix (%d:%d)%s, open/ext: %d/%d\n ktup: %d, %s, width: %3d",
+#endif
+ ppstr->pam_name, ppstr->pam_h,ppstr->pam_l,
+ (ppstr->ext_sq_set) ? "xS":"\0",
+ ppstr->gdelval, ppstr->ggapval,
+ ppstr->param_u.fa.ktup, options_str1,
+ ppstr->param_u.fa.optwid);
+
+
+ if (pstring2 != NULL) {
+ sprintf (pstring2,
+#ifdef OLD_FASTA_GAP
+ "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)\n\
+; pg_gap-pen: %d %d\n; pg_ktup: %d\n; %s\n",
+#else
+ "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)\n\
+; pg_open-ext: %d %d\n; pg_ktup: %d\n; %s\n",
+#endif
+ pg_str,verstr,ppstr->pam_name, ppstr->pam_h,ppstr->pam_l, ppstr->gdelval,
+ ppstr->ggapval,ppstr->param_u.fa.ktup,options_str2);
+ }
+}
+
+void
+close_work (const unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ struct f_struct *f_str;
+
+ f_str = *f_arg;
+
+ if (f_str != NULL) {
+ if (f_str->kar_p!=NULL) free(f_str->kar_p);
+ f_str->ss--;
+ free(f_str->ss);
+
+ f_str->bss--;
+ free(f_str->bss);
+ f_str->bss_size = 0;
+
+ /* free(f_str->res); */
+ free(f_str->waa_a);
+ free(f_str->waa_s);
+
+ free(f_str->pam2p[1][0]);
+ free(f_str->pam2p[1]);
+
+ free(f_str->pam2p[0][0]);
+ free(f_str->pam2p[0]);
+
+
+ free(f_str->diag);
+ free(f_str->link);
+ free(f_str->pamh2);
+ free(f_str->pamh1);
+ free(f_str->harr);
+
+ free(f_str);
+ *f_arg = NULL;
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] close_work() with NULL f_str ***\n",
+ __FILE__, __LINE__);
+ }
+}
+
+int savemax (struct dstruct *, int,
+ struct savestr *vmax,
+ struct savestr **lowmax);
+
+
+int spam (const unsigned char *, const unsigned char *, struct savestr *,
+ int **, int, int, int);
+int sconn(struct savestr **, int nsave, int cgap, int pgap, int noff);
+void kpsort(struct savestr **, int);
+
+extern int
+NW_ALIGN(const unsigned char *, const unsigned char *, int, int,
+ int **pam2p, int, int q, int r, int *res, int *nc);
+
+static int
+LOCAL_ALIGN(const unsigned char *, const unsigned char *,
+ int, int, int, int,
+ int **, int, int, int *, int *, int *, int *, int,
+ struct f_struct *);
+
+static int
+B_ALIGN(const unsigned char *A, const unsigned char *B, int M,
+ int N, int low, int up, int **W, int G, int H, int *S,
+ int *nS, int MW, int MX, struct bdstr *bss, struct mtp_str *mtp);
+
+static void
+do_fasta (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ struct rstruct *rst, int *hoff, int shuff_flg,
+ struct score_count_s *s_info) {
+ int nd; /* diagonal array size */
+ int lhval;
+ int kfact;
+ struct savestr vmax[MAXSAV]; /* best matches saved for one sequence */
+ struct savestr *vptr[MAXSAV];
+ struct savestr *lowmax;
+ int lowscor;
+ register struct dstruct *dptr;
+ register int tscor;
+
+ register struct dstruct *diagp;
+ int noff;
+ struct dstruct *dpmax, *dpmin;
+ register int lpos;
+ int tpos;
+ struct savestr *vmptr;
+ int scor, ib, nsave;
+ int xdrop, do_extend;
+ int ktup, kt1, lkt, ip, ktup_sq;
+ const int *hsq;
+ int opt_cut, c_gap;
+
+
+ if (ppst->ext_sq_set) {
+ ip = 1;
+ hsq = ppst->hsqx;
+ }
+ else {
+ ip = 0;
+ hsq = ppst->hsq;
+ }
+
+ xdrop = -ppst->pam_l;
+ /* do extended alignment in spam iff protein or short sequences */
+ do_extend = !ppst->nt_align || (n0 < 50) || (n1 < 50);
+
+ ktup = ppst->param_u.fa.ktup;
+ kt1 = ktup-1;
+ if (ktup <= 3) {
+ ktup_sq = ktup*ktup;
+ }
+ else {
+ ktup_sq = ktup;
+ }
+ if (ktup == 1) ktup_sq *= 2;
+
+ if (n1 < ktup) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+ if (n0+n1+1 >= MAXDIAG) {
+ fprintf(stderr,"*** error [%s:%d] n0,n1 too large: %d + %d (%d) > %d \n",
+ __FILE__, __LINE__, n0,n1,n0+n1+1,MAXDIAG);
+ rst->score[0] = rst->score[1] = rst->score[2] = -1;
+ return;
+ }
+
+ /* dynamically set optcut and cgap */
+ if (ppst->param_u.fa.use_E_thresholds) {
+ c_gap = ELK_to_s(ppst->param_u.fa.E_join*ktup_sq, n0, n1, ppst->pLambda, ppst->pK, ppst->pH);
+ opt_cut = ELK_to_s(ppst->param_u.fa.E_band_opt*ktup_sq, n0, n1, ppst->pLambda, ppst->pK, ppst->pH);
+ rst->valid_stat = 0;
+ }
+ else {
+ c_gap = ppst->param_u.fa.cgap;
+ opt_cut = ppst->param_u.fa.optcut;
+ rst->valid_stat = 1;
+ }
+
+ s_info->tot_scores++;
+
+ nd = n0 + n1;
+
+ /* here we are cleaning up after the previous run, when the
+ structure is initialized (init_work), ndo is set to 0, so the
+ initialization keeps going out to nd only when nd increases
+
+ dpmax now remembers the largest diagp with hits, and only cleansa
+ up to there
+ */
+
+ dpmin = dpmax = &f_str->diag[nd];
+ for (dptr = &f_str->diag[f_str->ndo]; dptr < dpmax;){
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+ dpmax = f_str->diag;
+
+ for (vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++) {
+ vmptr->score = 0;
+ }
+
+ lowmax = vmax;
+ lowscor = 0;
+
+ /* start hashing */
+ lhval = 0;
+ lkt = kt1;
+ for (lpos = 0; (lpos < lkt || hsq[aa1[lpos]]>=NMAP) && lpos <n1; lpos++) {
+ /* restart lhval calculation */
+ if (hsq[aa1[lpos]]>=NMAP) {
+ lhval = 0; lkt = lpos + ktup;
+ continue;
+ }
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + hsq[aa1[lpos]];
+ }
+
+ noff = n0 - 1;
+ diagp = &f_str->diag[noff + lkt];
+ for (; lpos < n1; lpos++, diagp++) {
+ /* skip over low complexity */
+ if (hsq[aa1[lpos]]>=NMAP) {
+ lpos++ ; diagp++;
+ while (hsq[aa1[lpos]]>=NMAP && lpos < n1 ) {lpos++; diagp++;}
+ if (lpos >= n1) break;
+ lhval = 0;
+ }
+
+ /* lhval is the hash value of the library sequence */
+ lhval = ((lhval & f_str->hmask) << f_str->kshft) + hsq[aa1[lpos]];
+ /* tpos gives the locations where the library word matches */
+
+ /* get the diagonal of the initial hit */
+ tpos = f_str->harr[lhval];
+ if (tpos >= 0) { /* we have a hit */
+ /* tpos = link[tpos] means that tpos always gets smaller, do
+ diagp-tpos gets bigger */
+ if (diagp-tpos < dpmin) dpmin = diagp - tpos;
+ for (; tpos >= 0; tpos = f_str->link[tpos]) {
+ /* here tscor is actually the end of the run */
+ if ((tscor = (dptr = &diagp[-tpos])->stop) >= 0) {
+ /* increased by ktup word length */
+ tscor += ktup;
+ /* now tscor becomes the penalty for the unmatched
+ (non-identical) residues in the diagonal between the
+ previous match and the current match (lpos) */
+ if ((tscor -= lpos) <= 0) {
+ scor = dptr->score;
+ /* the score is getting worse; if it is better than lowscor, save it */
+ if ((tscor += (kfact = f_str->pamh2[lhval])) < 0 && lowscor < scor) {
+ lowscor = savemax(dptr, dptr - f_str->diag, vmax, &lowmax);
+ }
+ /* tscor is now a candidate score of the current run;
+ kfact is the score for starting over */
+ if ((tscor += scor) >= kfact) {
+ /* continuing better */
+ dptr->score = tscor;
+ dptr->stop = lpos;
+ }
+ else {
+ /* starting over better */
+ dptr->score = kfact;
+ dptr->start = (dptr->stop = lpos) - kt1;
+ }
+ }
+ else { /* continuing a match */
+ dptr->score += f_str->pamh1[aa0[tpos]];
+ dptr->stop = lpos;
+ }
+ }
+ else { /* no run in this diagonal yet */
+ dptr->score = f_str->pamh2[lhval];
+ dptr->start = (dptr->stop = lpos) - kt1;
+ }
+ /* dptr is biggest at the end of tpos = link[tpos] */
+ if (dptr > dpmax) dpmax = dptr;
+ }
+ } /* end tpos */
+ } /* end lpos */
+
+ for (dptr = dpmin; dptr <= dpmax;) {
+ if (dptr->score > lowscor) {
+ lowscor = savemax (dptr, dptr-f_str->diag, vmax, &lowmax);
+ }
+ dptr->stop = -1;
+ dptr->dmax = NULL;
+ dptr++->score = 0;
+ }
+ f_str->ndo = nd;
+
+/*
+ at this point all of the elements of aa1[lpos]
+ have been searched for elements of aa0[tpos]
+ with the results in diag[dpos]
+*/
+ for (nsave = 0, vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++) {
+ /*
+ fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
+ noff+vmptr->start-vmptr->dp,
+ noff+vmptr->stop-vmptr->dp,
+ vmptr->start,vmptr->stop,
+ vmptr->dp,vmptr->score);
+
+ */
+ if (vmptr->score > 0) {
+ vmptr->score = spam (aa0, aa1, vmptr, ppst->pam2[ip], xdrop,
+ noff,do_extend);
+ if (vmptr->score > 0 ) vptr[nsave++] = vmptr;
+ }
+ }
+
+ if (nsave <= 0) {
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ return;
+ }
+
+ /*
+ fprintf(stderr,"n0: %d; n1: %d; noff: %d\n",n0,n1,noff);
+ for (ib=0; ib<nsave; ib++) {
+ fprintf(stderr,"0: %4d-%4d 1: %4d-%4d dp: %d score: %d\n",
+ noff+vptr[ib]->start-vptr[ib]->dp,
+ noff+vptr[ib]->stop-vptr[ib]->dp,
+ vptr[ib]->start,vptr[ib]->stop,
+ vptr[ib]->dp,vptr[ib]->score);
+ }
+ fprintf(stderr,"---\n");
+ */
+
+ /* find the best init1 score */
+ for (vmptr=vptr[0],ib=1; ib<nsave; ib++) {
+ if (vptr[ib]->score > vmptr->score) vmptr=vptr[ib];
+ }
+
+ /* sconn does not modify vmptr->score, so only do it if it will help */
+ if (vmptr->score >= c_gap ) {
+ s_info->s_cnt[0]++;
+ scor = sconn (vptr, nsave, c_gap, ppst->param_u.fa.pgap, noff);
+ rst->alg_info |= 1;
+ }
+ else { scor = vmptr->score;}
+
+/* kssort (vptr, nsave); */
+
+ rst->score[1] = vmptr->score;
+ rst->score[0] = max (scor, vmptr->score);
+ rst->score[2] = rst->score[0]; /* initn */
+
+ *hoff= noff - vmptr->dp; /* always need *hoff */
+ if (ppst->param_u.fa.optflag) {
+ if ( /* shuff_flg || */ rst->score[0] > opt_cut ) {
+ rst->score[2] = dmatch (aa0, n0, aa1, n1, *hoff,
+ ppst->param_u.fa.optwid, ppst->pam2[ip],
+ ppst->gdelval,ppst->ggapval,f_str);
+
+ s_info->s_cnt[2]++;
+ rst->alg_info |= 2;
+ rst->valid_stat = 1;
+ }
+ /*
+ else if (rst->score[0] > c_gap) {
+ rst->score[2] = dmatch (aa0, n0, aa1, n1, *hoff, 4, ppst->pam2[ip],
+ ppst->gdelval,ppst->ggapval,f_str);
+ }
+ */
+ }
+ else { /* we never do dmatch, so initn is valid */
+ rst->valid_stat = 1;
+ }
+}
+
+void do_work (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ int qr_flg, int shuff_flg, struct rstruct *rst,
+ struct score_count_s *s_info)
+{
+ int hoff, n10;
+
+ double lambda, H;
+
+ rst->score[0] = rst->score[1] = rst->score[2] = 0;
+ rst->escore = 1.0;
+ rst->segnum = rst->seglen = 1;
+ rst->valid_stat = 0;
+ rst->alg_info = 0;
+
+ if (n1 < ppst->param_u.fa.ktup) return;
+
+#ifdef TFASTA
+ n10=aatran(aa1,f_str->aa1x,n1,frame);
+ do_fasta (aa0, n0, f_str->aa1x, n10, ppst, f_str, rst, &hoff, shuff_flg, s_info);
+#else /* FASTA */
+ do_fasta (aa0, n0, aa1, n1, ppst, f_str, rst, &hoff, shuff_flg, s_info);
+#endif
+
+#ifndef TFASTA
+ if((ppst->zsflag%10) == 6 &&
+ (do_karlin(aa1, n1, ppst->pam2[0], ppst,f_str->aa0_f,
+ f_str->kar_p, &lambda, &H)>0)) {
+ rst->comp = 1.0/lambda;
+ rst->H = H;
+ }
+ else {rst->comp = rst->H = -1.0;}
+#else
+ rst->comp = rst->H = -1.0;
+#endif
+}
+
+void do_opt (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct rstruct *rst)
+{
+ int optflag, tscore, hoff, n10;
+ struct score_count_s s_info;
+
+ optflag = ppst->param_u.fa.optflag;
+ ppst->param_u.fa.optflag = 1;
+
+#ifdef TFASTA
+ n10=aatran(aa1,f_str->aa1x,n1,frame);
+ do_fasta (aa0, n0, f_str->aa1x, n10, ppst, f_str, rst, &hoff, 0, &s_info);
+#else /* FASTA */
+ do_fasta(aa0,n0,aa1,n1,ppst,f_str,rst, &hoff, 0, &s_info);
+#endif
+ ppst->param_u.fa.optflag = optflag;
+}
+
+int
+savemax (struct dstruct *dptr, int dpos,
+ struct savestr *vmax, struct savestr **lowmax)
+{
+ struct savestr *vmptr;
+ int i;
+
+/* check to see if this is the continuation of a run that is already saved */
+
+ if ((vmptr = dptr->dmax) != NULL && vmptr->dp == dpos &&
+ vmptr->start == dptr->start) {
+ vmptr->stop = dptr->stop;
+ if ((i = dptr->score) <= vmptr->score) return (*lowmax)->score;
+ vmptr->score = i;
+ if (vmptr != (*lowmax)) return (*lowmax)->score;
+ }
+ else {
+ i = (*lowmax)->score = dptr->score;
+ (*lowmax)->dp = dpos;
+ (*lowmax)->start = dptr->start;
+ (*lowmax)->stop = dptr->stop;
+ dptr->dmax = (*lowmax);
+ }
+
+ for (vmptr = vmax; vmptr < vmax+MAXSAV; vmptr++) {
+ if (vmptr->score < i) {
+ i = vmptr->score;
+ *lowmax = vmptr;
+ }
+ }
+ return i;
+}
+
+int spam (const unsigned char *aa0, const unsigned char *aa1,
+ struct savestr *dmax, int **pam2, int xdrop,
+ int noff, int do_extend)
+{
+ register int lpos, tot;
+ register const unsigned char *aa0p, *aa1p;
+
+ int drop_thresh;
+
+ struct {
+ int start, stop, score;
+ } curv, maxv;
+
+ aa1p = &aa1[lpos= dmax->start]; /* get the start of lib seq */
+ aa0p = &aa0[lpos - dmax->dp + noff]; /* start of query */
+#ifdef DEBUG
+ /* also add check in calling routine */
+ if (aa0p < aa0) { return -99; }
+#endif
+ curv.start = lpos; /* start index in lib seq */
+
+ tot = curv.score = maxv.score = 0;
+
+ for (; lpos <= dmax->stop; lpos++) {
+ tot += pam2[*aa0p++][*aa1p++];
+ if (tot > curv.score) { /* update current score */
+ curv.stop = lpos;
+ curv.score = tot;
+ }
+ else if (tot < 0) {
+ if (curv.score > maxv.score) { /* save score, start, stop */
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+ tot = curv.score = 0; /* reset running score */
+ curv.start = lpos+1; /* reset start */
+ if(lpos >= dmax->stop) break; /* if the zero is beyond stop, quit */
+ }
+ }
+
+ if (curv.score > maxv.score) {
+ maxv.start = curv.start;
+ maxv.stop = curv.stop;
+ maxv.score = curv.score;
+ }
+
+#ifndef NOSPAM_EXT
+
+ /* now check to see if the score gets better by extending */
+ if (do_extend && maxv.score > xdrop) {
+
+ if (maxv.stop == dmax->stop) {
+ tot = maxv.score;
+ drop_thresh = maxv.score - xdrop;
+ aa1p = &aa1[lpos= dmax->stop];
+ aa0p = &aa0[lpos - dmax->dp + noff];
+ while (tot > drop_thresh ) {
+ ++lpos;
+ tot += pam2[*(++aa0p)][*(++aa1p)];
+ if (tot > maxv.score) {
+ maxv.start = lpos;
+ maxv.score = tot;
+ drop_thresh = tot - xdrop;
+ }
+ }
+ }
+
+ /* scan backwards now */
+
+ if (maxv.start == dmax->start) {
+ tot = maxv.score;
+ drop_thresh = maxv.score - xdrop;
+ aa1p = &aa1[lpos= dmax->start];
+ aa0p = &aa0[lpos - dmax->dp + noff];
+ while (tot > drop_thresh) {
+ --lpos;
+ tot += pam2[*(--aa0p)][*(--aa1p)];
+ if (tot > maxv.score) {
+ maxv.start = lpos;
+ maxv.score = tot;
+ drop_thresh = tot - xdrop;
+ }
+ }
+ }
+ }
+#endif
+
+/* if (maxv.start != dmax->start || maxv.stop != dmax->stop)
+ printf(" new region: %3d %3d %3d %3d\n",maxv.start,
+ dmax->start,maxv.stop,dmax->stop);
+*/
+ dmax->start = maxv.start;
+ dmax->stop = maxv.stop;
+
+ return maxv.score;
+}
+
+int sconn (struct savestr **v, int n, int cgap, int pgap, int noff)
+{
+ int i, si;
+ struct slink
+ {
+ int score;
+ struct savestr *vp;
+ struct slink *next;
+ } *start, *sl, *sj, *so, sarr[MAXSAV];
+ int lstart, tstart, plstop, ptstop;
+
+/* sort the score left to right in lib pos */
+
+ kpsort (v, n);
+
+ start = NULL;
+
+/* for the remaining runs, see if they fit */
+
+ for (i = 0, si = 0; i < n; i++) {
+
+/* if the score is less than the gap penalty, it never helps */
+ if (v[i]->score < cgap) continue;
+ lstart = v[i]->start;
+ tstart = lstart - v[i]->dp + noff;
+
+/* put the run in the group */
+ sarr[si].vp = v[i];
+ sarr[si].score = v[i]->score;
+ sarr[si].next = NULL;
+
+/* if it fits, then increase the score */
+ for (sl = start; sl != NULL; sl = sl->next)
+ {
+ plstop = sl->vp->stop;
+ ptstop = plstop - sl->vp->dp + noff;
+ if (plstop < lstart && ptstop < tstart)
+ {
+ sarr[si].score = sl->score + v[i]->score + pgap;
+ break;
+ }
+ }
+
+/* now recalculate where the score fits */
+ if (start == NULL)
+ start = &sarr[si];
+ else
+ for (sj = start, so = NULL; sj != NULL; sj = sj->next) {
+ if (sarr[si].score > sj->score) {
+ sarr[si].next = sj;
+ if (so != NULL) so->next = &sarr[si];
+ else start = &sarr[si];
+ break;
+ }
+ so = sj;
+ }
+ si++;
+ }
+
+ if (start != NULL)
+ return (start->score);
+ else
+ return (0);
+}
+
+void
+kssort (v, n)
+struct savestr *v[];
+int n;
+{
+ int gap, i, j;
+ struct savestr *tmp;
+
+ for (gap = n / 2; gap > 0; gap /= 2)
+ for (i = gap; i < n; i++)
+ for (j = i - gap; j >= 0; j -= gap)
+ {
+ if (v[j]->score >= v[j + gap]->score)
+ break;
+ tmp = v[j];
+ v[j] = v[j + gap];
+ v[j + gap] = tmp;
+ }
+}
+
+void
+kpsort (struct savestr **v, int n) {
+ int gap, i, j, k;
+ int incs[4] = { 21, 7, 3, 1 };
+ struct savestr *tmp;
+ int v_start;
+
+ for ( k = 0; k < 4; k++) {
+ gap = incs[k];
+ for (i = gap; i < n; i++) {
+ tmp = v[i];
+ j = i;
+ v_start = v[i]->start;
+ while (j >= gap && v[j - gap]->start > v_start) {
+ v[j] = v[j - gap];
+ j -= gap;
+ }
+ v[j] = tmp;
+ }
+ }
+}
+
+int dmatch (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int hoff, int window,
+ int **pam2, int gdelval, int ggapval,
+ struct f_struct *f_str)
+{
+ int low, up;
+
+ window = min (n1, window);
+
+#ifdef DEBUG
+ if (window > f_str->bss_size) {
+ fprintf(stderr,"*** error [%s:%d] dropnfa.c:dmatch window [%d] out of range [%d]\n",
+ __FILE__, __LINE__, window, f_str->bss_size);
+ window = f_str->bss_size - 4;
+ }
+#endif
+
+ /* hoff is the offset found from aa1 to seq 2 by dmatch */
+
+ low = -window/2-hoff;
+ up = low+window;
+
+ return FLOCAL_ALIGN(aa0-1,aa1-1,n0,n1, low, up,
+ pam2,
+#ifdef OLD_FASTA_GAP
+ -(gdelval-ggapval),
+#else
+ -gdelval,
+#endif
+ -ggapval,window,f_str);
+ }
+
+
+/* A PACKAGE FOR LOCALLY ALIGNING TWO SEQUENCES WITHIN A BAND:
+
+ To invoke, call LOCAL_ALIGN(A,B,M,N,L,U,W,G,H,MW).
+ The parameters are explained as follows:
+ A, B : two sequences to be aligned
+ M : the length of sequence A
+ N : the length of sequence B
+ L : lower bound of the band
+ U : upper bound of the band
+ W : scoring table for matches and mismatches
+ G : gap-opening penalty
+ H : gap-extension penalty
+ MW : maximum window size
+*/
+
+#include <stdio.h>
+
+#define MININT -9999999
+
+int
+FLOCAL_ALIGN(const unsigned char *A, const unsigned char *B,
+ int M, int N, int low, int up,
+ int **W, int G,int H, int MW,
+ struct f_struct *f_str)
+{
+ int band;
+ register struct bdstr *bssp;
+ int i, j, si, ei;
+ int c, d, e, m;
+ int leftd, rightd;
+ int best_score;
+ int *wa, curd;
+ int ib;
+
+ bssp = f_str->bss;
+
+ m = G+H;
+ low = max(-M, low);
+ up = min(N, up);
+
+ if (N <= 0) return 0;
+
+ if (M <= 0) return 0;
+
+ band = up-low+1;
+ if (band < 1) {
+ fprintf(stderr,"*** error [%s:%d] low > up is unacceptable!: M: %d N: %d l/u: %d/%d\n",
+ __FILE__, __LINE__, M, N, low, up);
+ return 0;
+ }
+
+ if (low > 0) leftd = 1;
+ else if (up < 0) leftd = band;
+ else leftd = 1-low;
+ rightd = band;
+ si = max(0,-up); /* start index -1 */
+ ei = min(M,N-low); /* end index */
+ bssp[leftd].CC = 0;
+ for (j = leftd+1; j <= rightd; j++) {
+ bssp[j].CC = 0;
+ bssp[j].DD = -G;
+ }
+
+ bssp[rightd+1].CC = MININT;
+ bssp[rightd+1].DD = MININT;
+
+ best_score = 0;
+ bssp[leftd-1].CC = MININT;
+ bssp[leftd].DD = -G;
+
+ for (i = si+1; i <= ei; i++) {
+ if (i > N-up) rightd--;
+ if (leftd > 1) leftd--;
+ wa = W[A[i]];
+ if ((c = bssp[leftd+1].CC-m) > (d = bssp[leftd+1].DD-H)) d = c;
+ if ((ib = leftd+low-1+i ) > 0) c = bssp[leftd].CC+wa[B[ib]];
+
+ if (d > c) c = d;
+ if (c < 0) c = 0;
+ e = c-G;
+ bssp[leftd].DD = d;
+ bssp[leftd].CC = c;
+ if (c > best_score) best_score = c;
+
+ for (curd=leftd+1; curd <= rightd; curd++) {
+ if ((c = c-m) > (e = e-H)) e = c;
+ if ((c = bssp[curd+1].CC-m) > (d = bssp[curd+1].DD-H)) d = c;
+ c = bssp[curd].CC + wa[B[curd+low-1+i]];
+ if (e > c) c = e;
+ if (d > c) c = d;
+ if (c < 0) c = 0;
+ bssp[curd].CC = c;
+ bssp[curd].DD = d;
+ if (c > best_score) best_score = c;
+ }
+ }
+
+ return best_score;
+}
+
+/* ckalloc - allocate space; check for success */
+char *ckalloc(size_t amount)
+{
+ char *p;
+
+ if ((p = malloc( (unsigned) amount)) == NULL)
+ w_abort("Ran out of memory.","");
+ return(p);
+}
+
+/* calculate the 100% identical score */
+int
+shscore(const unsigned char *aa0, int n0, int **pam2)
+{
+ int i, sum;
+ for (i=0,sum=0; i<n0; i++)
+ sum += pam2[aa0[i]][aa0[i]];
+ return sum;
+}
+
+static int
+BCHECK_SCORE(const unsigned char *A, const unsigned char *B,
+ int M, int N, int *S, int **w, int g, int h,
+ int *nres)
+{
+ register int i, j, op, nc;
+ int *Ssave;
+ int score;
+
+ score = i = j = op = nc = 0;
+ Ssave = S;
+ while (i < M || j < N) {
+ op = *S++;
+ if (op == 0) {
+ score = w[A[++i]][B[++j]] + score;
+ nc++;
+/* fprintf(stderr,"op0 %4d %4d %4d %4d\n",i,j,w[A[i]][B[i]],score); */
+ }
+ else if (op > 0) {
+ score = score - (g+op*h);
+/* fprintf(stderr,"op> %4d %4d %4d %4d %4d\n",i,j,op,-(g+op*h),score); */
+ j += op;
+ nc += op;
+ } else {
+ score = score - (g-op*h);
+/* fprintf(stderr,"op< %4d %4d %4d %4d %4d\n",i,j,op,-(g-op*h),score); */
+ i -= op;
+ nc -= op;
+ }
+ }
+ *nres = nc;
+ return score;
+}
+
+/* bd_malign is a recursive interface to bd_walign() that is called
+ from do_walign(). bd_malign() first does an alignment, then checks
+ to see if the score is greater than the threshold. If so, it tries
+ doing a left and right alignment.
+ */
+struct a_res_str *
+bd_malign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int score_thresh, int max_res,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ struct a_res_str *cur_ares, int first_align)
+{
+ struct a_res_str *tmpl_ares, *tmpr_ares, *this_ares, *last_ares;
+ int hoff, l_min, l_max, window;
+ unsigned char *local_aa1;
+ int local_n1;
+ /*
+ unsigned char aa1_min_s, aa1_max_s;
+ */
+ struct rstruct rst;
+ int score_ix;
+ char save_res;
+ int max_sub_score = -1;
+ int min_alen;
+ int have_local_aa1;
+ struct score_count_s s_info = {0, 0, 0};
+
+ min_alen = min(MIN_LOCAL_LEN,n0);
+
+ score_ix = ppst->score_ix;
+
+ /* now we need alignment storage - get it */
+ if ((cur_ares->res = (int *)calloc((size_t)max_res,sizeof(int)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] cannot allocate alignment results array %d\n",
+ __FILE__, __LINE__, max_res);
+ exit(1);
+ }
+
+ cur_ares->next = NULL;
+ cur_ares->n1 = n1;
+
+ /* lots of changes to optcut, optwid, and optflag were made in do_walign */
+
+ do_fasta(aa0, n0, aa1, n1, ppst, f_str, &cur_ares->rst, &hoff,0, &s_info);
+
+ if (first_align || cur_ares->rst.score[score_ix]>score_thresh) {
+ if (ppst->sw_flag) {
+ /* hoff gives us a projection of the query on the library
+ sequence, which can be used to limit the portions of the
+ library sequence that will be aligned by Smith-Waterman
+
+ this ensures that the optimal alignment (with pam2p[0], not
+ pam2p[1]), aligns the correct region when the only difference between
+ the regions is the lseg encoding
+ */
+
+ window = min(n1, ppst->param_u.fa.optwid);
+ /* this windowing seems inappropriate when Smith-Waterman is used (sw_flag),
+ but it is done to ensure that seg'ed regions are ignored
+ */
+ l_min = 0;
+ l_max = n1;
+ if (ppst->pam_x_set) {
+ l_min = max(0, -window-hoff);
+ l_max = min(n1, n0-hoff+window);
+ }
+
+ have_local_aa1 = 0;
+ local_aa1 = (unsigned char *)aa1;
+ if (l_min > 0 || l_max < n1 - 1) {
+ if (l_max - l_min < 0) {
+ fprintf(stderr,"*** error [%s:%d] l_min: %d > l_max %d\n",__FILE__, __LINE__, l_min,l_max);
+ exit(1);
+ }
+ if ((local_aa1 = (unsigned char *)calloc(l_max - l_min +2,sizeof(unsigned char *)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] Cannot allocate local_aa1\n",__FILE__, __LINE__);
+ exit(1);
+ }
+
+ local_aa1++;
+ memcpy(local_aa1,aa1+l_min, l_max - l_min);
+ have_local_aa1 = 1;
+ }
+ /*
+ if (l_min > 0) {
+ aa1_min_s = aa1[l_min-1];
+ aa1[l_min-1] = '\0';
+ }
+ if (l_max < n1 - 1) {
+ aa1_max_s = aa1[l_max];
+ aa1[l_max] = '\0';
+ }
+ */
+ cur_ares->sw_score = sw_walign(f_str->pam2p[0], n0, local_aa1, l_max - l_min,
+ -ppst->gdelval, -ppst->ggapval,
+ f_str->ss, cur_ares);
+
+ cur_ares->min1 += l_min;
+ cur_ares->max1 += l_min;
+ cur_ares->n1 += l_min;
+ if (have_local_aa1) {free(--local_aa1);}
+ /*
+ if (l_min > 0) { aa1[l_min-1] = aa1_min_s;}
+ if (l_max < n1 - 1) { aa1[l_max] = aa1_max_s;}
+ */
+ }
+ else {
+ cur_ares->sw_score = bd_walign(aa0, n0, aa1, n1, ppst, f_str, hoff, cur_ares);
+ }
+ }
+ else {
+ cur_ares->nres = 0;
+ cur_ares->sw_score=0;
+ cur_ares->rst.score[0] = cur_ares->rst.score[1] = cur_ares->rst.score[2] = 0;
+ }
+
+ /* check to see if a variant is better */
+
+ if (!ppst->do_rep || cur_ares->rst.score[score_ix] <= score_thresh) {return cur_ares;}
+
+ if (cur_ares->min1 >= min_alen) { /* try the left */
+ /* allocate a_res */
+ tmpl_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+
+ local_aa1 = (unsigned char *)calloc(cur_ares->min1+2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1,cur_ares->min1);
+ /*
+ save_res = aa1[cur_ares->min1-1];
+ aa1[cur_ares->min1-1] = '\0';
+ */
+ tmpl_ares = bd_malign(aa0, n0, local_aa1, cur_ares->min1,
+ score_thresh, max_res,
+ ppst, f_str, tmpl_ares,0);
+
+ free(--local_aa1);
+ /*
+ aa1[cur_ares->min1-1] = save_res;
+ */
+
+ /* fprintf(stderr," bd_malign ret(%d-%d) left: %d-%d/%d-%d : %d\n",
+ cur_ares->min1, cur_ares->max1,
+ tmpl_ares->min0,tmpl_ares->max0,
+ tmpl_ares->min1,tmpl_ares->max1,
+ tmpl_ares->rst.score[score_ix]);
+ */
+
+ if (tmpl_ares->rst.score[score_ix] > score_thresh) {
+ max_sub_score = tmpl_ares->rst.score[score_ix];
+ }
+ else {
+ if (tmpl_ares->res) free(tmpl_ares->res);
+ free(tmpl_ares);
+ tmpl_ares=NULL;
+ }
+ }
+ else {tmpl_ares = NULL;}
+
+ local_n1 = n1 - cur_ares->max1;
+ if (local_n1 >= min_alen) { /* try the right */
+ /* allocate a_res */
+ tmpr_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+
+ /* find boundaries */
+ local_aa1 = (unsigned char *)calloc(local_n1+2, sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1+cur_ares->max1, local_n1);
+ /*
+ save_res = aa1[cur_ares->max1-1];
+ aa1[cur_ares->max1-1] = '\0';
+ */
+ tmpr_ares = bd_malign(aa0, n0, local_aa1, local_n1,
+ score_thresh, max_res,
+ ppst, f_str, tmpr_ares,0);
+ free(--local_aa1);
+ /*
+ aa1[cur_ares->max1-1] = save_res;
+ */
+ /* fprintf(stderr," bd_malign ret(%d-%d) right: %d-%d/%d-%d : %d\n",
+ cur_ares->min1, cur_ares->max1,
+ tmpr_ares->min0,tmpr_ares->max0,
+ tmpr_ares->min1,tmpr_ares->max1,
+ tmpr_ares->rst.score[score_ix]);
+ */
+
+ if (tmpr_ares->rst.score[score_ix] > score_thresh) {
+ /* adjust the left boundary */
+ for (this_ares = tmpr_ares; this_ares; this_ares = this_ares->next) {
+ this_ares->min1 += cur_ares->max1;
+ this_ares->max1 += cur_ares->max1;
+ last_ares=this_ares;
+ }
+ if (tmpr_ares->rst.score[score_ix] > max_sub_score) {
+ max_sub_score = tmpr_ares->rst.score[score_ix];
+ }
+ }
+ else {
+ if (tmpr_ares->res) free(tmpr_ares->res);
+ free(tmpr_ares);
+ tmpr_ares = NULL;
+ }
+ }
+ else {tmpr_ares = NULL;}
+
+ if (max_sub_score <= score_thresh) return cur_ares;
+
+ /*
+ fprintf(stderr, "lr: %d l: %d r:%d\n",
+ max_sub_score,
+ (tmpl_ares ? tmpl_ares->rst.score[score_ix] : 0),
+ (tmpr_ares ? tmpr_ares->rst.score[score_ix] : 0));
+ */
+
+ cur_ares = merge_ares_chains(cur_ares, tmpl_ares, score_ix, "left");
+ cur_ares = merge_ares_chains(cur_ares, tmpr_ares, score_ix, "right");
+
+ return cur_ares;
+}
+
+/* A PACKAGE FOR LOCALLY ALIGNING TWO SEQUENCES WITHIN A BAND:
+
+ To invoke, call LOCAL_ALIGN(A,B,M,N,L,U,W,G,H,S,dflag,&SI,&SJ,&EI,&EJ,MW).
+ The parameters are explained as follows:
+ A, B : two sequences to be aligned
+ M : the length of sequence A
+ N : the length of sequence B
+ L : lower bound of the band
+ U : upper bound of the band
+ W : scoring table for matches and mismatches
+ G : gap-opening penalty
+ H : gap-extension penalty
+ dflag : 0 - no display or backward pass
+ *SI : starting position of sequence A in the optimal local alignment
+ *SJ : starting position of sequence B in the optimal local alignment
+ *EI : ending position of sequence A in the optimal local alignment
+ *EJ : ending position of sequence B in the optimal local alignment
+ MW : maximum window size
+*/
+
+int bd_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ struct pstruct *ppst,
+ struct f_struct *f_str, int hoff,
+ struct a_res_str *a_res)
+{
+ int low, up, score;
+ int min0, min1, max0, max1;
+ int window;
+
+ window = min (n1, ppst->param_u.fa.optwid);
+ if (window > f_str->bss_size) {
+ fprintf(stderr,"*** error [%s:%d] walign window [%d] out of range [%d]\n",
+ __FILE__, __LINE__, window, f_str->bss_size);
+ window = f_str->bss_size - 4;
+ }
+
+ /* hoff is the offset found from aa1 to seq 2 by dmatch */
+ low = -window/2-hoff;
+ up = low+window;
+
+ score=LOCAL_ALIGN(aa0-1,aa1-1,n0,n1, low, up,
+ ppst->pam2[0], -ppst->gdelval, -ppst->ggapval,
+ &min0,&min1,&max0,&max1,ppst->param_u.fa.optwid,f_str);
+ a_res->n1 = n1;
+
+ if (score <=0) {
+ fprintf(stderr,"*** [%s:%d] n0/n1: %d/%d hoff: %d window: %d\n",
+ __FILE__, __LINE__, n0, n1, hoff, window);
+ return 0;
+ }
+
+/*
+ fprintf(stderr," ALIGN: start0: %d start1: %d stop0: %d stop1: %d, bot: %d top: %d, win: %d MX %d\n",
+ min0-1,min1-1,max0-min0+1,max1-min1+1,low-(min1-min0),up-(min1-min0),
+ ppst->param_u.fa.optwid,n0);
+*/
+
+ a_res->min0 = min0-1; a_res->min1 = min1-1;
+ a_res->max0 = max0; a_res->max1 = max1;
+
+ B_ALIGN(aa0-1+min0-1,aa1-1+min1-1,max0-min0+1,max1-min1+1,
+ low-(min1-min0),up-(min1-min0),
+ ppst->pam2[0], -ppst->gdelval, -ppst->ggapval,
+ a_res->res,&a_res->nres,ppst->param_u.fa.optwid,n0,f_str->bss, &f_str->mtp);
+
+ return score;
+}
+
+static int
+LOCAL_ALIGN(const unsigned char *A, const unsigned char *B,
+ int M, int N,
+ int low, int up, int **W, int G,int H,
+ int *psi, int *psj, int *pei, int *pej, int MW,
+ struct f_struct *f_str)
+{
+ int band;
+ register struct bdstr *bssp;
+ int i, j, si, ei;
+ int c, d, e, t, m;
+ int leftd, rightd;
+ int best_score, starti, startj, endi, endj;
+ int *wa, curd;
+ int ib;
+ char flag;
+
+ bssp = f_str->bss;
+
+ m = G+H;
+ low = max(-M, low);
+ up = min(N, up);
+
+ if (N <= 0) {
+ *psi = *psj = *pei = *pej;
+ return 0;
+ }
+ if (M <= 0) {
+ *psi = *psj = *pei = *pej;
+ return 0;
+ }
+ band = up-low+1;
+ if (band < 1) {
+ fprintf(stderr,"*** error [%s:%d] low > up is unacceptable!: M: %d N: %d l/u: %d/%d\n",
+ __FILE__, __LINE__, M, N, low, up);
+ return -1;
+ }
+
+ /* already done by init_work();
+ j = (MW + 2 + 2) * sizeof(struct bdstr);
+ if (f_str->bss==NULL) f_str->bss = (struct bdstr *) ckalloc(j);
+ */
+
+ if (low > 0) leftd = 1;
+ else if (up < 0) leftd = band;
+ else leftd = 1-low;
+ rightd = band;
+ si = max(0,-up);
+ ei = min(M,N-low);
+ bssp[leftd].CC = 0;
+ for (j = leftd+1; j <= rightd; j++) {
+ bssp[j].CC = 0;
+ bssp[j].DD = -G;
+ }
+ bssp[rightd+1].CC = MININT;
+ bssp[rightd+1].DD = MININT;
+ best_score = 0;
+ endi = si;
+ endj = si+low;
+ bssp[leftd-1].CC = MININT;
+ bssp[leftd].DD = -G;
+ for (i = si+1; i <= ei; i++) {
+ if (i > N-up) rightd--;
+ if (leftd > 1) leftd--;
+ wa = W[A[i]];
+ if ((c = bssp[leftd+1].CC-m) > (d = bssp[leftd+1].DD-H)) d = c;
+ if ((ib = leftd+low-1+i ) > 0) c = bssp[leftd].CC+wa[B[ib]];
+/*
+ if (ib > N) fprintf(stderr,"B[%d] out of range %d\n",ib,N);
+*/
+ if (d > c) c = d;
+ if (c < 0) c = 0;
+ e = c-G;
+ bssp[leftd].DD = d;
+ bssp[leftd].CC = c;
+ if (c > best_score) {
+ best_score = c;
+ endi = i;
+ endj = ib;
+ }
+ for (curd=leftd+1; curd <= rightd; curd++) {
+ if ((c = c-m) > (e = e-H)) e = c;
+ if ((c = bssp[curd+1].CC-m) > (d = bssp[curd+1].DD-H)) d = c;
+/*
+ if ((ib=curd+low-1+i) <= 0 || ib > N)
+ fprintf(stderr,"B[%d]:%d\n",ib,B[ib]);
+*/
+ c = bssp[curd].CC + wa[B[curd+low-1+i]];
+ if (e > c) c = e;
+ if (d > c) c = d;
+ if (c < 0) c = 0;
+ bssp[curd].CC = c;
+ bssp[curd].DD = d;
+ if (c > best_score) {
+ best_score = c;
+ endi = i;
+ endj = curd+low-1+i;
+ }
+ }
+ }
+
+ leftd = max(1,-endi-low+1);
+ rightd = band-(up-(endj-endi));
+ bssp[rightd].CC = 0;
+ t = -G;
+ for (j = rightd-1; j >= leftd; j--) {
+ bssp[j].CC = t = t-H;
+ bssp[j].DD = t-G;
+ }
+ for (j = rightd+1; j <= band; ++j) bssp[j].CC = MININT;
+ bssp[leftd-1].CC = bssp[leftd-1].DD = MININT;
+ bssp[rightd].DD = -G;
+ flag = 0;
+ for (i = endi; i >= 1; i--) {
+ if (i+low <= 0) leftd++;
+ if (rightd < band) rightd++;
+ wa = W[A[i]];
+ if ((c = bssp[rightd-1].CC-m) > (d = bssp[rightd-1].DD-H)) d = c;
+ if ((ib = rightd+low-1+i) <= N) c = bssp[rightd].CC+wa[B[ib]];
+
+/*
+ if (ib <= 0) fprintf(stderr,"rB[%d] <1\n",ib);
+*/
+ if (d > c) c = d;
+ e = c-G;
+ bssp[rightd].DD = d;
+ bssp[rightd].CC = c;
+ if (c == best_score) {
+ starti = i;
+ startj = ib;
+ flag = 1;
+ break;
+ }
+ for (curd=rightd-1; curd >= leftd; curd--) {
+ if ((c = c-m) > (e = e-H)) e = c;
+ if ((c = bssp[curd-1].CC-m) > (d = bssp[curd-1].DD-H)) d = c;
+
+/*
+ if ((ib=curd+low-1+i) <= 0 || ib > N)
+ fprintf(stderr,"i: %d, B[%d]:%d\n",i,ib,B[ib]);
+*/
+ c = bssp[curd].CC + wa[B[curd+low-1+i]];
+ if (e > c) c = e;
+ if (d > c) c = d;
+ bssp[curd].CC = c;
+ bssp[curd].DD = d;
+ if (c == best_score) {
+ starti = i;
+ startj = curd+low-1+i;
+ flag = 1;
+ break;
+ }
+ }
+ if (flag == 1) break;
+ }
+
+ if (starti < 0 || starti > M || startj < 0 || startj > N) {
+ printf("starti=%d, startj=%d\n",starti,startj);
+ *psi = *psj = *pei = *pej;
+ exit(1);
+ }
+ *psi = starti;
+ *psj = startj;
+ *pei = endi;
+ *pej = endj;
+ return best_score;
+}
+
+/* A PACKAGE FOR GLOBALLY ALIGNING TWO SEQUENCES WITHIN A BAND:
+
+ To invoke, call B_ALIGN(A,B,M,N,L,U,W,G,H,S,MW,MX).
+ The parameters are explained as follows:
+ A, B : two sequences to be aligned
+ M : the length of sequence A
+ N : the length of sequence B
+ L : lower bound of the band
+ U : upper bound of the band
+ W : scoring table for matches and mismatches
+ G : gap-opening penalty
+ H : gap-extension penalty
+ S : script for DISPLAY routine
+ MW : maximum window size
+ MX : maximum length sequence M to be aligned
+*/
+
+/* static int IP; */
+/* static int *MP[3]; */ /* save crossing points */
+/* static int *FP; */ /* forward dividing points */
+/* static char *MT[3]; */ /* 0: rep, 1: del, 2: ins */
+/* static char *FT; */
+
+#define gap(k) ((k) <= 0 ? 0 : g+h*(k)) /* k-symbol indel cost */
+
+/* Append "Delete k" op */
+#define DEL(k) \
+{ if (*last < 0) \
+ *last = (*sapp)[-1] -= (k); \
+ else { \
+ *last = (*sapp)[0] = -(k); \
+ (*sapp)++; \
+ } \
+}
+
+/* Append "Insert k" op */
+#define INS(k) \
+{ if (*last > 0) \
+ *last = (*sapp)[-1] += (k); \
+ else { \
+ *last = (*sapp)[0] = (k); \
+ (*sapp)++; \
+ } \
+}
+
+#define REP { *last = (*sapp)[0] = 0; (*sapp)++;} /* Append "Replace" op */
+
+/* bg_align(A,B,M,N,up,low,tb,te) returns the cost of an optimum conversion between
+ A[1..M] and B[1..N] and appends such a conversion to the current script.
+ tb(te)= 1 no gap-open penalty if the conversion begins(ends) with a delete.
+ tb(te)= 2 no gap-open penalty if the conversion begins(ends) with an insert.
+*/
+static int
+bg_align(const unsigned char *A, const unsigned char *B,
+ int M, int N,
+ int low, int up, int tb, int te,
+ int **w, int g, int h,
+ struct bdstr *bss, struct mtp_str *mtp,
+ int **sapp, int *last)
+{
+ int rmid, k, l, r, v, kt;
+ int t1, t2, t3;
+
+ {
+ int band, midd;
+ int leftd, rightd; /* for CC, DD, CP and DP */
+ register int curd; /* current index for CC, DD CP and DP */
+ register int i, j;
+ register int c, d, e;
+ int t, fr, *wa, ib, m;
+
+ /* Boundary cases: M <= 0 , N <= 0, or up-low <= 0 */
+ if (N <= 0) {
+ if (M > 0) { DEL(M) }
+ return 0;
+ }
+ if (M <= 0) {
+ INS(N)
+ return 0;
+ }
+ if ((band = up-low+1) <= 1) {
+ for (i = 1; i <= M; i++) { REP }
+ return 0;
+ }
+
+ /* Divide: Find all crossing points */
+
+ /* Initialization */
+ m = g + h;
+
+ midd = band/2 + 1;
+ rmid = low + midd - 1;
+ leftd = 1-low;
+ rightd = up-low+1;
+ if (leftd < midd) {
+ fr = -1;
+ for (j = 0; j < midd; j++)
+ bss[j].CP = bss[j].DP = -1;
+ for (j = midd; j <= rightd; j++) {
+ bss[j].CP = bss[j].DP = 0;
+ }
+ mtp->MP[0][0] = -1;
+ mtp->MP[1][0] = -1;
+ mtp->MP[2][0] = -1;
+ mtp->MT[0][0] = mtp->MT[1][0] = mtp->MT[2][0] = 0;
+ } else if (leftd > midd) {
+ fr = leftd-midd;
+ for (j = 0; j <= midd; j++) {
+ bss[j].CP = bss[j].DP = fr;
+ }
+ for (j = midd+1; j <= rightd; j++)
+ bss[j].CP = bss[j].DP = -1;
+ mtp->MP[0][fr] = -1;
+ mtp->MP[1][fr] = -1;
+ mtp->MP[2][fr] = -1;
+ mtp->MT[0][fr] = mtp->MT[1][fr] = mtp->MT[2][fr] = 0;
+ } else {
+ fr = 0;
+ for (j = 0; j < midd; j++) {
+ bss[j].CP = bss[j].DP = 0;
+ }
+ for (j = midd; j <= rightd; j++) {
+ bss[j].CP = bss[j].DP = 0;
+ }
+ mtp->MP[0][0] = -1;
+ mtp->MP[1][0] = -1;
+ mtp->MP[2][0] = -1;
+ mtp->MT[0][0] = mtp->MT[1][0] = mtp->MT[2][0] = 0;
+ }
+
+ bss[leftd].CC = 0;
+ if (tb == 2) t = 0;
+ else t = -g;
+ for (j = leftd+1; j <= rightd; j++) {
+ bss[j].CC = t = t-h;
+ bss[j].DD = t-g;
+ }
+ bss[rightd+1].CC = MININT;
+ bss[rightd+1].DD = MININT;
+ if (tb == 1) bss[leftd].DD = 0;
+ else bss[leftd].DD = -g;
+ bss[leftd-1].CC = MININT;
+ for (i = 1; i <= M; i++) {
+ if (i > N-up) rightd--;
+ if (leftd > 1) leftd--;
+ wa = w[A[i]];
+ if ((c = bss[leftd+1].CC-m) > (d = bss[leftd+1].DD-h)) {
+ d = c;
+ bss[leftd].DP = bss[leftd+1].CP;
+ } else bss[leftd].DP = bss[leftd+1].DP;
+ if ((ib = leftd+low-1+i) > 0) c = bss[leftd].CC+wa[B[ib]];
+ if (d > c || ib <= 0) {
+ c = d;
+ bss[leftd].CP = bss[leftd].DP;
+ }
+ e = c-g;
+ bss[leftd].DD = d;
+ bss[leftd].CC = c;
+ mtp->IP = bss[leftd].CP;
+ if (leftd == midd) bss[leftd].CP = bss[leftd].DP = mtp->IP = i;
+ for (curd=leftd+1; curd <= rightd; curd++) {
+ if (curd != midd) {
+ if ((c = c-m) > (e = e-h)) {
+ e = c;
+ mtp->IP = bss[curd-1].CP;
+ } /* otherwise, mtp->IP is unchanged */
+ if ((c = bss[curd+1].CC-m) > (d = bss[curd+1].DD-h)) {
+ d = c;
+ bss[curd].DP = bss[curd+1].CP;
+ } else {
+ bss[curd].DP = bss[curd+1].DP;
+ }
+ c = bss[curd].CC + wa[B[curd+low-1+i]];
+ if (c < d || c < e) {
+ if (e > d) {
+ c = e;
+ bss[curd].CP = mtp->IP;
+ } else {
+ c = d;
+ bss[curd].CP = bss[curd].DP;
+ }
+ } /* otherwise, CP is unchanged */
+ bss[curd].CC = c;
+ bss[curd].DD = d;
+ } else {
+ if ((c = c-m) > (e = e-h)) {
+ e = c;
+ mtp->MP[1][i] = bss[curd-1].CP;
+ mtp->MT[1][i] = 2;
+ } else {
+ mtp->MP[1][i] = mtp->IP;
+ mtp->MT[1][i] = 2;
+ }
+ if ((c = bss[curd+1].CC-m) > (d = bss[curd+1].DD-h)) {
+ d = c;
+ mtp->MP[2][i] = bss[curd+1].CP;
+ mtp->MT[2][i] = 1;
+ } else {
+ mtp->MP[2][i] = bss[curd+1].DP;
+ mtp->MT[2][i] = 1;
+ }
+ c = bss[curd].CC + wa[B[curd+low-1+i]];
+ if (c < d || c < e) {
+ if (e > d) {
+ c = e;
+ mtp->MP[0][i] = mtp->MP[1][i];
+ mtp->MT[0][i] = 2;
+ } else {
+ c = d;
+ mtp->MP[0][i] = mtp->MP[2][i];
+ mtp->MT[0][i] = 1;
+ }
+ } else {
+ mtp->MP[0][i] = i-1;
+ mtp->MT[0][i] = 0;
+ }
+ if (c-g > e) {
+ mtp->MP[1][i] = mtp->MP[0][i];
+ mtp->MT[1][i] = mtp->MT[0][i];
+ }
+ if (c-g > d) {
+ mtp->MP[2][i] = mtp->MP[0][i];
+ mtp->MT[2][i] = mtp->MT[0][i];
+ }
+ bss[curd].CP = bss[curd].DP = mtp->IP = i;
+ bss[curd].CC = c;
+ bss[curd].DD = d;
+ }
+ }
+ }
+
+ /* decide which path to be traced back */
+ if (te == 1 && d+g > c) {
+ k = bss[rightd].DP;
+ l = 2;
+ } else if (te == 2 && e+g > c) {
+ k = mtp->IP;
+ l = 1;
+ } else {
+ k = bss[rightd].CP;
+ l = 0;
+ }
+ if (rmid > N-M) l = 2;
+ else if (rmid < N-M) l = 1;
+ v = c;
+ }
+ /* Conquer: Solve subproblems recursively */
+
+ /* trace back */
+ r = -1;
+ for (; k > -1; r=k, k=mtp->MP[l][r], l=mtp->MT[l][r]){
+ mtp->FP[k] = r;
+ mtp->FT[k] = l; /* l=0,1,2 */
+ }
+ /* forward dividing */
+ if (r == -1) { /* optimal alignment did not cross the middle diagonal */
+ if (rmid < 0) {
+ bg_align(A,B,M,N,rmid+1,up,tb,te,w,g,h,bss, mtp, sapp, last);
+ }
+ else {
+ bg_align(A,B,M,N,low,rmid-1,tb,te,w,g,h,bss, mtp, sapp, last);
+ }
+ } else {
+ k = r;
+ l = mtp->FP[k];
+ kt = mtp->FT[k];
+
+ /* first block */
+ if (rmid < 0) {
+ bg_align(A,B,r-1,r+rmid,rmid+1,min(up,r+rmid),tb,1,w,g,h,bss, mtp, sapp,last);
+ DEL(1)
+ } else if (rmid > 0) {
+ bg_align(A,B,r,r+rmid-1,max(-r,low),rmid-1,tb,2,w,g,h,bss, mtp, sapp,last);
+ INS(1)
+ }
+
+ /* intermediate blocks */
+ t2 = up-rmid-1;
+ t3 = low-rmid+1;
+ for (; l > -1; k = l, l = mtp->FP[k], kt = mtp->FT[k]) {
+ if (kt == 0) { REP }
+ else if (kt == 1) { /* right-hand side triangle */
+ INS(1)
+ t1 = l-k-1;
+ bg_align(A+k,B+k+rmid+1,t1,t1,0,min(t1,t2),2,1,w,g,h,bss, mtp, sapp,last);
+ DEL(1)
+ }
+ else { /* kt == 2, left-hand side triangle */
+ DEL(1)
+ t1 = l-k-1;
+ bg_align(A+k+1,B+k+rmid,t1,t1,max(-t1,t3),0,1,2,w,g,h,bss, mtp, sapp,last);
+ INS(1)
+ }
+ }
+
+ /* last block */
+ if (N-M > rmid) {
+ INS(1)
+ t1 = k+rmid+1;
+ bg_align(A+k,B+t1,M-k,N-t1,0,min(N-t1,t2),2,te,w,g,h,bss, mtp, sapp,last);
+ } else if (N-M < rmid) {
+ DEL(1)
+ t1 = M-(k+1);
+ bg_align(A+k+1,B+k+rmid,t1,N-(k+rmid),max(-t1,t3),0,1,te,w,g,h,
+ bss,mtp,sapp,last);
+ }
+ }
+ return(v);
+}
+
+int B_ALIGN(const unsigned char *A, const unsigned char *B,
+ int M, int N,
+ int low, int up, int **W, int G, int H, int *S, int *nS,
+ int MW, int MX, struct bdstr *bss, struct mtp_str *mtp)
+{
+ int c, i, j;
+ int g, h;
+ size_t mj;
+ int check_score;
+ int **sapp, *sapp_v, *last, last_v;
+
+ g = G;
+ h = H;
+ sapp_v = S;
+ sapp = &sapp_v;
+
+ last_v = 0;
+ last = &last_v;
+
+ low = min(max(-M, low),min(N-M,0));
+ up = max(min(N, up),max(N-M,0));
+
+ if (N <= 0) {
+ if (M > 0) { DEL(M); }
+ return -gap(M);
+ }
+ if (M <= 0) {
+ INS(N);
+ return -gap(N);
+ }
+ if (up-low+1 <= 1) {
+ c = 0;
+ for (i = 1; i <= M; i++) {
+ REP;
+ c += W[A[i]][B[i]];
+ }
+ return c;
+ }
+
+ if (mtp->MT[0]==NULL) {
+ mj = (MX+1) * sizeof(int);
+ mtp->MT[0] = (int *) ckalloc(mj);
+ mtp->MT[1] = (int *) ckalloc(mj);
+ mtp->MT[2] = (int *) ckalloc(mj);
+ mtp->FT = (int *) ckalloc(mj);
+
+ /* mj *= sizeof(int); (already done) */
+ mtp->MP[0] = (int *) ckalloc(mj);
+ mtp->MP[1] = (int *) ckalloc(mj);
+ mtp->MP[2] = (int *) ckalloc(mj);
+ mtp->FP = (int *) ckalloc(mj);
+ }
+
+ c = bg_align(A,B,M,N,low,up,0,0,W,G,H,bss, mtp, sapp, last);
+
+ check_score = BCHECK_SCORE(A,B,M,N,S,W,G,H,nS);
+
+ free(mtp->FP); free(mtp->MP[2]); free(mtp->MP[1]); free(mtp->MP[0]);
+ free(mtp->FT); free(mtp->MT[2]); free(mtp->MT[1]); free(mtp->MT[0]);
+ mtp->MT[0]=NULL;
+
+ if (check_score != c)
+ printf("\nBCheck_score=%d != %d\n", check_score,c);
+ return c;
+}
+
+struct a_res_str *
+do_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int repeat_thresh,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ int *have_ares)
+{
+ int hoff, use_E_thresholds_s, optflag_s, optcut_s, optwid_s, n10, score;
+ const unsigned char *aa1p;
+ struct rstruct rst;
+ struct a_res_str *a_res, *tmp_a_res;
+ int a_res_index;
+#ifdef DEBUG
+ unsigned long adler32_crc;
+#endif
+
+#ifdef TFASTA
+ f_str->n10 = n10=aatran(aa1,f_str->aa1x,n1,frame);
+ aa1p = f_str->aa1x;
+#else
+ n10 = n1;
+ aa1p = aa1;
+#endif
+
+#ifdef DEBUG
+ adler32_crc = adler32(1L,aa1,n1);
+#endif
+
+ *have_ares = 0x3; /* set 0x2 bit to indicate local copy */
+
+ if ((a_res = (struct a_res_str *)calloc(1, sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] Cannot allocate a_res", __FILE__, __LINE__);
+ return NULL;
+ }
+
+ use_E_thresholds_s = ppst->param_u.fa.use_E_thresholds;
+ optflag_s = ppst->param_u.fa.optflag;
+ optcut_s = ppst->param_u.fa.optcut;
+ optwid_s = ppst->param_u.fa.optwid;
+ ppst->param_u.fa.use_E_thresholds = 0;
+ ppst->param_u.fa.optflag = 1;
+ ppst->param_u.fa.optcut = 0;
+ if (!ppst->param_u.fa.optwid_set) {
+ ppst->param_u.fa.optwid *= 2;
+ }
+
+ a_res = bd_malign(aa0, n0, aa1p, n10,
+ repeat_thresh, f_str->max_res,
+ ppst, f_str, a_res,1);
+
+ ppst->param_u.fa.use_E_thresholds = use_E_thresholds_s;
+ ppst->param_u.fa.optflag = optflag_s;
+ ppst->param_u.fa.optcut = optcut_s;
+ ppst->param_u.fa.optwid = optwid_s;
+
+#ifdef DEBUG
+ if (adler32(1L,aa1,n1) != adler32_crc) {
+ fprintf(stderr,"*** error [%s:%d] adler32_crc mismatch n1: %d\n",__FILE__, __LINE__, n1);
+ }
+#endif
+
+ a_res_index = 0;
+ for (tmp_a_res=a_res; tmp_a_res; tmp_a_res = tmp_a_res->next) {
+ tmp_a_res->index = a_res_index++;
+ }
+
+ return a_res;
+}
+
+void
+pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {
+
+#ifdef TFASTA
+ f_str->n10 = aatran(aa1,f_str->aa1x,n1,frame);
+#endif
+}
+
+/* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
+/* call from calcons, calc_id, calc_code */
+void
+aln_func_vals(int frame, struct a_struct *aln) {
+
+#ifdef TFASTA
+ aln->qlfact = 1;
+ aln->llfact = 3;
+ aln->llmult = 3;
+ aln->qlrev = 0;
+ aln->frame = frame;
+ if (frame > 2) {
+ aln->llrev = 1;
+ aln->frame = 3 - frame;
+ }
+ else aln->llrev = 0;
+#else /* FASTA */
+ aln->llfact = aln->qlfact = aln->llmult = 1;
+ aln->llrev = 0;
+ if (frame > 0) aln->qlrev = 1;
+ else aln->qlrev = 0;
+ aln->frame = 0;
+#endif
+}
diff --git a/src/dropnfa.h b/src/dropnfa.h
new file mode 100644
index 0000000..09fe455
--- /dev/null
+++ b/src/dropnfa.h
@@ -0,0 +1,84 @@
+
+/* $Id: dropnfa.h 795 2011-07-04 15:36:12Z wrp $ */
+/* $Revision: 795 $ */
+
+/* global definitions shared by dropnfa.c and altivec.c */
+
+#ifndef MAXSAV
+#define MAXSAV 10
+#endif
+
+
+
+struct dstruct /* diagonal structure for saving current run */
+{
+ int score; /* hash score of current match */
+ int start; /* start of current match */
+ int stop; /* end of current match */
+ struct savestr *dmax; /* location in vmax[] where best score data saved */
+};
+
+struct savestr
+{
+ int score; /* pam score with segment optimization */
+ int score0; /* pam score of best single segment */
+ int gscore; /* score from global match */
+ int dp; /* diagonal of match */
+ int start; /* start of match in lib seq */
+ int stop; /* end of match in lib seq */
+};
+
+struct bdstr {
+ int CC, DD, CP, DP;
+};
+
+struct mtp_str { /* used to hold previous static values */
+ int IP;
+ int *MP[3]; /* save crossing points */
+ int *FP; /* forward dividing points */
+ int *MT[3]; /* 0: rep, 1: del, 2: ins -- was char, now int */
+ int *FT; /* was char, now int */
+};
+
+struct f_struct {
+ struct dstruct *diag;
+ int ndo;
+ int hmask; /* hash constants */
+ int *pamh1; /* pam based array */
+ int *pamh2; /* pam based kfact array */
+ int *link, *harr; /* hash arrays */
+ int kshft; /* shift width */
+ int c_gap, opt_cut;
+#ifdef TFASTA
+ unsigned char *aa1x;
+ int n10;
+#endif
+ struct bdstr *bss;
+ struct mtp_str mtp;
+ int bss_size;
+ struct swstr *ss;
+ struct swstr *f_ss, *r_ss;
+ int *waa_s, *waa_a;
+ int **pam2p[2];
+ int max_res;
+ double aa0_f[MAXSQ];
+ double *kar_p;
+
+#ifdef FA_ALTIVEC
+ int vec_len;
+ vecInt **vec_matrix;
+ vector signed ALTIVEC_SIZE *vec_HH;
+ vector signed ALTIVEC_SIZE *vec_EE;
+
+ int vec_len2;
+ vecInt2 **vec_matrix2;
+ vector signed ALTIVEC_SIZE2 *vec_HH2;
+ vector signed ALTIVEC_SIZE2 *vec_EE2;
+#endif
+};
+
+static int
+FLOCAL_ALIGN(const unsigned char *A, const unsigned char *B,
+ int M, int N, int low, int up,
+ int **W, int G,int H, int MW,
+ struct f_struct *f_str);
diff --git a/src/dropnnw2.c b/src/dropnnw2.c
new file mode 100644
index 0000000..a1112e6
--- /dev/null
+++ b/src/dropnnw2.c
@@ -0,0 +1,900 @@
+/* $Id: dropnnw2.c $ */
+/* $Revision: 1140 $ */
+
+/* copyright (c) 1996, 2007, 2014 by William R. Pearson and The Rector &
+ Visitors of the Univeristy of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* 4-April-2007 - convert to global alignment */
+
+/* 17-Aug-2006 - removed globals *sapp/last - alignment should be thread safe */
+
+/* 12-Oct-2005 - converted to use a_res and aln for alignment coordinates */
+
+/* 4-Nov-2004 - Diagonal Altivec Smith-Waterman included */
+
+/* 14-May-2003 - modified to return alignment start at 0, rather than
+ 1, for begin:end alignments
+
+ 25-Feb-2003 - modified to support Altivec parallel Smith-Waterman
+
+ 22-Sep-2003 - removed Altivec support at request of Sencel lawyers
+*/
+
+/* this code uses an implementation of the Smith-Waterman algorithm
+ designed by Phil Green, U. of Washington, that is 1.5 - 2X faster
+ than my Miller and Myers implementation. */
+
+/* the shortcuts used in this program prevent it from calculating scores
+ that are less than the gap penalty for the first residue in a gap. As
+ a result this code cannot be used with very large gap penalties, or
+ with very short sequences, and probably should not be used with prss3.
+*/
+
+/* version 3.2 fixes a subtle bug that was encountered while running
+ do_walign() interspersed with do_work(). This happens only with -m
+ 9 and pvcomplib. The fix was to more explicitly zero-out ss[] at
+ the beginning of do_work.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+
+#include "defs.h"
+#include "param.h"
+
+static char *verstr="6.0 April 2007";
+
+#include "dropgsw2.h"
+
+#define DROP_INTERN
+#include "drop_func.h"
+
+#ifdef SW_SSE2
+#ifdef GLOBAL_GLOBAL
+#include "global_sse2.h"
+#define GLOBAL_BYTE global_sse2_byte
+#define GLOBAL_WORD global_sse2_word
+#else
+#include "glocal_sse2.h"
+#define GLOBAL_BYTE glocal_sse2_byte
+#define GLOBAL_WORD glocal_sse2_word
+#endif
+#endif
+
+struct swstr {int H, E;};
+
+extern void init_karlin(const unsigned char *aa0, int n0, struct pstruct *ppst,
+ double *aa0_f, double **kp);
+extern int do_karlin(const unsigned char *aa1, int n1,
+ int **pam2, const struct pstruct *ppst,
+ double *aa0_f, double *kar_p, double *lambda, double *H);
+
+extern int
+NW_ALIGN(int IW, const unsigned char *B,
+ int M, int N,
+ int **W, int G, int H, int *res, int *nres
+ );
+
+static int
+FGLOBAL_ALIGN(int *pwaa, const unsigned char *aa1,
+ int n0, int n1,
+ int GG,int HH,
+ struct swstr *ss);
+
+extern struct a_res_str *
+nsw_malign (int ***pam2p, int pam_ix, int n0,
+ const unsigned char *aa1, int n1,
+ int score_thresh, int max_res,
+ int gdelval, int ggapval,
+ struct swstr *ss,
+ struct a_res_str *cur_ares,
+ int (*fn_walign)
+ (
+ int **pam2p, int n0,
+ const unsigned char *aa1, int n1,
+ int q, int r,
+ struct swstr *ss,
+ struct a_res_str *a_res
+ ),
+ int do_rep
+ );
+
+static
+void DISPLAY(const unsigned char *A, const unsigned char *B,
+ int M, int N,
+ int *S, int AP, int BP, char *sq);
+
+extern void aancpy(char *to, char *from, int count, struct pstruct *ppst);
+
+/* initialize for Smith-Waterman optimal score */
+
+void
+init_work (unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ int maxn0, ip;
+ int *pwaa_s, *pwaa_a;
+ int e, f, i, j, l;
+ int *res;
+ struct f_struct *f_str;
+ int **pam2p;
+ struct swstr *ss;
+ int nsq;
+
+#if defined(SW_ALTIVEC) || defined(SW_SSE2)
+ int data, bias, ceiling, gap;
+ unsigned char * pc;
+ unsigned short * ps;
+ int overflow;
+
+ int n_count;
+ int col_len;
+#endif
+
+ if (ppst->ext_sq_set) {
+ nsq = ppst->nsqx; ip = 1;
+ }
+ else {
+ /* with memory mapped databases with lc chars, always nsqx */
+ nsq = ppst->nsqx; ip = 0;
+ }
+
+ /* initialize range of length appropriate */
+
+ if (ppst->n1_low == 0 ) {
+ ppst->n1_low = (int)(0.75 * (float)n0 + 0.5);
+ }
+
+#if defined(GLOBAL_GLOBAL)
+ if (ppst->n1_high == BIGNUM) {
+ ppst->n1_high = (int)(1.33 * (float)n0 - 0.5);
+ }
+#endif
+
+ /* allocate space for function globals */
+ f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct));
+
+ if((ppst->zsflag%10) == 6) {
+ f_str->kar_p = NULL;
+ init_karlin(aa0, n0, ppst, &f_str->aa0_f[0], &f_str->kar_p);
+ }
+
+ /* allocate space for the scoring arrays */
+ if ((ss = (struct swstr *) calloc (n0+2, sizeof (struct swstr)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate ss array %3d\n", n0);
+ exit (1);
+ }
+ ss++;
+
+ f_str->ss = ss;
+
+ /* initialize variable (-S) pam matrix */
+ if ((f_str->waa_s= (int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate waa_s array %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ /* initialize pam2p[1] pointers */
+ if ((f_str->pam2p[1]= (int **)calloc((n0+1),sizeof(int *))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1] array %3d\n",n0);
+ exit(1);
+ }
+
+ pam2p = f_str->pam2p[1];
+ if ((pam2p[0]=(int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1][] array %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ for (i=1; i<n0; i++) {
+ pam2p[i]= pam2p[0] + (i*(nsq+1));
+ }
+
+ /* initialize universal (alignment) matrix */
+ if ((f_str->waa_a= (int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate waa_a struct %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ /* initialize pam2p[0] pointers */
+ if ((f_str->pam2p[0]= (int **)calloc((n0+1),sizeof(int *))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1] array %3d\n",n0);
+ exit(1);
+ }
+
+ pam2p = f_str->pam2p[0];
+ if ((pam2p[0]=(int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1][] array %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ for (i=1; i<n0; i++) {
+ pam2p[i]= pam2p[0] + (i*(nsq+1));
+ }
+
+ /*
+ pwaa effectively has a sequence profile --
+ pwaa[0..n0-1] has pam score for residue 0 (-BIGNUM)
+ pwaa[n0..2n0-1] has pam scores for residue 1 (A)
+ pwaa[2n0..3n-1] has pam scores for residue 2 (R), ...
+
+ thus: pwaa = f_str->waa_s + (*aa1p++)*n0; sets up pwaa so that
+ *pwaa++ rapidly moves though the scores of the aa1p[] position
+ without further indexing
+
+ For a real sequence profile, pwaa[0..n0-1] vs ['A'] could have
+ a different score in each position.
+ */
+
+ pwaa_s = f_str->waa_s;
+ pwaa_a = f_str->waa_a;
+ if (ppst->pam_pssm) {
+ for (e = 0; e <nsq; e++) { /* for each residue in the alphabet */
+ for (f = 0; f < n0; f++) { /* for each position in aa0 */
+ *pwaa_s++ = f_str->pam2p[ip][f][e] = ppst->pam2p[ip][f][e];
+ *pwaa_a++ = f_str->pam2p[0][f][e] = ppst->pam2p[0][f][e];
+ }
+ }
+ }
+ else { /* initialize scanning matrix */
+ for (e = 0; e <nsq; e++) /* for each residue in the alphabet */
+ for (f = 0; f < n0; f++) { /* for each position in aa0 */
+ *pwaa_s++ = f_str->pam2p[ip][f][e]= ppst->pam2[ip][aa0[f]][e];
+ *pwaa_a++ = f_str->pam2p[0][f][e] = ppst->pam2[0][aa0[f]][e];
+ }
+ }
+
+ /* these structures are used for producing alignments */
+
+#if defined(SW_SSE2)
+ /* First we allocate memory for the workspace - i.e. two rows for H and
+ * one row for F. We also need enough space to hold a temporary
+ * scoring profile which will be query_length * 16 (sse2 word length).
+ * Since this might be run on Linux or AIX too, we don't assume
+ * anything about the memory allocation but align it ourselves.
+ */
+ f_str->workspace_memory = (void *)malloc(3*16*(MAXTST+MAXLIB+32)+256);
+ f_str->workspace = (void *)((((size_t)f_str->workspace_memory) + 255) & (~0xff));
+
+ /* We always use a scoring profile for the SSE2 implementation, but the layout
+ * is a bit strange. The scoring profile is parallel to the query, but is
+ * accessed in a stripped pattern. The query is divided into equal length
+ * segments. The number of segments is equal to the number of elements
+ * processed in the SSE2 register. For 8-bit calculations, the query will
+ * be divided into 16 equal length parts. If the query is not long enough
+ * to fill the last segment, it will be filled with neutral weights. The
+ * first element in the SSE register will hold a value from the first segment,
+ * the second element of the SSE register will hold a value from the
+ * second segment and so on. So if the query length is 288, then each
+ * segment will have a length of 18. So the first 16 bytes will have
+ * the following weights: Q1, Q19, Q37, ... Q271; the next 16 bytes will
+ * have the following weights: Q2, Q20, Q38, ... Q272; and so on until
+ * all parts of all segments have been written. The last seqment will
+ * have the following weights: Q18, Q36, Q54, ... Q288. This will be
+ * done for the entire alphabet.
+ */
+
+ f_str->word_score_memory = (void *)malloc((n0 + 32) * sizeof(short) * (nsq + 1) + 256);
+ f_str->byte_score_memory = (void *)malloc((n0 + 32) * sizeof(char) * (nsq + 1) + 256);
+
+ f_str->word_score = (unsigned short *)((((size_t)f_str->word_score_memory) + 255) & (~0xff));
+ f_str->byte_score = (unsigned char *)((((size_t)f_str->byte_score_memory) + 255) & (~0xff));
+
+ overflow = 0;
+ gap = -2 * ppst->ggapval;
+
+ if (ppst->pam_pssm) {
+ /* Use a position-specific scoring profile.
+ * This is essentially what we are going to construct anyway, but we'll
+ * reorder it to suit sse2.
+ */
+ bias = 127;
+ ceiling = 0;
+ for (i = 1; i < nsq ; i++) {
+ for (j = 0; j < n0 ; j++) {
+ data = ppst->pam2p[ip][j][i];
+ if (data < bias) {
+ bias = data;
+ }
+ if (data > ceiling) {
+ ceiling = data;
+ }
+ }
+ }
+ bias += gap;
+ if (bias > 0) {
+ bias = 0;
+ }
+
+
+ /* Fill our specially organized byte- and word-size scoring arrays. */
+ ps = f_str->word_score;
+ col_len = (n0 + 7) / 8;
+ n_count = (n0 + 7) & 0xfffffff8;
+ for (f = 0; f < n_count; ++f) {
+ *ps++ = 0;
+ }
+ for (f = 1; f < nsq ; f++) {
+ for (e = 0; e < col_len; e++) {
+ for (i = e; i < n_count; i += col_len) {
+ if ( i < n0) { data = ppst->pam2p[ip][i][f] + gap;}
+ else {data = 0;}
+ *ps++ = (unsigned short)(data);
+ }
+ }
+ }
+ pc = f_str->byte_score;
+ col_len = (n0 + 15) / 16;
+ n_count = (n0 + 15) & 0xfffffff0;
+ for (f = 0; f < n_count; ++f) {
+ *pc++ = 0;
+ }
+ for (f = 1; f < nsq ; f++) {
+ for (e = 0; e < col_len; e++) {
+ for (i = e; i < n_count; i += col_len) {
+ if ( i < n0 ) { data = ppst->pam2p[ip][i][f] + gap;}
+ else {data = 0;}
+ if (data > 255) {
+ printf("Fatal error. data: %d bias: %d, position: %d/%d, "
+ "Score out of range for 8-bit SSE2 datatype.\n",
+ data, bias, f, e);
+ exit(1);
+ }
+ *pc++ = (unsigned char)(data-bias);
+ }
+ }
+ }
+ } else {
+ /* Classical simple substitution matrix */
+ /* Find the bias to use in the substitution matrix */
+ bias = 127;
+ ceiling = 0;
+ for (i = 1; i < nsq ; i++) {
+ for (j = 1; j < nsq ; j++) {
+ data = ppst->pam2[ip][i][j];
+ if (data < bias) {
+ bias = data;
+ }
+ if (data > ceiling) {
+ ceiling = data;
+ }
+ }
+ }
+ bias += gap;
+ if (bias > 0) {
+ bias = 0;
+ }
+
+ /* Fill our specially organized byte- and word-size scoring arrays. */
+ ps = f_str->word_score;
+ col_len = (n0 + 7) / 8;
+ n_count = (n0 + 7) & 0xfffffff8;
+ for (f = 0; f < n_count; ++f) {
+ *ps++ = 0;
+ }
+ for (f = 1; f < nsq ; f++) {
+ for (e = 0; e < col_len; e++) {
+ for (i = e; i < n_count; i += col_len) {
+ if (i >= n0) {
+ data = 0;
+ } else {
+ data = ppst->pam2[ip][aa0[i]][f] + gap;
+ }
+ *ps++ = (unsigned short)(data);
+ }
+ }
+ }
+
+ pc = f_str->byte_score;
+ col_len = (n0 + 15) / 16;
+ n_count = (n0 + 15) & 0xfffffff0;
+ for (f = 0; f < n_count; ++f) {
+ *pc++ = 0;
+ }
+ for (f = 1; f < nsq ; f++) {
+ for (e = 0; e < col_len; e++) {
+ for (i = e; i < n_count; i += col_len) {
+ if (i >= n0) {
+ data = 0;
+ } else {
+ data = ppst->pam2[ip][aa0[i]][f] + gap;
+ }
+ if (data > 255) {
+ printf("Fatal error. data: %d bias: %d, position: %d/%d, "
+ "Score out of range for 8-bit SSE2 datatype.\n",
+ data, bias, f, e);
+ exit(1);
+ }
+ *pc++ = (unsigned char)(data-bias);
+ }
+ }
+ }
+ }
+
+ f_str->ceiling = (unsigned char) (ceiling + gap - bias);
+ f_str->bias = (unsigned char) (-bias);
+ f_str->alphabet_size = nsq;
+
+ /* Some variable to keep track of how many 8-bit runs we need to rerun
+ * in 16-bit accuracy. If there are too many reruns it can be faster
+ * to use 16-bit alignments directly.
+ */
+
+ /* We can only do 8-bit alignments if the scores were small enough. */
+ f_str->try_8bit = (overflow == 0) ? 1 : 0;
+
+ f_str->done_8bit = 0;
+ f_str->done_16bit = 0;
+#endif /* SW_SSE2 */
+
+ /* minimum allocation for alignment */
+ f_str->max_res = max(3*n0/2,MIN_RES);
+
+ *f_arg = f_str;
+}
+
+void close_work (const unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ struct f_struct *f_str;
+
+ f_str = *f_arg;
+
+ if (f_str != NULL) {
+ if (f_str->kar_p !=NULL) free(f_str->kar_p);
+ f_str->ss--;
+ free(f_str->ss);
+ free(f_str->waa_a);
+ free(f_str->pam2p[0][0]);
+ free(f_str->pam2p[0]);
+ free(f_str->waa_s);
+ free(f_str->pam2p[1][0]);
+ free(f_str->pam2p[1]);
+
+#if defined(SW_ALTIVEC) || defined(SW_SSE2)
+ free(f_str->workspace_memory);
+ free(f_str->word_score_memory);
+ free(f_str->byte_score_memory);
+#endif
+ free(f_str);
+ *f_arg = NULL;
+ }
+}
+
+
+/* pstring1 is a message to the manager, currently 512 */
+/*void get_param(struct pstruct *pstr,char *pstring1)*/
+void
+get_param (const struct pstruct *ppst,
+ char **pstring1, char *pstring2,
+ struct score_count_s *s_info)
+{
+
+ char pg_str[120];
+ char psi_str[120];
+
+#ifdef SW_SSE2
+#if defined(GLOBAL_GLOBAL)
+ char *pg_desc = "Global/Global affine Needleman-Wunsch (SSE2, Michael Farrar 2010)";
+#else
+ char *pg_desc = "Global/Local affine Needleman-Wunsch (SSE2, Michael Farrar 2010)";
+#endif
+#else
+#if defined(GLOBAL_GLOBAL)
+ char *pg_desc = "Global/Global affine Needleman-Wunsch (2007)";
+#else
+ char *pg_desc = "Global/Local affine Needleman-Wunsch (2007)";
+#endif
+#endif
+
+ strncpy(pg_str, pg_desc, sizeof(pg_str));
+
+ if (ppst->pam_pssm) { strncpy(psi_str,"-PSI",sizeof(psi_str));}
+ else { psi_str[0]='\0';}
+
+ sprintf (pstring1[0], "%s (%s)", pg_str, verstr);
+ sprintf (pstring1[1],
+#ifdef OLD_FASTA_GAP
+ "%s matrix%s (%d:%d)%s, gap-penalty: %d/%d",
+#else
+ "%s matrix%s (%d:%d)%s, open/ext: %d/%d",
+#endif
+ ppst->pam_name, psi_str, ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set)?"xS":"\0", ppst->gdelval, ppst->ggapval);
+
+ if (pstring2 != NULL) {
+#ifdef OLD_FASTA_GAP
+ sprintf(pstring2,"; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n; pg_gap-pen: %d %d\n",
+#else
+ sprintf(pstring2,"; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n; pg_open-ext: %d %d\n",
+#endif
+ pg_str,verstr,psi_str,ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set)?"xS":"\0",ppst->gdelval,ppst->ggapval);
+ }
+}
+
+void do_work (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ int qr_flg, int shuff_flg, struct rstruct *rst,
+ struct score_count_s *s_info)
+{
+ int score;
+ double lambda, H;
+ int i;
+
+ rst->valid_stat = 1;
+ s_info->s_cnt[0]++;
+ s_info->tot_scores++;
+
+#if defined(SW_SSE2)
+
+ score = OVERFLOW_SCORE;
+
+ if (f_str->try_8bit) {
+ score = GLOBAL_BYTE(n0,
+ f_str->byte_score,
+ aa1,
+ n1,
+#ifndef OLD_FASTA_GAP
+ //-(ppst->gdelval + ppst->ggapval),
+ -ppst->gdelval,
+#else
+ //-ppst->gdelval,
+ -(ppst->gdelval - ppst->ggapval),
+#endif
+ -ppst->ggapval,
+ f_str->ceiling,
+ f_str->bias,
+ f_str);
+
+ f_str->done_8bit++;
+
+ /* The 8 bit version is roughly 50% faster than the 16 bit version,
+ * so we are fine if less than about 1/3 of the runs have to
+ * be rerun with 16 bits. If it is more, and we have tried at least
+ * 500 sequences, we switch off the 8-bit mode.
+ */
+ if (score == OVERFLOW) {
+ f_str->done_16bit++;
+ if(f_str->done_8bit>500 && (3*f_str->done_16bit)>(f_str->done_8bit))
+ f_str->try_8bit = 0;
+ }
+ }
+
+ if (score == OVERFLOW_SCORE) {
+ /* Overflow, so we have to redo it in 16 bits. */
+ score = GLOBAL_WORD(n0,
+ f_str->word_score,
+ aa1,
+ n1,
+#ifndef OLD_FASTA_GAP
+ //-(ppst->gdelval + ppst->ggapval),
+ -ppst->gdelval,
+#else
+ //-ppst->gdelval,
+ -(ppst->gdelval - ppst->ggapval),
+#endif
+ -ppst->ggapval,
+ f_str->ceiling,
+ f_str);
+ }
+#else
+
+ score = FGLOBAL_ALIGN(f_str->waa_s,aa1,n0,n1,
+#ifdef OLD_FASTA_GAP
+ -(ppst->gdelval - ppst->ggapval),
+#else
+ -ppst->gdelval,
+#endif
+ -ppst->ggapval,f_str->ss);
+#endif
+
+ rst->score[0] = score;
+
+ if(((ppst->zsflag % 10) == 6) &&
+ (do_karlin(aa1, n1, ppst->pam2[0], ppst,f_str->aa0_f,
+ f_str->kar_p, &lambda, &H)>0)) {
+ rst->comp = 1.0/lambda;
+ rst->H = H;
+ }
+ else {rst->comp = rst->H = -1.0;}
+
+}
+
+/* nw_walign is the equivalent of sw_walign from dropnnw.c -- it is to
+ be called from nw_malign (the equivalent of sw_malign */
+
+int
+nw_walign (int **pam2p, int n0,
+ const unsigned char *aa1, int n1,
+ int q, int r,
+ struct swstr *ss,
+ struct a_res_str *a_res
+ )
+{
+ const unsigned char *aa1p;
+ register int i, j;
+ register struct swstr *ssj;
+ int e, f, h, p;
+ int qr, t;
+ int score;
+ int cost, I, J, K, L;
+
+ qr = q + r;
+
+ score = -BIGNUM;
+ J = n0-1; L = 0; /* alignments are global in aa0[n0] */
+
+ /* initialize 0th row */
+ ss[0].H = 0;
+ ss[0].E = t = -q; /* must be re-initialized because it was
+ filled in the reverse direction in previous
+ invocations */
+ /* must count from ss+1, ss[0].H = 0 */
+ for (ssj=ss+1; ssj <= ss+n0 ; ssj++) {
+ ssj->H = t = t - r;
+ ssj->E = t - q;
+ }
+
+ aa1p = aa1;
+ i = 0;
+ t = -q;
+ while (*aa1p) {
+ p = ss[0].H;
+#ifndef GLOBAL_GLOBAL /* GLOBAL_LOCAL */
+ ss[0].H = h = t = 0;
+#else
+ ss[0].H = h = t = t - r;
+#endif
+ f = t - q;
+ /* pwaa = waa + (*aa1p++ * n0); */
+ /* ssj must start at ss+1, ss[0].H = 0,
+ but j must go 0 .. n0-1 for pam2p[j] */
+ for (ssj = ss+1, j=0; j < n0; ssj++,j++) {
+ if ((h = h - qr) > /* gap open from left best */
+ /* gap extend from left gapped */
+ (f = f - r)) f = h; /* if better, use new gap opened */
+ if ((h = ssj->H - qr) > /* gap open from up best */
+ /* gap extend from up gap */
+ (e = ssj->E - r)) e = h; /* if better, use new gap opened */
+ h = p + pam2p[j][*aa1p];
+ /* h = p + *pwaa++; */ /* diagonal match */
+ if (h < f ) h = f; /* left gap better, reset */
+ if (h < e ) h = e; /* up gap better, reset */
+ p = ssj->H; /* save previous best score */
+ ssj->H = h; /* save (new) up diag-matched */
+ ssj->E = e; /* save upper gap opened */
+ }
+#ifndef GLOBAL_GLOBAL
+ if (h > score) { /* ? new best score at the end of each row */
+ score = h; /* save best */
+ I = i; /* row */
+ }
+#endif
+ /*
+ fprintf(stderr," r %d - score: %d ssj[]: %d\n", i,score,
+ ss[(i <= n0) ? i-1 : n0-1].H);
+ */
+ aa1p++; /* aa1p goes down the path graph, row by row */
+ i++; /* increment the row */
+ } /* done with forward pass */
+
+#ifdef GLOBAL_GLOBAL
+ cost = score = h;
+ K = 0;
+ I = n1 - 1;
+#else
+ /* fprintf(stderr, " r: %d - score: %d\n", I, score); */
+
+ /* to get the start point, go backwards */
+
+ cost = -BIGNUM;
+ K = 0;
+ ss[n0].H = 0;
+ t = -q;
+ for (ssj=ss+n0-1; ssj>=ss; ssj--) {
+ ssj->H = t = t - r;
+ ssj->E= t - q;
+ }
+
+ t = 0;
+ for (i=I; i>=0; i--) {
+ p = ss[n0].H;
+ ss[n0].H = h = t = t-r;
+ f = t-q;
+ for (ssj=ss+J, j= J-1; j>=0; ssj--, j--) {
+ if ((h = h - qr) > /* gap open from left best */
+ /* gap extend from left gapped */
+ (f = f - r)) f = h; /* if better, use new gap opened */
+ if ((h = ssj->H - qr) > /* gap open from up best */
+ /* gap extend from up gap */
+ (e = ssj->E - r)) e = h; /* if better, use new gap opened */
+ h = p + pam2p[j][aa1[i]]; /* diagonal match */
+ if (h < f ) h = f; /* left gap better, reset */
+ if (h < e ) h = e; /* up gap better, reset */
+ p = ssj->H; /* save previous best score */
+ ssj->H = h; /* save (new) up diag-matched */
+ ssj->E = e; /* save upper gap opened */
+ }
+ if (h > cost) {
+ cost = h;
+ K = i;
+ if (cost >= score) goto found;
+ }
+ }
+ /* at this point, ss[0].E has a very high value for good alignments */
+ found:
+#endif /* not GLOBAL_GLOBAL */
+
+/* fprintf(stderr," *** %d=%d: L: %3d-%3d/%3d; K: %3d-%3d/%3d\n",score,cost,L,J+1,n0,K,I+1,n1); */
+
+/* in the f_str version, the *res array is already allocated at 4*n0/3 */
+
+ a_res->n1 = n1;
+ a_res->max0 = J+1; a_res->min0 = L; a_res->max1 = I+1; a_res->min1 = K;
+
+/* this code no longer refers to aa0[], it uses pam2p[0][L] instead */
+ NW_ALIGN(L,&aa1[K-1],J-L+1,I-K+1,pam2p,q,r,a_res->res,&a_res->nres);
+
+/* DISPLAY(&aa0[L-1],&aa1[K-1],J-L+1,I-K+1,res,L,K,ppst->sq); */
+
+/* return *res and nres */
+
+ return score;
+}
+
+#define gap(k) ((k) <= 0 ? 0 : g+h*(k)) /* k-symbol indel cost */
+
+static int
+FGLOBAL_ALIGN(int *waa, const unsigned char *aa1,
+ int n0, int n1,
+ int q, int r,
+ struct swstr *ss)
+{
+ const unsigned char *aa1p;
+ register int *pwaa;
+ int i,j;
+ struct swstr *ssj;
+ int t;
+ int e, f, h, p;
+ int qr;
+ int score;
+ int ij, max_col, max_ij;
+
+ /* q - gap open is positve */
+ /* r - gap extend is positive */
+
+ qr = q+r;
+
+ score = -BIGNUM;
+
+ /* initialize 0th row */
+ ss[0].H = 0;
+ ss[0].E = t = -q;
+ for (ssj = ss+1; ssj <= ss+n0 ; ssj++) {
+ ssj->H = t = t - r;
+ ssj->E = t - q;
+ }
+
+ aa1p = aa1;
+ t = -q;
+ while (*aa1p) {
+ p = ss[0].H;
+#if defined(GLOBAL_GLOBAL)
+ ss[0].H = h = t = t - r;
+#else /* GLOBAL_LOCAL */
+ ss[0].H = h = t = 0;
+#endif
+ f = t - q;
+ pwaa = waa + (*aa1p++ * n0);
+ for (ssj = ss+1; ssj <= ss+n0; ssj++) { /* go across query */
+ if ((h = h - qr) > (f = f - r)) f = h;
+ if ((h = ssj->H - qr) > (e = ssj->E - r)) e = h;
+ h = p + *pwaa++;
+ if (h < f) h = f;
+ if (h < e) h = e;
+ p = ssj->H;
+ ssj->H = h;
+ ssj->E = e;
+ }
+#if !defined(GLOBAL_GLOBAL) /* GLOBAL_LOCAL */
+ if (h > score) {
+ score = h; /* at end of query, update score */
+ }
+#endif
+ } /* done with forward pass */
+#ifdef GLOBAL_GLOBAL
+ score = h;
+#endif
+ return score;
+}
+
+void do_opt (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ struct pstruct *ppst, struct f_struct *f_str,
+ struct rstruct *rst)
+{
+}
+
+/* this do_walign simply calls nsw_malign using nw_walign it is
+ modeled after the same do_walign code in dropgsw2.c
+
+ It makes no sense to use this strategy for GLOBAL_GLOBAL
+ alignments, but hopefully they will take care of themselves.
+*/
+
+struct a_res_str *
+do_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int repeat_thresh,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ int *have_ares)
+{
+ struct a_res_str *a_res, *tmp_a_res;
+ int a_res_index;
+
+ *have_ares = 0x3; /* set 0x2 bit to indicate local copy */
+
+ if ((a_res = (struct a_res_str *)calloc(1, sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr," [do_walign] Cannot allocate a_res");
+ return NULL;
+ }
+
+ a_res = nsw_malign(f_str->pam2p, (ppst->ext_sq_set ? 1 : 0), n0, aa1, n1,
+ repeat_thresh, f_str->max_res,
+ -ppst->gdelval, -ppst->ggapval,
+ f_str->ss, a_res,
+ &nw_walign, ppst->do_rep
+ );
+
+ a_res_index = 0;
+ for (tmp_a_res=a_res; tmp_a_res; tmp_a_res = tmp_a_res->next) {
+ tmp_a_res->index = a_res_index++;
+ }
+
+ return a_res;
+}
+
+void
+pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {
+
+#ifdef TFAST
+ f_str->n10 = aatran(aa1,f_str->aa1x,n1,frame);
+#endif
+
+}
+
+/* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
+/* call from calcons, calc_id, calc_code */
+void
+aln_func_vals(int frame, struct a_struct *aln) {
+
+ aln->llfact = aln->llmult = aln->qlfact = 1;
+ aln->llrev = 0;
+ if (frame > 0) aln->qlrev = 1;
+ else aln->qlrev = 0;
+ aln->frame = 0;
+}
diff --git a/src/dropnsw.c b/src/dropnsw.c
new file mode 100644
index 0000000..3aba3ff
--- /dev/null
+++ b/src/dropnsw.c
@@ -0,0 +1,424 @@
+/* $Id: dropnsw.c $ */
+
+/* copyright (c) 1994, 1995, 1996, 2014 by William R. Pearson and the
+ Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/*
+ this is a slower version of dropgsw.c that implements the Smith-Waterman
+ algorithm. It lacks the shortcuts in dropgsw.c that prevent scores less
+ than the penalty for the first residue in a gap from being generated.
+
+ Thus, dropnsw.c should be used for tests with very large gap penalties,
+ and is more appropriate for programs like prss3, which are interested
+ in accurate low scores.
+*/
+
+/* the do_walign() code in this file is not thread_safe */
+/* init_work(), do_work(), are thread safe */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+
+#include "defs.h"
+#include "param.h"
+
+static char *verstr="3.5 Aug 2009";
+
+struct swstr { int H, E;};
+
+struct f_struct {
+ struct swstr *ss;
+ struct swstr *f_ss;
+ struct swstr *r_ss;
+ int *waa_s, *waa_a;
+ int **pam2p[2];
+ int max_res;
+ double aa0_f[MAXSQ];
+ double *kar_p;
+};
+
+#define DROP_INTERN
+#include "drop_func.h"
+
+extern int do_karlin(const unsigned char *aa1, int n1,
+ int **pam2, const struct pstruct *ppst,
+ double *aa0_f, double *kar_p, double *lambda, double *H);
+extern int sw_walign (int **pam2p, int n0,
+ const unsigned char *aa1, int n1,
+ int q, int r,
+ struct swstr *ss,
+ struct a_res_str *a_res
+ );
+
+extern struct a_res_str *
+nsw_malign (int ***pam2p, int pam_ix, int n0,
+ const unsigned char *aa1, int n1,
+ int score_thresh, int max_res,
+ int gdelval, int ggapval,
+ struct swstr *ss,
+ struct a_res_str *cur_ares,
+ int score_ix,
+ int (*fn_walign)
+ (
+ int **pam2p, int n0,
+ const unsigned char *aa1, int n1,
+ int q, int r,
+ struct swstr *ss,
+ struct a_res_str *a_res
+ ),
+ int do_rep
+ );
+
+void
+SIM(const unsigned char *A, /* seq1 indexed A[1..M] */
+ const unsigned char *B, /* seq2 indexed B[1..N] */
+ int M, int N, /* len seq1, seq2 */
+ struct pstruct *ppst, /* parameters */
+ int nseq, /* nseq - number of different sequences */
+ int mini_score, /* cut-off score */
+ int max_count, /* number of alignments */
+ struct a_res_str *a_res); /* alignment result structure */
+
+/* initialize for Smith-Waterman optimal score */
+
+void init_work (unsigned char *aa0, int n0,
+ struct pstruct *ppst,
+ struct f_struct **f_arg)
+{
+ int maxn0;
+ int *pwaa_s, *pwaa_a;
+ int e, f, i, j, q;
+ int *res;
+ struct f_struct *f_str;
+ int **pam2p;
+ struct swstr *ss, *f_ss, *r_ss;
+ int nsq, ip;
+
+ ppst->stats_mod = 1;
+ if (ppst->ext_sq_set) {
+ nsq = ppst->nsqx; ip = 1;
+ }
+ else {
+ nsq = ppst->nsqx; ip = 0;
+ }
+
+ f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct));
+
+ /* allocate space for the scoring arrays */
+ maxn0 = n0 + 2;
+ if ((ss = (struct swstr *) calloc (maxn0, sizeof (struct swstr)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate ss array %3d\n", n0);
+ exit (1);
+ }
+ ss++;
+ f_str->ss = ss;
+
+ if ((f_ss = (struct swstr *) calloc (maxn0, sizeof (struct swstr)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate f_ss array %3d\n", n0);
+ exit (1);
+ }
+ f_ss++;
+ f_str->f_ss = f_ss;
+
+ if ((r_ss = (struct swstr *) calloc (n0+2, sizeof (struct swstr)))
+ == NULL) {
+ fprintf (stderr, "cannot allocate r_ss array %3d\n", n0);
+ exit (1);
+ }
+ r_ss++;
+ f_str->r_ss = r_ss;
+
+ /* initialize variable (-S) pam matrix */
+ if ((f_str->waa_s= (int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate waa_s array %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ if ((f_str->pam2p[1]= (int **)calloc((n0+1),sizeof(int *))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1] array %3d\n",n0);
+ exit(1);
+ }
+
+ pam2p = f_str->pam2p[1];
+ if ((pam2p[0]=(int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1][] array %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ for (i=1; i<n0; i++) {
+ pam2p[i]= pam2p[0] + (i*(nsq+1));
+ }
+
+ /* initialize universal (alignment) matrix */
+ if ((f_str->waa_a= (int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate waa_a struct %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ if ((f_str->pam2p[0]= (int **)calloc((n0+1),sizeof(int *))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1] array %3d\n",n0);
+ exit(1);
+ }
+
+ pam2p = f_str->pam2p[0];
+ if ((pam2p[0]=(int *)calloc((nsq+1)*(n0+1),sizeof(int))) == NULL) {
+ fprintf(stderr,"cannot allocate pam2p[1][] array %3d\n",nsq*n0);
+ exit(1);
+ }
+
+ for (i=1; i<n0; i++) {
+ pam2p[i]= pam2p[0] + (i*(nsq+1));
+ }
+
+ /*
+ pwaa effectively has a sequence profile --
+ pwaa[0..n0-1] has pam score for residue 0 (-BIGNUM)
+ pwaa[n0..2n0-1] has pam scores for residue 1 (A)
+ pwaa[2n0..3n-1] has pam scores for residue 2 (R), ...
+
+ thus: pwaa = f_str->waa_s + (*aa1p++)*n0; sets up pwaa so that
+ *pwaa++ rapidly moves though the scores of the aa1p[] position
+ without further indexing
+
+ For a real sequence profile, pwaa[0..n0-1] vs ['A'] could have
+ a different score in each position.
+ */
+
+ if (ppst->pam_pssm) {
+ pwaa_s = f_str->waa_s;
+ pwaa_a = f_str->waa_a;
+ for (e = 0; e <nsq; e++) { /* for each residue in the alphabet */
+ for (f = 0; f < n0; f++) { /* for each position in aa0 */
+ *pwaa_s++ = f_str->pam2p[ip][f][e] = ppst->pam2p[ip][f][e];
+ *pwaa_a++ = f_str->pam2p[0][f][e] = ppst->pam2p[0][f][e];
+ }
+ }
+ }
+ else { /* initialize scanning matrix */
+ pwaa_s = f_str->waa_s;
+ pwaa_a = f_str->waa_a;
+ for (e = 0; e <nsq; e++) /* for each residue in the alphabet */
+ for (f = 0; f < n0; f++) { /* for each position in aa0 */
+ *pwaa_s++ = f_str->pam2p[ip][f][e]= ppst->pam2[ip][e][aa0[f]];
+ *pwaa_a++ = f_str->pam2p[0][f][e] = ppst->pam2[0][e][aa0[f]];
+ }
+ }
+
+ /* minimum allocation for alignment */
+ f_str->max_res = max(3*n0/2,MIN_RES);
+
+ *f_arg = f_str;
+}
+
+void close_work (const unsigned char *aa0, int n0,
+ struct pstruct *ppst, struct f_struct **f_arg)
+{
+ struct f_struct *f_str;
+
+ f_str = *f_arg;
+
+ if (f_str != NULL) {
+ if (f_str->kar_p !=NULL) free(f_str->kar_p);
+ f_str->ss--;
+ free(f_str->ss);
+ free(f_str->waa_a);
+ free(f_str->pam2p[0][0]);
+ free(f_str->pam2p[0]);
+ free(f_str->waa_s);
+ free(f_str->pam2p[1][0]);
+ free(f_str->pam2p[1]);
+
+ free(f_str);
+ *f_arg = NULL;
+ }
+}
+
+/* pstring1 is a message to the manager, currently 512 */
+/*void get_param(struct pstruct *pstr,char **pstring1)*/
+void
+get_param (const struct pstruct *ppst,
+ char **pstring1, char *pstring2)
+{
+ char psi_str[120];
+
+ char *pg_str="Smith-Waterman";
+
+ if (ppst->pam_pssm) { strncpy(psi_str,"-PSI",sizeof(psi_str));}
+ else { psi_str[0]='\0';}
+
+ sprintf (pstring1[0], " %s (%s)", pg_str, verstr);
+ sprintf (pstring1[1],
+#ifdef OLD_FASTA_GAP
+ "%s matrix%s (%d:%d)%s, gap-penalty: %d/%d",
+#else
+ "%s matrix%s (%d:%d)%s, open/ext: %d/%d",
+#endif
+ ppst->pam_name, psi_str, ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set)?"xS":"\0", ppst->gdelval, ppst->ggapval);
+
+ if (pstring2 != NULL) {
+ sprintf(pstring2,
+#ifdef OLD_FASTA_GAP
+ "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n; pg_gap-pen: %d %d\n",
+#else
+ "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n; pg_open-ext: %d %d\n",
+#endif
+ pg_str,verstr,psi_str,ppst->pam_h,ppst->pam_l,
+ (ppst->ext_sq_set)?"xS":"\0",ppst->gdelval,ppst->ggapval);
+ }
+}
+
+
+void do_work (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ const struct pstruct *ppst, struct f_struct *f_str,
+ int qr_flg, int shuff_flg,
+ struct rstruct *rst)
+{
+ const unsigned char *aa0p, *aa1p;
+ register struct swstr *ssj;
+ struct swstr *ss, *f_ss, *r_ss;
+ register int *pwaa;
+ int *waa;
+ register int i, j;
+ int e, f, h, p;
+ int q, r, m;
+ int score;
+
+ double lambda, H, K;
+
+ rst->escore = 1.0;
+ rst->segnum = rst->seglen = 1;
+
+ waa = f_str->waa_s;
+ ss = f_str->ss;
+ f_ss = f_str->f_ss;
+ r_ss = f_str->r_ss;
+
+#ifdef OLD_FASTA_GAP
+ q = -(ppst->gdelval - ppst->ggapval);
+#else
+ q = -ppst->gdelval;
+#endif
+ r = -ppst->ggapval;
+ m = q + r;
+
+ /* initialize 0th row */
+ for (ssj=ss; ssj<&ss[n0]; ssj++) {
+ ssj->H = 0;
+ ssj->E = -q;
+ }
+
+ rst->valid_stat = 1;
+ score = 0;
+ aa1p = aa1;
+ while (*aa1p) {
+ h = p = 0;
+ f = -q;
+ pwaa = waa + (*aa1p++ * n0);
+ for (ssj = ss, aa0p = aa0; ssj < ss+n0; ssj++) {
+ if ((h = h - m) > (f = f - r)) f = h;
+ if ((h = ssj->H - m) > (e = ssj->E - r)) e = h;
+ h = p + *pwaa++;
+ if (h < 0 ) h = 0;
+ if (h < f ) h = f;
+ if (h < e ) h = e;
+ p = ssj->H;
+ ssj->H = h;
+ ssj->E = e;
+ if (h > score) score = h;
+ }
+ } /* done with forward pass */
+
+ rst->score[0] = score;
+
+ if(((ppst->zsflag % 10) == 6) &&
+ (do_karlin(aa1, n1, ppst->pam2[0], ppst,f_str->aa0_f,
+ f_str->kar_p, &lambda, &H)>0)) {
+ rst->comp = 1.0/lambda;
+ rst->H = H;
+ }
+ else {rst->comp = rst->H = -1.0;}
+} /* here we should be all done */
+
+void do_opt (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame,
+ struct pstruct *pst, struct f_struct *f_str,
+ struct rstruct *rstr)
+{
+}
+
+struct a_res_str *
+do_walign (const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int frame, int repeat_thresh,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ int *have_ares)
+{
+ struct a_res_str *a_res, *tmp_a_res;
+ int a_res_index;
+
+ *have_ares = 0x3; /* set 0x2 bit to indicate local copy */
+
+ if ((a_res = (struct a_res_str *)calloc(1, sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr," [do_walign] Cannot allocate a_res");
+ return NULL;
+ }
+
+#ifndef LALIGN
+ a_res = nsw_malign(f_str->pam2p, (ppst->ext_sq_set?1:0), n0, aa1, n1,
+ repeat_thresh, f_str->max_res,
+ -ppst->gdelval, -ppst->ggapval,
+ f_str->ss, a_res,ppst->score_ix,
+ &sw_walign, ppst->do_rep
+ );
+#else /* LALIGN */
+ if (!ppst->show_ident && same_seq(aa0, n0, aa1, n1)) ppst->nseq = 1;
+ else ppst->nseq = 2;
+
+ SIM(aa0-1, aa1-1, n0, n1, ppst, ppst->nseq, repeat_thresh, ppst->max_repeat, a_res);
+#endif
+
+ a_res_index = 0;
+ for (tmp_a_res=a_res; tmp_a_res; tmp_a_res = tmp_a_res->next) {
+ tmp_a_res->index = a_res_index++;
+ }
+
+ return a_res;
+}
+
+void
+pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {}
+
+/* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
+/* call from calcons, calc_id, calc_code */
+void
+aln_func_vals(int frame, struct a_struct *aln) {
+
+ aln->llfact = aln->llmult = aln->qlfact = 1;
+ aln->llrev = 0;
+ if (frame > 0) aln->qlrev = 1;
+ else aln->qlrev = 0;
+ aln->frame = 0;
+}
diff --git a/src/dyn_string.h b/src/dyn_string.h
new file mode 100644
index 0000000..1eeb520
--- /dev/null
+++ b/src/dyn_string.h
@@ -0,0 +1,30 @@
+/* $Id: dyn_string.h 1197 2013-07-19 20:25:19Z wrp $ */
+/* $Revision: 1197 $ */
+
+/* structure, functions for dynamic strings */
+
+struct dyn_string_str {
+ char *string;
+ int c_size;
+ int mx_size;
+ int inc;
+};
+
+/* initial allocation */
+struct dyn_string_str *init_dyn_string(int size, int inc);
+
+/* strcpy */
+void dyn_strcpy(struct dyn_string_str *dyn_string, char *value);
+
+/* strcat */
+void dyn_strcat(struct dyn_string_str *dyn_string, char *value);
+
+/* free */
+void free_dyn_string(struct dyn_string_str *dyn_string);
+
+/* reset */
+void reset_dyn_string(struct dyn_string_str *dyn_string);
+
+/* initialize to '\0' */
+#define NULL_dyn_string(str) str->string[0]='\0'; str->c_size = 0;
+
diff --git a/src/faatran.c b/src/faatran.c
new file mode 100644
index 0000000..1477456
--- /dev/null
+++ b/src/faatran.c
@@ -0,0 +1,445 @@
+/* $Id: faatran.c $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* aatran.c translates from nt to aa, 1 char codes */
+/* modified July 2, 1987 for all 6 frames */
+/* 23 Jan 1991 fixed bug for short sequences */
+
+/* this mapping is not alphabet independent */
+
+#define XTERNAL
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "upam.h"
+#include "uascii.h"
+
+/*
+1. The Standard Code (transl_table=1)
+
+By default all transl_table in GenBank flatfiles are equal to id 1, and this
+is not shown. When transl_table is not equal to id 1, it is shown as a
+qualifier on the CDS feature.
+
+*/
+static
+char *AA1="FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
+/*
+ Starts = ---M---------------M---------------M----------------------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+2. The Vertebrate Mitochondrial Code (transl_table=2)
+*/
+static
+char *AA2 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG";
+/*
+ Starts = --------------------------------MMMM---------------M------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+3. The Yeast Mitochondrial Code (transl_table=3)
+*/
+static
+char *AA3 ="FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
+/*
+ Starts = -----------------------------------M----------------------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the
+Mycoplasma/Spiroplasma Code (transl_table=4)
+*/
+static
+char *AA4 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
+/*
+ Starts = --MM---------------M------------MMMM---------------M------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+5. The Invertebrate Mitochondrial Code (transl_table=5)
+*/
+static
+char *AA5 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG";
+/*
+ Starts = ---M----------------------------MMMM---------------M------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+6. The Ciliate, Dasycladacean and Hexamita Nuclear Code (transl_table=6)
+*/
+static
+char *AA6 ="FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
+/*
+ Starts = -----------------------------------M----------------------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+9. The Echinoderm Mitochondrial Code (transl_table=9)
+*/
+static
+char *AA7 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG";
+/*
+ Starts = -----------------------------------M----------------------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+10. The Euplotid Nuclear Code (transl_table=10)
+*/
+static
+char *AA10="FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
+/*
+ Starts = -----------------------------------M----------------------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+11. The Bacterial "Code" (transl_table=11)
+*/
+static
+char *AA11="FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
+/*
+ Starts = ---M---------------M------------MMMM---------------M------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+12. The Alternative Yeast Nuclear Code (transl_table=12)
+*/
+static
+char *AA12 ="FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
+/*
+ Starts = -------------------M---------------M----------------------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+13. The Ascidian Mitochondrial Code (transl_table=13)
+*/
+static
+char *AA13="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG";
+/*
+ Starts = -----------------------------------M----------------------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+14. The Flatworm Mitochondrial Code (transl_table=14)
+*/
+static
+char *AA14 ="FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG";
+/*
+ Starts = -----------------------------------M----------------------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+15. Blepharisma Nuclear Code (transl_table=15)
+*/
+static
+char *AA15="FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
+/*
+ Starts = -----------------------------------M----------------------------
+ Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+*/
+
+static
+char *AA16 ="FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
+/*
+ id 16 ,
+ name "Chlorophycean Mitochondrial" ,
+ sncbieaa "-----------------------------------M----------------------------"
+ -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+*/
+
+static
+char *AA21 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG";
+/*
+ name "Trematode Mitochondrial" ,
+ id 21 ,
+ sncbieaa "-----------------------------------M---------------M------------"
+ -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+*/
+
+static
+char *AA22 ="FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
+/*
+ name "Scenedesmus obliquus Mitochondrial" ,
+ id 22 ,
+ sncbieaa "-----------------------------------M----------------------------"
+ -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+*/
+
+static
+char *AA23 ="FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
+/*
+ name "Thraustochytrium Mitochondrial" ,
+ id 23 ,
+ sncbieaa "--------------------------------M--M---------------M------------"
+ -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+ -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+ -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+*/
+
+
+static char aacmap[64]={
+ 'K','N','K','N','T','T','T','T','R','S','R','S','I','I','M','I',
+ 'Q','H','Q','H','P','P','P','P','R','R','R','R','L','L','L','L',
+ 'E','D','E','D','A','A','A','A','G','G','G','G','V','V','V','V',
+ '*','Y','*','Y','S','S','S','S','*','C','W','C','L','F','L','F'
+};
+
+static int aamap[64]; /* integer aa values */
+static int aamapr[64]; /* reverse sequence map */
+
+/* tnt is used only by aatran.c. It must be consistent with lascii and
+the nt alphabet. It uses 3,3 because T and U are considered separately
+*/
+static int tnt[]={0,0,1,2,3,3,0,1,0,0,1,2,0,0,0,1,0,0,
+ 0,1,2,3,3,0,1,0,0,1,2,0,0,0,1,0,0};
+
+static int debug_set;
+
+int
+aatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame)
+{
+ int iaa, im, nna, i;
+ register int *nnp;
+ const unsigned char *nts0;
+ register int *aamp;
+ register unsigned char *aap;
+
+ iaa=nna=(maxs-(frame<3?frame:frame-3))/3;
+ if (nna <= 3 ) {
+ aaseq[0]=EOSEQ;
+ return 0;
+ }
+
+ nnp = tnt;
+
+ if (frame < 3) {
+ aamp = aamap;
+ nts0 = &ntseq[frame];
+ aap = aaseq;
+ while (nna--) {
+ im = nnp[*nts0++]<<4;
+ im += nnp[*nts0++]<<2;
+ im += nnp[*nts0++];
+ *aap++ = aamp[im];
+
+ /* this check is included because of a bug in tfasty
+ which occurs only during the alignment process */
+
+#ifdef DEBUG
+ if (debug_set && aamp[im] > MAXUC) {
+ fprintf(stderr,"faatran: %d %d %d %d %d?%d\n",
+ *(nts0-3),*(nts0-2),*(nts0-1), im, aamp[im],aamap[im]);
+
+ /* this allows recovery, but should not be done frequently */
+ for (i=0; i<64; i++) {
+ aamap[i]=aascii[aacmap[i]];
+ aamapr[i]=aascii[aacmap[(~i)&63]];
+ }
+ *(aap-1) = aamp[im];
+ }
+#endif
+ }
+ }
+ else {
+ aamp = aamapr;
+ nts0 = &ntseq[maxs-(frame-3)];
+ aap = aaseq;
+ while (nna--) {
+ im = nnp[*--nts0]<<4;
+ im += nnp[*--nts0]<<2;
+ im += nnp[*--nts0];
+ *aap++ = aamp[im];
+ /* this check is included because of a bug in tfasty
+ which occurs only during the alignment process */
+
+#ifdef DEBUG
+ if (debug_set && aamp[im] > MAXUC) {
+ fprintf(stderr,"faatran: %d %d %d %d %d?%d\n",
+ *(nts0-3),*(nts0-2),*(nts0-1), im, aamp[im],aamap[im]);
+
+ /* this allows recovery, but should not be done frequently */
+ for (i=0; i<64; i++) {
+ aamap[i]=aascii[aacmap[i]];
+ aamapr[i]=aascii[aacmap[(~i)&63]];
+ }
+ *(aap-1) = aamp[im];
+ }
+#endif
+ }
+ }
+ aaseq[iaa]=EOSEQ;
+ return iaa;
+}
+
+/* slower version that masks out NNN,XXX */
+
+/* - A C G T U R Y M W S K D H V B N X */
+static int snt[]={0,0,1,2,3,3,0,1,0,0,4,4,4,4,4,4,4,4};
+
+int
+saatran(const unsigned char *ntseq,
+ unsigned char *aaseq, int maxs, int frame)
+{
+ int iaa, im, it, nna, xflag;
+ register int *nnp;
+ const unsigned char *nts0;
+ register int *aamp;
+ register unsigned char *aap;
+
+ iaa=nna=(maxs-(frame<3?frame:frame-3))/3;
+ if (nna <= 3 ) {
+ aaseq[0]=EOSEQ;
+ return 0;
+ }
+
+ nnp = snt;
+ if (frame < 3) {
+ aamp = aamap;
+ nts0 = &ntseq[frame];
+ aap = aaseq;
+ while (nna--) {
+ xflag = 0;
+ if ((it=nnp[*nts0++])<4) {im = it<<4;}
+ else {xflag = 1; im=0;}
+ if ((it=nnp[*nts0++])<4) {im += it<<2;}
+ else xflag = 1;
+ if ((it=nnp[*nts0++])<4) {im += it;}
+ else xflag = 1;
+ if (xflag) *aap++ = aascii['X'];
+ else *aap++ = aamp[im];
+ }
+ }
+ else {
+ aamp = aamapr;
+ nts0 = &ntseq[maxs-(frame-3)];
+ aap = aaseq;
+ while (nna--) {
+ xflag = 0;
+ if ((it=nnp[*--nts0]) < 4) im = it<<4;
+ else {xflag = 1; im=0;}
+ if ((it=nnp[*--nts0]) < 4) im += it<<2;
+ else xflag = 1;
+ if ((it=nnp[*--nts0]) < 4) im += it;
+ else xflag = 1;
+ if (xflag) *aap++ = aascii['X'];
+ else *aap++ = aamp[im];
+ }
+ }
+ aaseq[iaa]=EOSEQ;
+ return iaa;
+}
+
+void
+aainit(int tr_type, int debug)
+{
+ int i,j;
+ char *aasmap;
+ int ascii_star;
+ int imap[4]={3,1,0,2}, i0, i1, i2, ii;
+
+ debug_set = debug;
+
+ aasmap = AA1;
+
+ ascii_star = aascii['*'];
+ aascii['*'] = TERM;
+
+ if (tr_type > 0) {
+ /* need to put in a new translation table */
+ switch (tr_type) {
+ case 1: aasmap = AA1; break;
+ case 2: aasmap = AA2; break;
+ case 3: aasmap = AA3; break;
+ case 4: aasmap = AA4; break;
+ case 5: aasmap = AA5; break;
+ case 6: aasmap = AA6; break;
+ case 7: aasmap = AA7; break;
+ case 10: aasmap = AA10; break;
+ case 11: aasmap = AA11; break;
+ case 12: aasmap = AA12; break;
+ case 13: aasmap = AA13; break;
+ case 14: aasmap = AA14; break;
+ case 15: aasmap = AA15; break;
+ case 16: aasmap = AA16; break;
+ case 21: aasmap = AA21; break;
+ case 22: aasmap = AA22; break;
+ case 23: aasmap = AA23; break;
+
+ default: aasmap = AA1; break;
+ }
+
+ if (debug) fprintf(stderr," codon table: %d\n new old\n",tr_type);
+ for (i0 = 0; i0 < 4; i0++)
+ for (i1 = 0; i1 < 4; i1++)
+ for (i2 = 0; i2 < 4; i2++) {
+ ii = (imap[i0]<<4) + (imap[i1]<<2) + imap[i2];
+ if (debug && aacmap[ii] != *aasmap) {
+ fprintf(stderr," %c%c%c: %c - %c\n",
+ nt[imap[i0]+1],nt[imap[i1]+1],nt[imap[i2]+1],
+ *aasmap,aacmap[ii]);
+ }
+ aacmap[ii]= *aasmap++;
+ }
+
+
+ for (i=0; i<64; i++) {
+ fprintf(stderr,"'%c',",aacmap[i]);
+ if ((i%16)==15) fputc('\n',stderr);
+ }
+ fputc('\n',stderr);
+
+ }
+ for (i=0; i<64; i++) {
+ aamap[i]=aascii[aacmap[i]];
+ if (aamap[i] > TERM) {
+ fprintf(stderr," *** error - codon out of range: %d %d (%c)\n",i,aamap[i], NCBIstdaa_l[aamap[i]] );
+ }
+ aamapr[i]=aascii[aacmap[(~i)&63]];
+ if (aamapr[i] > TERM) {
+ fprintf(stderr," *** error - codon_r out of range: %d %d (%c)\n",i,aamapr[i], NCBIstdaa_l[aamapr[i]]);
+ }
+ }
+ aascii['*'] = ascii_star;
+}
+
+void
+aagetmap(char *to, int n)
+{
+ int i;
+ for (i=0; i<n; i++) to[i] = aacmap[i];
+}
diff --git a/src/getenv.c b/src/getenv.c
new file mode 100644
index 0000000..bc3cd9b
--- /dev/null
+++ b/src/getenv.c
@@ -0,0 +1,56 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define MAXENV 1024
+char *envstr;
+
+char *mgetenv(str)
+char *str;
+{
+ static int EnvInit=0;
+
+ char *eptr, *esptr, *bp;
+ int i,esize;
+ FILE *fenv;
+
+ if (EnvInit==0) {
+ EnvInit=1;
+ if ((fenv=fopen("environment","r"))!=NULL) {
+ if ((envstr=malloc((size_t)(esize=MAXENV)))==NULL) {
+ fclose(fenv); goto noenv;}
+ esptr=envstr; esize -= 10;
+ while (fgets(esptr,esize,fenv)!=NULL) {
+ if ((bp=strchr(esptr,'\n'))!=NULL) *bp='\0';
+ esize -= (i=strlen(esptr)+1);
+ esptr += i;
+ }
+ fclose(fenv);
+ esptr='\0';
+ }
+ else envstr=NULL;
+ }
+
+ if (envstr==NULL) return NULL;
+ else {
+ for (eptr=envstr; *eptr; eptr += strlen(eptr)+1) {
+ if (strncmp(str,eptr,(long)strlen(str))==0) {
+ return strchr(eptr,'=')+1;
+ }
+ }
+ return NULL;
+ }
+noenv: envstr=NULL; return NULL;
+ }
+
+strnpcpy(to,from,max)
+ char *to; Str255 from; size_t max;
+{
+ size_t i, n;
+
+ n = (*from<max) ? *from : max;
+ from++;
+
+ for (i=0; i<n; i++) *to++ = *from++;
+ if (n<max) *to='\0';
+ }
diff --git a/src/getopt.c b/src/getopt.c
new file mode 100644
index 0000000..0d76146
--- /dev/null
+++ b/src/getopt.c
@@ -0,0 +1,64 @@
+/*LINTLIBRARY*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define ERR(s, c) if(opterr){\
+ char errbuf[3];\
+ errbuf[0] = c; errbuf[1] = '\n'; errbuf[2]='\0';\
+ (void) fputs(argv[0],stderr);\
+ (void) fputs(s,stderr);\
+ (void) fputs(errbuf,stderr);}
+
+
+int opterr = 1;
+int optind = 1;
+int optopt;
+char *optarg;
+
+int
+getopt(argc, argv, opts)
+int argc;
+char **argv, *opts;
+{
+ static int sp = 1;
+ register int c;
+ register char *cp;
+
+ if(sp == 1)
+ if(optind >= argc ||
+ argv[optind][0] != '-' || argv[optind][1] == '\0')
+ return(EOF);
+ else if(strcmp(argv[optind], "--") == 0) {
+ optind++;
+ return(EOF);
+ }
+ optopt = c = argv[optind][sp];
+ if(c == ':' || (cp=strchr(opts, c)) == NULL) {
+ ERR(": illegal option -- ", c);
+ if(argv[optind][++sp] == '\0') {
+ optind++;
+ sp = 1;
+ }
+ return('?');
+ }
+ if(*++cp == ':') {
+ if(argv[optind][sp+1] != '\0')
+ optarg = &argv[optind++][sp+1];
+ else if(++optind >= argc) {
+ ERR(": option requires an argument -- ", c);
+ sp = 1;
+ return('?');
+ } else
+ optarg = argv[optind++];
+ sp = 1;
+ } else {
+ if(argv[optind][++sp] == '\0') {
+ sp = 1;
+ optind++;
+ }
+ optarg = NULL;
+ }
+ return(c);
+}
diff --git a/src/getseq.c b/src/getseq.c
new file mode 100644
index 0000000..ef912d3
--- /dev/null
+++ b/src/getseq.c
@@ -0,0 +1,313 @@
+/* getseq.c */
+
+/* copyright (c) 1987,1988,1989,1992,1995,2000, 2014 by William R. Pearson
+ and The Rectors and Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* May, June 1987 - modified for rapid read of database
+ This is one of three alternative files that can be used to
+ read a database. The three files are nxgetaa.c, nmgetaa.c, and
+ mmgetaa.c.
+
+ nxgetaa.c contains the original code for reading databases, and
+ is still used for Mac and PC versions of fasta33 (which do not
+ use mmap).
+
+ nmgetaa.c and mmgetaa.c are used together. nmgetaa.c provides
+ the same functions as nxgetaa.c if memory mapping is not used,
+ mmgetaa.c provides the database reading functions if memory
+ mapping is used. The decision to use memory mapping is made on
+ a file-by-file basis.
+
+ June 2, 1987 - added TFASTA
+ March 30, 1988 - combined ffgetaa, fgetgb;
+ April 8, 1988 - added PIRLIB format for unix
+ Feb 4, 1989 - added universal subroutines for libraries
+ December, 1995 - added range option file.name:1-1000
+ Feb 22, 2002 - fix to allow "plain" text file queries
+
+ getnt.c associated subroutines for matching sequences */
+
+/* $Id: getseq.c 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+/*
+ 8-April-88
+ The compile time #define PIRLIB allows this routine to be used
+ to read protein and DNA sequence libraries in the NBRF/PIR
+ VAX/VMS library format. That is:
+
+ >P1;LCBO
+ This is a line of description
+ GTYH ... the sequence starts on this line
+
+ This may ease conversion from UWGCG format libraries. It
+ has not been extensively tested.
+
+ In addition, sequence libraries with a '>' in the 4th position
+ are recognized as NBRF format libraries for consistency with
+ UWGCG
+*/
+
+/* Nov 12, 1987 - this version checks to see if the sequence
+ is DNA or protein by asking whether > 85% is A, C, G, T
+
+ May 5, 1988 - modify the DNA/PROTEIN checker by re-reading
+ DNA sequences in order to check for 'U'.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+#include "structs.h"
+
+#ifndef SFCHAR
+#define SFCHAR ':'
+#endif
+
+#define XTERNAL
+#include "uascii.h"
+#include "upam.h"
+#undef XTERNAL
+
+#define YES 1
+#define NO 0
+#define MAXLINE 512
+
+#ifndef min
+#define min(x,y) ((x) > (y) ? (y) : (x))
+#endif
+
+#define NO_FORMAT 0
+#define FASTA_FORMAT 1
+#define GCG_FORMAT 2
+
+static int seq_format=NO_FORMAT;
+static char seq_title[200];
+
+int scanseq(unsigned char *, int, char *);
+void sf_sort(int *, int);
+extern void init_ascii(int is_ext, int *sascii, int is_dna);
+
+/* getseq - get a query sequence, possibly re-reading to set type
+ returns - length of query sequence or error = 0
+
+ char *filen - name of file to be opened
+ char *seq - destination for query sequence
+ int maxs - maximum length of query
+ char libstr[20] - short description (locus or acc)
+ int *dnaseq - -1 => use scanseq to determine sequence type
+ 0 => must be protein
+ 1 => must be DNA
+ long *sq0off - offset into query specified by query_file:1001-2000
+*/
+
+int
+getseq(char *filen, int *qascii, unsigned char *seq, int maxs, char *libstr, long *sq0off)
+{
+ FILE *fptr;
+ char line[512],*bp, *bp1, *bpn, *tp;
+ int i, rn, n;
+ int ic;
+ int sstart, sstop, sset=0;
+ int llen, l_offset;
+
+ seq_title[0]='\0';
+ libstr[0]='\0';
+
+ sstart = sstop = -1;
+#ifndef DOS
+ if ((bp=strchr(filen,':'))!=NULL && *(bp+1)!='\0') {
+#else
+ if ((bp=strchr(filen+3,':'))!=NULL && *(bp+1)!='\0') {
+#endif
+ *bp='\0';
+ if (*(bp+1)=='-') {
+ sstart = 0;
+ sscanf(bp+2,"%d",&sstop);
+ }
+ else {
+ sscanf(bp+1,"%d-%d",&sstart,&sstop);
+ sstart--;
+ if (sstop <= 0 ) sstop = BIGNUM;
+ }
+ sset=1;
+ }
+ else {
+ sstart = 0;
+ sstop = BIGNUM;
+ }
+
+ /* check for input from stdin */
+ if (strcmp(filen,"-") && strcmp(filen,"@")) {
+ if ((fptr=fopen(filen,"r"))==NULL) {
+ fprintf(stderr," could not open %s\n",filen);
+ return 0;
+ }
+ }
+ else {
+ fptr = stdin;
+ }
+ rn = n=0;
+
+ while(fgets(line,sizeof(line),fptr)!=NULL) {
+ l_offset = 0;
+ if (line[0]=='>') {
+ seq_format = FASTA_FORMAT;
+ if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
+ strncpy(seq_title,line+1,sizeof(seq_title));
+ seq_title[sizeof(seq_title)-1]='\0';
+ if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
+ strncpy(libstr,line+1,12);
+ libstr[12]='\0';
+ }
+ else if (seq_format==NO_FORMAT && strcmp(line,"..")==0) {
+ seq_format = GCG_FORMAT;
+/*
+ if (*dnaseq != 1) qascii['*'] = qascii['X'];
+*/
+ l_offset = 10;
+ llen = strlen(line);
+ while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
+ if (fgets(line,sizeof(line),fptr)==NULL) return 0;
+ llen = strlen(line);
+ }
+ bp = strtok(line," \t");
+/*
+ if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
+ else if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
+*/
+ if (bp!=NULL) strncpy(libstr,bp,12);
+ else strncpy(libstr,filen,12);
+ libstr[12]='\0';
+ if (fgets(line,sizeof(line),fptr)==NULL) return 0;
+ }
+ else {
+ if (libstr[0]=='\0') strncpy(libstr,filen,12);
+ libstr[12]='\0';
+ }
+
+ if (seq_format==GCG_FORMAT && strlen(line)<l_offset) continue;
+
+ if (line[0]!='>'&& line[0]!=';') {
+ for (i=l_offset; (n<maxs && rn < sstop)&&
+ ((ic=qascii[line[i]&AAMASK])<EL); i++)
+ if (ic<NA && ++rn > sstart) seq[n++]= ic;
+ if (ic == ES || rn > sstop) break;
+ }
+ }
+
+ if (n==maxs) {
+ fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
+ fflush(stderr);
+ }
+ if ((bp=strchr(libstr,'\n'))!=NULL) *bp = '\0';
+ if ((bp=strchr(libstr,'\r'))!=NULL) *bp = '\0';
+ seq[n]= EOSEQ;
+
+
+ if (seq_format !=GCG_FORMAT)
+ while(fgets(line,sizeof(line),fptr)!=NULL) {
+ if (line[0]!='>'&& line[0]!=';') {
+ for (i=0; (n<maxs && rn < sstop)&&
+ ((ic=qascii[line[i]&AAMASK])<EL); i++)
+ if (ic<NA && ++rn > sstart ) seq[n++]= ic;
+ if (ic == ES || rn > sstop) break;
+ }
+ }
+ else {
+ llen = strlen(line);
+ while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
+ if (fgets(line,sizeof(line),fptr)==NULL) return 0;
+ llen = strlen(line);
+ }
+ while (fgets(line,sizeof(line),fptr)!=NULL) {
+ if (strlen(line)<l_offset) continue;
+ for (i=l_offset; (n<maxs && rn < sstop) &&
+ ((ic=qascii[line[i]&AAMASK])<EL); i++)
+ if (ic<NA && ++rn > sstart ) seq[n++]= ic;
+ if (ic == ES || rn > sstop ) break;
+ }
+ }
+
+ if (n==maxs) {
+ fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
+ fflush(stderr);
+ }
+ seq[n]= EOSEQ;
+
+ if (fptr!=stdin) fclose(fptr);
+
+ if (sset==1) {
+ sstart++;
+ filen[strlen(filen)]=':';
+ if (*sq0off==1 || sstart>=1) *sq0off = sstart;
+ }
+
+ return n;
+}
+
+int
+gettitle(char *filen, char *title, int len) {
+ FILE *fptr;
+ char line[512];
+ char *bp;
+ int sset;
+#ifdef WIN32
+ char *strpbrk();
+#endif
+
+ sset = 0;
+
+ if (strncmp(filen,"-",1)==0 || strncmp(filen,"@",1)==0) {
+ strncpy(title,seq_title,len);
+ title[len-1]='\0';
+ return (int)strlen(title);
+ }
+
+ if ((bp=strchr(filen,':'))!=NULL) { *bp='\0'; sset=1;}
+
+
+ if ((fptr=fopen(filen,"r"))==NULL) {
+ fprintf(stderr," file %s was not found\n",filen);
+ fflush(stderr);
+ return 0;
+ }
+
+ if (sset==1) filen[strlen(filen)]=':';
+
+ while(fgets(line,sizeof(line),fptr)!=NULL) {
+ if (line[0]=='>'|| line[0]==';') goto found;
+ }
+ fclose(fptr);
+ title[0]='\0';
+ return 0;
+
+ found:
+
+#ifdef WIN32
+ bp = strpbrk(line,"\n\r");
+#else
+ bp = strchr(line,'\n');
+#endif
+ if (bp!=NULL) *bp = 0;
+ strncpy(title,line,len);
+ title[len-1]='\0';
+ fclose(fptr);
+ return strlen(title);
+}
+
diff --git a/src/global_sse2.c b/src/global_sse2.c
new file mode 100644
index 0000000..87999ea
--- /dev/null
+++ b/src/global_sse2.c
@@ -0,0 +1,547 @@
+/******************************************************************
+ Copyright 2010 by Michael Farrar. All rights reserved.
+ This program may not be sold or incorporated into a commercial product,
+ in whole or in part, without written consent of Michael Farrar. For
+ further information regarding permission for use or reproduction, please
+ contact: Michael Farrar at farrar.michael at gmail.com.
+*******************************************************************/
+
+/*
+ Written by Michael Farrar, 2010.
+ Please send bug reports and/or suggestions to farrar.michael at gmail.com.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "defs.h"
+#include "param.h"
+#include "dropgsw2.h"
+#include "global_sse2.h"
+
+#ifdef __SUNPRO_C
+#include <sunmedia_intrin.h>
+#else
+#include <emmintrin.h>
+#endif
+
+#ifdef SW_SSE2
+
+static inline __m128i
+max_epu16(__m128i a, __m128i b)
+{
+ a = _mm_subs_epu16 (a, b);
+ b = _mm_adds_epu16 (b, a);
+ return b;
+}
+
+
+int
+global_sse2_word(int queryLength,
+ unsigned short *profile,
+ const unsigned char *dbSeq,
+ int dbLength,
+ unsigned short gapOpen,
+ unsigned short gapExtend,
+ unsigned short ceiling,
+ struct f_struct *f_str)
+{
+ int i, j;
+
+ int score;
+ int scale;
+ int temp;
+ int distance;
+
+ int offset;
+ int position;
+
+ int cmp;
+ int iter;
+
+ __m128i *pvH;
+ __m128i *pvE;
+
+ __m128i vE, vF, vH;
+ __m128i vHNext;
+ __m128i vFPrev;
+
+ __m128i vGapOpen;
+ __m128i vGapExtend;
+ __m128i vCeiling;
+
+ __m128i vScale;
+ __m128i vScaleAmt;
+ __m128i vScaleTmp;
+
+ __m128i vTemp;
+ __m128i vNull;
+
+ __m128i *pvScore;
+
+ scale = 0;
+ iter = (queryLength + 7) / 8;
+ offset = (queryLength - 1) % iter;
+ position = 7 - (queryLength - 1) / iter;
+
+ pvH = (__m128i *)f_str->workspace;
+ pvE = pvH + iter;
+
+ /* Load gap opening penalty to all elements of a constant */
+ vGapOpen = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */
+ vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0);
+ vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
+ vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);
+
+ /* Load gap extension penalty to all elements of a constant */
+ vGapExtend = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */
+ vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0);
+ vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
+ vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);
+
+ /* Generate the ceiling before scaling */
+ vTemp = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */
+ vTemp = _mm_insert_epi16 (vTemp, ceiling, 0);
+ vTemp = _mm_shufflelo_epi16 (vTemp, 0);
+ vTemp = _mm_shuffle_epi32 (vTemp, 0);
+ vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp);
+ vCeiling = _mm_srli_epi16 (vCeiling, 1);
+ vCeiling = _mm_subs_epi16 (vCeiling, vTemp);
+ vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen);
+
+ vNull = _mm_cmpeq_epi16 (vTemp, vTemp);
+ vNull = _mm_slli_epi16 (vNull, 15);
+ vScaleAmt = _mm_xor_si128 (vNull, vNull);
+
+ /* Zero out the storage vector */
+ vTemp = _mm_adds_epi16 (vNull, vGapOpen);
+ for (i = 0; i < iter; i++) {
+ _mm_store_si128 (pvH + i, vTemp);
+ _mm_store_si128 (pvE + i, vNull);
+ }
+
+ /* initialize F */
+ vF = vNull;
+ vFPrev = vNull;
+
+ /* load and scale H for the next round */
+ vTemp = _mm_srli_si128 (vGapOpen, 14);
+ vH = _mm_load_si128 (pvH + iter - 1);
+ vH = _mm_adds_epi16 (vH, vTemp);
+
+ for (i = 0; i < dbLength; ++i) {
+ /* fetch first data asap. */
+ pvScore = (__m128i *) profile + dbSeq[i] * iter;
+
+ vF = vNull;
+
+ vH = _mm_max_epi16 (vH, vFPrev);
+ for (j = 0; j < iter; j++) {
+ /* correct H from the previous columns F */
+ vHNext = _mm_load_si128 (pvH + j);
+ vHNext = _mm_max_epi16 (vHNext, vFPrev);
+
+ /* load and correct E value */
+ vE = _mm_load_si128 (pvE + j);
+ vTemp = _mm_subs_epi16 (vHNext, vGapOpen);
+ vE = _mm_max_epi16 (vE, vTemp);
+ _mm_store_si128 (pvE + j, vE);
+
+ /* add score to vH */
+ vH = _mm_adds_epi16 (vH, *pvScore++);
+
+ /* get max from vH, vE and vF */
+ vH = _mm_max_epi16 (vH, vE);
+ vH = _mm_max_epi16 (vH, vF);
+ _mm_store_si128 (pvH + j, vH);
+
+ /* update vF value */
+ vH = _mm_subs_epi16 (vH, vGapOpen);
+ vF = _mm_max_epi16 (vF, vH);
+
+ /* load the next h values */
+ vH = vHNext;
+ }
+
+ /* check if we need to scale before the next round */
+ vTemp = _mm_cmpgt_epi16 (vF, vCeiling);
+ cmp = _mm_movemask_epi8 (vTemp);
+
+ /* broadcast F values */
+ vF = _mm_xor_si128 (vF, vNull);
+
+ vTemp = _mm_slli_si128 (vF, 2);
+ vTemp = _mm_subs_epu16 (vTemp, vScaleAmt);
+ vF = max_epu16 (vF, vTemp);
+
+ vTemp = _mm_slli_si128 (vF, 4);
+ vScaleTmp = _mm_slli_si128 (vScaleAmt, 2);
+ vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt);
+ vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
+ vF = max_epu16 (vF, vTemp);
+
+ vTemp = _mm_slli_si128 (vScaleTmp, 4);
+ vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp);
+ vTemp = _mm_slli_si128 (vF, 8);
+ vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
+ vF = max_epu16 (vF, vTemp);
+
+ /* scale if necessary */
+ if (cmp != 0x0000) {
+ __m128i vScale1;
+ __m128i vScale2;
+
+ vScale = _mm_slli_si128 (vF, 2);
+ vScale = _mm_subs_epu16 (vScale, vGapOpen);
+ vScale = _mm_subs_epu16 (vScale, vScaleAmt);
+
+ vTemp = _mm_slli_si128 (vScale, 2);
+ vTemp = _mm_subs_epu16 (vScale, vTemp);
+ vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp);
+ vTemp = _mm_slli_si128 (vScale, 2);
+ vTemp = _mm_subs_epu16 (vTemp, vScale);
+ vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp);
+
+ /* rescale the previous F */
+ vF = _mm_subs_epu16 (vF, vScale);
+
+ /* check if we can continue in signed 16-bits */
+ vTemp = _mm_xor_si128 (vF, vNull);
+ vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling);
+ cmp = _mm_movemask_epi8 (vTemp);
+ if (cmp != 0x0000) {
+ return OVERFLOW_SCORE;
+ }
+
+ vTemp = _mm_adds_epi16 (vCeiling, vCeiling);
+ vScale1 = _mm_subs_epu16 (vScale, vTemp);
+ vScale2 = _mm_subs_epu16 (vScale, vScale1);
+
+ /* scale all the vectors */
+ for (j = 0; j < iter; j++) {
+ /* load H and E */
+ vH = _mm_load_si128 (pvH + j);
+ vE = _mm_load_si128 (pvE + j);
+
+ /* get max from vH, vE and vF */
+ vH = _mm_subs_epi16 (vH, vScale1);
+ vH = _mm_subs_epi16 (vH, vScale2);
+ vE = _mm_subs_epi16 (vE, vScale1);
+ vE = _mm_subs_epi16 (vE, vScale2);
+
+ /* save the H and E */
+ _mm_store_si128 (pvH + j, vH);
+ _mm_store_si128 (pvE + j, vE);
+ }
+
+ vScale = vScaleAmt;
+ for (j = 0; j < position; ++j) {
+ vScale = _mm_slli_si128 (vScale, 2);
+ }
+
+ /* calculate the final scaling amount */
+ vTemp = _mm_xor_si128 (vTemp, vTemp);
+ vScale1 = _mm_unpacklo_epi16 (vScale, vTemp);
+ vScale2 = _mm_unpackhi_epi16 (vScale, vTemp);
+ vScale = _mm_add_epi32 (vScale1, vScale2);
+ vTemp = _mm_srli_si128 (vScale, 8);
+ vScale = _mm_add_epi32 (vScale, vTemp);
+ vTemp = _mm_srli_si128 (vScale, 4);
+ vScale = _mm_add_epi32 (vScale, vTemp);
+ scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0);
+ temp = (int) (unsigned short) _mm_extract_epi16 (vScale, 1);
+ scale = scale + (temp << 16);
+ }
+
+ /* scale the F value for the next round */
+ vFPrev = _mm_slli_si128 (vF, 2);
+ vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt);
+ vFPrev = _mm_xor_si128 (vFPrev, vNull);
+
+ /* load and scale H for the next round */
+ vH = _mm_load_si128 (pvH + iter - 1);
+ vH = _mm_xor_si128 (vH, vNull);
+ vH = _mm_slli_si128 (vH, 2);
+ vH = _mm_subs_epu16 (vH, vScaleAmt);
+ vH = _mm_insert_epi16 (vH, gapOpen, 0);
+ vH = _mm_xor_si128 (vH, vNull);
+ }
+
+ vH = _mm_load_si128 (pvH + offset);
+ vH = _mm_max_epi16 (vH, vFPrev);
+ for (j = 0; j < position; ++j) {
+ vH = _mm_slli_si128 (vH, 2);
+ }
+ score = (int) (signed short) _mm_extract_epi16 (vH, 7);
+ score = score + SHORT_BIAS;
+
+ /* return largest score */
+ distance = (queryLength + dbLength) * gapExtend;
+ score = score - (gapOpen * 2) - distance + scale;
+
+ return score;
+}
+
+int
+global_sse2_byte(int queryLength,
+ unsigned char *profile,
+ const unsigned char *dbSeq,
+ int dbLength,
+ unsigned short gapOpen,
+ unsigned short gapExtend,
+ unsigned short ceiling,
+ unsigned short bias,
+ struct f_struct *f_str)
+{
+ int i, j;
+
+ int score;
+ int scale;
+ int distance;
+
+ int offset;
+ int position;
+
+ int dup;
+ int cmp;
+ int iter;
+
+ __m128i *pvH;
+ __m128i *pvE;
+
+ __m128i vE, vF, vH;
+ __m128i vHInit;
+ __m128i vHNext;
+ __m128i vFPrev;
+
+ __m128i vBias;
+ __m128i vGapOpen;
+ __m128i vGapExtend;
+ __m128i vCeiling;
+
+ __m128i vScale;
+ __m128i vScaleAmt;
+ __m128i vScaleTmp;
+
+ __m128i vTemp;
+ __m128i vNull;
+
+ __m128i *pvScore;
+
+ scale = 0;
+ iter = (queryLength + 15) / 16;
+ offset = (queryLength - 1) % iter;
+ position = 15 - (queryLength - 1) / iter;
+
+ pvH = (__m128i *)f_str->workspace;
+ pvE = pvH + iter;
+
+ /* Load the bias to all elements of a constant */
+ dup = (bias << 8) | (bias & 0x00ff);
+ vBias = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */
+ vBias = _mm_insert_epi16 (vBias, dup, 0);
+ vBias = _mm_shufflelo_epi16 (vBias, 0);
+ vBias = _mm_shuffle_epi32 (vBias, 0);
+
+ /* Load gap opening penalty to all elements of a constant */
+ dup = (gapOpen << 8) | (gapOpen & 0x00ff);
+ vGapOpen = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */
+ vGapOpen = _mm_insert_epi16 (vGapOpen, dup, 0);
+ vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
+ vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);
+
+ /* Load gap extension penalty to all elements of a constant */
+ dup = (gapExtend << 8) | (gapExtend & 0x00ff);
+ vGapExtend = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */
+ vGapExtend = _mm_insert_epi16 (vGapExtend, dup, 0);
+ vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
+ vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);
+
+ /* Generate the ceiling before scaling */
+ dup = (ceiling << 8) | (ceiling & 0x00ff);
+ vTemp = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */
+ vTemp = _mm_insert_epi16 (vTemp, dup, 0);
+ vTemp = _mm_shufflelo_epi16 (vTemp, 0);
+ vTemp = _mm_shuffle_epi32 (vTemp, 0);
+ vCeiling = _mm_cmpeq_epi8 (vTemp, vTemp);
+ vCeiling = _mm_subs_epu8 (vCeiling, vTemp);
+ vCeiling = _mm_subs_epu8 (vCeiling, vGapOpen);
+
+ /* since we want to use the full range, zero is redefined as */
+ /* 2 * gapOpen. the lowest scaled score will an insert followed */
+ /* by a delete. */
+ vHInit = _mm_srli_si128 (vGapOpen, 15);
+
+ /* vNull = _mm_xor_si128 (vNull, vNull); */
+ vNull = _mm_setzero_si128(); /* initialize cf Apple Devel smith_waterman_sse2.c */
+ vScaleAmt = vNull;
+
+ /* Zero out the storage vector */
+ for (i = 0; i < iter; i++) {
+ _mm_store_si128 (pvH + i, vGapOpen);
+ _mm_store_si128 (pvE + i, vNull);
+ }
+
+ /* initialize F */
+ vF = vNull;
+ vFPrev = vNull;
+
+ /* load and scale H for the next round */
+ vH = _mm_load_si128 (pvH + iter - 1);
+ vH = _mm_slli_si128 (vH, 1);
+ vH = _mm_adds_epu8 (vH, vHInit);
+ vH = _mm_adds_epu8 (vH, vHInit);
+
+ for (i = 0; i < dbLength; ++i) {
+ /* fetch first data asap. */
+ pvScore = (__m128i *) profile + dbSeq[i] * iter;
+
+ vF = _mm_xor_si128 (vF, vF);
+
+ vH = _mm_max_epu8 (vH, vFPrev);
+ for (j = 0; j < iter; j++) {
+ /* correct H from the previous columns F */
+ vHNext = _mm_load_si128 (pvH + j);
+ vHNext = _mm_max_epu8 (vHNext, vFPrev);
+
+ /* load and correct E value */
+ vE = _mm_load_si128 (pvE + j);
+ vTemp = _mm_subs_epu8 (vHNext, vGapOpen);
+ vE = _mm_max_epu8 (vE, vTemp);
+ _mm_store_si128 (pvE + j, vE);
+
+ /* add score to vH */
+ vH = _mm_adds_epu8 (vH, *pvScore++);
+ vH = _mm_subs_epu8 (vH, vBias);
+
+ /* get max from vH, vE and vF */
+ vH = _mm_max_epu8 (vH, vE);
+ vH = _mm_max_epu8 (vH, vF);
+ _mm_store_si128 (pvH + j, vH);
+
+ /* update vF value */
+ vH = _mm_subs_epu8 (vH, vGapOpen);
+ vF = _mm_max_epu8 (vF, vH);
+
+ /* load the next h values */
+ vH = vHNext;
+ }
+
+ /* check if we need to scale before the next round */
+ vTemp = _mm_subs_epu8 (vCeiling, vF);
+ vTemp = _mm_cmpeq_epi8 (vTemp, vNull);
+ cmp = _mm_movemask_epi8 (vTemp);
+
+ /* broadcast F values */
+ vTemp = _mm_slli_si128 (vF, 1);
+ vTemp = _mm_subs_epu8 (vTemp, vScaleAmt);
+ vF = _mm_max_epu8 (vF, vTemp);
+
+ vScaleTmp = _mm_slli_si128 (vScaleAmt, 1);
+ vScaleTmp = _mm_adds_epu8 (vScaleTmp, vScaleAmt);
+ vTemp = _mm_slli_si128 (vF, 2);
+ vTemp = _mm_subs_epu8 (vTemp, vScaleTmp);
+ vF = _mm_max_epu8 (vF, vTemp);
+
+ vTemp = _mm_slli_si128 (vScaleTmp, 2);
+ vScaleTmp = _mm_adds_epu8 (vScaleTmp, vTemp);
+ vTemp = _mm_slli_si128 (vF, 4);
+ vTemp = _mm_subs_epu8 (vTemp, vScaleTmp);
+ vF = _mm_max_epu8 (vF, vTemp);
+
+ vTemp = _mm_slli_si128 (vScaleTmp, 4);
+ vScaleTmp = _mm_adds_epu8 (vScaleTmp, vTemp);
+ vTemp = _mm_slli_si128 (vF, 8);
+ vTemp = _mm_subs_epu8 (vTemp, vScaleTmp);
+ vF = _mm_max_epu8 (vF, vTemp);
+
+ /* scale if necessary */
+ if (cmp != 0x0000) {
+ vScale = _mm_slli_si128 (vF, 1);
+ vScale = _mm_subs_epu8 (vScale, vGapOpen);
+ vScale = _mm_subs_epu8 (vScale, vScaleAmt);
+
+ vTemp = _mm_slli_si128 (vScale, 1);
+ vTemp = _mm_subs_epu8 (vScale, vTemp);
+ vScaleAmt = _mm_adds_epu8 (vScaleAmt, vTemp);
+ vTemp = _mm_slli_si128 (vScale, 1);
+ vTemp = _mm_subs_epu8 (vTemp, vScale);
+ vScaleAmt = _mm_subs_epu8 (vScaleAmt, vTemp);
+
+ /* rescale the previous F */
+ vF = _mm_subs_epu8 (vF, vScale);
+
+ /* check if we can continue in 8-bits */
+ vTemp = _mm_subs_epu8 (vCeiling, vF);
+ vTemp = _mm_cmpeq_epi8 (vTemp, vNull);
+ cmp = _mm_movemask_epi8 (vTemp);
+ if (cmp != 0x0000) {
+ return OVERFLOW_SCORE;
+ }
+
+ /* scale all the vectors */
+ for (j = 0; j < iter; j++) {
+ /* load H and E */
+ vH = _mm_load_si128 (pvH + j);
+ vE = _mm_load_si128 (pvE + j);
+
+ /* get max from vH, vE and vF */
+ vH = _mm_subs_epu8 (vH, vScale);
+ vE = _mm_subs_epu8 (vE, vScale);
+
+ /* save the H and E */
+ _mm_store_si128 (pvH + j, vH);
+ _mm_store_si128 (pvE + j, vE);
+ }
+
+ /* calculate the final scaling amount */
+ vScale = vScaleAmt;
+ for (j = 0; j < position; ++j) {
+ vScale = _mm_slli_si128 (vScale, 1);
+ }
+ vTemp = _mm_unpacklo_epi8 (vScale, vNull);
+ vScale = _mm_unpackhi_epi8 (vScale, vNull);
+ vScale = _mm_adds_epi16 (vScale, vTemp);
+ vTemp = _mm_srli_si128 (vScale, 8);
+ vScale = _mm_adds_epi16 (vScale, vTemp);
+ vTemp = _mm_srli_si128 (vScale, 4);
+ vScale = _mm_adds_epi16 (vScale, vTemp);
+ vTemp = _mm_srli_si128 (vScale, 2);
+ vScale = _mm_adds_epi16 (vScale, vTemp);
+ scale = (int) _mm_extract_epi16 (vScale, 0);
+ }
+
+ /* scale the F value for the next round */
+ vFPrev = _mm_slli_si128 (vF, 1);
+ vFPrev = _mm_subs_epu8 (vFPrev, vScaleAmt);
+
+ /* load and scale H for the next round */
+ vH = _mm_load_si128 (pvH + iter - 1);
+ vH = _mm_slli_si128 (vH, 1);
+ vH = _mm_subs_epu8 (vH, vScaleAmt);
+ vH = _mm_or_si128 (vH, vHInit);
+ }
+
+ /* calculate the max global score */
+ vH = _mm_load_si128 (pvH + offset);
+ vH = _mm_max_epu8 (vH, vF);
+ for (j = 0; j < position; ++j) {
+ vH = _mm_slli_si128 (vH, 1);
+ }
+ score = (int) (unsigned short) _mm_extract_epi16 (vH, 7);
+ score >>= 8;
+
+ /* return largest score */
+ distance = (queryLength + dbLength) * gapExtend;
+ score = score - (gapOpen * 2) - distance + scale;
+
+ return score;
+}
+#else
+
+/* No SSE2 support. Avoid compiler complaints about empty object */
+
+int nw_dummy;
+
+#endif
+
diff --git a/src/global_sse2.h b/src/global_sse2.h
new file mode 100644
index 0000000..d45d14d
--- /dev/null
+++ b/src/global_sse2.h
@@ -0,0 +1,41 @@
+/******************************************************************
+ Copyright 2010 by Michael Farrar. All rights reserved.
+ This program may not be sold or incorporated into a commercial product,
+ in whole or in part, without written consent of Michael Farrar. For
+ further information regarding permission for use or reproduction, please
+ contact: Michael Farrar at farrar.michael at gmail.com.
+*******************************************************************/
+
+/*
+ Written by Michael Farrar, 2010.
+ Please send bug reports and/or suggestions to farrar.michael at gmail.com.
+*/
+
+#ifndef INCLUDE_GLOBAL_SSE2_H
+#define INCLUDE_GLOBAL_SSE2_H
+
+#define SHORT_BIAS 32768
+#define OVERFLOW_SCORE 0x7f000000
+
+int
+global_sse2_word(int queryLength,
+ unsigned short *profile,
+ const unsigned char *dbSeq,
+ int dbLength,
+ unsigned short gapOpen,
+ unsigned short gapExtend,
+ unsigned short ceiling,
+ struct f_struct *f_str);
+
+int
+global_sse2_byte(int queryLength,
+ unsigned char *profile,
+ const unsigned char *dbSeq,
+ int dbLength,
+ unsigned short gapOpen,
+ unsigned short gapExtend,
+ unsigned short ceiling,
+ unsigned short bias,
+ struct f_struct *f_str);
+
+#endif /* INCLUDE_GLOBAL_SSE2_H */
diff --git a/src/glocal_sse2.c b/src/glocal_sse2.c
new file mode 100644
index 0000000..e0f5ffd
--- /dev/null
+++ b/src/glocal_sse2.c
@@ -0,0 +1,596 @@
+/******************************************************************
+ Copyright 2010 by Michael Farrar. All rights reserved.
+ This program may not be sold or incorporated into a commercial product,
+ in whole or in part, without written consent of Michael Farrar. For
+ further information regarding permission for use or reproduction, please
+ contact: Michael Farrar at farrar.michael at gmail.com.
+*******************************************************************/
+
+/*
+ Written by Michael Farrar, 2010.
+ Please send bug reports and/or suggestions to farrar.michael at gmail.com.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "defs.h"
+#include "param.h"
+#include "dropgsw2.h"
+#include "global_sse2.h"
+
+#ifdef __SUNPRO_C
+#include <sunmedia_intrin.h>
+#else
+#include <emmintrin.h>
+#endif
+
+#ifdef SW_SSE2
+
+static inline __m128i
+max_epu16(__m128i a, __m128i b)
+{
+ a = _mm_subs_epu16 (a, b);
+ b = _mm_adds_epu16 (b, a);
+ return b;
+}
+
+int
+glocal_sse2_word(int queryLength,
+ unsigned short *profile,
+ const unsigned char *dbSeq,
+ int dbLength,
+ unsigned short gapOpen,
+ unsigned short gapExtend,
+ unsigned short ceiling,
+ struct f_struct *f_str)
+{
+ int i, j;
+
+ int max;
+ int score;
+ int scale;
+ int temp;
+ int distance;
+ int initScale;
+ int hinit;
+ int zero;
+
+ int offset;
+ int position;
+
+ int cmp;
+ int iter;
+
+ __m128i *pvH;
+ __m128i *pvE;
+
+ __m128i vE, vF, vH;
+ __m128i vHNext;
+ __m128i vFPrev;
+
+ __m128i vGapOpen;
+ __m128i vGapExtend;
+ __m128i vCeiling;
+
+ __m128i vScale;
+ __m128i vScaleAmt;
+ __m128i vScaleTmp;
+
+ __m128i vTemp;
+ __m128i vNull;
+
+ __m128i *pvScore;
+
+ scale = 0;
+ initScale = 0;
+
+ max = 0x80000000;
+ iter = (queryLength + 7) / 8;
+ offset = (queryLength - 1) % iter;
+ position = 7 - (queryLength - 1) / iter;
+
+ pvH = (__m128i *)f_str->workspace;
+ pvE = pvH + iter;
+
+ /* Load gap opening penalty to all elements of a constant */
+ vGapOpen = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */
+ vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0);
+ vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
+ vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);
+
+ /* Load gap extension penalty to all elements of a constant */
+ vGapExtend = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */
+ vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0);
+ vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
+ vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);
+
+ /* Generate the ceiling before scaling */
+ vTemp = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */
+ vTemp = _mm_insert_epi16 (vTemp, ceiling, 0);
+ vTemp = _mm_shufflelo_epi16 (vTemp, 0);
+ vTemp = _mm_shuffle_epi32 (vTemp, 0);
+ vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp);
+ vCeiling = _mm_srli_epi16 (vCeiling, 1);
+ vCeiling = _mm_subs_epi16 (vCeiling, vTemp);
+ vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen);
+
+ vGapExtend = _mm_srli_si128 (vGapExtend, 14);
+ vNull = _mm_cmpeq_epi16 (vTemp, vTemp);
+ vNull = _mm_slli_epi16 (vNull, 15);
+ vScaleAmt = _mm_xor_si128 (vNull, vNull);
+
+ hinit = gapOpen * 2 - SHORT_BIAS;
+ zero = hinit;
+
+ /* Zero out the storage vector */
+ vTemp = _mm_adds_epi16 (vNull, vGapOpen);
+ for (i = 0; i < iter; i++) {
+ _mm_store_si128 (pvH + i, vTemp);
+ _mm_store_si128 (pvE + i, vNull);
+ }
+
+ /* initialize F */
+ vF = vNull;
+ vFPrev = vNull;
+
+ /* load and scale H for the next round */
+ vH = _mm_load_si128 (pvH + iter - 1);
+ vH = _mm_slli_si128 (vH, 2);
+ vH = _mm_insert_epi16 (vH, zero, 0);
+
+ for (i = 0; i < dbLength; ++i) {
+ /* fetch first data asap. */
+ pvScore = (__m128i *) profile + dbSeq[i] * iter;
+
+ vF = _mm_insert_epi16 (vNull, hinit, 0);
+ vF = _mm_adds_epi16 (vF, vGapExtend);
+ vF = _mm_subs_epi16 (vF, vGapOpen);
+
+ vH = _mm_max_epi16 (vH, vFPrev);
+ for (j = 0; j < iter; j++) {
+ /* correct H from the previous columns F */
+ vHNext = _mm_load_si128 (pvH + j);
+ vHNext = _mm_max_epi16 (vHNext, vFPrev);
+
+ /* load and correct E value */
+ vE = _mm_load_si128 (pvE + j);
+ vTemp = _mm_subs_epi16 (vHNext, vGapOpen);
+ vE = _mm_max_epi16 (vE, vTemp);
+ _mm_store_si128 (pvE + j, vE);
+
+ /* add score to vH */
+ vH = _mm_adds_epi16 (vH, *pvScore++);
+
+ /* get max from vH, vE and vF */
+ vH = _mm_max_epi16 (vH, vE);
+ vH = _mm_max_epi16 (vH, vF);
+ _mm_store_si128 (pvH + j, vH);
+
+ /* update vF value */
+ vH = _mm_subs_epi16 (vH, vGapOpen);
+ vF = _mm_max_epi16 (vF, vH);
+
+ /* load the next h values */
+ vH = vHNext;
+ }
+
+ /* check if we need to scale before the next round */
+ vTemp = _mm_cmpgt_epi16 (vF, vCeiling);
+ cmp = _mm_movemask_epi8 (vTemp);
+
+ /* broadcast F values */
+ vF = _mm_xor_si128 (vF, vNull);
+
+ vTemp = _mm_slli_si128 (vF, 2);
+ vTemp = _mm_subs_epu16 (vTemp, vScaleAmt);
+ vF = max_epu16 (vF, vTemp);
+
+ vTemp = _mm_slli_si128 (vF, 4);
+ vScaleTmp = _mm_slli_si128 (vScaleAmt, 2);
+ vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt);
+ vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
+ vF = max_epu16 (vF, vTemp);
+
+ vTemp = _mm_slli_si128 (vScaleTmp, 4);
+ vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp);
+ vTemp = _mm_slli_si128 (vF, 8);
+ vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
+ vF = max_epu16 (vF, vTemp);
+
+ /* scale if necessary */
+ if (cmp != 0x0000) {
+ __m128i vScale1;
+ __m128i vScale2;
+
+ scale = hinit - gapOpen * 2 + SHORT_BIAS;
+ initScale = initScale + scale;
+
+ vScale = _mm_slli_si128 (vF, 2);
+ vScale = _mm_subs_epu16 (vScale, vGapOpen);
+ vScale = _mm_subs_epu16 (vScale, vScaleAmt);
+ vScale = _mm_insert_epi16 (vScale, scale, 0);
+
+ vTemp = _mm_slli_si128 (vScale, 2);
+ vTemp = _mm_subs_epu16 (vScale, vTemp);
+ vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp);
+ vTemp = _mm_slli_si128 (vScale, 2);
+ vTemp = _mm_subs_epu16 (vTemp, vScale);
+ vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp);
+ vTemp = _mm_subs_epu8 (vTemp, vTemp);
+ vTemp = _mm_insert_epi16 (vTemp, scale, 0);
+ vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp);
+
+ /* rescale the previous F */
+ vF = _mm_subs_epu16 (vF, vScale);
+
+ /* rescale the initial H value */
+ hinit = zero;
+
+ /* check if we can continue in signed 16-bits */
+ vTemp = _mm_xor_si128 (vF, vNull);
+ vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling);
+ cmp = _mm_movemask_epi8 (vTemp);
+ if (cmp != 0x0000) {
+ return OVERFLOW_SCORE;
+ }
+
+ vTemp = _mm_adds_epi16 (vCeiling, vCeiling);
+ vScale1 = _mm_subs_epu16 (vScale, vTemp);
+ vScale2 = _mm_subs_epu16 (vScale, vScale1);
+
+ /* scale all the vectors */
+ for (j = 0; j < iter; j++) {
+ /* load H and E */
+ vH = _mm_load_si128 (pvH + j);
+ vE = _mm_load_si128 (pvE + j);
+
+ /* get max from vH, vE and vF */
+ vH = _mm_subs_epi16 (vH, vScale1);
+ vH = _mm_subs_epi16 (vH, vScale2);
+ vE = _mm_subs_epi16 (vE, vScale1);
+ vE = _mm_subs_epi16 (vE, vScale2);
+
+ /* save the H and E */
+ _mm_store_si128 (pvH + j, vH);
+ _mm_store_si128 (pvE + j, vE);
+ }
+
+ vScale = vScaleAmt;
+ for (j = 0; j < position; ++j) {
+ vScale = _mm_slli_si128 (vScale, 2);
+ }
+
+ /* calculate the final scaling amount */
+ /* vTemp = _mm_xor_si128 (vTemp, vTemp); */
+ vTemp = _mm_setzero_si128(); /* transfered from Apple Devel fix for smith_waterman_sse2.c */
+ vScale1 = _mm_unpacklo_epi16 (vScale, vTemp);
+ vScale2 = _mm_unpackhi_epi16 (vScale, vTemp);
+ vScale = _mm_add_epi32 (vScale1, vScale2);
+ vTemp = _mm_srli_si128 (vScale, 8);
+ vScale = _mm_add_epi32 (vScale, vTemp);
+ vTemp = _mm_srli_si128 (vScale, 4);
+ vScale = _mm_add_epi32 (vScale, vTemp);
+ scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0);
+ temp = (int) (unsigned short) _mm_extract_epi16 (vScale, 1);
+ scale = scale + (temp << 16) + initScale;
+ }
+
+ /* scale the F value for the next round */
+ vFPrev = _mm_slli_si128 (vF, 2);
+ vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt);
+ vFPrev = _mm_xor_si128 (vFPrev, vNull);
+
+ vF = _mm_xor_si128 (vF, vNull);
+
+ vH = _mm_load_si128 (pvH + offset);
+ vH = _mm_max_epi16 (vH, vFPrev);
+ for (j = 0; j < position; ++j) {
+ vH = _mm_slli_si128 (vH, 2);
+ }
+ score = (int) (signed short) _mm_extract_epi16 (vH, 7);
+ score = score + SHORT_BIAS;
+
+ /* return largest score */
+ distance = (queryLength + i + 1) * gapExtend;
+ score = score - (gapOpen * 2) - distance + scale;
+ max = (max > score) ? max : score;
+
+ /* load and scale H for the next round */
+ hinit += gapExtend;
+ vH = _mm_load_si128 (pvH + iter - 1);
+ vH = _mm_slli_si128 (vH, 2);
+ vH = _mm_xor_si128 (vH, vNull);
+ vH = _mm_subs_epu16 (vH, vScaleAmt);
+ vH = _mm_xor_si128 (vH, vNull);
+ vH = _mm_insert_epi16 (vH, hinit, 0);
+ }
+
+ return max;
+}
+
+int
+glocal_sse2_byte(int queryLength,
+ unsigned char *profile,
+ const unsigned char *dbSeq,
+ int dbLength,
+ unsigned short gapOpen,
+ unsigned short gapExtend,
+ unsigned short ceiling,
+ unsigned short bias,
+ struct f_struct *f_str)
+{
+ int i, j;
+
+ int max;
+ int score;
+ int scale;
+ int distance;
+ int initScale;
+
+ int offset;
+ int position;
+
+ int dup;
+ int cmp;
+ int iter;
+
+ __m128i *pvH;
+ __m128i *pvE;
+
+ __m128i vE, vF, vH;
+ __m128i vHInit;
+ __m128i vHNext;
+ __m128i vFPrev;
+
+ __m128i vBias;
+ __m128i vGapOpen;
+ __m128i vGapExtend;
+ __m128i vCeiling;
+
+ __m128i vScale;
+ __m128i vScaleAmt;
+ __m128i vScaleTmp;
+
+ __m128i vTemp;
+ __m128i vZero;
+ __m128i vNull;
+
+ __m128i *pvScore;
+
+ scale = 0;
+ initScale = 0;
+
+ max = 0x80000000;
+ iter = (queryLength + 15) / 16;
+ offset = (queryLength - 1) % iter;
+ position = 15 - (queryLength - 1) / iter;
+
+ pvH = (__m128i *)f_str->workspace;
+ pvE = pvH + iter;
+
+ /* Load the bias to all elements of a constant */
+ dup = (bias << 8) | (bias & 0x00ff);
+ vBias = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */
+ vBias = _mm_insert_epi16 (vBias, dup, 0);
+ vBias = _mm_shufflelo_epi16 (vBias, 0);
+ vBias = _mm_shuffle_epi32 (vBias, 0);
+
+ /* Load gap opening penalty to all elements of a constant */
+ dup = (gapOpen << 8) | (gapOpen & 0x00ff);
+ vGapOpen = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */
+ vGapOpen = _mm_insert_epi16 (vGapOpen, dup, 0);
+ vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
+ vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);
+
+ /* Load gap extension penalty to all elements of a constant */
+ dup = (gapExtend << 8) | (gapExtend & 0x00ff);
+ vGapExtend = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */
+ vGapExtend = _mm_insert_epi16 (vGapExtend, dup, 0);
+ vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
+ vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);
+
+ /* Generate the ceiling before scaling */
+ dup = (ceiling << 8) | (ceiling & 0x00ff);
+ vTemp = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */
+ vTemp = _mm_insert_epi16 (vTemp, dup, 0);
+ vTemp = _mm_shufflelo_epi16 (vTemp, 0);
+ vTemp = _mm_shuffle_epi32 (vTemp, 0);
+ vCeiling = _mm_cmpeq_epi8 (vTemp, vTemp);
+ vCeiling = _mm_subs_epu8 (vCeiling, vTemp);
+ vCeiling = _mm_subs_epu8 (vCeiling, vGapOpen);
+
+ /* since we want to use the full range, zero is redefined as */
+ /* 2 * gapOpen. the lowest scaled score will an insert followed */
+ /* by a delete. */
+ vHInit = _mm_adds_epu8 (vGapOpen, vGapOpen);
+ vHInit = _mm_srli_si128 (vHInit, 15);
+ vZero = vHInit;
+
+ vGapExtend = _mm_srli_si128 (vGapExtend, 15);
+ /* vNull = _mm_xor_si128 (vNull, vNull); */
+ vNull = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */
+ vScaleAmt = vNull;
+
+ /* Zero out the storage vector */
+ for (i = 0; i < iter; i++) {
+ _mm_store_si128 (pvH + i, vGapOpen);
+ _mm_store_si128 (pvE + i, vNull);
+ }
+
+ /* initialize F */
+ vF = vNull;
+ vFPrev = vNull;
+
+ /* load and scale H for the next round */
+ vH = _mm_load_si128 (pvH + iter - 1);
+ vH = _mm_slli_si128 (vH, 1);
+ vH = _mm_or_si128 (vH, vZero);
+
+ for (i = 0; i < dbLength; ++i) {
+ /* fetch first data asap. */
+ pvScore = (__m128i *) profile + dbSeq[i] * iter;
+
+ vF = _mm_adds_epu8 (vHInit, vGapExtend);
+ vF = _mm_subs_epu8 (vF, vGapOpen);
+
+ vH = _mm_max_epu8 (vH, vFPrev);
+ for (j = 0; j < iter; j++) {
+ /* correct H from the previous columns F */
+ vHNext = _mm_load_si128 (pvH + j);
+ vHNext = _mm_max_epu8 (vHNext, vFPrev);
+
+ /* load and correct E value */
+ vE = _mm_load_si128 (pvE + j);
+ vTemp = _mm_subs_epu8 (vHNext, vGapOpen);
+ vE = _mm_max_epu8 (vE, vTemp);
+ _mm_store_si128 (pvE + j, vE);
+
+ /* add score to vH */
+ vH = _mm_adds_epu8 (vH, *pvScore++);
+ vH = _mm_subs_epu8 (vH, vBias);
+
+ /* get max from vH, vE and vF */
+ vH = _mm_max_epu8 (vH, vE);
+ vH = _mm_max_epu8 (vH, vF);
+ _mm_store_si128 (pvH + j, vH);
+
+ /* update vF value */
+ vH = _mm_subs_epu8 (vH, vGapOpen);
+ vF = _mm_max_epu8 (vF, vH);
+
+ /* load the next h values */
+ vH = vHNext;
+ }
+
+ /* check if we need to scale before the next round */
+ vTemp = _mm_subs_epu8 (vCeiling, vF);
+ vTemp = _mm_cmpeq_epi8 (vTemp, vNull);
+ cmp = _mm_movemask_epi8 (vTemp);
+
+ /* broadcast F values */
+ vTemp = _mm_slli_si128 (vF, 1);
+ vTemp = _mm_subs_epu8 (vTemp, vScaleAmt);
+ vF = _mm_max_epu8 (vF, vTemp);
+
+ vScaleTmp = _mm_slli_si128 (vScaleAmt, 1);
+ vScaleTmp = _mm_adds_epu8 (vScaleTmp, vScaleAmt);
+ vTemp = _mm_slli_si128 (vF, 2);
+ vTemp = _mm_subs_epu8 (vTemp, vScaleTmp);
+ vF = _mm_max_epu8 (vF, vTemp);
+
+ vTemp = _mm_slli_si128 (vScaleTmp, 2);
+ vScaleTmp = _mm_adds_epu8 (vScaleTmp, vTemp);
+ vTemp = _mm_slli_si128 (vF, 4);
+ vTemp = _mm_subs_epu8 (vTemp, vScaleTmp);
+ vF = _mm_max_epu8 (vF, vTemp);
+
+ vTemp = _mm_slli_si128 (vScaleTmp, 4);
+ vScaleTmp = _mm_adds_epu8 (vScaleTmp, vTemp);
+ vTemp = _mm_slli_si128 (vF, 8);
+ vTemp = _mm_subs_epu8 (vTemp, vScaleTmp);
+ vF = _mm_max_epu8 (vF, vTemp);
+
+ /* scale if necessary */
+ if (cmp != 0x0000) {
+ vHInit = _mm_subs_epu8 (vHInit, vGapOpen);
+ vHInit = _mm_subs_epu8 (vHInit, vGapOpen);
+ scale = _mm_extract_epi16 (vHInit, 0);
+ initScale = initScale + scale;
+
+ vScale = _mm_slli_si128 (vF, 1);
+ vScale = _mm_subs_epu8 (vScale, vGapOpen);
+ vScale = _mm_subs_epu8 (vScale, vScaleAmt);
+ vScale = _mm_or_si128 (vScale, vHInit);
+
+ vTemp = _mm_slli_si128 (vScale, 1);
+ vTemp = _mm_subs_epu8 (vScale, vTemp);
+ vScaleAmt = _mm_adds_epu8 (vScaleAmt, vTemp);
+ vTemp = _mm_slli_si128 (vScale, 1);
+ vTemp = _mm_subs_epu8 (vTemp, vScale);
+ vScaleAmt = _mm_subs_epu8 (vScaleAmt, vTemp);
+ vScaleAmt = _mm_subs_epu8 (vScaleAmt, vHInit);
+
+ /* rescale the previous F */
+ vF = _mm_subs_epu8 (vF, vScale);
+
+ /* rescale the initial H value */
+ vHInit = vZero;
+
+ /* check if we can continue in 8-bits */
+ vTemp = _mm_subs_epu8 (vCeiling, vF);
+ vTemp = _mm_cmpeq_epi8 (vTemp, vNull);
+ cmp = _mm_movemask_epi8 (vTemp);
+ if (cmp != 0x0000) {
+ return OVERFLOW_SCORE;
+ }
+
+ /* scale all the vectors */
+ for (j = 0; j < iter; j++) {
+ /* load H and E */
+ vH = _mm_load_si128 (pvH + j);
+ vE = _mm_load_si128 (pvE + j);
+
+ /* get max from vH, vE and vF */
+ vH = _mm_subs_epu8 (vH, vScale);
+ vE = _mm_subs_epu8 (vE, vScale);
+
+ /* save the H and E */
+ _mm_store_si128 (pvH + j, vH);
+ _mm_store_si128 (pvE + j, vE);
+ }
+
+ /* calculate the final scaling amount */
+ vScale = vScaleAmt;
+ for (j = 0; j < position; ++j) {
+ vScale = _mm_slli_si128 (vScale, 1);
+ }
+ vTemp = _mm_unpacklo_epi8 (vScale, vNull);
+ vScale = _mm_unpackhi_epi8 (vScale, vNull);
+ vScale = _mm_adds_epi16 (vScale, vTemp);
+ vTemp = _mm_srli_si128 (vScale, 8);
+ vScale = _mm_adds_epi16 (vScale, vTemp);
+ vTemp = _mm_srli_si128 (vScale, 4);
+ vScale = _mm_adds_epi16 (vScale, vTemp);
+ vTemp = _mm_srli_si128 (vScale, 2);
+ vScale = _mm_adds_epi16 (vScale, vTemp);
+ scale = (int) _mm_extract_epi16 (vScale, 0);
+ scale = scale + initScale;
+ }
+
+ /* scale the F value for the next round */
+ vFPrev = _mm_slli_si128 (vF, 1);
+ vFPrev = _mm_subs_epu8 (vFPrev, vScaleAmt);
+
+ /* calculate the max glocal score for this column */
+ vH = _mm_load_si128 (pvH + offset);
+ vH = _mm_max_epu8 (vH, vF);
+ for (j = 0; j < position; ++j) {
+ vH = _mm_slli_si128 (vH, 1);
+ }
+ score = (int) (unsigned short) _mm_extract_epi16 (vH, 7);
+ score >>= 8;
+
+ /* return largest score */
+ distance = (queryLength + i + 1) * gapExtend;
+ score = score - (gapOpen * 2) - distance + scale;
+ max = (max > score) ? max : score;
+
+ /* load and scale H for the next round */
+ vHInit = _mm_adds_epu8 (vHInit, vGapExtend);
+ vH = _mm_load_si128 (pvH + iter - 1);
+ vH = _mm_slli_si128 (vH, 1);
+ vH = _mm_subs_epu8 (vH, vScaleAmt);
+ vH = _mm_or_si128 (vH, vHInit);
+ }
+
+ return max;
+}
+#else
+
+/* No SSE2 support. Avoid compiler complaints about empty object */
+
+int nw_dummy;
+
+#endif
+
diff --git a/src/glocal_sse2.h b/src/glocal_sse2.h
new file mode 100644
index 0000000..2f77642
--- /dev/null
+++ b/src/glocal_sse2.h
@@ -0,0 +1,41 @@
+/******************************************************************
+ Copyright 2010 by Michael Farrar. All rights reserved.
+ This program may not be sold or incorporated into a commercial product,
+ in whole or in part, without written consent of Michael Farrar. For
+ further information regarding permission for use or reproduction, please
+ contact: Michael Farrar at farrar.michael at gmail.com.
+*******************************************************************/
+
+/*
+ Written by Michael Farrar, 2010.
+ Please send bug reports and/or suggestions to farrar.michael at gmail.com.
+*/
+
+#ifndef INCLUDE_GLOCAL_SSE2_H
+#define INCLUDE_GLOCAL_SSE2_H
+
+#define SHORT_BIAS 32768
+#define OVERFLOW_SCORE 0x7f000000
+
+int
+glocal_sse2_word(int queryLength,
+ unsigned short *profile,
+ const unsigned char *dbSeq,
+ int dbLength,
+ unsigned short gapOpen,
+ unsigned short gapExtend,
+ unsigned short ceiling,
+ struct f_struct *f_str);
+
+int
+glocal_sse2_byte(int queryLength,
+ unsigned char *profile,
+ const unsigned char *dbSeq,
+ int dbLength,
+ unsigned short gapOpen,
+ unsigned short gapExtend,
+ unsigned short ceiling,
+ unsigned short bias,
+ struct f_struct *f_str);
+
+#endif /* INCLUDE_GLOCAL_SSE2_H */
diff --git a/src/h_altlib.h b/src/h_altlib.h
new file mode 100644
index 0000000..3f95c0e
--- /dev/null
+++ b/src/h_altlib.h
@@ -0,0 +1,28 @@
+
+/* $Id: h_altlib.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+#define LASTENTRY 10
+#define LASTLIB 10
+#define BINARYGB 9
+#define DEFAULT 0
+#define FULLGB 1
+#define UNIXPIR 2
+#define EMBLSWISS 3
+#define INTELLIG 4
+#define VMSPIR 5
+
+int agetlib_h(); /* pearson fasta format */
+int agetntlib_h(); /* pearson fasta format nucleotides */
+int vgetlib_h(); /* PIR VMS format */
+
+int (*h_getliba[LASTLIB])()={
+ agetlib_h,agetlib_h,agetlib_h,agetlib_h,
+ agetlib_h,vgetlib_h,agetlib_h,agetlib_h,
+ agetlib_h,agetlib_h};
+
+int (*h_getntliba[LASTLIB])()={
+ agetntlib_h,agetntlib_h,agetntlib_h,agetntlib_h,
+ agetntlib_h,agetntlib_h,agetntlib_h,agetntlib_h,
+ agetntlib_h,agetntlib_h};
+
diff --git a/src/htime.c b/src/htime.c
new file mode 100644
index 0000000..c873493
--- /dev/null
+++ b/src/htime.c
@@ -0,0 +1,43 @@
+/* Concurrent read version */
+
+/* $Id: htime.c 867 2011-10-30 14:44:04Z wrp $ */
+/* $Revision: 867 $ */
+
+#include <stdio.h>
+#include <time.h>
+
+#ifdef UNIX
+#include <sys/types.h>
+#include <sys/time.h>
+#ifdef TIMES
+#include <sys/times.h>
+#else
+#undef TIMES
+#endif
+#endif
+
+#ifndef HZ
+#define HZ 100
+#endif
+
+long s_time () /* returns time in milliseconds */
+{
+#ifndef TIMES
+ time_t time(), tt;
+ return time(&tt)*1000;
+#else
+ struct tms tt;
+ times(&tt);
+#ifdef CLK_TCK
+ return tt.tms_utime*1000/CLK_TCK;
+#else
+ return tt.tms_utime*1000/HZ;
+#endif
+#endif
+}
+
+void ptime (FILE *fp, long time) /* prints the time */
+{
+ fprintf (fp, "%6.3f",(double)(time)/1000.0);
+}
+
diff --git a/src/initfa.c b/src/initfa.c
new file mode 100644
index 0000000..1f11bf6
--- /dev/null
+++ b/src/initfa.c
@@ -0,0 +1,3183 @@
+/* initfa.c */
+/* $Id: initfa.c 1274 2014-08-07 18:30:56Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 2014 by William R. Pearson and The
+ Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* init??.c files provide function specific initializations */
+
+/* h_init() - called from comp_lib.c, comp_thr.c to initialize pstruct ppst
+ which includes the alphabet, and pam matrix
+
+ alloc_pam() - allocate pam matrix space
+ init_pam2() - convert from 1D to 2D pam
+
+ init_pamx() - extend pam2 for 'X', 'N', or lower case characters
+
+ f_initenv() - set up mngmsg and pstruct defaults
+ f_getopt() - read fasta specific command line options
+ f_getarg() - read ktup
+
+ resetp() - reset the parameters, scoring matrix for DNA-DNA/DNA-prot
+
+ query_parm() - ask for ktup
+ last_init() - some things must be done last
+
+ f_initpam() - set some parameters based on the pam matrix
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <math.h>
+
+#ifdef UNIX
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
+
+#include "defs.h"
+#include "structs.h"
+#include "param.h"
+#include "best_stats.h"
+
+#define XTERNAL
+#include "upam.h"
+#include "uascii.h"
+#undef XTERNAL
+
+#define MAXWINDOW 32
+
+int initpam(char *, struct pstruct *);
+void init_pam2 (struct pstruct *ppst);
+void init_altpam(struct pstruct *ppst);
+void init_pamx (struct pstruct *ppst);
+void extend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst);
+void build_xascii(int *qascii, char *save_str);
+void add_ascii_ann(int *qascii, unsigned char *ann_arr);
+void re_ascii(int *qascii, int *pascii, int max_ann_arr);
+extern int my_nrand(int, void *);
+
+/* at some point, all the defaults should be driven from this table */
+/*
+#pgm q_seq l_seq p_seq matrix g_open g_ext fr_shft e_cut ktup E_band_opt
+# -n/-p -s -e -f -h/-j -E argv[3]
+fasta prot(0) prot(0) prot(0) bl50 -10 -2 - 10.0 2 0.02
+fasta dna(1) dna(1) dna(1) +5/-4 -12 -4 - 2.0 6 0.01
+ssearch prot(0) prot(0) prot(0) bl50 -10 -2 - 10.0 - -
+ssearch dna(1) dna(1) dna(1) +5/-4 -16 -4 - 2.0 - -
+fastx dna(1) prot(0) prot(0) BL50 -12 -2 -20 5.0 2 0.02
+fasty dna(1) prot(0) prot(0) BL50 -12 -2 -20/-24 5.0 2 0.02
+tfastx dna(1) prot(0) prot(0) BL50 -14 -2 -20 5.0 2 0.01
+tfasty dna(1) prot(0) prot(0) BL50 -14 -2 -20/-24 5.0 2 0.01
+fasts prot(0) prot(0) prot(0) MD20-MS - - - 5.0 - -
+fasts dna(1) dna(1) dna(1) +2/-4 - - - 5.0 1 -
+tfasts prot(0) dna(1) prot(0) MD10-MS - - - 2.0 1 -
+fastf prot(0) prot(0) prot(0) MD20 - - - 2.0 1 -
+tfastf prot(0) dna(1) prot(0) MD10 - - - 1.0 1 -
+fastm prot(0) prot(0) prot(0) MD20 - - - 5.0 1 -
+fastm dna(1) dna(1) dna(1) +2/-4 - - - 2.0 1 -
+tfastm prot(0) dna(1) prot(0) MD10 - - - 2.0 1 -
+lalign prot(0) prot(0) prot(0) BL50 -12 -2 - 1.0 - -
+lalign dna(1) dna(1) dna(1) +5/-4 -12 -4 - 0.1 - -
+*/
+
+void show_help(char *, int );
+
+char *ref_str_a[]={
+/* 0 */ "W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448\n",
+/* 1 */ "T. F. Smith and M. S. Waterman, (1981) J. Mol. Biol. 147:195-197; \n W.R. Pearson (1991) Genomics 11:635-650\n",
+/* 2 */ "Pearson et al, Genomics (1997) 46:24-36\n",
+/* 3 */ "Mackey et al. Mol. Cell. Proteomics (2002) 1:139-147\n",
+/* 4 */ "W.R. Pearson (1996) Meth. Enzymol. 266:227-258\n",
+/* 5 */ "X. Huang and W. Miller (1991) Adv. Appl. Math. 12:373-381\n",
+ ""
+};
+
+#define FA_PID 1
+#define SS_PID 2
+#define FX_PID 3
+#define FY_PID 4
+#define FS_PID 5
+#define FF_PID 6
+#define FM_PID 7
+#define RSS_PID 8
+#define RFX_PID 9
+#define SSS_PID 10 /* old (slow) non-PG Smith-Waterman */
+#define TFA_PID FA_PID+10
+#define TFX_PID FX_PID+10
+#define TFY_PID FY_PID+10
+#define TFS_PID FS_PID+10
+#define TFF_PID FF_PID+10
+#define TFM_PID FM_PID+10
+#define LAL_PID 18
+#define LNW_PID 19
+#define GNW_PID 20
+
+struct pgm_def_str {
+ int pgm_id;
+ char *prog_func;
+ char *info_pgm_abbr;
+ char *iprompt0;
+ char *ref_str;
+ int PgmDID;
+ char *smstr;
+ int g_open_mod;
+ int g_ext_mod;
+ int gshift;
+ int hshift;
+ double e_cut;
+ int ktup;
+ double E_band_opt;
+ int can_pre_align;
+};
+
+static struct pgm_def_str
+pgm_def_arr[21] = {
+ {0, "", "", "", NULL, 400, "", 0, 0, 0, 0, 1.0, 0, 0 }, /* 0 */
+ {FA_PID, "FASTA", "fa",
+ "FASTA searches a protein or DNA sequence data bank",
+ NULL, 401, "BL50", 0, 0, 0, 0, 10.0, 2, 0.2, 1}, /* 1 - FASTA */
+ {SS_PID, "SSEARCH","gsw","SSEARCH performs a Smith-Waterman search",
+ NULL, 404, "BL50", 0, 0, 0, 0, 10.0, 0, 0.0, 1}, /* 2 - SSEARCH */
+ {FX_PID, "FASTX","fx",
+ "FASTX compares a DNA sequence to a protein sequence data bank",
+ NULL, 405, "BL50", -2, 0, -20, 0, 5.0, 2, 0.10, 1}, /* 3 - FASTX */
+ {FY_PID, "FASTY", "fy",
+ "FASTY compares a DNA sequence to a protein sequence data bank",
+ NULL, 405, "BL50", -2, 0, -20, -24, 5.0, 2, 0.10, 1}, /* 4 - FASTY */
+ {FS_PID, "FASTS", "fs",
+ "FASTS compares linked peptides to a protein data bank",
+ NULL, 400, "MD20-MS", 0, 0, 0, 0, 5.0, 1, 0.0, 0}, /* 5 - FASTS */
+ {FF_PID, "FASTF", "ff",
+ "FASTF compares mixed peptides to a protein databank",
+ NULL, 400, "MD20", 0, 0, 0, 0, 2.0, 1, 0.0, 0 }, /* 6 - FASTF */
+ {FM_PID, "FASTM", "fm",
+ "FASTM compares ordered peptides to a protein data bank",
+ NULL, 400, "MD20", 0, 0, 0, 0, 5.0, 1, 0.0, 0 }, /* 7 - FASTM */
+ {RSS_PID, "PRSS", "rss",
+ "PRSS evaluates statistical signficance using Smith-Waterman",
+ NULL, 401, "BL50", 0, 0, 0, 0, 1000.0, 0, 0.0, 1 }, /* 8 - PRSS */
+ {RFX_PID,"PRFX", "rfx",
+ "PRFX evaluates statistical signficance using FASTX",
+ NULL, 401, "BL50", -2, 0, -20, -24, 1000.0, 2, 0.2, 1 }, /* 9 - PRFX */
+ {SSS_PID, "OSEARCH","ssw","OSEARCH searches a sequence data bank",
+ NULL, 404, "BL50", 0, 0, 0, 0, 10.0, 0, 0.0, 1}, /* 2 - OSEARCH */
+ {TFA_PID, "TFASTA", "tfa",
+ "TFASTA compares a protein to a translated DNA data bank",
+ NULL, 402, "BL50", -2, 0, 0, 0, 5.0, 2, 0.1, 1},
+ {0, "", "", "", NULL, 400, "", 0, 0, 0, 0, 1.0, 0, 0.0 }, /* 0 */
+ {TFX_PID, "TFASTX", "tfx",
+ "TFASTX compares a protein to a translated DNA data bank",
+ NULL, 406, "BL50", -2, 0, -20, 0, 2.0, 2, 0.10, 1},
+ {TFY_PID, "TFASTY", "tfy",
+ "TFASTY compares a protein to a translated DNA data bank",
+ NULL, 406, "BL50", -2, 0, -20, -24, 2.0, 2, 0.10, 1},
+ {TFS_PID, "TFASTS", "tfs",
+ "TFASTS compares linked peptides to a translated DNA data bank",
+ NULL, 400, "MD10-MS", 0, 0, 0, 0, 2.0, 2, 0.0, 0 },
+ {TFF_PID, "TFASTF", "tff",
+ "TFASTF compares mixed peptides to a protein databank",
+ NULL, 400, "MD10", 0, 0, 0, 0, 1.0, 1, 0.0, 0 },
+ {TFM_PID, "TFASTM", "tfm",
+ "TFASTM compares ordered peptides to a translated DNA databank",
+ NULL, 400, "MD10", 0, 0, 0, 0, 1.0, 1, 0.0, 0 },
+ {LAL_PID, "LALIGN", "lal",
+ "LALIGN finds non-overlapping local alignments",
+ NULL, 404, "BL50", -2, 0, 0, 0, 1.0, 0, 0.0, 1}, /* 18 - LALIGN */
+ {LNW_PID, "GLSEARCH", "lnw",
+ "GLSEARCH performs a global-query/local-library search",
+ NULL, 404, "BL50", -2, 0, 0, 0, 10.0, 0, 0.0, 1}, /* 19 - GLSEARCH */
+ {GNW_PID, "GGSEARCH", "gnw",
+ "GGSEARCH performs a global/global database searches",
+ NULL, 404, "BL50", 0, 0, 0, 0, 10.0, 0, 0.0, 1}, /* 20 - GGSEARCH */
+};
+
+struct msg_def_str {
+ int pgm_id;
+ int q_seqt;
+ int l_seqt;
+ int p_seqt;
+ int sw_flag;
+ int stages;
+ int qframe;
+ int nframe;
+ int nrelv, srelv, arelv;
+ char *f_id0, *f_id1, *label, *alabel;
+};
+
+/* align_label must be < MAX_SSTR (32) */
+char *align_label[]={
+ "Smith-Waterman", /* 0 */
+ "banded Smith-Waterman", /* 1 */
+ "Waterman-Eggert", /* 2 */
+ "trans. Smith-Waterman", /* 3 */
+ "global/local", /* 4 */
+ "trans. global/local", /* 5 */
+ "global/global (N-W)" /* 6 */
+};
+
+/* pgm_id q_seqt l_seqt p_seqt sw_f st qf nf nrv srv arv s_ix */
+static struct msg_def_str
+msg_def_arr[21] = {
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "", "", ""}, /* ID=0 */
+ {FA_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 1, 3,
+ "fa","sw", "opt"},
+ {SS_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
+ "sw","sw", "s-w"},
+ {FX_PID, SEQT_DNA, SEQT_PROT, SEQT_PROT, 1, 1, 2, -1, 3, 1, 3,
+ "fx","sx", "opt"},
+ {FY_PID, SEQT_DNA, SEQT_PROT, SEQT_PROT, 1, 1, 2, -1, 3, 1, 3,
+ "fy","sy", "opt"},
+ {FS_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
+ "fs","fs", "initn init1"},
+ {FF_PID, SEQT_PROT,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
+ "ff","ff", "initn init1"},
+ {FM_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
+ "fm","fm","initn init1"},
+ {RSS_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 0, 1, 1, -1, 1, 1, 1,
+ "rss","sw","s-w"},
+ {RFX_PID, SEQT_DNA,SEQT_PROT, SEQT_PROT, 0, 1, 2, -1, 3, 1, 3,
+ "rfx","sx","opt"},
+ {SSS_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
+ "sw","sw", "s-w"},
+ {TFA_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 0, 1, 1, 6, 3, 1, 3,
+ "tfa","fa","initn init1"},
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "", "", ""}, /* ID=12 */
+ {TFX_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 2, 3, 2, 3,
+ "tfx","sx","initn opt"},
+ {TFY_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 2, 3, 2, 3,
+ "tfy","sy","initn opt"},
+ {TFS_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
+ "tfs","fs","initn init1"},
+ {TFF_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
+ "tff","ff","initn init1"},
+ {TFM_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
+ "tfm","fm","initn init1"},
+ {LAL_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
+ "lsw","lsw", "ls-w"},
+ {LNW_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
+ "gnw","gnw", "n-w"},
+ {GNW_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
+ "gnw","gnw", "n-w"},
+};
+
+int
+get_pgm_id() {
+
+ int rval=0;
+
+#ifdef FASTA
+#ifndef TFAST
+ pgm_def_arr[FA_PID].ref_str = ref_str_a[0];
+ msg_def_arr[FA_PID].alabel = align_label[0];
+ rval=FA_PID;
+#else
+ pgm_def_arr[TFA_PID].ref_str = ref_str_a[0];
+ msg_def_arr[TFA_PID].alabel = align_label[2];
+ rval=TFA_PID;
+#endif
+#endif
+
+#ifdef FASTX
+#ifndef TFAST
+#ifndef PRSS
+ pgm_def_arr[FX_PID].ref_str = ref_str_a[2];
+ msg_def_arr[FX_PID].alabel = align_label[3];
+ rval=FX_PID;
+#else
+ pgm_def_arr[RFX_PID].ref_str = ref_str_a[2];
+ msg_def_arr[FX_PID].alabel = align_label[3];
+ rval=RFX_PID;
+#endif
+#else
+ pgm_def_arr[TFX_PID].ref_str = ref_str_a[2];
+ msg_def_arr[TFX_PID].alabel = align_label[3];
+ rval=TFX_PID;
+#endif
+#endif
+
+#ifdef FASTY
+#ifndef TFAST
+ pgm_def_arr[FY_PID].ref_str = ref_str_a[2];
+ msg_def_arr[FY_PID].alabel = align_label[3];
+ rval=FY_PID;
+#else
+ pgm_def_arr[TFY_PID].ref_str = ref_str_a[2];
+ msg_def_arr[TFY_PID].alabel = align_label[3];
+ rval=TFY_PID;
+#endif
+#endif
+
+#ifdef FASTS
+#ifndef TFAST
+ pgm_def_arr[FS_PID].ref_str = ref_str_a[3];
+ msg_def_arr[FS_PID].alabel = align_label[4];
+ rval=FS_PID;
+#else
+ pgm_def_arr[TFS_PID].ref_str = ref_str_a[3];
+ msg_def_arr[TFS_PID].alabel = align_label[5];
+ rval=TFS_PID;
+#endif
+#endif
+
+#ifdef FASTF
+#ifndef TFAST
+ pgm_def_arr[FF_PID].ref_str = ref_str_a[3];
+ msg_def_arr[FF_PID].alabel = align_label[4];
+ rval=FF_PID;
+#else
+ pgm_def_arr[TFF_PID].ref_str = ref_str_a[3];
+ msg_def_arr[TFF_PID].alabel = align_label[5];
+ rval=TFF_PID;
+#endif
+#endif
+
+#ifdef FASTM
+#ifndef TFAST
+ pgm_def_arr[FM_PID].ref_str = ref_str_a[3];
+ msg_def_arr[FM_PID].alabel = align_label[4];
+ rval=FM_PID;
+#else
+ pgm_def_arr[TFM_PID].ref_str = ref_str_a[3];
+ msg_def_arr[TFM_PID].alabel = align_label[5];
+ rval=TFM_PID;
+#endif
+#endif
+
+#ifdef SSEARCH
+#define CAN_PSSM
+ pgm_def_arr[SS_PID].ref_str = ref_str_a[1];
+ msg_def_arr[SS_PID].alabel = align_label[0];
+ rval=SS_PID;
+#endif
+
+#ifdef OSEARCH
+ pgm_def_arr[SSS_PID].ref_str = ref_str_a[1];
+ msg_def_arr[SSS_PID].alabel = align_label[0];
+ rval=SSS_PID;
+#endif
+
+#ifdef LALIGN
+#define CAN_PSSM
+ pgm_def_arr[LAL_PID].ref_str = ref_str_a[5];
+ msg_def_arr[LAL_PID].alabel = align_label[2];
+ rval=LAL_PID;
+#endif
+
+#ifdef GLSEARCH
+#define CAN_PSSM
+ pgm_def_arr[LNW_PID].ref_str = ref_str_a[6];
+ msg_def_arr[LNW_PID].alabel = align_label[4];
+ rval=LNW_PID;
+#endif
+
+#ifdef GGSEARCH
+#define CAN_PSSM
+ pgm_def_arr[GNW_PID].ref_str = ref_str_a[6];
+ msg_def_arr[GNW_PID].alabel = align_label[6];
+ rval=GNW_PID;
+#endif
+
+ return rval;
+}
+
+extern struct opt_def_str g_options[];
+extern void set_opt_disp_defs(char opt_char, struct opt_def_str *options, int type,
+ int i_param1, int i_param2,
+ double d_param1, double d_param2, char *s_param);
+
+static char z_opt_descr[] = "Statistics estimation method:\n 1 - regression; -1 - no stats.; 0 - no scaling; 2 - Maximum Likelihood Est.;\n 3 - Altschul/Gish; 4 - iter. regress.; 5 - regress w/variance;\n 6 - MLE with comp. adj.;\n 11 - 16 - estimates from shuffled library sequences;\n 21 - 26 - E2()-stats from shuffled high-scoring sequences;";
+
+static char s_opt_descr[] = "Scoring matrix: (protein)\n BL50, BP62 (sets -f -11 -g -1); P250, OPT5, VT200,\n VT160, P120, VT120, BL80, VT80, MD40, VT40, MD20, VT20, MD10, VT10;\n scoring matrix file name; -s ?BL50 adjusts matrix for short queries;";
+
+
+struct opt_def_str f_options[] = {
+ {'3', 0, "norevcomp", "compare forward strand only", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#if defined(FASTA) || defined(SSEARCH)
+ {'a', 0, "show_all", "show complete Query/Sbjct sequences in alignment", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'W', 1, "context", "alignment context length (surrounding unaligned sequence)", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+#if defined(FASTA)
+ {'A', 0, "sw_align", "Smith-Waterman for final DNA alignment, band alignment for protein\n default is band-alignment for DNA, Smith-Waterman for protein", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+ {'b', 1, "num_descriptions", "high scores reported (limited by -E by default)",
+ "high scores reported (limited by -E by default);\n =<int> forces <int> results;", 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'d', 1, "num_alignments", "number of alignments shown (limited by -E by default)", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#if defined(FASTA) || defined(FASTX) || defined(FASTY)
+ {'c', 1, "opt_join", "expected fraction for band-optimization, joining", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+ {'E', 1, "evalue", "E()-value threshold", "E()-value,E()-repeat threshold", 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'f', 1, "gapopen", "gap-open penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'g', 1, "gapext", "gap-extension penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#ifdef SHOW_HELP
+ {'h', 0, "help", "help - show options, arguments", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#ifdef FASTY
+ {'j', 1, "frame_subs", "frame-shift, codon substitution penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#else
+#ifdef FASTX
+ {'j', 1, "frame_shift", "frame-shift penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+#endif
+#else
+#ifndef LALIGN
+ {'h', 1, "frame", "frameshift penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'j', 1, "codon_subs", "codon substitution", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+#endif
+#if defined(LALIGN)
+ {'J', 0, "show_ident", "show identity alignment", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'K', 1, "max_repeat", "maximum number of non-intersecting alignments", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+ {'k', 1, "nshuffle", "number of shuffles", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'M', 1, "range", "filter on library sequence length", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'n', 0, "dna", "DNA/RNA query", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'p', 0, "prot", "protein query", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#if defined(CAN_PSSM)
+ {'P', 1, "pssm", "PSSM file", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+ {'r', 1, "dna_ratio", " +match/-mismatch for DNA/RNA", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'s', 1, "matrix", "scoring matrix", &s_opt_descr[0], 0, 0, 0, 0, 0.0, 0.0, NULL},
+#if !defined(LALIGN) && !defined(FASTS) && !defined(FASTM)
+ {'S', 0, "seg", "filter lowercase (seg) residues", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+#if defined(TFAST) || defined(FASTX) || defined(FASTY)
+ {'t', 1, "gencode", "translation genetic code", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+#endif
+ {'U', 0, "rna", "RNA query", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'X', 1, "ext_opts", "Extended options", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'z', 1, "stats", "Statistics estimation method", &z_opt_descr[0], 0, 0, 0, 0, 0.0, 0.0, NULL},
+ {'\0', 0, "", "", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL}
+};
+
+void f_init_opts(int pgm_id, struct mngmsg *m_msp, struct pstruct *ppst) {
+#if defined(FASTA) || defined(FASTX) || defined(FASTY)
+ set_opt_disp_defs('c', f_options, 4, 0, 0, ppst->param_u.fa.E_band_opt, max(ppst->param_u.fa.E_band_opt*5.0,1.0), NULL);
+#endif
+#ifndef LALIGN
+ set_opt_disp_defs('E', f_options, 4, 0, 0, ppst->e_cut, ppst->e_cut_r, NULL);
+#else
+ set_opt_disp_defs('E', f_options, 3, 0, 0, ppst->e_cut, 0.0, NULL);
+#endif
+ set_opt_disp_defs('s', f_options, 5, 0, 0, 0.0, 0.0, ppst->pamfile);
+ set_opt_disp_defs('f', f_options, 1, ppst->gdelval, 0, 0.0, 0.0, NULL);
+ set_opt_disp_defs('g', f_options, 1, ppst->ggapval, 0, 0.0, 0.0, NULL);
+#ifdef FASTY
+ set_opt_disp_defs('j', f_options, 2, ppst->gshift, ppst->gsubs, 0.0, 0.0, NULL);
+#endif
+#ifdef FASTX
+ set_opt_disp_defs('j', f_options, 1, ppst->gshift, 0, 0.0, 0.0, NULL);
+#endif
+#ifdef LALIGN
+ set_opt_disp_defs('K', f_options, 1, ppst->max_repeat, 0, 0.0, 0.0, NULL);
+#endif
+ set_opt_disp_defs('k', f_options, 1, m_msp->shuff_max, 0, 0.0, 0.0, NULL);
+ set_opt_disp_defs('r', f_options, 2, ppst->p_d_mat, ppst->p_d_mis, 0.0, 0.0, NULL);
+ set_opt_disp_defs('t', f_options, 1, ppst->tr_type, 0, 0.0, 0.0, NULL);
+ set_opt_disp_defs('z', f_options, 1, ppst->zsflag, 0, 0.0, 0.0, NULL);
+#if defined(FASTA) || defined(SSEARCH)
+ set_opt_disp_defs('W', f_options, 1, m_msp->aln.llcntx, 0, 0.0, 0.0, NULL);
+#endif
+}
+
+char *iprompt1=" test sequence file name: ";
+char *iprompt2=" database file name: ";
+
+#ifdef PCOMPLIB
+char *verstr="36.3.8f May, 2017 MPI";
+#else
+char *verstr="36.3.8f May, 2017";
+#endif
+
+static int mktup=3;
+static int ktup_set = 0;
+static int gap_set=0;
+static int del_set=0;
+static int mshuff_set = 0;
+static int prot2dna = 0;
+void
+parse_ext_opts(char *opt_arg, int pgm_id, struct mngmsg *m_msp, struct pstruct *ppst);
+
+extern int fa_max_workers;
+
+extern void s_abort(char *, char *);
+extern void init_ascii0(int *sascii, char *sq_map, int sq_map_n, struct pstruct *ppst);
+extern void init_ascii(int ext_sq, int *sascii, int nsq, int dnaseq);
+extern void validate_novel_aa(int *sascii, int p_nsq, int dnaseq);
+extern int standard_pam(char *smstr, struct pstruct *ppst,
+ int del_set, int gap_set);
+extern int min_pam_bits(int n0_eff, double bit_thresh, struct pstruct *ppst,
+ int del_set, int gap_set);
+extern void mk_n_pam(int *arr,int siz, int mat, int mis);
+extern int karlin(int , int, double *, double *, double *);
+extern void init_karlin_a(struct pstruct *, double *, double **);
+extern int do_karlin_a(int **, const struct pstruct *, double *,
+ double *, double *, double *, double *);
+
+#if defined(TFAST) || defined(FASTX) || defined(FASTY)
+extern void aainit(int tr_type, int debug);
+#endif
+
+char *iprompt0, *prog_func, *refstr;
+
+/* Sets defaults assuming a protein sequence */
+void h_init (struct pstruct *ppst, struct mngmsg *m_msp, char *info_pgm_abbr)
+{
+ struct pgm_def_str pgm_def;
+ int i, pgm_id;
+
+ ppst->pgm_id = pgm_id = get_pgm_id();
+ pgm_def = pgm_def_arr[pgm_id];
+
+ /* check that pgm_def_arr[] is valid */
+ if (pgm_def.pgm_id != pgm_id) {
+ fprintf(stderr,
+ "**pgm_def integrity failure: def.pgm_id %d != pgm_id %d**\n",
+ pgm_def.pgm_id, pgm_id);
+ exit(1);
+ }
+
+ /* check that msg_def_arr[] is valid */
+ if (msg_def_arr[pgm_id].pgm_id != pgm_id) {
+ fprintf(stderr,
+ "**msg_def integrity failure: def.pgm_id %d != pgm_id %d**\n",
+ msg_def_arr[pgm_id].pgm_id, pgm_id);
+ exit(1);
+ }
+
+ SAFE_STRNCPY(info_pgm_abbr,pgm_def.info_pgm_abbr,MAX_SSTR);
+ iprompt0 = pgm_def.iprompt0;
+ refstr = pgm_def.ref_str;
+ prog_func = pgm_def.prog_func;
+
+ /* used to be MAXTOT = MAXTST+MAXLIB, but now fixed at MAXLIB for
+ pre-loaded libraries */
+ m_msp->max_tot = MAXLIB;
+
+ init_ascii0(aascii, NCBIstdaa, NCBIstdaa_n, ppst);
+ pascii = aascii;
+
+ /* set up DNA query sequence if required*/
+ if (msg_def_arr[pgm_id].q_seqt == SEQT_DNA) {
+ memcpy(qascii,nascii,sizeof(qascii));
+ m_msp->qdnaseq = SEQT_DNA;
+ }
+ else { /* when SEQT_UNK, start with protein */
+ memcpy(qascii,aascii,sizeof(qascii));
+ m_msp->qdnaseq = msg_def_arr[pgm_id].q_seqt;
+ }
+
+#if defined(FASTF) || defined(FASTS) || defined(FASTM)
+ qascii[','] = ESS;
+ /* also initialize aascii, nascii for databases */
+ qascii['*'] = NA;
+ ppst->pam_ms = 1;
+ ppst->do_rep=0; /* disable multiple alignments */
+ ppst->pseudocts = 200;
+#else
+ ppst->pam_ms = 0;
+ ppst->do_rep=1; /* enable multiple alignments */
+#endif
+
+ /* initialize a pam matrix */
+ SAFE_STRNCPY(ppst->pamfile,pgm_def.smstr,MAX_FN);
+ standard_pam(ppst->pamfile,ppst,del_set,gap_set);
+ ppst->have_pam2 = 0;
+
+ /* specify pre-alignment */
+ ppst->can_pre_align = pgm_def.can_pre_align;
+
+ ppst->p_d_mat = 5;
+ ppst->p_d_mis = -4;
+
+ /* this is always protein by default */
+ ppst->nsq = NCBIstdaa_n;
+ ppst->nsqx = NCBIstdaa_ext_n;
+ /* we need to populate ppst->sq to nsqx for direct lc mapping */
+ for (i=0; i<ppst->nsqx; i++) {
+ ppst->sq[i] = NCBIstdaa_l[i];
+ ppst->hsq[i] = h_NCBIstdaa[i];
+ }
+ for (i=0; i<ppst->nsqx; i++) {
+ ppst->sqx[i]=NCBIstdaa_ext[i];
+ ppst->hsqx[i]=h_NCBIstdaa_ext[i];
+ }
+ ppst->sq[ppst->nsq] = ppst->sqx[ppst->nsqx] = '\0';
+
+ /* set up the c_nt[] mapping */
+
+#if defined(FASTS) || defined(FASTF) || defined(FASTM)
+ ppst->c_nt[ESS] = ESS;
+#endif
+ ppst->c_nt[0]=0;
+ for (i=1; i<nnt; i++) {
+ ppst->c_nt[i]=gc_nt[i];
+ ppst->c_nt[i+nnt]=gc_nt[i]+nnt;
+ }
+
+#ifdef CAN_PSSM
+ ppst->pam2p[0] = NULL;
+ ppst->pam2p[1] = NULL;
+#endif
+}
+
+/*
+ * alloc_pam(): allocates memory for the 2D pam matrix as well
+ * as for the integer array used to transmit the pam matrix
+ */
+void
+alloc_pam (int d1, int d2, struct pstruct *ppst)
+{
+ int i, *d2p;
+ char err_str[128];
+
+ if ((ppst->pam2[0] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
+ sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
+ s_abort (err_str,"");
+ }
+
+ if ((ppst->pam2[1] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
+ sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
+ s_abort (err_str,"");
+ }
+
+ if ((d2p = (int *) calloc (d1 * d2, sizeof (int))) == NULL) {
+ sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
+ s_abort (err_str,"");
+ }
+
+ for (i = 0; i < d1; i++, d2p += d2)
+ ppst->pam2[0][i] = d2p;
+
+ if ((d2p= (int *) malloc (d1 * d2 * sizeof (int))) == NULL) {
+ sprintf(err_str,"Cannot allocate 2d pam matrix: %d",d2);
+ s_abort (err_str,"");
+ }
+
+ for (i = 0; i < d1; i++, d2p += d2)
+ ppst->pam2[1][i] = d2p;
+
+ ppst->have_pam2 = 1;
+}
+
+/*
+ * init_pam2(struct pstruct pst): Converts 1-D pam matrix to 2-D
+ * currently, this function is very protein centric
+ */
+void
+init_pam2 (struct pstruct *ppst) {
+ int i, j, k, nsq, sa_t;
+ int ix_j, ix_l, ix_i, p_i, p_j;
+
+ nsq = ppst->nsq;
+
+ ppst->pam2[0][0][0] = -BIGNUM;
+ ppst->pam_h = -1; ppst->pam_l = 1;
+
+ k = 0;
+
+ if (ppst->dnaseq == 0) { /* not DNA */
+ sa_t = aascii['*']; /* this is the last character for which pam[] is available */
+ pam_sq = apam_sq;
+ pam_sq_n = apam_sq_n;
+ }
+ else { /* have DNA, no '*' */
+ sa_t = nascii['X'];
+ pam_sq = npam_sq;
+ pam_sq_n = npam_sq_n;
+ }
+
+ /* we use sa_t here because that is the last position in the 1-D
+ matrix */
+ for (i = 1; i < sa_t; i++) {
+ p_i = pascii[pam_sq[i]];
+ ppst->pam2[0][0][p_i] = ppst->pam2[0][p_i][0] = -BIGNUM;
+ for (j = 1; j <= i; j++) {
+ /* here is where the pam file is actually set */
+ p_j = pascii[pam_sq[j]];
+ ppst->pam2[0][p_j][p_i] = ppst->pam2[0][p_i][p_j] = pam[k++] - ppst->pamoff;
+ if (ppst->pam_l > ppst->pam2[0][p_i][p_j]) ppst->pam_l = ppst->pam2[0][p_i][p_j];
+ if (ppst->pam_h < ppst->pam2[0][p_i][p_j]) ppst->pam_h = ppst->pam2[0][p_i][p_j];
+ }
+ }
+
+ /* need to do the same thing for characters > sa_t */
+ for (i = sa_t+1; i < pam_sq_n; i++) {
+ p_i = pascii[pam_sq[i]];
+ ppst->pam2[0][0][p_i] = ppst->pam2[0][p_i][0] = -BIGNUM;
+ }
+
+ if (ppst->dnaseq == 0) {
+ init_altpam(ppst);
+ }
+}
+
+void
+init_altpam(struct pstruct *ppst) {
+ int ix_i, ix_l, ix_j, p_i, p_j, i;
+
+ /* add values for 'J' (I/L) value, which are not present in 1-D matrices */
+ ix_i = pascii['I'];
+ ix_l = pascii['L'];
+ ix_j = pascii['J'];
+ if (strchr(pam_sq,'J')==NULL) {
+ ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
+ /* get the identities */
+ ppst->pam2[0][ix_j][ix_j] =
+ max(ppst->pam2[0][ix_i][ix_i],ppst->pam2[0][ix_l][ix_l]);
+ for (i=1; i < pam_sq_n; i++) {
+ p_i = pascii[pam_sq[i]];
+ /* do not assume symmetric matrices */
+ ppst->pam2[0][ix_j][p_i] =
+ max(ppst->pam2[0][ix_i][p_i],ppst->pam2[0][ix_l][p_i]);
+ ppst->pam2[0][p_i][ix_j] =
+ max(ppst->pam2[0][p_i][ix_i],ppst->pam2[0][p_i][ix_l]);
+ }
+ }
+ /* add values for 'O' (K) value, which are not present in 1-D matrices */
+ ix_i = pascii['K'];
+ ix_j = pascii['O'];
+ if (ix_j < ppst->nsq) { /* is it in the NCBIstdaa alphabet ? */
+ ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
+ /* get the identity */
+ ppst->pam2[0][ix_j][ix_j] = ppst->pam2[0][ix_i][ix_i];
+ /* do not assume symmetric matrices */
+ for (i=1; i < pam_sq_n; i++) {
+ p_i = pascii[pam_sq[i]];
+ ppst->pam2[0][ix_j][p_i] = ppst->pam2[0][ix_i][p_i];
+ ppst->pam2[0][p_i][ix_j] = ppst->pam2[0][p_i][ix_i];
+ }
+ }
+ else {
+ pascii['O'] = pascii['K'];
+ pascii['o'] = pascii['k'];
+ }
+
+ /* add values for 'U' (C) value, which are not present in 1-D matrices */
+ ix_i = pascii['C'];
+ ix_j = pascii['U'];
+ if (ix_j < ppst->nsq) { /* is it in the NCBIstdaa alphabet */
+ ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
+ /* get the identity */
+ ppst->pam2[0][ix_j][ix_j] = ppst->pam2[0][ix_i][ix_i];
+ /* do not assume symmetric matrices */
+ for (i=1; i < pam_sq_n; i++) {
+ p_i = pascii[pam_sq[i]];
+ ppst->pam2[0][ix_j][p_i] = ppst->pam2[0][ix_i][p_i];
+ ppst->pam2[0][p_i][ix_j] = ppst->pam2[0][p_i][ix_i];
+ }
+ }
+ else {
+ pascii['U'] = pascii['C'];
+ pascii['u'] = pascii['c'];
+ }
+}
+
+/* extend the standard pam matrix for special residues
+ (a) 'X' for protein and 'N' for DNA, 'G' and 'U' for RNA,
+ (b) lower-case characters for ext_sq_set
+
+ must be called after init_pam2()
+*/
+void
+init_pamx (struct pstruct *ppst) {
+ int i, j, k, nsq;
+ int sa_x, sa_t, tmp;
+
+ nsq = ppst->nsq;
+
+ ppst->nt_align = (ppst->dnaseq== SEQT_DNA || ppst->dnaseq == SEQT_RNA);
+
+ /* sa_x is the 'unknown' character, 'X' for proteins, 'N' for DNA/RNA */
+ /* sa_t is the termination character -- not used for DNA */
+
+ if (ppst->nt_align) {
+ sa_x = pascii['N'];
+ sa_t = sa_x;
+ }
+ else {
+ sa_x = pascii['X'];
+ sa_t = pascii['*'];
+ }
+
+ /* build an asymmetric matrix for RNA */
+ if (ppst->dnaseq == SEQT_RNA && !ppst->pam_set) {
+ tmp = ppst->pam2[0][nascii['G']][nascii['G']] - 3;
+ ppst->pam2[0][nascii['A']][nascii['G']] =
+ ppst->pam2[0][nascii['C']][nascii['T']] =
+ ppst->pam2[0][nascii['C']][nascii['U']] = tmp;
+ }
+
+ if (ppst->pam_x_set) {
+ for (i=1; i<nsq; i++) {
+ if (sa_x < nsq) ppst->pam2[0][sa_x][i] = ppst->pam2[0][i][sa_x]=ppst->pam_xm;
+ if (sa_t < nsq) ppst->pam2[0][sa_t][i] = ppst->pam2[0][i][sa_t]=ppst->pam_xm;
+ }
+ if (sa_x < nsq) ppst->pam2[0][sa_x][sa_x]=ppst->pam_xx;
+ if (sa_t < nsq) ppst->pam2[0][sa_t][sa_t]=ppst->pam_xx;
+ }
+ else {
+ ppst->pam_xx = ppst->pam2[0][sa_x][sa_x];
+ ppst->pam_xm = ppst->pam2[0][1][sa_x];
+ }
+
+ /* fill in pam2[1] matrix */
+ ppst->pam2[1][0][0] = -BIGNUM;
+ /* fill in additional parts of the matrix */
+ for (i = 0; i < nsq; i++) {
+ /* -BIGNUM to all matches vs 0 */
+ ppst->pam2[0][0][i+nsq] = ppst->pam2[0][i+nsq][0] =
+ ppst->pam2[1][0][i+nsq] = ppst->pam2[1][i+nsq][0] =
+ ppst->pam2[0][0][i] = ppst->pam2[0][i][0] =
+ ppst->pam2[1][0][i] = ppst->pam2[1][i][0] = -BIGNUM;
+
+ for (j = 0; j < nsq; j++) {
+ /* replicate pam2[0] to i+nsq, j+nsq, also initialize lowest of pam2[1]*/
+ ppst->pam2[0][i+nsq][j] = ppst->pam2[0][i][j+nsq] = ppst->pam2[0][i+nsq][j+nsq] =
+ ppst->pam2[1][i][j] = ppst->pam2[0][i][j];
+
+ /* set the high portion of pam2[1] to the corresponding value
+ of pam2[1][sa_x][j] */
+
+ ppst->pam2[1][i+nsq][j] = ppst->pam2[1][i][j+nsq]=
+ ppst->pam2[1][i+nsq][j+nsq]=ppst->pam2[0][sa_x][j];
+ }
+ }
+
+ /* set matches to the internal '-' to pam_xx */
+ /* this needs to be adjusted for multiple internal '-----' */
+ for (i=1; i< nsq; i++) {
+ ppst->pam2[0][nsq][i] = ppst->pam2[0][i][nsq] =
+ ppst->pam2[0][nsq][i+nsq] = ppst->pam2[0][i+nsq][nsq] =
+ ppst->pam2[1][nsq][i] = ppst->pam2[1][i][nsq] =
+ ppst->pam2[1][nsq][i+nsq] = ppst->pam2[1][i+nsq][nsq] = ppst->pam_xm;
+ }
+ ppst->pam2[0][nsq][nsq] = ppst->pam2[1][nsq][nsq] = ppst->pam_xm;
+}
+
+/* function specific initializations */
+void
+f_initenv (struct mngmsg *m_msp, struct pstruct *ppst, unsigned char **aa0) {
+ struct msg_def_str m_msg_def;
+ int pgm_id;
+
+ pgm_id = ppst->pgm_id;
+ m_msg_def = msg_def_arr[pgm_id];
+
+ m_msp->last_calc_flg=0;
+
+ SAFE_STRNCPY(m_msp->f_id0,m_msg_def.f_id0,sizeof(m_msp->f_id0));
+ SAFE_STRNCPY(m_msp->f_id1,m_msg_def.f_id1,sizeof(m_msp->f_id1));
+ SAFE_STRNCPY (m_msp->label, m_msg_def.label, sizeof(m_msp->label));
+ SAFE_STRNCPY(m_msp->alabel, m_msg_def.alabel, sizeof(m_msp->alabel));
+
+#if !defined(SSEARCH) && !defined(GGSEARCH) && !defined(GLSEARCH) && !defined(LALIGN)
+ SAFE_STRNCPY (m_msp->alab[0],"initn",20);
+ SAFE_STRNCPY (m_msp->alab[1],"init1",20);
+ SAFE_STRNCPY (m_msp->alab[2],"opt",20);
+#else
+#if defined(SSEARCH) || defined(LALIGN)
+ SAFE_STRNCPY (m_msp->alab[0],"s-w opt",20);
+#else
+ SAFE_STRNCPY (m_msp->alab[0],"n-w opt",20);
+#endif
+#endif
+
+#if defined(GGSEARCH) || defined(GLSEARCH)
+ m_msp->zsflag = ppst->zsflag = ppst->zsflag_f = 0;
+#else
+ m_msp->zsflag = ppst->zsflag = ppst->zsflag_f = 1;
+ m_msp->zsflag2 = ppst->zsflag2 = 1;
+#endif
+
+ ppst->gdelval += pgm_def_arr[pgm_id].g_open_mod;
+ ppst->ggapval += pgm_def_arr[pgm_id].g_ext_mod;
+#if defined(FASTX) || defined(FASTY)
+ ppst->gshift = pgm_def_arr[pgm_id].gshift;
+ ppst->gsubs = pgm_def_arr[pgm_id].hshift;
+#endif
+ ppst->sw_flag = m_msg_def.sw_flag;
+ ppst->e_cut = m_msp->e_cut=pgm_def_arr[pgm_id].e_cut;
+#ifndef LALIGN
+ ppst->e_cut_r = ppst->e_cut/10.0; /* more significant */
+#else
+ ppst->e_cut_r = ppst->e_cut; /* everything if local */
+#endif
+
+ ppst->score_ix = 0;
+ ppst->histint = 2;
+ m_msp->qframe = m_msg_def.qframe;
+ m_msp->nframe = m_msg_def.nframe;
+ m_msp->nrelv = m_msg_def.nrelv;
+ m_msp->srelv = m_msg_def.srelv;
+ m_msp->arelv = m_msg_def.arelv;
+ m_msp->stages = m_msg_def.stages;
+ m_msp->shuff_wid = 0;
+#if defined(GGSEARCH)
+ m_msp->shuff_max = 100;
+#else
+ m_msp->shuff_max = MAX_RSTATS;
+#endif
+ m_msp->shuff_max_save = m_msp->shuff_max;
+
+ /* see param.h for the definition of all these */
+
+ m_msp->qshuffle = 0;
+ m_msp->nm0 = 1;
+ m_msp->escore_flg = 0;
+
+ /* pam information */
+ ppst->pam_pssm = 0;
+#if defined(FASTS) || defined(FASTF) || defined(FASTM)
+ ppst->pam_xx = ppst->pam_xm = 0;
+#else
+ ppst->pam_xx = 1; /* set >0 to use pam['X']['X'] value */
+ ppst->pam_xm = -1; /* set >0 to use pam['X']['A-Z'] value */
+#endif
+ ppst->pam_x_set = 0;
+ ppst->pam_x_id_sim = 0;
+ ppst->pam_set = ppst->pam_variable = 0;
+ ppst->pam_pssm = 0;
+ ppst->p_d_set = 0;
+ ppst->pamoff = 0;
+ ppst->ext_sq_set = 0;
+ ppst->nsq_e = ppst->nsq;
+
+ /* initial settings for protein */
+ if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
+ mktup = 3;
+ ppst->param_u.fa.bestscale = 300;
+ ppst->param_u.fa.bestoff = 36;
+ ppst->param_u.fa.bkfact = 6;
+ ppst->param_u.fa.scfact = 3;
+ ppst->param_u.fa.bktup = mktup;
+ ppst->param_u.fa.ktup = 0;
+ ppst->param_u.fa.bestmax = 50;
+ ppst->param_u.fa.pamfact = 1;
+ ppst->param_u.fa.altflag = 0;
+ ppst->param_u.fa.optflag = 1;
+ ppst->param_u.fa.iniflag = 0;
+ ppst->param_u.fa.optcut = 0;
+ ppst->param_u.fa.optcut_set = 0;
+ ppst->param_u.fa.cgap = 0;
+ ppst->param_u.fa.optwid = 16;
+ ppst->param_u.fa.optwid_set = 0;
+ ppst->param_u.fa.E_band_opt = pgm_def_arr[ppst->pgm_id].E_band_opt;
+ ppst->param_u.fa.use_E_thresholds = 1; /* disable E-thresholds for now */
+ }
+
+ f_init_opts(pgm_id, m_msp, ppst);
+}
+
+/* switches for fasta only */
+
+static int shift_set=0;
+static int subs_set=0;
+static int sw_flag_set=0;
+static int nframe_set=0;
+static int E_thresh_set = 0;
+static int E_cgap_set = 0;
+
+void
+f_getopt (char copt, char *optarg,
+ struct mngmsg *m_msg, struct pstruct *ppst)
+{
+ int pgm_id;
+ double tmp_f, tmp_f1;
+ double tmp_e_cut, tmp_e_rep;
+ int dnaseq_save;
+ char *bp;
+
+ pgm_id = ppst->pgm_id;
+
+ switch (copt) {
+ case '3':
+ nframe_set = 1;
+ if (pgm_id == TFA_PID) {
+ m_msg->nframe = 3; break;
+ }
+ else {
+ m_msg->nframe = 1; /* for TFASTXY */
+ m_msg->qframe = 1; /* for FASTA, FASTX */
+ }
+ break;
+ case 'a': m_msg->aln.showall = 1; break;
+ case 'A':
+ if (ppst->sw_flag) ppst->sw_flag=0;
+ else ppst->sw_flag= 1;
+ sw_flag_set = 1;
+ break;
+ case 'b':
+ if (optarg[0] == '$') {
+ m_msg->mshow = -1;
+ m_msg->e_cut = 10000000.0;
+ break;
+ }
+ else if (optarg[0] == '=') {
+ m_msg->e_cut = 10000000.0;
+ m_msg->e_cut_set = 1;
+ m_msg->mshow_min = 1;
+ sscanf (optarg+1, "%d", &m_msg->mshow);
+ }
+ else if (optarg[0] == '>') {
+ m_msg->mshow_min = 2;
+ sscanf (optarg+1, "%d", &m_msg->mshow);
+ }
+ else {
+ sscanf (optarg, "%d", &m_msg->mshow);
+ m_msg->mshow_min = 0;
+ }
+ m_msg->mshow_set = 1;
+ break;
+ case 'c':
+ tmp_f = tmp_f1 = 0.0;
+ if (*optarg == 'O') {
+ ppst->param_u.fa.use_E_thresholds = 0;
+ optarg++;
+ }
+ if (*optarg != '\0' && pgm_def_arr[pgm_id].ktup > 0) {
+ sscanf (optarg, "%lf %lf", &tmp_f, &tmp_f1);
+ if (tmp_f > 1.0) {
+ ppst->param_u.fa.optcut = (int)(tmp_f+0.1);
+ ppst->param_u.fa.use_E_thresholds = 0;
+ ppst->param_u.fa.optcut_set = 1;
+ }
+ else if (tmp_f <= 0.0) {
+ ppst->param_u.fa.use_E_thresholds = 1;
+ }
+ else { /* 0.0 < tmp_f <= 1.0 */
+ ppst->param_u.fa.use_E_thresholds = 1;
+ ppst->param_u.fa.E_band_opt = min(tmp_f,1.0);
+ E_thresh_set = 1;
+ if (tmp_f1 > 0.0) {
+ tmp_f1 = min(tmp_f1,1.0); /* may want to do max(tmp_f1,tmp_f) */
+ ppst->param_u.fa.E_join = tmp_f1;
+ E_cgap_set=1;
+ }
+ }
+ }
+ break;
+ case 'd': sscanf(optarg,"%d",&m_msg->ashow);
+ if ((m_msg->mshow > 0) && (m_msg->ashow > m_msg->mshow)) m_msg->mshow=m_msg->ashow;
+ m_msg->ashow_set = 1;
+ break;
+ case 'E':
+ if (strchr(optarg,' ')) { /* check for 1 or 2 values */
+ sscanf(optarg,"%lf %lf",&tmp_e_cut, &tmp_e_rep);
+ if (tmp_e_rep <= 0.0) { /* two values, 2nd <= 0.0, no do_rep */
+ ppst->do_rep = 0;
+ ppst->e_cut_r = 1E-100;
+ tmp_e_rep = -2.0;
+ }
+ else {ppst->do_rep = 1;}
+ }
+ else { /* one value, do_rep; tmp_e_rep=10.0 */
+ sscanf(optarg,"%lf",&tmp_e_cut);
+#ifndef LALIGN
+ tmp_e_rep = 10.0;
+#else
+ tmp_e_rep = 1.0;
+#endif
+ ppst->do_rep = 1;
+ }
+ if (!m_msg->e_cut_set && tmp_e_cut > 0.0 ) {
+ ppst->e_cut = m_msg->e_cut = tmp_e_cut;
+ }
+ m_msg->e_cut_set = 1;
+
+ if (tmp_e_rep > 0.0) {
+ if (tmp_e_rep >= 1.0) { ppst->e_cut_r = ppst->e_cut/tmp_e_rep;}
+ else { ppst->e_cut_r = tmp_e_rep;}
+ }
+ break;
+ case 'f':
+ sscanf (optarg, "%d", &ppst->gdelval);
+ if (ppst->gdelval > 0) ppst->gdelval = -ppst->gdelval;
+ del_set = 1;
+ break;
+ case 'g':
+ sscanf (optarg, "%d", &ppst->ggapval);
+ if (ppst->ggapval > 0) ppst->ggapval = -ppst->ggapval;
+ gap_set = 1;
+ break;
+#ifndef SHOW_HELP
+ case 'h':
+ sscanf (optarg, "%d", &ppst->gshift);
+ if (ppst->gshift > 0) ppst->gshift = -ppst->gshift;
+ shift_set = 1;
+ break;
+ case 'j':
+ sscanf (optarg, "%d", &ppst->gsubs);
+ if (ppst->gsubs > 0) ppst->gsubs = -ppst->gsubs;
+ subs_set = 1;
+ break;
+#else
+ case 'h':
+ show_help(m_msg->pgm_name, pgm_id);
+ break;
+ case 'j':
+#ifdef FASTY
+ if (strchr(optarg,' ')) {
+ sscanf (optarg, "%d %d", &ppst->gshift, &ppst->gsubs);
+ subs_set = 1;
+ if (ppst->gsubs > 0) ppst->gsubs = -ppst->gsubs;
+ }
+ else if (strchr(optarg,',')) {
+ sscanf (optarg, "%d,%d", &ppst->gshift, &ppst->gsubs);
+ subs_set = 1;
+ if (ppst->gsubs > 0) ppst->gsubs = -ppst->gsubs;
+ }
+ else {
+ sscanf (optarg, "%d", &ppst->gshift);
+ }
+#else
+#ifdef FASTX
+ sscanf (optarg, "%d", &ppst->gshift);
+#endif
+#endif
+ if (ppst->gshift > 0) ppst->gshift = -ppst->gshift;
+ shift_set = 1;
+ break;
+#endif
+ case 'J':
+#ifdef LALIGN
+ ppst->show_ident=1;
+#else
+ ppst->show_ident=0;
+#endif
+ break;
+
+
+#ifdef LALIGN
+ case 'K':
+ sscanf(optarg,"%d", &ppst->max_repeat);
+ break;
+#endif
+ case 'k':
+ sscanf (optarg, "%d", &m_msg->shuff_max);
+ m_msg->shuff_max_save = m_msg->shuff_max;
+ mshuff_set = 1;
+ break;
+ case 'M':
+ sscanf(optarg,"%d-%d",&m_msg->n1_low,&m_msg->n1_high);
+ if (m_msg->n1_low < 0) {
+ m_msg->n1_high = -m_msg->n1_low;
+ m_msg->n1_low = 0;
+ }
+ if (m_msg->n1_high == 0) m_msg->n1_high = BIGNUM;
+ if (m_msg->n1_low > m_msg->n1_high) {
+ fprintf(stderr," low cutoff %d greater than high %d\n",
+ m_msg->n1_low, m_msg->n1_high);
+ m_msg->n1_low = 0;
+ m_msg->n1_high = BIGNUM;
+ }
+ ppst->n1_low = m_msg->n1_low;
+ ppst->n1_high = m_msg->n1_high;
+ break;
+ case 'n':
+ m_msg->qdnaseq = SEQT_DNA;
+ re_ascii(qascii,nascii,strlen((char *)m_msg->ann_arr+1));
+ SAFE_STRNCPY(m_msg->sqnam,"nt",4);
+ prot2dna = 1;
+ break;
+ case 'o':
+ case 'p':
+ m_msg->qdnaseq = SEQT_PROT;
+ ppst->dnaseq = SEQT_PROT;
+ SAFE_STRNCPY(m_msg->sqnam,"aa",4);
+ break;
+ case 'P':
+ SAFE_STRNCPY(ppst->pgpfile,optarg,MAX_FN);
+ if ((bp=strchr(ppst->pgpfile,' '))!=NULL) {
+ *bp='\0';
+ ppst->pgpfile_type = atoi(bp+1);
+ }
+ else ppst->pgpfile_type = 0;
+ ppst->pam_pssm = 1;
+ break;
+ case 'r':
+ sscanf(optarg,"%d/%d",&ppst->p_d_mat,&ppst->p_d_mis);
+ ppst->pam_set = 0;
+ ppst->p_d_set = 1;
+
+ SAFE_STRNCPY(ppst->pam_name, "DNA", 4);
+ if (ppst->dnaseq != SEQT_RNA) ppst->dnaseq = SEQT_DNA;
+ if (ppst->p_d_mat > 0 && ppst->p_d_mis < 0) {
+ ppst->p_d_set = 1;
+ SAFE_STRNCPY(ppst->pamfile,optarg,40);
+ }
+ break;
+ /* modified Sept, 2011, to recognize that a scoring matrix
+ specifies a sequence alphabet */
+ case 's':
+ if (*optarg == '?') {
+ ppst->pam_variable = 1;
+ optarg++;
+ }
+ if (*optarg == '\0') break;
+ SAFE_STRNCPY (ppst->pamfile, optarg, MAX_FN);
+ dnaseq_save = ppst->dnaseq;
+ /* check for default abbreviation */
+ if (!standard_pam(ppst->pamfile,ppst,del_set, gap_set)) {
+ /* check/load matrix file */
+ if (!initpam (ppst->pamfile, ppst)) {
+ /* matrix file failed, use default matrix */
+ SAFE_STRNCPY(ppst->pamfile,pgm_def_arr[pgm_id].smstr,MAX_FN);
+ }
+ }
+ ppst->pam_set=1;
+ /* check for changing alphabet here */
+ if (ppst->dnaseq != dnaseq_save && ppst->dnaseq >= SEQT_DNA) {
+ m_msg->qdnaseq = SEQT_DNA;
+ re_ascii(qascii,nascii,strlen((char *)m_msg->ann_arr+1));
+ SAFE_STRNCPY(m_msg->sqnam,"nt",4);
+ prot2dna = 1;
+ }
+ break;
+ case 'S': /* turn on extended alphabet for seg */
+ ppst->ext_sq_set = 1;
+ ppst->nsq_e = ppst->nsqx;
+ break;
+ case 't':
+ if (tolower(optarg[0])=='t') {
+ m_msg->ldb_info.term_code = aascii['*'];
+ optarg++;
+ }
+ if (*optarg) {sscanf (optarg, "%d", &ppst->tr_type);}
+ break;
+ case 'U':
+ m_msg->qdnaseq = SEQT_RNA;
+ memcpy(qascii,nascii,sizeof(qascii));
+ SAFE_STRNCPY(m_msg->sqnam,"nt",4);
+ nt[nascii['T']]='U';
+ prot2dna=1;
+ break;
+ case 'W':
+ sscanf (optarg,"%d",&m_msg->aln.llcntx);
+ m_msg->aln.llcntx_set = 1;
+ break;
+ case 'X':
+ parse_ext_opts(optarg, pgm_id, m_msg, ppst);
+ break;
+ case 'z':
+ if (strchr(optarg,' ')!=NULL) {
+ sscanf(optarg,"%d %d",&ppst->zsflag,&ppst->zsflag2);
+ if (ppst->zsflag2 < 1 || ppst->zsflag2 > 6) ppst->zsflag2 = 2;
+ }
+ else if (strchr(optarg,',')!=NULL) {
+ sscanf(optarg,"%d,%d",&ppst->zsflag,&ppst->zsflag2);
+ if (ppst->zsflag2 < 1 || ppst->zsflag2 > 6) ppst->zsflag2 = 2;
+ }
+ else {
+ sscanf(optarg,"%d",&ppst->zsflag);
+ ppst->zsflag2 = (ppst->zsflag % 10);
+ }
+ break;
+ }
+}
+
+static char my_opts[] = "1BIM:ox:y:N:";
+
+void
+parse_ext_opts(char *opt_arg, int pgm_id, struct mngmsg *m_msp, struct pstruct *ppst) {
+ long l_arg;
+ char c_arg, c_opt, *the_arg, *bp;
+
+ c_opt = *opt_arg;
+ if ((bp=strchr(my_opts, c_opt))==NULL) {
+ return;
+ }
+
+ if (*(bp+1) == ':') the_arg = opt_arg+1;
+
+ switch (c_opt) {
+ case '1':
+ if (pgm_def_arr[pgm_id].ktup > 0) {
+ ppst->param_u.fa.iniflag=1;
+ }
+ break;
+ case 'B': m_msp->z_bits = 0; break;
+ case 'I':
+ m_msp->tot_ident = 1;
+ /*
+ l_arg = 0;
+ sscanf(the_arg,"%ld",&l_arg);
+ if (l_arg > 0) m_msp->tot_ident = l_arg;
+ */
+ break;
+ case 'M':
+ c_arg = '\0';
+ sscanf(the_arg,"%ld%c",&l_arg,&c_arg);
+ if (l_arg < 0) m_msp->max_memK = BIGNUM;
+ else {
+ l_arg *= 1024;
+ if (c_arg == 'G') l_arg *= 1024;
+ m_msp->max_memK = l_arg;
+ }
+ break;
+ case 'N':
+ case 'X':
+ ppst->pam_x_id_sim = 0;
+ if (*the_arg == 'S' || *the_arg == '+') {
+ ppst->pam_x_id_sim = 1;
+ }
+ else if (*the_arg == 'D' || *the_arg == '-') {
+ ppst->pam_x_id_sim = -1;
+ }
+ break;
+ case 'o':
+ if (pgm_def_arr[pgm_id].ktup > 0) {
+ ppst->param_u.fa.optflag = 0;
+ msg_def_arr[pgm_id].nrelv = m_msp->nrelv = 2;
+ }
+ break;
+ case 'x':
+ if (strchr(the_arg,' ')!=NULL) {
+ sscanf (the_arg,"%d %d",&ppst->pam_xx, &ppst->pam_xm);
+ }
+ else if (strchr(the_arg,',')!=NULL) {
+ sscanf (the_arg,"%d,%d",&ppst->pam_xx, &ppst->pam_xm);
+ }
+ else {
+ sscanf (the_arg,"%d",&ppst->pam_xx);
+ ppst->pam_xm = ppst->pam_xx;
+ }
+ ppst->pam_x_set=1;
+ break;
+ case 'y':
+ if (pgm_def_arr[pgm_id].ktup > 0) {
+ sscanf (the_arg, "%d", &ppst->param_u.fa.optwid);
+ ppst->param_u.fa.optwid_set = 1;
+ }
+ break;
+ }
+}
+
+void
+f_lastenv (struct mngmsg *m_msg, struct pstruct *ppst)
+{
+ char save_str[MAX_SSTR];
+
+#if !defined(FASTM) && !defined(FASTS) && !defined(FASTF)
+ SAFE_STRNCPY(save_str,"*",sizeof(save_str));
+#else
+ SAFE_STRNCPY(save_str,",",sizeof(save_str));
+#endif
+
+ if (m_msg->qdnaseq == SEQT_UNK) {
+ build_xascii(qascii,save_str);
+ if (m_msg->ann_flg) add_ascii_ann(qascii,m_msg->ann_arr);
+ }
+/* this check allows lc DNA sequence queries with FASTX */
+ else {
+#if !defined(FASTS) && !defined(FASTM) && !defined(FASTF) && !defined(FASTX) && !defined(FASTY)
+ init_ascii(ppst->ext_sq_set,qascii, ppst->nsq, m_msg->qdnaseq);
+#endif
+ validate_novel_aa(qascii, ppst->nsq, m_msg->qdnaseq);
+ }
+}
+
+void
+f_getarg (int argc, char **argv, int optind,
+ struct mngmsg *m_msg, struct pstruct *ppst)
+{
+
+ if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
+ if (argc - optind >= 4) {
+ sscanf (argv[optind + 3], "%d", &ppst->param_u.fa.ktup);
+ ktup_set = 1;
+ }
+ else {
+ ppst->param_u.fa.ktup = -pgm_def_arr[ppst->pgm_id].ktup;
+ }
+ }
+
+ if (ppst->pgm_id == RSS_PID && argc - optind > 3) {
+ sscanf (argv[optind + 3], "%d", &m_msg->shuff_max);
+ }
+
+ if (ppst->pgm_id == RFX_PID && argc - optind > 4) {
+ sscanf (argv[optind + 4], "%d", &m_msg->shuff_max);
+ }
+ m_msg->shuff_max_save = m_msg->shuff_max;
+}
+
+/* fills in the query ascii mapping from the parameter
+ ascii mapping.
+*/
+
+void
+re_ascii(int *qascii, int *pascii, int max_ann_arr) {
+ int i;
+
+ for (i=0; i < 128; i++) {
+ if (qascii[i] > NANN+max_ann_arr || qascii[i] < ESS) {
+ qascii[i] = pascii[i];
+ }
+ }
+}
+
+
+/* recode has become function specific to accommodate FASTS/M */
+/* modified 28-Dec-2004 to ensure that all mapped characters
+ are valid */
+int
+recode(unsigned char *seq, int n, int *qascii, int nsqx) {
+ int i,j;
+ char save_c;
+
+#if defined(FASTS) || defined(FASTM)
+ qascii[',']=ESS;
+#endif
+
+ for (i=0; i < n; i++) {
+ save_c = seq[i];
+ if (seq[i] > '@' || seq[i]=='*') seq[i] = qascii[seq[i]];
+ if (seq[i] > nsqx && seq[i]!=ESS) {
+ fprintf(stderr, "*** Warning - unrecognized residue at %d:%c - %2d\n",
+ i,save_c,save_c);
+ seq[i] = qascii['X'];
+ }
+ }
+ seq[i]=EOSEQ;
+ return i;
+}
+
+/* here we have the query sequence, all the command line options,
+ but we need to set various parameter options based on the type
+ of the query sequence (m_msg->qdnaseq = 0:protein/1:DNA) and
+ the function (FASTA/FASTX/TFASTA)
+
+ 29-Jun-2008 add code to ensure that weird ('O', 'U') amino-acids
+ are read properly.
+
+ 15-Nov-2010 -- modify scoring matrix for very short query sequences
+ (e.g. short read metagenomics)
+*/
+
+/* this resetp is for conventional a FASTA/TFASTXYZ search */
+void
+resetp (struct mngmsg *m_msg, struct pstruct *ppst) {
+ int i, pgm_id;
+ int n0_eff;
+
+ pgm_id = ppst->pgm_id;
+
+ /* check for alphabet conflict */
+
+ ppst->shuffle_dna3 = 0;
+#if defined(TFAST)
+ if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) {
+ fprintf(stderr," %s compares a protein to a translated\n\
+DNA sequence library. Do not use a DNA query/scoring matrix.\n",prog_func);
+ exit(1);
+ }
+ ppst->shuffle_dna3 = 1;
+#else
+#if (defined(FASTX) || defined(FASTY))
+ if (!(m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA)) {
+ fprintf(stderr," FASTX/Y compares a DNA sequence to a protein database\n");
+ fprintf(stderr," Use a DNA query\n");
+ exit(1);
+ }
+#endif
+#endif
+
+ /* **************************************************************** */
+ /* adjust alphabets for prot:prot or DNA:DNA alignments */
+
+ /* this code changes parameters for programs (FA_PID, SS_PID, FS_PID,
+ RSS_PID) that can examine either protein (initial state) or DNA
+ Modified May, 2006 to reset e_cut for DNA comparisons.
+ */
+ /* **************************************************************** */
+
+ if (msg_def_arr[pgm_id].q_seqt == SEQT_UNK) {
+ if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) {
+ msg_def_arr[pgm_id].q_seqt = m_msg->qdnaseq;
+ msg_def_arr[pgm_id].p_seqt = SEQT_DNA;
+ msg_def_arr[pgm_id].l_seqt = SEQT_DNA;
+ if (m_msg->qdnaseq == SEQT_DNA) msg_def_arr[pgm_id].qframe = 2;
+ if (!m_msg->e_cut_set) {
+ pgm_def_arr[pgm_id].e_cut /= 5.0;
+ ppst->e_cut_r = 0.001;
+ }
+ }
+ else {
+ msg_def_arr[pgm_id].q_seqt = SEQT_PROT;
+ }
+ }
+
+ /* set the comparison type (PROT/DNA) in ppst */
+ ppst->dnaseq = msg_def_arr[pgm_id].p_seqt;
+
+ if (!sw_flag_set) ppst->sw_flag = msg_def_arr[pgm_id].sw_flag;
+ if (!m_msg->e_cut_set) {
+ ppst->e_cut = m_msg->e_cut=pgm_def_arr[pgm_id].e_cut;
+#ifdef LALIGN
+ ppst->e_cut_r = ppst->e_cut;
+#endif
+ }
+
+ if (ppst->dnaseq == SEQT_DNA && m_msg->qdnaseq==SEQT_RNA) {
+ ppst->dnaseq = SEQT_RNA;
+ ppst->nt_align = 1;
+ }
+ if (ppst->dnaseq==SEQT_DNA) pascii = &nascii[0];
+ else if (ppst->dnaseq==SEQT_RNA) {
+ pascii = &nascii[0];
+ ppst->sq[nascii['T']] = 'U';
+ }
+ else pascii = &aascii[0];
+ m_msg->ldb_info.ldnaseq = msg_def_arr[pgm_id].l_seqt;
+
+ if (m_msg->ldb_info.ldnaseq & SEQT_DNA) {
+ memcpy(lascii,nascii,sizeof(lascii));
+#ifndef TFAST
+#ifdef DNALIB_LC
+ init_ascii(ppst->ext_sq_set,lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
+#endif
+#else
+ /* no init_ascii() because we translate lower case library sequences */
+#endif
+ validate_novel_aa(lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
+ }
+ else {
+ memcpy(lascii,aascii,sizeof(lascii)); /* initialize lib mapping */
+ if (m_msg->ann_flg && strchr((char *)m_msg->ann_arr,'*')) {lascii['*'] = NA;}
+
+#if defined(FASTF) || defined(FASTS) || defined(FASTM)
+ lascii['*'] = NA;
+#endif
+ init_ascii(ppst->ext_sq_set,lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
+ validate_novel_aa(lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
+ }
+
+ /* have lascii - initialize l_ann_ascii[] if necessary */
+ if (m_msg->ann_flg) {
+ memcpy(l_ann_ascii,lascii,sizeof(l_ann_ascii));
+ /* make certain that '*' is treated correctly */
+ if (strchr((char *)m_msg->ann_arr,'*')) {l_ann_ascii['*'] = NA;}
+ add_ascii_ann(l_ann_ascii, m_msg->ann_arr);
+ }
+
+ /* **************************************************************** */
+ /* adjust qframe/nframe if DNA/translated DNA search */
+ /* **************************************************************** */
+
+ if (!nframe_set) {
+ m_msg->qframe = msg_def_arr[pgm_id].qframe;
+ m_msg->nframe = msg_def_arr[pgm_id].nframe;
+ }
+
+ /* the possibilities:
+ -i -3 qframe revcomp
+ FA_D/FX - - 2 0
+ FA_D/FX + - 2 1
+ FA_D/FX - + 1 0
+ FA_D/FX + + 2 1
+ */
+
+ if (m_msg->qdnaseq == SEQT_DNA) {
+ m_msg->nframe = 1;
+ if (m_msg->qframe == 1 && m_msg->revcomp==1) {
+ m_msg->qframe = m_msg->revcomp+1;
+ }
+ }
+ else if (m_msg->qdnaseq == SEQT_RNA) {
+ m_msg->qframe = m_msg->revcomp+1;
+ m_msg->nframe = 1;
+ }
+
+ /* **************************************************************** */
+ /* adjust FASTA heuristics for DNA/translated DNA search */
+ /* **************************************************************** */
+
+ if (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA) {
+ ppst->histint = 4;
+
+ if (!del_set) {
+#ifdef OLD_FASTA_GAP
+ ppst->gdelval = -16; /* def. del penalty */
+#else
+ ppst->gdelval = -12; /* def. open penalty */
+#endif
+ }
+ if (!gap_set) ppst->ggapval = -4; /* def. gap penalty */
+
+ ppst->nsq = nnt;
+ ppst->nsqx = nntx;
+ ppst->sq[ppst->nsqx+1] = ppst->sqx[ppst->nsqx+1] = '\0';
+
+ /* reset parameters for DNA */
+ if (pgm_def_arr[pgm_id].ktup > 0) {
+ /* these parameters are used to scale optcut, and are being replaced
+ by statistically based parameters */
+ /* largest ktup */
+ pgm_def_arr[pgm_id].ktup = mktup = 6;
+ if (!ppst->param_u.fa.optwid_set) ppst->param_u.fa.optwid = 16;
+ ppst->param_u.fa.bestscale = 80;
+ ppst->param_u.fa.bkfact = 5;
+ ppst->param_u.fa.scfact = 1;
+ ppst->param_u.fa.bktup = mktup;
+ ppst->param_u.fa.bestmax = 80;
+ ppst->param_u.fa.bestoff = 45;
+ if (!E_thresh_set) ppst->param_u.fa.E_band_opt = 0.05;
+
+ if (!sw_flag_set) {
+ ppst->sw_flag = 0;
+ SAFE_STRNCPY(m_msg->f_id1,"bs",sizeof(m_msg->f_id1));
+ SAFE_STRNCPY(m_msg->alabel, align_label[1], sizeof(m_msg->alabel));
+ }
+
+ /* largest ktup */
+ mktup = 6;
+
+ if (ppst->param_u.fa.pamfact >= 0) ppst->param_u.fa.pamfact = 0;
+ if (ppst->param_u.fa.ktup < 0)
+ ppst->param_u.fa.ktup = -ppst->param_u.fa.bktup;
+ }
+
+ for (i=0; i<=ppst->nsqx; i++) {
+ ppst->hsq[i] = hnt[i];
+ ppst->sq[i] = nt[i];
+ ppst->hsqx[i] = hntx[i];
+ ppst->sqx[i] = ntx[i];
+ }
+
+ /* **************************************************************** */
+ /* adjust scoring matrix for DNA:DNA search */
+ /* **************************************************************** */
+
+ if (!ppst->pam_set) {
+ if (ppst->p_d_set)
+ mk_n_pam(npam,nnt,ppst->p_d_mat,ppst->p_d_mis);
+#if !defined(FASTS) && !defined(FASTM)
+ else if (ppst->pamfile[0]=='\0' || strncmp(ppst->pamfile,"BL50",4)==0) {
+ SAFE_STRNCPY (ppst->pamfile, "+5/-4", sizeof(ppst->pamfile));
+ SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, sizeof(ppst->pamfile_save));
+ SAFE_STRNCPY (ppst->pam_name, "+5/-4", sizeof(ppst->pamfile));
+ }
+#else
+ else if (strncmp(ppst->pamfile,"MD20",4)==0) {
+ SAFE_STRNCPY (ppst->pamfile, "+2/-2", sizeof(ppst->pamfile));
+ SAFE_STRNCPY (ppst->pam_name, "+2/-2", sizeof(ppst->pam_name));
+ SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, sizeof(ppst->pamfile_save));
+ ppst->p_d_mat = +2;
+ ppst->p_d_mis = -2;
+ mk_n_pam(npam,nnt,ppst->p_d_mat,ppst->p_d_mis);
+ }
+#endif
+ pam = npam;
+ }
+
+ SAFE_STRNCPY (m_msg->sqnam, "nt",sizeof(m_msg->sqnam));
+ SAFE_STRNCPY (m_msg->sqtype, "DNA",sizeof(m_msg->sqtype));
+ } /* end DNA reset */
+
+ else { /* other parameters for protein comparison */
+ if (pgm_def_arr[pgm_id].ktup > 0) {
+ if (!ppst->param_u.fa.optwid_set) {
+ if (ppst->param_u.fa.ktup==1) ppst->param_u.fa.optwid = 32;
+ else ppst->param_u.fa.optwid = 16;
+ }
+ }
+ if (!shift_set) {ppst->gshift = pgm_def_arr[pgm_id].gshift;}
+ if (!subs_set) {ppst->gsubs = pgm_def_arr[pgm_id].hshift;}
+ }
+
+ SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, 120);
+}
+
+/* query_parm() this function asks for any additional parameters
+ that have not been provided. Could be null. */
+void
+query_parm (struct mngmsg *m_msp, struct pstruct *ppst)
+{
+ char qline[40];
+
+ if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
+ if (ppst->param_u.fa.ktup < 0)
+ ppst->param_u.fa.ktup = -ppst->param_u.fa.ktup;
+
+ if (ppst->param_u.fa.ktup == 0) {
+ printf (" ktup? (1 to %d) [%d] ", mktup, pgm_def_arr[ppst->pgm_id].ktup);
+ if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
+ else sscanf(qline,"%d",&ppst->param_u.fa.ktup);
+ }
+ if (ppst->param_u.fa.ktup == 0)
+ ppst->param_u.fa.ktup = pgm_def_arr[ppst->pgm_id].ktup;
+ else ktup_set = 1;
+ }
+
+#if defined(PRSS)
+ if (m_msp->shuff_max < 10) m_msp->shuff_max = MAX_RSTATS;
+
+ if (!mshuff_set) {
+ printf(" number of shuffles [%d]? ",m_msp->shuff_max);
+ fflush(stdout);
+ if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
+ else sscanf(qline,"%d",&m_msp->shuff_max);
+ }
+
+ if (ppst->zs_win == 0) {
+ printf (" local (window) (w) or uniform (u) shuffle [u]? ");
+ if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
+ else if (qline[0]=='w' || qline[0]=='W') {
+ m_msp->shuff_wid = 20;
+ printf(" local shuffle window size [%d]? ",m_msp->shuff_wid);
+ if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
+ else sscanf(qline,"%d",&m_msp->shuff_wid);
+ }
+ }
+#endif
+}
+
+/* last_init() cannot look at aa0, n0, because it is only run once,
+ it is not run before each new aa0 search */
+void
+last_init (struct mngmsg *m_msg, struct pstruct *ppst)
+{
+ int ix_l, ix_i, i, pgm_id;
+ double *kar_p;
+ double aa0_f[MAXSQ];
+
+ m_msg->zsflag = ppst->zsflag;
+ m_msg->zsflag2 = ppst->zsflag2;
+
+ if (ppst->zsflag < 0) {
+ ppst->do_rep = 0;
+ }
+
+ pgm_id = ppst->pgm_id;
+
+#ifdef LALIGN
+ m_msg->do_showbest = 1;
+ m_msg->quiet = 1;
+#endif
+
+#if defined(FASTF) || defined(FASTS) || defined(FASTM)
+ m_msg->nohist = 1;
+ m_msg->shuff_max = 2000;
+ ppst->shuff_node = m_msg->shuff_max/fa_max_workers;
+#else
+ m_msg->shuff_max = m_msg->shuff_max_save;
+#endif
+
+ if (m_msg->aln.llen < 1) {
+ m_msg->aln.llen = 60;
+ }
+
+ if (m_msg->ldb_info.ldnaseq== SEQT_PROT) {
+ m_msg->max_tot = MAXLIB_P;
+ }
+
+#if defined(FASTX) || defined(FASTY) || defined(TFAST)
+ /* set up translation tables: faatran.c */
+ aainit(ppst->tr_type,ppst->debug_lib);
+#endif
+
+/* a sanity check */
+#if !defined(TFAST)
+ if (m_msg->revcomp && m_msg->qdnaseq!=SEQT_DNA && m_msg->qdnaseq!=SEQT_RNA) {
+ fprintf(stderr," cannot reverse complement protein\n");
+ m_msg->revcomp = 0;
+ }
+#endif
+
+ if (pgm_def_arr[pgm_id].ktup > 0) {
+
+ if (ppst->param_u.fa.ktup < 0)
+ ppst->param_u.fa.ktup = -ppst->param_u.fa.ktup;
+
+ if (ppst->param_u.fa.ktup < 1 || ppst->param_u.fa.ktup > mktup) {
+ fprintf(stderr," warning ktup = %d out of range [1..%d], reset to %d\n",
+ ppst->param_u.fa.ktup, mktup, ppst->param_u.fa.bktup);
+ ppst->param_u.fa.ktup = ppst->param_u.fa.bktup;
+ }
+
+ if (ppst->sw_flag) {
+ SAFE_STRNCPY(m_msg->f_id1,"sw",sizeof(m_msg->f_id1));
+ SAFE_STRNCPY(m_msg->alabel, align_label[0], sizeof(m_msg->alabel));
+ }
+ else {
+ SAFE_STRNCPY(m_msg->f_id1,"bs",sizeof(m_msg->f_id1));
+ SAFE_STRNCPY(m_msg->alabel, align_label[1], sizeof(m_msg->alabel));
+ }
+ }
+
+ if (pgm_id == TFA_PID) {
+ m_msg->revcomp *= 3;
+ if (m_msg->nframe == 3) m_msg->nframe += m_msg->revcomp;
+ }
+ else if (pgm_id == TFX_PID || pgm_id == TFY_PID) {
+ if (m_msg->nframe == 1) m_msg->nframe += m_msg->revcomp;
+ }
+
+#if !defined(TFAST)
+ /* for fasta/fastx searches, itt iterates the the query strand */
+ m_msg->nitt1 = m_msg->qframe-1;
+#else
+ /* for tfasta/tfastxy searches, itt iterates the library frames */
+ m_msg->nitt1 = m_msg->nframe-1;
+#endif
+
+ if (pgm_def_arr[pgm_id].ktup > 0) { /* its FASTA, not SSEARCH */
+ if (ppst->param_u.fa.ktup>=2 && !ppst->param_u.fa.optwid_set) {
+ ppst->param_u.fa.optwid=16;
+ switch (pgm_id) {
+ case FA_PID:
+ case FX_PID:
+ case FY_PID:
+ m_msg->thr_fact = 8;
+ m_msg->thr_fact = 8;
+ break;
+ case TFA_PID:
+ case TFX_PID:
+ case TFY_PID:
+ m_msg->thr_fact = 4;
+ break;
+ default:
+ m_msg->thr_fact = 4;
+ }
+ }
+ else { m_msg->thr_fact = 4;}
+ }
+ else {
+#if !defined(SW_ALTIVEC) && !defined(SW_SSE2)
+ m_msg->thr_fact = 1; /* unvectorized SSEARCH */
+#else
+ m_msg->thr_fact = 8; /* vectorized SSEARCH */
+#endif
+ }
+
+#ifdef PCOMPLIB
+ m_msg->thr_fact = 1; /* use much larger buffers */
+#endif
+
+#if defined(PRSS)
+ if (m_msg->shuff_max < 10) m_msg->shuff_max = MAX_RSTATS;
+ if (ppst->zsflag < 10) ppst->zsflag += 10;
+ if (ppst->zs_win > 0) {
+ m_msg->shuff_wid = ppst->zs_win;
+ }
+#endif
+
+ if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
+ if (ppst->param_u.fa.iniflag) {
+ ppst->score_ix = 1;
+ SAFE_STRNCPY (m_msg->label, "initn init1", sizeof(m_msg->label));
+ }
+ else if (ppst->param_u.fa.optflag) {
+ ppst->score_ix = 2;
+ m_msg->stages = 1;
+ }
+ }
+
+ if (!ppst->have_pam2) {
+ alloc_pam (MAXSQ, MAXSQ, ppst);
+ init_pam2(ppst);
+ }
+ init_pamx(ppst);
+
+ if (ppst->pam_ms) {
+ if (m_msg->qdnaseq == SEQT_PROT) {
+ /* code to make 'L'/'I' identical scores */
+ ix_l = pascii['L'];
+ ix_i = pascii['I'];
+ ppst->pam2[0][ix_l][ix_i] = ppst->pam2[0][ix_i][ix_l] =
+ ppst->pam2[0][ix_l][ix_l] = ppst->pam2[0][ix_i][ix_i] =
+ max(ppst->pam2[0][ix_l][ix_l],ppst->pam2[0][ix_i][ix_i]);
+ for (i=1; i<=ppst->nsq; i++) {
+ ppst->pam2[0][i][ix_i] = ppst->pam2[0][i][ix_l] =
+ max(ppst->pam2[0][i][ix_l],ppst->pam2[0][i][ix_i]);
+ ppst->pam2[0][ix_i][i] = ppst->pam2[0][ix_l][i] =
+ max(ppst->pam2[0][ix_i][i],ppst->pam2[0][ix_l][i]);
+ }
+
+ /* code to make 'Q'/'K' identical scores */
+ if (!shift_set) {
+ ix_l = pascii['Q'];
+ ix_i = pascii['K'];
+ ppst->pam2[0][ix_l][ix_i] = ppst->pam2[0][ix_i][ix_l] =
+ ppst->pam2[0][ix_l][ix_l] = ppst->pam2[0][ix_i][ix_i] =
+ (ppst->pam2[0][ix_l][ix_l]+ppst->pam2[0][ix_i][ix_i]+1)/2;
+ for (i=1; i<=ppst->nsq; i++) {
+ ppst->pam2[0][i][ix_i] = ppst->pam2[0][i][ix_l] =
+ (ppst->pam2[0][i][ix_l]+ppst->pam2[0][i][ix_i]+1)/2;
+ ppst->pam2[0][ix_i][i] = ppst->pam2[0][ix_l][i] =
+ (ppst->pam2[0][ix_i][i]+ppst->pam2[0][ix_l][i]+1)/2;
+ }
+ }
+ }
+ }
+
+ /*
+ print_pam(ppst);
+ */
+
+ /* once we have a complete pam matrix, we can calculate Lambda and K
+ for "average" sequences */
+ kar_p = NULL;
+ init_karlin_a(ppst, aa0_f, &kar_p);
+ do_karlin_a(ppst->pam2[0], ppst, aa0_f,
+ kar_p, &m_msg->Lambda, &m_msg->K, &m_msg->H);
+ ppst->pLambda = m_msg->Lambda;
+ ppst->pK = m_msg->K;
+ ppst->pH = m_msg->H;
+ ppst->LK_set = 1;
+ free(kar_p);
+
+#if defined(FASTF) || defined(FASTS) || defined(FASTM)
+ if (ppst->ext_sq_set) {
+ fprintf(stderr," -S not available on [t]fast[fs]\n");
+ ppst->ext_sq_set = 0;
+ ppst->nsq_e = ppst->nsq;
+
+ /* reset sascii to ignore -S, map lc */
+ init_ascii(0,lascii, ppst->nsq, 0);
+ validate_novel_aa(lascii, ppst->nsq, 0);
+ }
+#endif
+}
+
+/* alloc_pam2p creates a profile structure */
+int **
+alloc_pam2p(int **pam2p, int len, int nsq) {
+ int i, pam2p_len;
+ int *pam2pp;
+
+ if (pam2p == NULL) {
+ if ((pam2p = (int **)calloc(len,sizeof(int *)))==NULL) {
+ fprintf(stderr," Cannot allocate pam2p: %d\n",len);
+ return NULL;
+ }
+
+ if((pam2p[0] = (int *)calloc((nsq+1)*len,sizeof(int)))==NULL) {
+ fprintf(stderr, "Cannot allocate pam2p[0]: %d\n", (nsq+1)*len);
+ free(pam2p);
+ return NULL;
+ }
+ }
+ else {
+ pam2p_len = (nsq+1)*len*sizeof(int);
+ pam2pp = pam2p[0];
+ if ((pam2pp = (int *)realloc(pam2pp,pam2p_len))==NULL) {
+ fprintf(stderr,
+ "Cannot reallocate pam2p[0]: %ld\n", (nsq+1)*len*sizeof(int));
+ return NULL;
+ }
+ memset(pam2pp,0,pam2p_len);
+
+ if ((pam2p = (int **)realloc(pam2p,len*sizeof(int *)))==NULL) {
+ fprintf(stderr," Cannot reallocate pam2p: %d\n",len);
+ return NULL;
+ }
+ pam2p[0] = pam2pp;
+ }
+
+ for (i=1; i<len; i++) {
+ pam2p[i] = pam2p[0] + (i*(nsq+1));
+ }
+
+ return pam2p;
+}
+
+void free_pam2p(int **pam2p) {
+ if (pam2p) {
+ free(pam2p[0]);
+ free(pam2p);
+ }
+}
+
+/* sortbest has now become comparison function specific so that we can use
+ a different comparison for fasts/f
+*/
+#if !defined(FASTS) && !defined (FASTF) && !defined(FASTM)
+void
+qshuffle() {}
+
+#ifndef LALIGN /* LALIGN has last_calc() in last_thresh.c */
+int
+last_calc(
+ unsigned char *aa0, unsigned char *aa1, int maxn,
+ struct beststr **bestp_arr, int nbest,
+ struct mngmsg m_msg, struct pstruct *ppst
+ , void **f_str
+ , void *pstat_str)
+{
+ return nbest;
+}
+#endif
+
+/* this function is almost never called, thus a slow shell sort */
+void sortbest (bptr, nbest, irelv)
+struct beststr **bptr;
+int nbest, irelv;
+{
+ int gap, i, j;
+ struct beststr *tmp;
+
+ for (gap = nbest/2; gap > 0; gap /= 2)
+ for (i = gap; i < nbest; i++)
+ for (j = i - gap; j >= 0; j-= gap) {
+ if (bptr[j]->rst.score[irelv] >= bptr[j + gap]->rst.score[irelv]) break;
+ tmp = bptr[j];
+ bptr[j] = bptr[j + gap];
+ bptr[j + gap] = tmp;
+ }
+}
+
+void show_aux(FILE *fp, struct beststr *bptr) {}
+void header_aux(FILE *fp) {}
+
+#else
+/* this function is almost never called, thus a slow shell sort */
+void sortbest (bptr, nbest, irelv)
+struct beststr **bptr;
+int nbest, irelv;
+{
+ int gap, i, j;
+ struct beststr *tmp;
+
+ for (gap = nbest/2; gap > 0; gap /= 2)
+ for (i = gap; i < nbest; i++)
+ for (j = i - gap; j >= 0; j-= gap) {
+ if (bptr[j]->rst.escore < bptr[j + gap]->rst.escore) break;
+ tmp = bptr[j];
+ bptr[j] = bptr[j + gap];
+ bptr[j + gap] = tmp;
+ }
+}
+
+#if defined(FASTS) || defined(FASTM)
+
+/* this shuffle is for FASTS */
+/* convert ',' -> '\0', shuffle each of the substrings */
+void
+qshuffle(unsigned char *aa0, int n0, int nm0, void *rand_state) {
+
+ unsigned char **aa0start, *aap, tmp;
+ int i,j,k, ns;
+
+ if ((aa0start=(unsigned char **)calloc(nm0+1,
+ sizeof(unsigned char *)))==NULL) {
+ fprintf(stderr,"cannot calloc for qshuffle %d\n",nm0);
+ exit(1);
+ }
+
+ aa0start[0]=aa0;
+ for (k=1,i=0; i<n0; i++) {
+ if (aa0[i]==EOSEQ || aa0[i]==ESS) {
+ aa0[i]='\0';
+ aa0start[k++] = &aa0[i+1];
+ }
+ }
+
+ /* aa0start has the beginning of each substring */
+ for (k=0; k<nm0; k++) {
+ aap=aa0start[k];
+ ns = strlen((const char *)aap);
+ for (i=ns; i>1; i--) {
+ j = my_nrand(i, rand_state);
+ tmp = aap[j];
+ aap[j] = aap[i-1];
+ aap[i-1] = tmp;
+ }
+ aap[ns] = 0;
+ }
+
+ for (k=1; k<nm0; k++) {
+/* aap = aa0start[k];
+ while (*aap) fputc(pst.sq[*aap++],stderr);
+ fputc('\n',stderr);
+*/
+ aa0start[k][-1]=ESS;
+ }
+
+ free(aa0start);
+}
+#endif
+
+#ifdef FASTF
+void qshuffle(unsigned char *aa0, int n0, int nm0, void *rand_state) {
+
+ int i, j, k, nmpos;
+ unsigned char tmp;
+ int nmoff;
+
+ nmoff = (n0 - nm0 - 1)/nm0 + 1;
+
+ for (i = nmoff-1 ; i > 0 ; i--) {
+
+ /* j = nrand(i); if (i == j) continue;*/ /* shuffle columns */
+ j = (nmoff -1 ) - i;
+ if (i <= j) break; /* reverse columns */
+
+ /* swap all i'th column residues for all j'th column residues */
+ for(nmpos = 0, k = 0 ; k < nm0 ; k++, nmpos += nmoff+1 ) {
+ tmp = aa0[nmpos + i];
+ aa0[nmpos + i] = aa0[nmpos + j];
+ aa0[nmpos + j] = tmp;
+ }
+ }
+}
+#endif
+
+
+/* show additional best_str values */
+void show_aux(FILE *fp, struct beststr *bptr) {
+ fprintf(fp," %2d %3d",bptr->rst.segnum,bptr->rst.seglen);
+}
+
+void header_aux(FILE *fp) {
+ fprintf(fp, " sn sl");
+}
+#endif
+
+void
+fill_pam(int **pam2p, int n0, int nsq, double **freq2d, double scale, int **no_remap) {
+ int i, j, new_j;
+ double freq;
+
+ /* fprintf(stderr, "scale: %g\n", scale); */
+
+ /* now fill in the pam matrix: */
+ for (j = 1 ; j <=20 ; j++) {
+ new_j = qascii[pssm_aa[j]];
+ for (i = 0 ; i < n0 ; i++) {
+ freq = scale * freq2d[i][j-1];
+ if ( freq < 0.0) freq -= 0.5;
+ else freq += 0.5;
+
+ if (no_remap[i][j-1]) {
+ pam2p[i][j] = (int)freq;
+ }
+ else {
+ pam2p[i][new_j] = (int)(freq);
+ }
+ }
+ }
+}
+
+double
+get_lambda(int **pam2p, int n0, int nsq, unsigned char *query) {
+ double lambda, H;
+ double *pr, tot, sum;
+ int i, ioff, j, min, max, q_i;
+
+ /* get min and max scores */
+ min = BIGNUM;
+ max = -BIGNUM;
+ if(pam2p[0][1] == -BIGNUM) {
+ ioff = 1;
+ n0++;
+ } else {
+ ioff = 0;
+ }
+
+ for (i = ioff ; i < n0 ; i++) {
+ for (j = 1; j < nsq ; j++) {
+ if (min > pam2p[i][j])
+ min = pam2p[i][j];
+ if (max < pam2p[i][j])
+ max = pam2p[i][j];
+ }
+ }
+
+ /* fprintf(stderr, "min: %d\tmax:%d\n", min, max); */
+
+ if ((pr = (double *) calloc(max - min + 1, sizeof(double))) == NULL) {
+ fprintf(stderr, "Couldn't allocate memory for score probabilities: %d\n", max - min + 1);
+ exit(1);
+ }
+
+ tot = (double) rrtotal * (double) rrtotal * (double) n0;
+ for (i = ioff ; i < n0 ; i++) {
+
+ if (query[i] < 'A') {q_i = query[i];}
+ else {q_i= aascii[query[i]];}
+
+ for (j = 1; j < nsq ; j++) {
+ pr[pam2p[i][j] - min] +=
+ (double) ((double) rrcounts[q_i] * (double) rrcounts[j]) / tot;
+ }
+ }
+
+ sum = 0.0;
+ for(i = 0 ; i <= max-min ; i++) {
+ sum += pr[i];
+ /* fprintf(stderr, "%3d: %g %g\n", i+min, pr[i], sum); */
+ }
+ /* fprintf(stderr, "sum: %g\n", sum); */
+
+ for(i = 0 ; i <= max-min ; i++) { pr[i] /= sum; }
+
+ if (!karlin(min, max, pr, &lambda, &H)) {
+ fprintf(stderr, "Karlin lambda estimation failed\n");
+ }
+
+ /* fprintf(stderr, "lambda: %g\n", lambda); */
+ free(pr);
+
+ return lambda;
+}
+
+/*
+ *aa0 - query sequence
+ n0 - length
+ pamscale - scaling for pam matrix - provided by apam.c, either
+ 0.346574 = ln(2)/2 (P120, BL62) or
+ 0.231049 = ln(2)/3 (P250, BL50)
+*/
+
+void
+scale_pssm(int **pssm2p, double **freq2d,
+ unsigned char *query, int n0,
+ int **pam2, double pamscale);
+
+static unsigned char ustandard_aa[] ="\0ARNDCQEGHILKMFPSTWYV";
+
+void
+read_pssm(unsigned char *aa0, int n0, int nsq,
+ double pamscale,
+ FILE *fp, int pgpf_type, struct pstruct *ppst) {
+ int i, j, len, k;
+ int qi, rj; /* qi - index query; rj - index residues (1-20) */
+ int **pam2p;
+ int first, too_high;
+ unsigned char *query, ctmp;
+ char dline[512];
+ double freq, **freq2d, lambda, new_lambda;
+ double scale, scale_high, scale_low;
+
+ pam2p = ppst->pam2p[0];
+
+ if (pgpf_type == 0) {
+
+ if (1 != fread(&len, sizeof(int), 1, fp)) {
+ fprintf(stderr, "error reading from checkpoint file: %d\n", len);
+ exit(1);
+ }
+
+ if (len != n0) {
+ fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",
+ len,n0);
+ exit(1);
+ }
+
+ /* read over query sequence stored in BLAST profile */
+ if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) {
+ fprintf(stderr, "Couldn't allocate memory for query!\n");
+ exit(1);
+ }
+
+ if (len != fread(query, sizeof(char), len, fp)) {
+ fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);
+ exit(1);
+ }
+ }
+ else if (pgpf_type == 1) {
+
+ if ((fgets(dline,sizeof(dline),fp) == NULL) ||
+ (1 != sscanf(dline, "%d",&len))) {
+ fprintf(stderr, "error reading from checkpoint file: %d\n", len);
+ exit(1);
+ }
+
+ if(len != n0) {
+ fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",
+ len,n0);
+ exit(1);
+ }
+
+ /* read over query sequence stored in BLAST profile */
+ if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) {
+ fprintf(stderr, "Couldn't allocate memory for query!\n");
+ exit(1);
+ }
+
+ if (fgets((char *)query,len+2,fp)==NULL) {
+ fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);
+ exit(1);
+ }
+ }
+ else {
+ fprintf(stderr," Unrecognized PSSM file type: %d\n",pgpf_type);
+ exit(1);
+ }
+
+ /* currently we don't do anything with query; ideally, we should
+ check to see that it actually matches aa0 ... */
+
+ /* quick 2d array alloc: */
+ if((freq2d = (double **) calloc(n0, sizeof(double *))) == NULL) {
+ fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
+ exit(1);
+ }
+
+ if((freq2d[0] = (double *) calloc(n0 * 20, sizeof(double))) == NULL) {
+ fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
+ exit(1);
+ }
+
+ /* a little pointer arithmetic to fill out 2d array: */
+ for (i = 1 ; i < n0 ; i++) {
+ freq2d[i] = freq2d[i-1] + 20;
+ }
+
+ if (pgpf_type == 0) {
+ for (qi = 0 ; qi < n0 ; qi++) {
+ for (rj = 0 ; rj < 20 ; rj++) {
+ if(1 != fread(&freq, sizeof(double), 1, fp)) {
+ fprintf(stderr, "Error while reading frequencies!\n");
+ exit(1);
+ }
+ freq2d[qi][rj] = freq;
+ }
+ }
+ }
+ else {
+ for (qi = 0 ; qi < n0 ; qi++) {
+ if ((fgets(dline,sizeof(dline),fp) ==NULL) ||
+ (k = sscanf(dline,"%c %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg\n",
+ &ctmp, &freq2d[qi][0], &freq2d[qi][1], &freq2d[qi][2], &freq2d[qi][3], &freq2d[qi][4],
+ &freq2d[qi][5], &freq2d[qi][6], &freq2d[qi][7], &freq2d[qi][8], &freq2d[qi][9],
+ &freq2d[qi][10], &freq2d[qi][11], &freq2d[qi][12], &freq2d[qi][13], &freq2d[qi][14],
+ &freq2d[qi][15], &freq2d[qi][16], &freq2d[qi][17], &freq2d[qi][18], &freq2d[qi][19]))<1) {
+ fprintf(stderr, "Error while reading frequencies: %d read!\n",k);
+ exit(1);
+ }
+ for (rj=0; rj<20; rj++) { freq2d[qi][rj] /= 10.0; } /* reverse scaling */
+ }
+ }
+
+ scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale);
+
+ free(freq2d[0]);
+ free(freq2d);
+
+ free(query);
+}
+
+/* before fasta-36.3.6 (with reordered amino-acid mapping), scale_pssm()
+ simply produced a log(q_ij/p_j) and put it into pam2p.
+
+ But pssm's use pssm_aa encoding, while fasta-36.3.6 use NCBIstdaa
+ encoding, so the pam2p must be re-mapped
+*/
+
+void
+scale_pssm(int **pssm2p, double **freq2d, unsigned char *query, int n0, int **pam2, double pamscale) {
+ int i, qi, rj;
+ double freq, new_lambda, lambda;
+ int first, too_high;
+ double scale, scale_high, scale_low;
+ int **no_remap;
+
+
+ /* quick 2d array alloc: */
+ if((no_remap = (int **) calloc(n0, sizeof(int *))) == NULL) {
+ fprintf(stderr, "***error [%s:%d] Couldn't allocate memory for remap[%d]\n",__FILE__, __LINE__, n0);
+ exit(1);
+ }
+
+ if((no_remap[0] = (int *) calloc(n0 * 20, sizeof(int))) == NULL) {
+ fprintf(stderr, "***error [%s:%d] Couldn't allocate memory for remap[%d]\n",__FILE__, __LINE__, n0);
+ exit(1);
+ }
+
+ for (qi=1; qi < n0; qi++) {
+ no_remap[qi] = no_remap[qi-1]+20;
+ }
+
+ /* convert freq2d from frequences to log_scores;
+ fill zeros with BLOSUM62 values */
+
+ for (rj = 0 ; rj < 20 ; rj++) {
+ for (qi = 0 ; qi < n0 ; qi++) {
+ if (freq2d[qi][rj] > 1e-20) {
+ freq = log(freq2d[qi][rj] /((double) (rrcounts[rj+1])/(double) rrtotal));
+ freq /= pamscale; /* this gets us close to originial pam scores */
+ freq2d[qi][rj] = freq;
+ }
+ else {
+ /* when blastpgp decides to leave something out, it puts 0's in all the frequencies
+ in the binary checkpoint file. In the ascii version, however, it uses BLOSUM62
+ values. I will put in scoring matrix values as well */
+ /* 11-Oct-2015 -- this does not work properly, because the
+ correct amino-acid ordering is not used -- pam2 uses
+ NCBIStdaa ordering, but the rest of the matrix uses pssm_aa
+ ordering, which is changed in fill_pam */
+
+ no_remap[qi][rj] = 1;
+ if (query[qi] < 'A') {
+ freq2d[qi][rj] = pam2[query[qi]][rj+1];
+ }
+ else {
+ freq2d[qi][rj] = pam2[aascii[query[qi]]][rj+1];
+ }
+ }
+ }
+ }
+
+ /* now figure out the right scale */
+ scale = 1.0;
+ lambda = get_lambda(pam2, 20, 20, ustandard_aa);
+
+#ifdef DEBUG
+ /*
+ fill_pam(pssm2p, n0, 20, freq2d, scale, no_remap);
+ fprintf(stderr," ");
+ for (rj = 1; rj <= 20; rj++) {
+ fprintf(stderr," %c", NCBIstdaa[rj]);
+ }
+ fprintf(stderr,"\n");
+ for (qi = 0 ; qi < n0 ; qi++) {
+ fprintf(stderr, "%4d %c: ", qi+1, NCBIstdaa[query[qi]]);
+ for (rj = 1 ; rj <= 20 ; rj++) {
+ fprintf(stderr, "%4d", pssm2p[qi][rj]);
+ }
+ fprintf(stderr, "\n");
+ }
+ */
+#endif
+
+ /* should be near 1.0 because of our initial scaling by ppst->pamscale */
+ /* fprintf(stderr, "real_lambda: %g\n", lambda); */
+
+ /* get initial high/low scale values: */
+ first = 1;
+ while (1) {
+ fill_pam(pssm2p, n0, 20, freq2d, scale, no_remap);
+ new_lambda = get_lambda(pssm2p, n0, 20, query);
+
+ if (new_lambda > lambda) {
+ if (first) {
+ first = 0;
+ scale = scale_high = 1.0 + 0.05;
+ scale_low = 1.0;
+ too_high = 1;
+ } else {
+ if (!too_high) break;
+ scale = (scale_high += scale_high - 1.0);
+ }
+ } else if (new_lambda > 0) {
+ if (first) {
+ first = 0;
+ scale_high = 1.0;
+ scale = scale_low = 1.0 - 0.05;
+ too_high = 0;
+ } else {
+ if (too_high) break;
+ scale = (scale_low += scale_low - 1.0);
+ }
+ } else {
+ fprintf(stderr, "new_lambda (%g) <= 0; matrix has positive average score", new_lambda);
+ exit(1);
+ }
+ }
+
+ /* now do binary search between low and high */
+ for (i = 0 ; i < 10 ; i++) {
+ scale = 0.5 * (scale_high + scale_low);
+ fill_pam(pssm2p, n0, 20, freq2d, scale, no_remap);
+ new_lambda = get_lambda(pssm2p, n0, 20, query);
+
+ if (new_lambda > lambda) scale_low = scale;
+ else scale_high = scale;
+ }
+
+ scale = 0.5 * (scale_high + scale_low);
+ fill_pam(pssm2p, n0, 20, freq2d, scale, no_remap);
+
+ free(no_remap[0]);
+ free(no_remap);
+
+#ifdef DEBUG
+ /*
+ fprintf(stderr, "final scale: %g\n", scale);
+
+ fprintf(stderr," ");
+ for (rj = 1; rj <= 20; rj++) {
+ fprintf(stderr," %c", NCBIstdaa[rj]);
+ }
+ fprintf(stderr,"\n");
+ for (qi = 0 ; qi < n0 ; qi++) {
+ fprintf(stderr, "%4d %c: ", qi+1, NCBIstdaa[query[qi]]);
+ for (rj = 1 ; rj <= 20 ; rj++) {
+ fprintf(stderr, "%4d", pssm2p[qi][rj]);
+ }
+ fprintf(stderr, "\n");
+ }
+ */
+#endif
+
+}
+
+#if defined(CAN_PSSM)
+int
+parse_pssm_asn_fa(FILE *afd, int *n_rows, int *n_cols,
+ unsigned char **query,
+ double ***wfreqs, double ***freqs, int ***iscores,
+ char *matrix, int *gap_open, int *gap_extend,
+ double *lambda);
+
+/* the ASN.1 pssm includes information about the scoring matrix used
+ (though not the gap penalty in the current version PSSM:2) The PSSM
+ scoring matrix and gap penalties should become the default if they
+ have not been set explicitly.
+*/
+
+/* read the PSSM from an open FILE *fp - but nothing has been read
+ from *fp */
+
+int
+read_asn_pssm(unsigned char *aa0, int n0, int nsq,
+ double pamscale, FILE *fp, struct pstruct *ppst) {
+
+ int i, j, len, k, itmp;
+ int qi, rj; /* qi - index query; rj - index residues (1-20) */
+ int **pam2p;
+ int first, too_high;
+ unsigned char *query, ctmp;
+ char dline[512];
+ char matrix[MAX_SSTR];
+ double psi2_lambda;
+ double freq, **wfreq2d=NULL, **freq2d=NULL, lambda, new_lambda;
+ double scale, scale_high, scale_low;
+ int **iscores2d=NULL;
+ int gap_open, gap_extend;
+ int n_rows, n_cols;
+
+
+ pam2p = ppst->pam2p[0];
+
+ /* get the information from the ASN.1 (binary) file */
+ if (parse_pssm_asn_fa(fp, &n_rows, &n_cols, &query, &wfreq2d, &freq2d, &iscores2d,
+ matrix, &gap_open, &gap_extend, &psi2_lambda)<=0) {
+ return -1;
+ }
+
+ /* not using wfreq2d[][] right now, free it */
+ if (wfreq2d != NULL) {
+ if (wfreq2d[0] != NULL) {free(wfreq2d[0]);}
+ free(wfreq2d);
+ }
+
+ /* do we have a query sequence */
+ if (query == NULL) { query = aa0;}
+
+ if (!gap_set) {
+ if (gap_open) {
+ if (gap_open > 0) {gap_open = -gap_open;}
+ ppst->gdelval = gap_open;
+ }
+ else if (strncmp(matrix,"BLOSUM62",8)==0) {
+ ppst->gdelval = -11;
+ }
+ gap_set = 1;
+ }
+ if (!del_set) {
+ if (gap_extend) {
+ if (gap_extend > 0) {gap_extend = -gap_extend;}
+ ppst->ggapval = gap_extend;
+ }
+ else if (strncmp(matrix,"BLOSUM62",8)==0) {
+ ppst->ggapval = -1;
+ }
+ del_set = 1;
+ }
+
+ if (strncmp(matrix, "BLOSUM62", 8)== 0 && !ppst->pam_set) {
+ SAFE_STRNCPY(ppst->pamfile, "BL62", 120);
+ SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, 120);
+ standard_pam(ppst->pamfile,ppst,del_set, gap_set);
+ if (!ppst->have_pam2) {
+ alloc_pam (MAXSQ, MAXSQ, ppst);
+ }
+ init_pam2(ppst);
+ ppst->pam_set = 1;
+ }
+
+ if (n_cols < n0) {
+ fprintf(stderr, " query length: %d != n_cols: %d\n",n0, n_cols);
+ exit(1);
+ }
+
+ /* try to just use the the iscore2d file */
+ if (iscores2d != NULL) {
+ for (qi = 0 ; qi < n0 ; qi++) {
+ for (rj = 1 ; rj <= 24 ; rj++) {
+ itmp = iscores2d[qi][rj];
+ if (itmp < -256) itmp=0;
+ pam2p[qi][rj] = itmp;
+ }
+ }
+ /* all done, free it */
+ free(iscores2d[0]);
+ free(iscores2d);
+ }
+ else {
+ scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0], pamscale);
+ }
+
+#if DEBUG
+ if (ppst->debug_lib) {
+ /* fprintf(stderr, "final scale: %g\n", scale); */
+
+ fprintf(stderr," ");
+ for (rj = 1; rj <= 24; rj++) {
+ fprintf(stderr," %c", NCBIstdaa[rj]);
+ }
+ fprintf(stderr,"\n");
+ for (qi = 0 ; qi < n0 ; qi++) {
+ fprintf(stderr, "%3d %c: ", qi+1, NCBIstdaa[aa0[qi]]);
+ for (rj = 1 ; rj <= 24 ; rj++) {
+ fprintf(stderr, "%3d", pam2p[qi][rj]);
+ }
+ fprintf(stderr, "\n");
+ }
+ }
+#endif
+
+ if (freq2d != NULL) {
+ free(freq2d[0]);
+ free(freq2d);
+ }
+
+ if (query != aa0) free(query);
+ return 1;
+}
+#endif
+
+/* last_params() sets up values in pstruct *ppst now that all
+ parameters and data is available.
+
+ It:
+ (1) moves m_msg->n0 to ppst->n0 for statistics calculations
+ (2) sets ppst->nsq_e
+ (3) reads the PSSM file if one is being used
+ (4) calculates m_msg->nm0 for FASTF/S/M
+ (5) determines statistical strategy for FASTF/S, sets last_calc_flg
+ and qshuffle
+ (6) lowers ktup for short sequences
+*/
+
+void
+last_params(unsigned char *aa0, int n0,
+ struct mngmsg *m_msp,
+ struct pstruct *ppst
+ ) {
+ int i, nsq;
+ FILE *fp;
+ int is_fastxy=0;
+ int n0_eff;
+ /* do_karlin_a() must be re-run everytime the scoring matrix changes */
+ double *kar_p;
+ double aa0_f[MAXSQ];
+
+ if (n0 < 0) { return;}
+
+
+ n0_eff = m_msp->n0;
+ ppst->n0 = m_msp->n0;
+#if !defined(TFAST) && (defined(FASTX) || defined(FASTY))
+ n0_eff /= 3;
+ is_fastxy = 1;
+#endif
+
+ /* reset the PAMFILE to the original value */
+ if (strncmp(ppst->pamfile, ppst->pamfile_save,120)!=0) {
+ SAFE_STRNCPY(ppst->pamfile, ppst->pamfile_save, 120);
+ standard_pam(ppst->pamfile,ppst,del_set, gap_set);
+ init_pam2(ppst);
+ init_pamx(ppst);
+ }
+
+ /* **************************************************************** */
+ /* adjust scoring matrix for short protein/translated protein queries */
+ /* **************************************************************** */
+
+ if (ppst->pam_variable) {
+ if (min_pam_bits(n0_eff, DEF_MIN_BITS, ppst, del_set, gap_set)) {
+ init_pam2(ppst);
+ init_pamx(ppst);
+ kar_p = NULL;
+ init_karlin_a(ppst, aa0_f, &kar_p);
+ do_karlin_a(ppst->pam2[0], ppst, aa0_f,
+ kar_p, &m_msp->Lambda, &m_msp->K, &m_msp->H);
+ ppst->pLambda = m_msp->Lambda;
+ ppst->pK = m_msp->K;
+ ppst->pH = m_msp->H;
+ ppst->LK_set = 1;
+ free(kar_p);
+ }
+ else {
+ fprintf(stderr,"+++ warning [%s:%d] - query too short [%d] for %d bit signal -- fasts36 may be more useful +++\n",
+ __FILE__, __LINE__, n0, DEF_MIN_BITS);
+ }
+ }
+
+ if (ppst->ext_sq_set) { ppst->nsq_e = nsq = 2*ppst->nsq; }
+ else {ppst->nsq_e = nsq = ppst->nsq;}
+
+#if defined(CAN_PSSM)
+ ppst->pam2p[0] = alloc_pam2p(ppst->pam2p[0],n0,MAXSQ);
+ ppst->pam2p[1] = alloc_pam2p(ppst->pam2p[1],n0,MAXSQ);
+
+ if (ppst->pam_pssm) {
+ if ((ppst->pgpfile_type == 0) && (fp=fopen(ppst->pgpfile,"rb"))) {
+ read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 0, ppst);
+ extend_pssm(aa0, n0, ppst);
+ }
+ else if ((ppst->pgpfile_type == 1) && (fp=fopen(ppst->pgpfile,"r"))) {
+ read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 1, ppst);
+ extend_pssm(aa0, n0, ppst);
+ }
+ else if ((ppst->pgpfile_type == 2) && (fp=fopen(ppst->pgpfile,"rb"))) {
+ if (read_asn_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, ppst)>0) {
+ extend_pssm(aa0, n0, ppst);
+ }
+ else {
+ fprintf(stderr," Could not parse PSSM file: %s\n",ppst->pgpfile);
+ ppst->pam_pssm = 0;
+ return;
+ }
+ }
+ else {
+ fprintf(stderr," Could not open PSSM file: %s\n",ppst->pgpfile);
+ ppst->pam_pssm = 0;
+ return;
+ }
+ }
+#endif
+
+#if defined(FASTF) || defined(FASTS) || defined(FASTM)
+ m_msp->nm0 = 1;
+ for (i=0; i<n0; i++)
+ if (aa0[i]==EOSEQ || aa0[i]==ESS) m_msp->nm0++;
+
+/*
+ for FASTS, we can do statistics in one of two different ways
+ if there are <= 10 query fragments, then we calculate probabilistic
+ scores for every library sequence. If there are > 10 fragments, this
+ takes much too long and too much memory, so we use the old fashioned
+ raw score only z-score normalized method initially, and then calculate
+ the probabilistic scores for the best hits. To scale those scores, we
+ also need a set of random probabilistic scores. So we do the qshuffle
+ to get them.
+
+ For FASTF, precalculating probabilities is prohibitively expensive,
+ so we never do it; FASTF always acts like FASTS with nfrags>10.
+
+*/
+
+#if defined(FASTS) || defined(FASTM)
+ if (m_msp->nm0 > 10) m_msp->escore_flg = 0;
+ else m_msp->escore_flg = 1;
+#endif
+
+ if (m_msp->escore_flg && (ppst->zsflag&1)) {
+ m_msp->last_calc_flg = 0;
+ m_msp->qshuffle = 0;
+ }
+ else { /* need random query, second set of 2000 scores */
+ m_msp->last_calc_flg = 1;
+ m_msp->qshuffle = 1;
+ }
+#else
+#ifndef LALIGN
+ m_msp->last_calc_flg = 0;
+#else
+ m_msp->last_calc_flg = 1; /* LALIGN needs last_calc for threshold */
+#endif
+ m_msp->qshuffle = 0;
+ m_msp->escore_flg = 0;
+ m_msp->nm0 = 1;
+#endif
+
+/* adjust the ktup if appropriate */
+
+ if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
+ if (!ktup_set ) {
+ ppst->param_u.fa.ktup = pgm_def_arr[ppst->pgm_id].ktup;
+ if (m_msp->qdnaseq == SEQT_PROT || is_fastxy) {
+#if defined(FASTS) || defined(FASTM)
+ if (n0_eff > 100 && ppst->param_u.fa.ktup > 2) ppst->param_u.fa.ktup = 2;
+#endif
+ if (n0_eff <= 40 && ppst->param_u.fa.ktup > 1) ppst->param_u.fa.ktup = 1;
+ }
+ else if (m_msp->qdnaseq == SEQT_DNA || m_msp->qdnaseq == SEQT_RNA) {
+ if (n0_eff <= 20 && ppst->param_u.fa.ktup > 1) ppst->param_u.fa.ktup = 1;
+#if defined(FASTS) || defined(FASTM)
+ /* with the current (April 12 2005) dropfs2.c - ktup cannot be > 2 */
+ else ppst->param_u.fa.ktup = 2;
+#else
+ else if (n0 < 50 && ppst->param_u.fa.ktup > 2) ppst->param_u.fa.ktup = 2;
+ else if (n0 < 100 && ppst->param_u.fa.ktup > 3) ppst->param_u.fa.ktup = 3;
+#endif
+ }
+ }
+ /* regardless of ktup state */
+ if (ppst->param_u.fa.use_E_thresholds) {
+ ppst->param_u.fa.use_E_thresholds = ppst->LK_set;
+ }
+ if (!E_cgap_set) {
+ ppst->param_u.fa.E_join = ppst->param_u.fa.E_band_opt * 5;
+ }
+ else {
+ if (ppst->param_u.fa.E_join > 1.0) {
+ ppst->param_u.fa.E_join = ppst->param_u.fa.E_band_opt * ppst->param_u.fa.E_join;
+ }
+ }
+ }
+}
+
+/* validate_params() exists because of bugs that keep appearing
+
+ (1) pam2[0][x][0] or pam2[0][0][x] are not -BIGNUM
+ (2) sascii[] (or qascii[], lascii[]) have values outside nsq_e.
+ */
+
+int
+validate_params(const unsigned char *aa0, int n0,
+ const struct mngmsg *m_msg,
+ const struct pstruct *ppst,
+ const int *lascii, const int *pascii) {
+ int good_params = 1;
+ int i;
+
+ /* check for -BIGNUM for boundaries of pam2[0][0:x][x:0] */
+
+ for (i=0; i< ppst->nsq; i++) {
+ if (ppst->pam2[0][0][i] > -1000) {
+ fprintf(stderr," *** ERROR *** pam2[0][0][%d/%c] == %d\n",
+ i,NCBIstdaa[i],ppst->pam2[0][0][i]);
+ good_params = 0;
+ }
+ if (ppst->pam2[0][i][0] > -1000) {
+ fprintf(stderr," *** ERROR *** pam2[0][%d/%c][0] == %d\n",
+ i,NCBIstdaa[i],ppst->pam2[0][i][0]);
+ good_params = 0;
+ }
+ }
+
+ /* check for -BIGNUM for boundaries of pam2[1][0:x][x:0] */
+ if (ppst->ext_sq_set) {
+ for (i=0; i< ppst->nsqx; i++) {
+ if (ppst->pam2[1][0][i] > -1000) {
+ fprintf(stderr," *** ERROR *** pam2[1][0][%d] == %d\n",
+ i,ppst->pam2[1][0][i]);
+ good_params = 0;
+ }
+ if (ppst->pam2[1][i][0] > -1000) {
+ fprintf(stderr," *** ERROR *** pam2[1][%d][0] == %d\n",
+ i,ppst->pam2[1][i][0]);
+ good_params = 0;
+ }
+ }
+ }
+
+ /* check for valid residues in query */
+ for (i=0; i<n0; i++) {
+ if (aa0[i] > ppst->nsq_e && aa0[i] != ESS) {
+ fprintf(stderr," *** ERROR *** aa0[%d] = %c[%d > %d] out of range\n",
+ i, aa0[i], aa0[i], ppst->nsq_e);
+ good_params = 0;
+ }
+ }
+
+ for (i=0; i<128; i++) {
+ if (lascii[i] < NA && lascii[i] > ppst->nsq_e) {
+ fprintf(stderr," *** ERROR *** lascii [%c|%d] = %d > %d out of range\n",
+ i, i, lascii[i], ppst->nsq_e);
+ good_params = 0;
+ }
+
+ /* currently, pascii[] is not reset for upper-case only
+ if (pascii[i] < NA && pascii[i] > ppst->nsq_e) {
+ fprintf(stderr," *** WARNING *** pascii[%c|%d] = %d > %d out of range\n",
+ i, i, pascii[i], ppst->nsq_e);
+ }
+ */
+
+ }
+
+ return good_params;
+}
+
+/* given a good profile in ppst->pam2p[0], make an extended profile
+ in ppst->pam2p[1]
+*/
+void
+extend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst) {
+
+ int i, j, nsq;
+ int sa_x, sa_t, sa_b, sa_z, sa_j;
+ int **pam2p0, **pam2p1;
+
+ nsq = ppst->nsq;
+
+ pam2p0 = ppst->pam2p[0];
+ pam2p1 = ppst->pam2p[1];
+
+ sa_x = pascii['X'];
+ sa_t = pascii['*'];
+ if (sa_t >= ppst->nsq) {sa_t = sa_x;}
+ sa_b = pascii['B'];
+ sa_z = pascii['Z'];
+ sa_j = pascii['J'];
+
+ /* fill in boundaries, B, Z, *, X */
+ for (i=0; i<n0; i++) {
+ pam2p0[i][0] = -BIGNUM;
+ pam2p0[i][sa_b] = (int)
+ (((float)pam2p0[i][pascii['N']]+(float)pam2p0[i][pascii['D']]+0.5)/2.0);
+ pam2p0[i][sa_z] = (int)
+ (((float)pam2p0[i][pascii['Q']]+(float)pam2p0[i][pascii['E']]+0.5)/2.0);
+ pam2p0[i][sa_j] = (int)
+ (((float)pam2p0[i][pascii['I']]+(float)pam2p0[i][pascii['L']]+0.5)/2.0);
+ pam2p0[i][sa_x] = ppst->pam_xm;
+ pam2p0[i][sa_t] = ppst->pam_xm;
+ }
+
+ /* copy pam2p0 into pam2p1 */
+ for (i=0; i<n0; i++) {
+ pam2p1[i][0] = -BIGNUM;
+ for (j=1; j<=ppst->nsq; j++) {
+ pam2p1[i][j] = pam2p0[i][j];
+ }
+ }
+
+ /* then fill in extended characters, if necessary */
+ if (ppst->ext_sq_set) {
+ for (i=0; i<n0; i++) {
+ for (j=1; j<=ppst->nsq; j++) {
+ pam2p0[i][nsq+j] = pam2p0[i][j];
+ pam2p1[i][nsq+j] = ppst->pam_xm;
+ }
+ }
+ }
+}
+
+void format_params(struct opt_def_str *opt_ptr, char *string) {
+
+ if (opt_ptr->opt_char == 'r') {
+ sprintf(string, " [+%d/%d]", opt_ptr->i_param1, opt_ptr->i_param2);
+ return;
+ }
+
+ switch (opt_ptr->fmt_type) {
+
+ case 1:
+ sprintf(string, " [%d]", opt_ptr->i_param1); break;
+ case 2:
+ sprintf(string, " [%d,%d]", opt_ptr->i_param1, opt_ptr->i_param2); break;
+ case 3:
+ sprintf(string, " [%.4g]", opt_ptr->d_param1); break;
+ case 4:
+ sprintf(string, " [%.4g,%.4g]", opt_ptr->d_param1, opt_ptr->d_param2); break;
+ case 5:
+ sprintf(string, " [%s]", opt_ptr->s_param); break;
+ case 0:
+ default:
+ string[0] = '\0'; break;
+ }
+}
+
+
+#if defined(FASTX) || defined(FASTY)
+static char *common_opts = "sfgjSEbdI";
+#else
+#if defined(LALIGN)
+static char *common_opts = "sfgEZI";
+#else
+static char *common_opts = "sfgSbdI";
+#endif
+#endif
+
+void
+show_help(char *pgm_name, int pgm_id) {
+ int i, j;
+ int opt_line_cnt=0;
+ char tmp_string[MAX_STR];
+ struct opt_def_str *opt_ptr;
+
+ printf("USAGE\n");
+#ifndef LALIGN
+ printf(" %s [-options] query_file library_file",pgm_name);
+ if (pgm_def_arr[pgm_id].ktup > 0) {
+ printf(" [ktup]\n");
+ }
+ else {printf("\n");}
+#else
+ printf(" %s [-options] seq_file1 seq_file2\n",pgm_name);
+#endif
+ printf(" %s -help for a complete option list\n",pgm_name);
+ printf("\nDESCRIPTION\n");
+ printf(" %s\n version: %s\n",pgm_def_arr[pgm_id].iprompt0, verstr);
+ printf("\n");
+ printf("COMMON OPTIONS (options must preceed query_file library_file)\n");
+
+ for (i=0; i<strlen(common_opts); i++) {
+ opt_ptr = g_options;
+ for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
+ if (common_opts[i]==opt_ptr[j].opt_char) {
+ format_params(&opt_ptr[j], tmp_string);
+ printf(" -%c%c %s %s;",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '),
+ tmp_string, opt_ptr[j].opt_descr_s);
+ /* if ((++opt_line_cnt % 2) == 0) printf("\n"); */
+ printf("\n");
+ goto next_option;
+ }
+ }
+
+ opt_ptr = f_options;
+ for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
+ if (common_opts[i]==opt_ptr[j].opt_char) {
+ format_params(&opt_ptr[j], tmp_string);
+ printf(" -%c%c %s %s;",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '),
+ tmp_string, opt_ptr[j].opt_descr_s);
+ /* if ((++opt_line_cnt % 2)==0) printf("\n"); */
+ printf("\n");
+ }
+ }
+ next_option: continue;
+ }
+ if ((opt_line_cnt % 2) != 0) printf("\n");
+ exit(0);
+}
+
+/* sorts a list of options, with upper and lower case characters
+ sorted together */
+void sort_opt_list(char *v, int n) {
+ int gap, i, j, k;
+ int incs[7] = { 336, 112, 48, 21, 7, 3, 1 };
+ char tmp_c, tmp_u;
+ int v_start;
+
+ /* first shell sort the list using toupper() */
+ for ( k = 0; k < 7; k++) {
+ gap = incs[k];
+ for (i = gap; i < n; i++) {
+ tmp_c = v[i];
+ tmp_u = toupper(tmp_c);
+ j = i;
+ while (j >= gap && toupper(v[j - gap]) > tmp_u) {
+ v[j] = v[j - gap];
+ j -= gap;
+ }
+ v[j] = tmp_c;
+ }
+ }
+ /* then sort the toupper(==) pairs lower-case, upper-case */
+
+ for (i=1; i<n; i++) {
+ if (toupper(v[i])==toupper(v[i-1])) {
+ if (v[i] > v[i-1]) { tmp_c = v[i]; v[i] = v[i-1]; v[i-1]=tmp_c;}
+ }
+ }
+}
+
+char *
+sort_options (struct opt_def_str *g_options, struct opt_def_str *f_options) {
+ struct opt_def_str *this_option;
+ char *sorted_list, *sort_ptr;
+ int i, opt_count;
+
+ opt_count=0;
+ this_option = g_options;
+ while ((this_option++)->opt_char!='\0') { opt_count++;}
+ this_option = f_options;
+ while ((this_option++)->opt_char!='\0') { opt_count++;}
+
+ if ((sorted_list = (char *)calloc(opt_count+1, sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate sorted_list[%d]\n",opt_count+1);
+ exit(1);
+ }
+
+ sort_ptr = sorted_list;
+ this_option = g_options;
+ while (this_option->opt_char!='\0') {
+ *sort_ptr++ = this_option->opt_char;
+ this_option++;
+ }
+
+ this_option = f_options;
+ while (this_option->opt_char!='\0') {
+ *sort_ptr++ = this_option->opt_char;
+ this_option++;
+ }
+
+ sort_opt_list(sorted_list, opt_count);
+
+ return sorted_list;
+}
+
+void
+show_all_help(char *pgm_name, int pgm_id) {
+ int i, j;
+ struct opt_def_str *opt_ptr;
+ char tmp_string[MAX_STR];
+ char *descr_ptr;
+ char *sorted_list;
+
+ sorted_list = sort_options(g_options, f_options);
+
+ printf("USAGE\n");
+ printf(" %s [-options] query_file library_file",pgm_name);
+ if (pgm_def_arr[pgm_id].ktup > 0) {
+ printf(" [ktup]\n");
+ }
+ else {printf("\n");}
+
+ printf(" \"@\" query_file uses stdin; query_file:begin-end sets subset range\n");
+ printf(" library file formats: 0:FASTA; 1:GenBankFF; 3:EMBL_FF; 7:FASTQ; 10:subset; 12:NCBI blastdbcmd;\n");
+ printf(" alternate library formats: \"library_file 7\" for 7:FASTQ\n");
+
+ printf("\nDESCRIPTION\n");
+ printf(" %s\n version: %s\n",pgm_def_arr[pgm_id].iprompt0, verstr);
+ printf("\n");
+ printf("OPTIONS (options must preceed query_file library_file)\n");
+
+ for (i=0; i<strlen(sorted_list); i++) {
+ opt_ptr = g_options;
+ for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
+ if (sorted_list[i]==opt_ptr[j].opt_char) {
+ descr_ptr = (opt_ptr[j].opt_descr_l) ? opt_ptr[j].opt_descr_l : opt_ptr[j].opt_descr_s;
+ format_params(&opt_ptr[j], tmp_string);
+ printf(" -%c%c %s %s\n",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '),tmp_string, descr_ptr);
+ goto next_option;
+ }
+ }
+
+ opt_ptr = f_options;
+ for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
+ if (sorted_list[i]==opt_ptr[j].opt_char) {
+ descr_ptr = (opt_ptr[j].opt_descr_l) ? opt_ptr[j].opt_descr_l : opt_ptr[j].opt_descr_s;
+ format_params(&opt_ptr[j], tmp_string);
+ printf(" -%c%c %s %s\n",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '), tmp_string, descr_ptr);
+ }
+ }
+ next_option: continue;
+ }
+
+ free(sorted_list);
+
+ exit(0);
+}
diff --git a/src/karlin.c b/src/karlin.c
new file mode 100644
index 0000000..093aec8
--- /dev/null
+++ b/src/karlin.c
@@ -0,0 +1,519 @@
+/**************** Statistical Significance Parameter Subroutine ****************
+
+ $Id: karlin.c 625 2011-03-23 17:21:38Z wrp $
+ $Revision: 625 $
+
+ Version 1.0 February 2, 1990
+ Version 2.0 March 18, 1993
+
+ Program by: Stephen Altschul
+
+ Address: National Center for Biotechnology Information
+ National Library of Medicine
+ National Institutes of Health
+ Bethesda, MD 20894
+
+ Internet: altschul at ncbi.nlm.nih.gov
+
+ See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
+ Significance of Molecular Sequence Features by Using General Scoring
+ Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.
+
+ Computes the parameters lambda and K for use in calculating the
+ statistical significance of high-scoring segments or subalignments.
+
+ The scoring scheme must be integer valued. A positive score must be
+ possible, but the expected (mean) score must be negative.
+
+ A program that calls this routine must provide the value of the lowest
+ possible score, the value of the greatest possible score, and a pointer
+ to an array of probabilities for the occurence of all scores between
+ these two extreme scores. For example, if score -2 occurs with
+ probability 0.7, score 0 occurs with probability 0.1, and score 3
+ occurs with probability 0.2, then the subroutine must be called with
+ low = -2, high = 3, and pr pointing to the array of values
+ { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide
+ pointers to lambda and K; the subroutine will then calculate the values
+ of these two parameters. In this example, lambda=0.330 and K=0.154.
+
+ The parameters lambda and K can be used as follows. Suppose we are
+ given a length N random sequence of independent letters. Associated
+ with each letter is a score, and the probabilities of the letters
+ determine the probability for each score. Let S be the aggregate score
+ of the highest scoring contiguous segment of this sequence. Then if N
+ is sufficiently large (greater than 100), the following bound on the
+ probability that S is greater than or equal to x applies:
+
+ P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ].
+
+ In other words, the p-value for this segment can be written as
+ 1-exp[-KN*exp(-lambda*S)].
+
+ This formula can be applied to pairwise sequence comparison by assigning
+ scores to pairs of letters (e.g. amino acids), and by replacing N in the
+ formula with N*M, where N and M are the lengths of the two sequences
+ being compared.
+
+ In addition, letting y = KN*exp(-lambda*S), the p-value for finding m
+ distinct segments all with score >= S is given by:
+
+ 2 m-1 -y
+ 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e
+
+ Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
+ as the previous formula.
+
+*******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#define MAXIT 25 /* Maximum number of iterations used in calculating lambda */
+#define NMAP_X 23
+#define NMAP 33
+
+#define TINY 1e-6
+
+/* first build a residue map to automatically put residues in score bins */
+
+#include "defs.h"
+#include "param.h"
+
+/* initialize the Karlin frequency, probability arrays using
+ a specific query sequence */
+
+int karlin(int , int, double *, double *, double *);
+static int karlin_k(int , int , double *, double *, double *, double *);
+
+void init_karlin(const unsigned char *aa0, int n0, struct pstruct *ppst,
+ double *aa0_f, double **kp)
+{
+ int kar_nsq, kar_range, kar_min, kar_max;
+
+ const unsigned char *aa0p;
+ int i;
+ int r_cnt[NMAP+1];
+ double fn0, *kar_p;
+
+ kar_range = ppst->pam_h - ppst->pam_l + 1;
+ if (*kp == NULL) {
+ if ((kar_p=(double *)calloc(kar_range+1,sizeof(double)))==NULL) {
+ fprintf(stderr," cannot allocate kar_p array: %d\n",kar_range+1);
+ exit(1);
+ }
+ *kp = kar_p;
+ }
+ kar_nsq = ppst->nsq; /* alphabet size */
+ kar_min = ppst->pam_l; /* low pam value */
+ kar_max = ppst->pam_h; /* high pam value */
+
+ /* must have at least 1 residue of each type */
+ r_cnt[NMAP]=0;
+ for (i=1; i<=kar_nsq; i++) r_cnt[i]=1;
+
+ fn0 = 100.0/(double)(n0+kar_nsq); /* weight of each residue */
+
+ aa0p = aa0;
+ /* increment residue count for each residue in query sequence */
+ while (*aa0p) r_cnt[ppst->hsqx[*aa0p++]]++;
+
+ /* map all unmapped residues to 'X' */
+ r_cnt[NMAP_X] += r_cnt[NMAP];
+
+ for (i=1; i<=kar_nsq; i++) aa0_f[i] = fn0*(double)r_cnt[i];
+}
+
+double nt_f[] = {0.0, 0.25, 0.25, 0.25, 0.25 };
+
+/* Robinson and Robinson frequencies */
+double aa_f[] = {
+/* NULL */ 0.00,
+/* A */ 0.0780474700897585,
+/* R */ 0.0512953149316987,
+/* N */ 0.0448725775979007,
+/* D */ 0.0536397361638076,
+/* C */ 0.0192460110427568,
+/* Q */ 0.0426436013507063,
+/* E */ 0.0629485981204668,
+/* G */ 0.0737715654561964,
+/* H */ 0.0219922696262025,
+/* I */ 0.0514196403000682,
+/* L */ 0.090191394464413,
+/* K */ 0.0574383201866657,
+/* M */ 0.0224251883196316,
+/* F */ 0.0385564048655621,
+/* P */ 0.0520279465667327,
+/* S */ 0.0711984743501224,
+/* T */ 0.0584129422708473,
+/* W */ 0.013298374223799,
+/* Y */ 0.0321647488738564,
+/* V */ 0.0644094211988074};
+
+/* initialize the Karlin frequency, probability arrays using
+ an "average" composition (average length if n0 <=0) */
+
+void
+init_karlin_a(struct pstruct *ppst, double *aa0_f, double **kp)
+{
+ int kar_nsq, kar_range;
+
+ int i;
+ double fn0, *kar_p;
+
+ kar_range = ppst->pam_h - ppst->pam_l + 1;
+ if (*kp == NULL) {
+ if ((kar_p=(double *)calloc(kar_range+1,sizeof(double)))==NULL) {
+ fprintf(stderr," cannot allocate kar_p array: %d\n",kar_range+1);
+ exit(1);
+ }
+ *kp = kar_p;
+ }
+
+ if (ppst->nt_align) {
+ kar_nsq = 4;
+ for (i=1; i<=kar_nsq; i++) aa0_f[i] = nt_f[i];
+ }
+ else if (ppst->dnaseq==SEQT_PROT || ppst->dnaseq == SEQT_UNK) {
+ kar_nsq = 20;
+ for (i=1; i<=kar_nsq; i++) aa0_f[i] = aa_f[i];
+ }
+ else {
+ kar_nsq = ppst->nsq;
+ fn0 = 1.0/(double)(kar_nsq-1);
+ for (i=1; i< kar_nsq; i++) aa0_f[i] = fn0;
+ aa0_f[kar_nsq]=0.0;
+ }
+
+}
+
+/* calculate set up karlin() to calculate Lambda, K, by calculating
+ aa1 frequencies */
+int
+do_karlin(const unsigned char *aa1, int n1,
+ int **pam2, const struct pstruct *ppst,
+ double *aa0_f, double *kar_p, double *lambda, double *H)
+{
+ register unsigned const char *aap;
+ int kar_range, kar_min, kar_max, kar_nsq;
+ int r_cnt[NMAP+1];
+ double aa1_f[NMAP];
+ double fn1, kar_tot;
+ int i, j;
+
+ kar_nsq = ppst->nsq;
+ kar_min = ppst->pam_l;
+ kar_max = ppst->pam_h;
+ kar_range = kar_max - kar_min + 1;
+
+ r_cnt[NMAP]=0;
+ for (i=1; i<=kar_nsq; i++) r_cnt[i]=1;
+
+ /* residue counts */
+
+ aap=aa1;
+ while (*aap) r_cnt[ppst->hsqx[*aap++]]++;
+
+ r_cnt[NMAP_X] += r_cnt[NMAP];
+
+ /* residue frequencies */
+ fn1 = 100.0/(double)(n1+kar_nsq);
+ for (i=1; i<=kar_nsq; i++) aa1_f[i]= fn1*(double)r_cnt[i];
+
+ for (i=0; i<=kar_range; i++) kar_p[i] = 0.0;
+
+ for (i=1; i<=kar_nsq; i++) {
+ for (j=1; j<=kar_nsq; j++)
+ kar_p[pam2[i][j]-kar_min] += aa0_f[i]*aa1_f[j];
+ }
+
+ kar_tot = 0.0;
+ for (i=0; i<=kar_range; i++) kar_tot += kar_p[i];
+ if (kar_tot <= 0.00001) return 0;
+
+ for (i=0; i<=kar_range; i++) kar_p[i] /= kar_tot;
+
+ return karlin(kar_min, kar_max, kar_p, lambda, H);
+}
+
+int
+do_karlin_a(int **pam2, struct pstruct *ppst,
+ double *aa0_f, double *kar_p, double *lambda, double *K, double *H)
+{
+ double *aa1fp;
+ int kar_range, kar_min, kar_max, kar_nsq;
+ double aa1_f[NMAP];
+ double fn1, kar_tot;
+ int i, j;
+
+ kar_min = ppst->pam_l;
+ kar_max = ppst->pam_h;
+ kar_range = kar_max - kar_min + 1;
+
+ kar_tot = 0.0;
+ if (ppst->nt_align ) {
+ kar_nsq = 4;
+ aa1fp = nt_f;
+ for (i=1; i<=kar_nsq; i++) {kar_tot += aa1fp[i];}
+ for (i=1; i<=kar_nsq; i++) {aa1_f[i]= aa1fp[i]/kar_tot;}
+ }
+ else if (!ppst->nt_align) {
+ kar_nsq = 20;
+ aa1fp = aa_f;
+ for (i=1; i<=kar_nsq; i++) {kar_tot += aa1fp[i];}
+ for (i=1; i<=kar_nsq; i++) {aa1_f[i]= aa1fp[i]/kar_tot;}
+ }
+ else {
+ kar_nsq = ppst->nsq;
+ fn1 = 1.0/(double)(kar_nsq-1);
+ for (i=1; i< kar_nsq; i++) aa1_f[i] = fn1;
+ aa1_f[kar_nsq]=0.0;
+ }
+
+ for (i=0; i<=kar_range; i++) kar_p[i] = 0.0;
+
+ for (i=1; i<=kar_nsq; i++) {
+ for (j=1; j<kar_nsq; j++)
+ kar_p[pam2[i][j]-kar_min] += aa0_f[i]*aa1_f[j];
+ }
+
+ kar_tot = 0.0;
+ for (i=0; i<=kar_range; i++) kar_tot += kar_p[i];
+ if (kar_tot <= 0.00001) return 0;
+
+ for (i=0; i<=kar_range; i++) kar_p[i] /= kar_tot;
+
+ return karlin_k(kar_min, kar_max, kar_p, lambda, K, H);
+}
+
+/* take a array of letters and pam information and get *lambda, *H */
+int
+karlin(int low, /* Lowest score (must be negative) */
+ int high, /* Highest score (must be positive) */
+ double *pr, /* Probabilities for various scores */
+ double *lambda_p, /* Pointer to parameter lambda */
+ double *H_p) /* Pointer to parameter H */
+{
+ int i,range, nit;
+ double up,new,sum,av,beta,ftemp;
+ double lambda;
+ double *ptr1;
+
+ /* Calculate the parameter lambda */
+
+ range = high-low;
+
+ /* check for E() < 0.0 */
+ sum = 0;
+ ptr1 = pr;
+ for (i=low; i <= high ; i++) sum += i* (*ptr1++);
+ if (sum >= 0.0) {
+#ifdef DEBUG
+ fprintf(stderr," (karlin lambda) non-negative expected score: %.4lg\n",
+ sum);
+#endif
+ return 0;
+ }
+
+ /* up is upper bound on lambda */
+ up=0.5;
+ do {
+ up *= 2.0;
+ ptr1=pr;
+
+ beta=exp(up);
+
+ ftemp=exp(up*(low-1));
+ sum = 0.0;
+ for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
+ }
+ while (sum<1.0);
+
+ /* avoid overflow from very large lambda*S */
+/*
+ do {
+ up /= 2.0;
+ ptr1=pr;
+ beta=exp(up);
+
+ ftemp=exp(up*(low-1));
+ sum = 0.0;
+ for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
+ } while (sum > 2.0);
+
+ up *= 2.0;
+*/ /* we moved past, now back up */
+
+ /* for (lambda=j=0;j<25;++j) { */
+ lambda = 0.0;
+ nit = 0;
+ while ( nit++ < MAXIT ) {
+ new = (lambda+up)/2.0;
+ beta = exp(new);
+ ftemp = exp(new*(low-1));
+ ptr1=pr;
+ sum = 0.0;
+ /* multiply by exp(new) for each score */
+ for (i=0;i<=range;++i) sum+= *ptr1++ * (ftemp*=beta);
+
+ if (sum > 1.0 + TINY) up=new;
+ else {
+ if ( fabs(lambda - new) < TINY ) goto done;
+ lambda = new;
+ }
+ }
+
+ if (lambda <= 1e-10) {
+ lambda = -1.0;
+ return 0;
+ }
+
+ done:
+ *lambda_p = lambda;
+
+ /* Calculate the parameter K */
+
+ ptr1=pr;
+ ftemp=exp(lambda*(low-1));
+ for (av=0.0, i=low; i<=high; ++i)
+ av+= *ptr1++ *i*(ftemp*=beta);
+ *H_p= lambda*av;
+
+ return 1; /* Parameters calculated successfully */
+}
+
+static int a_gcd (int, int);
+
+/* take a array of letters and pam information and get *lambda, *K, *H */
+static int
+karlin_k(int low, /* Lowest score (must be negative) */
+ int high, /* Highest score (must be positive) */
+ double *pr, /* Probabilities for various scores */
+ double *lambda_p, /* Pointer to parameter lambda */
+ double *K_p,
+ double *H_p) /* Pointer to parameter H */
+{
+ int i,j,range,lo,hi,first,last, nit;
+ double up,new,sum,Sum,av,beta,oldsum,ratio,ftemp;
+ double lambda;
+ double *P,*ptrP,*ptr2;
+ double *ptr1;
+
+ /* Calculate the parameter lambda */
+
+ range = high-low;
+
+ /* check for E() < 0.0 */
+ sum = 0;
+ ptr1 = pr;
+ for (i=low; i <= high ; i++) sum += i* (*ptr1++);
+ if (sum >= 0.0) {
+ fprintf(stderr," (karlin lambda) non-negative expected score: %.4lg\n",
+ sum);
+ /* perhaps we should return values for BLOSUM50 here to avoid fp
+ underflow later */
+ return 0;
+ }
+
+ /* up is upper bound on lambda */
+ up=0.5;
+ do {
+ up *= 2.0;
+ ptr1=pr;
+
+ beta=exp(up);
+
+ ftemp=exp(up*(low-1));
+ sum = 0.0;
+ for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
+ }
+ while (sum<1.0);
+
+ /* avoid overflow from very large lambda*S */
+ /*
+ do {
+ up /= 2.0;
+ ptr1=pr;
+ beta=exp(up);
+
+ ftemp=exp(up*(low-1));
+ sum = 0.0;
+ for (i=0; i<=range; ++i) sum+= *ptr1++ * (ftemp*=beta);
+ } while (sum > 2.0);
+
+ up *= 2.0;
+ */
+ /* we moved past, now back up */
+
+ /* for (lambda=j=0;j<25;++j) { */
+ lambda = 0.0;
+ nit = 0;
+ while ( nit++ < MAXIT ) {
+ new = (lambda+up)/2.0;
+ beta = exp(new);
+ ftemp = exp(new*(low-1));
+ ptr1=pr;
+ sum = 0.0;
+ /* multiply by exp(new) for each score */
+ for (i=0;i<=range;++i) sum+= *ptr1++ * (ftemp*=beta);
+
+ if (sum > 1.0 + TINY) up=new;
+ else {
+ if ( fabs(lambda - new) < TINY ) goto done;
+ lambda = new;
+ }
+ }
+
+ if (lambda <= 1e-10) {
+ lambda = -1.0;
+ return 0;
+ }
+
+ done:
+ *lambda_p = lambda;
+
+ /* Calculate the parameter H */
+
+ ptr1=pr;
+ ftemp=exp(lambda*(low-1));
+ for (av=0.0, i=low; i<=high; ++i) av+= *ptr1++ *i*(ftemp*=beta);
+ *H_p= lambda*av;
+
+ /* Calculate the pamameter K */
+ Sum=lo=hi=0;
+ P= (double *) calloc(MAXIT*range+1,sizeof(double));
+ for (*P=sum=oldsum=j=1;j<=MAXIT && sum>0.001;Sum+=sum/=j++) {
+ first=last=range;
+ for (ptrP=P+(hi+=high)-(lo+=low); ptrP>=P; *ptrP-- =sum) {
+ ptr1=ptrP - first;
+ ptr2=pr + first;
+ for (sum=0,i=first; i<=last; ++i) sum += *ptr1-- * *ptr2++;
+ if (first) --first;
+ if (ptrP-P<=range) --last;
+ }
+ ftemp=exp(lambda*(lo-1));
+ for (sum=0,i=lo;i;++i) sum+= *++ptrP * (ftemp*=beta);
+ for (;i<=hi;++i) sum+= *++ptrP;
+ ratio=sum/oldsum;
+ oldsum=sum;
+ }
+ for (;j<=200;Sum+=oldsum/j++) oldsum*=ratio;
+ for (i=low; !pr[i-low]; ++i);
+ for (j= -i;i<high && j>1;) if (pr[++i-low]) j=a_gcd(j,i);
+ *K_p = (j*exp(-2*Sum))/(av*(1.0-exp(- lambda*j)));
+ free(P);
+
+ return 1; /* Parameters calculated successfully */
+}
+
+int
+a_gcd(int a, int b)
+{
+ int c;
+
+ if (b<0) b= -b;
+ if (b>a) { c=a; a=b; b=c; }
+ for (;b;b=c) { c=a%b; a=b; }
+ return a;
+}
+
diff --git a/src/last_tat.c b/src/last_tat.c
new file mode 100644
index 0000000..913bf0b
--- /dev/null
+++ b/src/last_tat.c
@@ -0,0 +1,155 @@
+/* $Id: last_tat.c 938 2012-06-04 16:15:06Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+
+#include "structs.h"
+#include "param.h"
+#include "mm_file.h"
+#include "best_stats.h"
+
+
+extern int (*ranlib) (char *str, int cnt,
+ fseek_t libpos, char *libstr,
+ struct lmf_str *lm_fd);
+
+#define RANLIB (m_fptr->ranlib)
+
+#define MAX_BLINE 200
+
+int
+re_getlib(unsigned char *, struct annot_str **,int, int, int, int, int, long *, long *,
+ struct lmf_str *m_fptr);
+
+void
+do_work(unsigned char *aa0, int n0, unsigned char *aa1, int n1, int frame,
+ struct pstruct *ppst, void *f_str, int qr_flg, struct rstruct *rst);
+
+extern void
+do_opt (unsigned char *aa0, int n0, unsigned char *aa1, int n1,
+ int frame, struct pstruct *pst, void *f_str,
+ struct rstruct *rst);
+
+struct lmf_str *re_openlib(struct lmf_str *, int outtty);
+
+void sortbestz (struct beststr **bptr, int nbest);
+
+double zs_to_E(double zs,int n1, int isdna, long entries, struct db_str db);
+
+double scale_one_score(int ipos, double escore, struct db_str db, void *rs_str);
+
+void sortbests (struct beststr **bptr, int nbest)
+{
+ int gap, i, j;
+ struct beststr *tmp;
+
+ for (gap = nbest/2; gap > 0; gap /= 2)
+ for (i = gap; i < nbest; i++)
+ for (j = i - gap; j >= 0; j-= gap) {
+ if (bptr[j]->rst.score[0] >= bptr[j + gap]->rst.score[0]) break;
+ tmp = bptr[j];
+ bptr[j] = bptr[j + gap];
+ bptr[j + gap] = tmp;
+ }
+}
+
+int
+last_calc(
+ unsigned char **aa0, unsigned char *aa1save, int maxn,
+ struct beststr **bptr, int nbest,
+ const struct mngmsg *m_msg, struct pstruct *ppst
+ , void **f_str
+ , void *rstat_str)
+{
+ unsigned char *aa1;
+ int nopt, ib;
+ struct beststr *bbp;
+ long loffset, l_off;
+ int n0, n1;
+ struct rstruct rst;
+ struct lmf_str *m_fptr;
+ char bline[60];
+ int tat_samp, tat_inc, loop_cnt, i;
+ double min_escore, ess;
+
+ n0 = m_msg->n0;
+
+ sortbestz(bptr,nbest);
+
+ tat_inc = 500;
+/*
+ if (zs_to_E(bptr[0]->zscore,bptr[0]->n1,0,ppst->zdb_size,m_msg->db)/
+ zs_to_E(bptr[nbest-1]->zscore,bptr[nbest-1]->n1,0,ppst->zdb_size,m_msg->db)
+ < 1e-20) { tat_inc /= 4 ;}
+*/
+
+/* || (zs_to_E(bptr[0]->zscore,bptr[0]->n1,0,ppst->zdb_size,m_msg->db)< 1e-5); */
+
+ ib = tat_samp = 0;
+ for (loop_cnt = 0; loop_cnt < 5; loop_cnt++) {
+ tat_samp += tat_inc;
+ nopt = min(nbest,tat_samp);
+ min_escore = 1000000.0;
+ for ( ; ib<nopt; ib++) {
+ bbp = bptr[ib];
+
+ if (bbp->rst.score[0] < 0) break;
+
+ if (bbp->seq->aa1b == NULL) {
+
+ if ((m_fptr=re_openlib(bbp->mseq->m_file_p,!m_msg->quiet))==NULL) {
+ fprintf(stderr,"*** cannot re-open %s\n",bbp->mseq->m_file_p->lb_name);
+ exit(1);
+ }
+
+ RANLIB(bline,sizeof(bline),bbp->mseq->lseek,bbp->mseq->libstr,m_fptr);
+
+ n1 = re_getlib(aa1save,NULL,maxn,m_msg->ldb_info.maxt3,
+ m_msg->ldb_info.l_overlap,bbp->mseq->cont,
+ m_msg->ldb_info.term_code,
+ &loffset,&l_off,bbp->mseq->m_file_p);
+ aa1 = aa1save;
+ }
+ else {
+ n1 = bbp->seq->n1;
+ aa1 = bbp->seq->aa1b;
+ loffset = bbp->seq->l_offset;
+ l_off = bbp->seq->l_off;
+ }
+
+ do_opt(aa0[bbp->frame],m_msg->n0,aa1,n1,bbp->frame,ppst,
+ f_str[bbp->frame],&rst);
+ memcpy(&(bbp->rst),&rst,sizeof(struct rstruct));
+
+ if ((ess=scale_one_score(ib, bbp->rst.escore, m_msg->db, rstat_str)) <
+ min_escore) { min_escore = ess;}
+ /*
+ fprintf(stderr,"%d: %4d %2d %3d %.4g %.4g\n",
+ ib, bbp->rst.score[0], bbp->segnum,bbp->seglen,bbp->escore, ess);
+ */
+ }
+
+ if (min_escore > m_msg->e_cut) goto done;
+ }
+ done:
+ return ib;
+}
diff --git a/src/last_thresh.c b/src/last_thresh.c
new file mode 100644
index 0000000..0d63130
--- /dev/null
+++ b/src/last_thresh.c
@@ -0,0 +1,62 @@
+/* $Id: last_thresh.c 808 2011-07-19 20:05:24Z wrp $ */
+
+/* copyright (c) 2011, 2014 by William R. Pearson and The
+ Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+
+#include "structs.h"
+#include "param.h"
+#include "mm_file.h"
+#include "best_stats.h"
+
+#ifdef PCOMPLIB
+#include "msg.h"
+void do_stage2(struct beststr **bptr, int nbest, const struct mngmsg *m_msp0,
+ int s_func, struct qmng_str *qm_msp);
+#endif
+
+static char thresh_str[MAX_STR];
+
+int E1_to_s(double e_val, int n0, int n1, int db_size, void *pu);
+
+int
+last_calc(
+#ifndef PCOMPLIB
+ unsigned char **aa0, unsigned char *aa1, int maxn,
+#endif
+ struct beststr **bptr, int nbest,
+ const struct mngmsg *m_msg, struct pstruct *ppst
+#ifdef PCOMPLIB
+ , struct qmng_str *qm_msp
+#else
+ , void **f_str
+#endif
+ , void *pstat_str)
+{
+
+ if (ppst->zdb_size < 0 ) ppst->zdb_size = m_msg->db.entries;
+ ppst->repeat_thresh = E1_to_s(ppst->e_cut, m_msg->n0, bptr[0]->seq->n1, ppst->zdb_size, pstat_str);
+
+ ppst->other_info = thresh_str;
+ sprintf(thresh_str,"Threshold: E() < %.2g score: %d\n",ppst->e_cut, ppst->repeat_thresh);
+
+ return nbest;
+}
diff --git a/src/lav_defs.h b/src/lav_defs.h
new file mode 100644
index 0000000..7471bab
--- /dev/null
+++ b/src/lav_defs.h
@@ -0,0 +1,44 @@
+
+#define MAX_STR 512 /* standard label/message buffer */
+
+#ifndef XTERNAL
+long pminx, pmaxx, pminy, pmaxy;
+int max_x=540, max_y=540;
+double fxscal, fyscal, fxoff, fyoff;
+
+int *linarr;
+int nlinarr=5;
+
+char lvstr[MAX_STR];
+
+double elinval[4]={1e-4,1e-2,1.0,100.0};
+double blinval[4]={40.0,30.0,20.0,10.0};
+int ilinval[4]={200,100,50,25};
+extern int have_bits, have_zdb;
+#else
+int have_bits=0, have_zdb=0;
+long zdb_size=1;
+#endif
+
+void openplt(long, long, int, int, char *, char *);
+void closeplt();
+void drawdiag(long n0, long n1);
+void closepl();
+void move(int, int);
+void cont(int, int);
+void clsline();
+void xaxis(long, int, char *);
+void yaxis(long, int, char *);
+void legend();
+void linetype(int);
+void opnline(int s, double bits);
+void newline();
+void clsline();
+void move(int, int);
+void draw(int, int);
+void sxy_move(int, int);
+void sxy_draw(int, int);
+void draw_str(char *);
+void draw_sstr(char *);
+
+double bit_to_E(double bits);
diff --git a/src/lib_sel.c b/src/lib_sel.c
new file mode 100644
index 0000000..9b1be12
--- /dev/null
+++ b/src/lib_sel.c
@@ -0,0 +1,341 @@
+/* $Id: lib_sel.c 792 2011-06-26 18:20:30Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* modified Dec 13, 1989 requires different FASTLIBS */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <ctype.h>
+#include <string.h>
+
+#include "defs.h"
+#include "structs.h"
+
+#ifdef NCBIBL13
+#define LASTLIB NCBIBL13+1
+#else
+#define LASTLIB 11
+#endif
+
+
+struct lib_struct *get_lnames(char *tname, struct lib_struct *cur_lib_p);
+struct lib_struct *add_file(char *name, char *env, struct lib_struct *cur_lib_p);
+void lib_choice(char *lname, int nl, char *flstr, int ldnaseq);
+void subs_env(char *dest, char *src, int dest_size);
+char *ulindex(char *str, char *chr);
+
+static char ldname[MAX_FN];
+static char *libenv;
+
+/* read in the library names.
+ returns the beginning of the list of names, not the end
+
+ if cur_lib_p is NULL, then allocates it and returns it.
+ if cur_lib_p is not NULL, then links to cur_lib_p->next and returns cur_lib_p
+*/
+struct lib_struct *
+get_lnames(char *iname, struct lib_struct *cur_lib_p)
+{
+ char *bp, tsave[MAX_STR], *tname;
+ char lline[MAX_FN], *llp;
+ struct lib_struct *new_lib_p;
+ FILE *tptr;
+
+ /* expand environment variables */
+
+ tname = tsave;
+ subs_env(tname, iname, sizeof(tsave));
+
+ if (*tname != '@') {
+ new_lib_p = add_file(tname,"\0",cur_lib_p);
+ if (cur_lib_p == NULL) return new_lib_p;
+ else return cur_lib_p;
+ }
+ else tname++;
+
+ /* remove ' ' before deftype if present */
+ if ((bp=strchr(tname,' '))!=NULL) *bp='\0';
+
+ if ((tptr=fopen(tname,"r"))==NULL) {
+ fprintf(stderr," could not open file of names: %s\n",tname);
+ return NULL;
+ }
+
+ new_lib_p = cur_lib_p;
+ while (fgets(lline,sizeof(lline),tptr)!=NULL) {
+ if (lline[0]==';') continue;
+ if ((bp=strchr(lline,'\n'))!=NULL) *bp='\0';
+ subs_env(tsave, lline, sizeof(tsave));
+ if (tsave[0]=='<') {
+ strncpy(ldname,&tsave[1],sizeof(ldname));
+ ldname[sizeof(ldname)-1]='\0';
+ libenv=ldname;
+ }
+ else {
+ new_lib_p = add_file(tsave,libenv,new_lib_p);
+ if (cur_lib_p == NULL) cur_lib_p = new_lib_p;
+ }
+ }
+ fclose(tptr);
+ return cur_lib_p;
+}
+
+void
+lib_choice(char *lname, int nl, char *flstr, int ldnaseq)
+{
+ FILE *fch;
+ char line[MAX_STR], *bp;
+ char *chstr[MAX_CH],*chfile[MAX_CH];
+ char *chtmp, *charr;
+ int i,j,k,chlen;
+
+ charr = NULL;
+ if (strlen(flstr)> (size_t)0) {
+ chlen = MAX_CH*MAX_FN;
+ if ((chtmp=charr=calloc((size_t)chlen,sizeof(char)))==NULL) {
+ fprintf(stderr,"cannot allocate choice file array\n");
+ goto l1;
+ }
+ chlen--;
+ if ((fch=fopen(flstr,"r"))==NULL) {
+ fprintf(stderr," cannot open choice file: %s\n",flstr);
+ goto l1;
+ }
+ fprintf(stderr,"\n Choose sequence library:\n\n");
+
+ for (i=j=0; j<MAX_CH; i++) {
+ if (fgets(line,sizeof(line),fch)==NULL) break;/* check for comment */
+ if (line[0]==';') continue;
+ if ((bp=strchr(line,'\n'))!=NULL) *bp='\0'; /* remove \n */
+ if ((bp=strchr(line,'$'))==NULL) continue; /* if no '$', continue */
+ *bp++='\0'; /* replace $ with \0, bp points to libtype */
+
+ /* if libtypes don't match, continue */
+ if ((*bp++ -'0')!=ldnaseq) continue;
+
+ /* if the library file name is too long, quit */
+ if ((k=strlen(line))>chlen) break;
+
+ /* save the library file name */
+ strncpy(chstr[j]=chtmp,line,chlen);
+ chtmp += k+1; chlen -= k+1;
+
+ if ((k=strlen(bp))>chlen) break;
+ strncpy(chfile[j]=chtmp,bp,chlen);
+ chtmp += k+1; chlen -= k+1;
+ fprintf(stderr," %c: %s\n",*chfile[j++],line);
+ }
+ l2: fprintf(stderr,"\n Enter library filename (e.g. %s), letter (e.g. P)\n",
+ (ldnaseq==0)? "prot.lib" : "dna.lib");
+ fprintf(stderr," or a %% followed by a list of letters (e.g. %%PN): ");
+ fflush(stderr);
+ if (fgets(line,sizeof(line),stdin)==NULL) exit(0);
+ if ((bp=strchr(line,'\n'))!=NULL) *bp='\0';
+ if (strlen(line)==0) goto l2;
+ strncpy(lname,line,nl);
+ }
+ else {
+ l1: fprintf(stderr," library file name: ");
+ fflush(stderr);
+ if (fgets(line,sizeof(line),stdin)==NULL) exit(0);
+ if ((bp=strchr(line,'\n'))!=NULL) *bp='\0';
+ if (strlen(line)> (size_t)0) strncpy(lname,line,nl);
+ else goto l1;
+ }
+ if (charr!=NULL) {
+ fclose(fch);
+ free(charr);
+ }
+}
+
+/* lib_select parses the choices in char *lname and builds the list
+ of library files.
+
+ lib_select returns the head of the list of files
+
+ lib_select recognizes:
+ %prm -- leading '%' indicates a list of one letter abbreviations
+ +abrev1+abrev2+abrev3 -- leading '+' indicates a list of word abbreviations
+
+ @library.fil -- leading '@' indicates a list of library files
+ (possibly containing additional @files)
+
+ Support for NCBI .pal/.nal files needs to be added
+*/
+struct lib_struct *
+lib_select(char *lname, char *ltitle, const char *flstr, int ldnaseq)
+{
+ char line[MAX_FN*2], *bp, *bp1;
+ char *llnames[MAX_LF]; /* pointers into new list of names */
+ int new_abbr,ich, nch; /* use new multi-letter abbr */
+ int ltmp;
+ FILE *fch;
+ struct lib_struct *cur_lib_p = NULL;
+
+ new_abbr = 0;
+ *ltitle = '\0';
+
+ if (strlen(lname) > (size_t)1 && *lname != '%' && *lname != '+') {
+ return get_lnames(lname, cur_lib_p); /* file name */
+ }
+ else {
+ if (*flstr=='\0') {
+ fprintf(stderr," abbrv. list request but FASTLIBS undefined, cannot use %s\n",lname);
+ exit(1);
+ }
+
+ if (strchr(lname,'+')) {
+ /* indicates list of database abbrevs (not files) */
+ new_abbr=1;
+ nch = 0;
+ bp = lname+1; if (*bp == '+') bp++;
+ for (bp1=bp; nch < MAX_LF && bp!=NULL && bp1!=NULL; bp=bp1+1) {
+ if ((bp1=strchr(bp,'+'))!=NULL) *bp1='\0';
+ llnames[nch++] = bp;
+ }
+ }
+ else if (*lname=='%') { /* list of single letter abbreviations */
+ lname++; /* bump over '%' to get letters */
+ }
+
+ /* else just use a single character abbreviation */
+
+ if (strlen(flstr) > (size_t)0) {
+ if ((fch=fopen(flstr,"r"))==NULL) {
+ fprintf(stderr," cannot open choice file: %s\n",flstr);
+ return NULL;
+ }
+ }
+
+ /* read each line of FASTLIBS */
+ while (fgets(line,sizeof(line),fch)!=NULL) {
+ if (line[0]==';') continue; /* skip comments */
+ if ((bp=strchr(line,'\n'))!=NULL) *bp='\0'; /* remove '\n' */
+ if ((bp=strchr(line,'$'))==NULL) continue; /* no delim, continue */
+ *bp++='\0'; /* point to library type */
+ if ((*bp++ -'0')!=ldnaseq) continue; /* doesn't match, continue */
+
+ /* if !new_abbr, match on one letter with ulindex() */
+ if (!new_abbr) {
+ if (*bp=='+') continue; /* not a &lib& */
+ else if (ulindex(lname,bp)!=NULL) {
+ if (ltitle[0] == '\0') {
+ strncpy(ltitle,line,MAX_STR);
+ }
+ else {
+ ltmp = strlen(ltitle);
+ strncat(ltitle,",\n ",MAX_STR-ltmp);
+ strncat(ltitle,line,MAX_STR-ltmp-4);
+ }
+ cur_lib_p = get_lnames(bp+1, cur_lib_p);
+ }
+ }
+ else {
+ if (*bp!='+') continue;
+ else {
+ bp++;
+ if ((bp1 = strchr(bp,'+'))!=NULL) {
+ *bp1='\0';
+ for (ich = 0; ich<nch; ich++) {
+ if (strcmp(llnames[ich],bp)==0) {
+ if (ltitle[0] == '\0') {
+ strncpy(ltitle,line,MAX_STR);
+ }
+ else {
+ ltmp = strlen(ltitle);
+ strncat(ltitle,",\n ",MAX_STR-ltmp);
+ strncat(ltitle,line,MAX_STR-ltmp-4);
+ }
+ cur_lib_p = get_lnames(bp1+1, cur_lib_p);
+ break;
+ }
+ }
+ *bp1='+';
+ }
+ else fprintf(stderr,"%s missing final '+'\n",bp);
+ }
+ }
+ }
+ fclose(fch);
+ }
+ return cur_lib_p;
+}
+
+/* unlike lib_select() and get_lnames(), add_file() returns a new
+ pointer, to which library files can be added
+*/
+struct lib_struct *
+add_file(char *fname, char *env, struct lib_struct *cur_lib_p)
+{
+ char tname[MAX_STR], *bp, *bp1;
+ char *lbptr;
+ int len, lenv, l_size;
+ struct lib_struct *this_lib_p;
+
+ /* check for default directory for files */
+ if (env != NULL && *env != '\0') lenv = strlen(env)+1;
+ else lenv = 0;
+
+ len=strlen(fname)+1+lenv;
+
+ if (lenv > 1 && *fname != '#') { /* add default directory to file name */
+ strncpy(tname,env,sizeof(tname)-1);
+#ifdef UNIX
+ strcat(tname,"/");
+#endif
+ }
+ else tname[0]='\0';
+
+ /* get to the end of the current list */
+ while (cur_lib_p && cur_lib_p->next) {cur_lib_p = cur_lib_p->next;}
+
+ /* add fname to tname, allocate space, and move to space */
+ strncat(tname,fname,sizeof(tname)-strlen(tname)-1);
+ len=strlen(tname)+1;
+ if ((lbptr=calloc(len,sizeof(char)))==NULL) {
+ fprintf(stderr,"no more space for filenames: %s ignored\n",fname);
+ return cur_lib_p;
+ }
+ else {
+ strncpy(lbptr,tname,len);
+ lbptr[len-1]='\0';
+ /* have a file name to add, I need a lib_struct */
+ if ((this_lib_p = (struct lib_struct *)calloc(1,sizeof(struct lib_struct)))==NULL) {
+ fprintf (stderr,"*** Error -- Cannot allocate lib_struct for %s\n",tname);
+ return NULL;
+ }
+ else {
+ this_lib_p->file_name = lbptr;
+ if (cur_lib_p != NULL) {cur_lib_p->next = this_lib_p;}
+ return this_lib_p;
+ }
+ }
+}
+
+char *
+ulindex(char *str, char *chr)
+{
+ char c;
+
+ c = tolower((int)(*chr));
+
+ while (*str != '\0' && tolower(*str) !=c ) str++;
+ if (*str=='\0') return NULL;
+ else return str;
+}
diff --git a/src/list_db.c b/src/list_db.c
new file mode 100644
index 0000000..40cb97d
--- /dev/null
+++ b/src/list_db.c
@@ -0,0 +1,245 @@
+/* list_db.c - report values from map_db.c */
+
+/* $Id: list_db.c $ */
+
+/* copyright (c) 1999, 2014 by William R. Pearson and The Rector &
+ Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* format of the index file:
+
+1) map_db version number ["MP"+2 bytes]
+2) number of sequences in database [4 bytes]
+3) total length of database [8 bytes]
+4) longest sequence in database [8 bytes]
+5) list of offsets to definitions [num_seq+1] int*8
+6) list of offsets to sequences [num_seq+1] int*8
+7) list of flag characters for sequences [num_seq+1] bytes
+ (used for GCG binary to encode 2bit or 4 bit representation)
+
+ sequence files will be as defined by their format
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "uascii.h"
+#include "mm_file.h"
+/* #include "ncbl2_head.h" */
+
+void src_int4_write(FILE *, int);
+void src_int4_read(FILE *, int *);
+void src_long4_read(FILE *, long *);
+void src_long8_write(FILE *, int64_t);
+void src_long8_read(FILE *, int64_t *);
+
+void newname(char *nname, char *oname, char *suff, int maxn);
+
+int
+main(int argc, char **argv)
+{
+ FILE *libi;
+ char lname[256];
+ char iname[256];
+ char format[4];
+ char *bp;
+
+ int i;
+ int d_pos; /* start of description */
+ int s_pos; /* start of sequence */
+ int attr; /* sequence attribute */
+ int lib_aa; /* 0 => DNA, 1 => protein */
+ int nlib; /* number of entries */
+ fseek_t f_size;
+ long lf_size; /* (long) version of f_size */
+ long max_len; /* longest sequence */
+ MM_OFF tot_len; /* total sequence length */
+ int n1;
+
+ int lib_size; /* current space available - may be realloc'ed */
+ int lib_inc;
+ int lib_type; /* 1 for protein, 0 for DNA */
+ int lib_dna; /* dna=1; prot=0; */
+ fseek_t *d_pos_arr; /* array of description pointers */
+ fseek_t *s_pos_arr; /* array of description pointers */
+ long *tmp_pos_arr;
+ char *attr_arr; /* array of attribute chars */
+
+ int mm64_flag;
+
+ lib_type = 0;
+ lib_dna = 0;
+
+ /* open the database */
+ if (argc > 1) strncpy(lname, argv[1],sizeof(lname));
+ else {
+ fprintf(stderr," Entry library name: ");
+ fgets(lname,sizeof(lname),stdin);
+ if ((bp=strchr(lname,'\n'))!=NULL) *bp='\0';
+ }
+
+ if ((bp=strchr(lname,' '))!=NULL) {
+ lib_type = atoi(bp+1);
+ *bp='\0';
+ }
+ else lib_type = 0;
+
+ newname(iname,lname,"xin",sizeof(iname));
+
+ if ((libi=fopen(iname,"r"))==NULL) {
+ fprintf(stderr," cannot open %s\n",iname);
+ exit(1);
+ }
+
+ fread(format,1,sizeof(format),libi);
+ printf("%c%c%d %d\n",format[0],format[1],format[2],format[3]);
+ mm64_flag = (format[2]==1);
+
+ src_int4_read(libi,&lib_aa);
+
+ if (mm64_flag) src_long8_read(libi,&f_size);
+ else {
+ src_long4_read(libi,&lf_size);
+ f_size = lf_size;
+ }
+
+ src_int4_read(libi,&nlib);
+
+ if (mm64_flag) {
+ src_long8_read(libi,&tot_len);
+ }
+ else {
+ src_long4_read(libi,&lf_size);
+ tot_len = lf_size;
+ }
+ src_long4_read(libi,&max_len);
+
+ printf(" %d entries; tot: %lld; max: %ld\n",nlib,tot_len,max_len);
+
+ /* allocate array of description pointers */
+ if (!mm64_flag) {
+ if ((tmp_pos_arr=(long *)calloc(nlib+1,sizeof(long)))==NULL) {
+ fprintf(stderr," cannot allocate %d for tmp_pos array\n",
+ nlib+1);
+ }
+ }
+
+ /* allocate array of description pointers */
+ if ((d_pos_arr=(fseek_t *)calloc(nlib+1, sizeof(fseek_t)))==NULL) {
+ fprintf(stderr," cannot allocate %d for desc. array\n",nlib+1);
+ exit(1);
+ }
+ /* allocate array of sequence pointers */
+ if ((s_pos_arr=(fseek_t *)calloc(nlib+1, sizeof(fseek_t)))==NULL) {
+ fprintf(stderr," cannot allocate %d for seq. array\n",nlib+1);
+ exit(1);
+ }
+ if ((attr_arr=(char *)calloc(nlib+1, sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate %d for attr. array\n",nlib+1);
+ exit(1);
+ }
+
+ if (mm64_flag) {
+ for (i=0; i<=nlib; i++) src_long8_read(libi,&d_pos_arr[i]);
+ for (i=0; i<=nlib; i++) src_long8_read(libi,&s_pos_arr[i]);
+ }
+ else {
+ for (i=0; i<=nlib; i++) src_long4_read(libi,&tmp_pos_arr[i]);
+ for (i=0; i<=nlib; i++) d_pos_arr[i] = tmp_pos_arr[i];
+ for (i=0; i<=nlib; i++) src_long4_read(libi,&tmp_pos_arr[i]);
+ for (i=0; i<=nlib; i++) s_pos_arr[i] = tmp_pos_arr[i];
+ }
+
+ fread(attr_arr,nlib+1,sizeof(char),libi);
+ fclose(libi);
+
+ printf("header\tseq\n");
+
+ for (i=0; i<nlib; i++) printf("%lld\t%lld\n",d_pos_arr[i],s_pos_arr[i]);
+
+ exit(0);
+}
+
+void src_int4_read(FILE *fd, int *val)
+{
+ int tval;
+#ifdef IS_BIG_ENDIAN
+ fread(&tval,(size_t)4,(size_t)1,fd);
+ *val = tval;
+#else
+ unsigned char b[4];
+
+ fread((char *)&b[0],(size_t)1,(size_t)4,fd);
+ *val = 0;
+ *val = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
+ +(int)b[3];
+#endif
+}
+
+void src_long4_read(FILE *fd, long *val)
+{
+ int tval;
+#ifdef IS_BIG_ENDIAN
+ fread(&tval,(size_t)4,(size_t)1,fd);
+ *val = tval;
+#else
+ unsigned char b[4];
+
+ fread((char *)&b[0],(size_t)1,(size_t)4,fd);
+ *val = 0;
+ *val = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
+ +(int)b[3];
+#endif
+}
+
+void src_long8_read(FILE *fd, int64_t *val)
+{
+#ifdef IS_BIG_ENDIAN
+ fread((char *)val,(size_t)8,(size_t)1,fd);
+#else
+ unsigned char b[8];
+
+ fread((char *)&b[0],(size_t)1,(size_t)8,fd);
+ *val = 0;
+ *val = (int)
+ ((((((((int)b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)+(int)b[3]<<8)+
+ (int)b[4]<<8)+(int)b[5]<<8)+(int)b[6]<<8)+(int)b[7];
+#endif
+}
+
+void src_int4_write(FILE *fd, int val)
+{
+#ifdef IS_BIG_ENDIAN
+ fwrite(&val,(size_t)4,(size_t)1,fd);
+#else
+ unsigned char b[4];
+
+ b[3] = val & 255;
+ b[2] = (val=val>>8)&255;
+ b[1] = (val=val>>8)&255;
+ b[0] = (val=val>>8)&255;
+
+ fwrite(b,(size_t)1,(size_t)4,fd);
+#endif
+}
+
+void
+newname(char *nname, char *oname, char *suff, int maxn)
+{
+ strncpy(nname,oname,maxn-1);
+ strncat(nname,".",1);
+ strncat(nname,suff,maxn-strlen(nname));
+}
diff --git a/src/llgetaa.c b/src/llgetaa.c
new file mode 100644
index 0000000..4e1910a
--- /dev/null
+++ b/src/llgetaa.c
@@ -0,0 +1,497 @@
+/* $Id: llgetaa.c 625 2011-03-23 17:21:38Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2007 by William R. Pearson and
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/*
+ Feb, 1998 - version for prss
+
+ March, 2001 - modifications to support comp_thr.c: use libpos to indicate
+ whether the score is shuffled==1 or unshuffled==0. This simplifies
+ complib.c and makes comp_thr.c possible
+
+ modified version of nxgetaa.c that generates random sequences
+ for a library
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+#include "mm_file.h"
+
+#include "uascii.h"
+#include "structs.h"
+
+#define XTERNAL
+#include "upam.h"
+#undef XTERNAL
+
+#define YES 1
+#define NO 0
+#define MAXLINE 512
+
+#ifndef min
+#define min(x,y) ((x) > (y) ? (y) : (x))
+#endif
+
+static int use_stdin=0;
+static char llibstr0[256];
+static char llibstr1[256];
+static char o_line[256];
+
+#define NO_FORMAT 0
+#define FASTA_FORMAT 1
+#define GCG_FORMAT 2
+static int seq_format=NO_FORMAT;
+static char seq_title[200];
+
+extern int irand(int);
+extern void shuffle(unsigned char *from, unsigned char *to, int n);
+extern void wshuffle(unsigned char *from, unsigned char *to, int n, int wsiz, int *ieven);
+
+int
+getseq(char *filen, int *qascii,
+ unsigned char *seq, int maxs, char *libstr,
+ int n_libstr, long *sq0off)
+{
+ FILE *fptr;
+ char line[512],*bp;
+ int i, j, n;
+ int ic;
+ int sstart, sstop, sset=0;
+ int have_desc = 0;
+ int desc_complete = 0;
+ int llen, l_offset;
+
+ seq_title[0]='\0';
+
+ sstart = sstop = -1;
+#ifndef DOS
+ if ((bp=strchr(filen,':'))!=NULL) {
+#else
+ if ((bp=strchr(filen+3,':'))!=NULL) {
+#endif
+ *bp='\0';
+ if (*(bp+1)=='-') sscanf(bp+2,"%d",&sstop);
+ else sscanf(bp+1,"%d-%d",&sstart,&sstop);
+ sset=1;
+ }
+
+ if (strcmp(filen,"-") && strcmp(filen,"@")) {
+ if ((fptr=fopen(filen,"r"))==NULL) {
+ fprintf(stderr," could not open %s\n",filen);
+ return 0;
+ }
+ }
+ else {
+ fptr = stdin;
+ use_stdin++;
+ }
+
+ if (use_stdin > 1) {
+ have_desc = 1;
+ if ((bp=strchr(o_line,'\001'))!=NULL) *bp='\0';
+ strncpy(llibstr1,o_line,sizeof(llibstr1));
+ strncpy(libstr,o_line,n_libstr);
+ libstr[n_libstr-1]='\0';
+ l_offset = 0;
+ }
+
+ if (sset==1) {
+ filen[strlen(filen)]=':';
+ if (*sq0off==1 || sstart>1) *sq0off = sstart;
+ }
+
+ desc_complete = 0;
+ n=0;
+ while(fgets(line,sizeof(line),fptr)!=NULL) {
+ if (line[0]=='>') {
+ if (have_desc) {
+ strncpy(o_line,line,sizeof(o_line));
+ goto last;
+ }
+ l_offset = 0;
+ seq_format = FASTA_FORMAT;
+
+ if ((bp=(char *)strchr(line,'\n'))!=NULL) {
+ *bp='\0'; /* have newline */
+ desc_complete = 1;
+ }
+
+ if ((bp=strchr(line+1,'\001'))!=NULL) *bp='\0';
+ strncpy(seq_title,line+1,sizeof(seq_title));
+ strncpy(llibstr0,line+1,sizeof(llibstr0));
+ if (n_libstr <= 20) {
+ if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
+ }
+ strncpy(libstr,line+1,n_libstr);
+ libstr[n_libstr-1]='\0';
+
+ if (!desc_complete) {
+ while (fgets(line, sizeof(line), fptr) != NULL) {
+ if (strchr(line,'\n') != NULL) {
+ line[0]='>';
+ break;
+ }
+ }
+ desc_complete = 1;
+ }
+ }
+ else if (seq_format==NO_FORMAT) {
+ seq_format = GCG_FORMAT;
+ qascii['*'] = qascii['X'];
+ l_offset = 10;
+ llen = strlen(line);
+ while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
+ if (fgets(line,sizeof(line),fptr)==NULL) return 0;
+ llen = strlen(line);
+ }
+ if (n_libstr <= 20) {
+ if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
+ else if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
+ }
+ strncpy(libstr,line,n_libstr);
+ libstr[n_libstr-1]='\0';
+ if (fgets(line,sizeof(line),fptr)==NULL) return 0;
+ }
+
+ if (seq_format==GCG_FORMAT && strlen(line)<l_offset) continue;
+
+ if (line[0]!='>'&& line[0]!=';') {
+ for (i=l_offset; (n<maxs)&&
+ ((ic=qascii[line[i]&AAMASK])<EL); i++)
+ if (ic<NA) seq[n++]= ic;
+ if (ic == ES) break;
+ }
+ else {
+ if (have_desc) {
+ strncpy(o_line,line,sizeof(o_line));
+ goto last;
+ }
+ else {
+ have_desc = 1;
+ }
+ }
+ }
+
+ last:
+ if (n==maxs) {
+ fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
+ fflush(stderr);
+ }
+ if ((bp=strchr(libstr,'\n'))!=NULL) *bp = '\0';
+ if ((bp=strchr(libstr,'\r'))!=NULL) *bp = '\0';
+ seq[n]= EOSEQ;
+
+ if (fptr!=stdin) fclose(fptr);
+
+ if (sset) {
+ if (sstart <= 0) sstart = 1;
+ if (sstop <= 0) sstop = n;
+ sstart--;
+ sstop--;
+ for (i=0, j=sstart; j<=sstop; i++,j++)
+ seq[i] = seq[j];
+ n = sstop - sstart +1;
+ seq[n]=EOSEQ;
+ }
+
+ return n;
+}
+
+int
+gettitle(filen,title,len)
+ char *filen, *title; int len;
+{
+ FILE *fptr;
+ char line[512];
+ char *bp;
+ int ll,sset;
+#ifdef WIN32
+ char *strpbrk();
+#endif
+ sset = 0;
+
+ if (use_stdin) {
+ if (use_stdin == 1) {
+ /* use_stdin++; */
+ strncpy(title,llibstr0,len);
+ }
+ else {
+ strncpy(title,llibstr1,len);
+ }
+ if ((bp=strchr(title,'\001'))!=NULL) *bp='\0';
+ return strlen(title);
+ }
+
+ if ((bp=strchr(filen,':'))!=NULL) { *bp='\0'; sset=1;}
+
+ if ((fptr=fopen(filen,"r"))==NULL) {
+ fprintf(stderr," file %s was not found\n",filen);
+ fflush(stderr);
+ return 0;
+ }
+
+ if (sset==1) filen[strlen(filen)]=':';
+
+ while(fgets(line,sizeof(line),fptr)!=0) {
+ if (line[0]=='>'|| line[0]==';') goto found;
+ }
+ fclose(fptr);
+ title[0]='\0';
+ return 0;
+
+ found:
+ if ((bp=strchr(line,'\001'))!=NULL) *bp = 0;
+#ifdef WIN32
+ bp = strpbrk(line,"\n\r");
+#else
+ bp = strchr(line,'\n');
+#endif
+ if (bp!=NULL) *bp = 0;
+ strncpy(title,line,len);
+ title[len-1]='\0';
+ fclose(fptr);
+ return strlen(title);
+}
+
+FILE *libf=NULL;
+
+long lpos;
+char lline[MAXLINE];
+int lfflag=0; /* flag for CRLF in EMBL CDROM files */
+#define LFCHAR '\015' /* for MWC 5.5 */
+
+int agetlib(); void aranlib(); /* pearson fasta format */
+
+/* the following is from fgetgb.c */
+
+/* a file name for open_lib may now include a library type suffix */
+/* only opens fasta format files */
+
+static char libn_save[MAX_FN];
+static int ldna_save=0;
+static int do_shuffle;
+static int shuff_cnt=10;
+static int w_flag = 0;
+#ifdef DEBUG
+static FILE *dfile=NULL;
+#endif
+static unsigned char *aa_save;
+static int n1_save;
+static int i_even;
+
+/* lmf_str * is used here for compatibility with the "normal" open_lib,
+ but is largely unnecessary */
+
+void
+set_shuffle(struct mngmsg m_msg) {
+ char dfname[MAX_FN];
+
+ if (m_msg.shuff_wid > 0) w_flag = m_msg.shuff_wid;
+ if (m_msg.shuff_max > shuff_cnt) shuff_cnt = m_msg.shuff_max;
+
+#ifdef DEBUG
+ if (m_msg.dfile[0]!='\0') {
+ strncpy(dfname,m_msg.dfile,sizeof(dfname));
+ strncat(dfname,"_rlib",sizeof(dfname));
+ dfile = fopen(dfname,"w");
+ }
+#endif
+}
+
+struct lmf_str *
+open_lib(char *lname, int ldnaseq, int *sascii, int quiet, struct lmf_str *m_fd)
+{
+ char rline[10],libn[MAX_FN], *bp;
+ int wcnt, ll, opnflg;
+ int libtype;
+ struct lmf_str *m_fptr;
+
+ wcnt = 0;
+ libtype = 0;
+
+ strncpy(libn_save,lname,sizeof(libn_save));
+
+ /* now allocate a buffer for the opened text file */
+ if ((m_fptr = calloc(1,sizeof(struct lmf_str)))==NULL) {
+ fprintf(stderr," cannot allocate lmf_str (%ld) for %s\n",
+ sizeof(struct lmf_str),lname);
+ return NULL;
+ }
+
+ strncpy(m_fptr->lb_name,lname,MAX_FN);
+ m_fptr->lb_name[MAX_FN-1]='\0';
+
+ m_fptr->sascii = sascii;
+ m_fptr->getlib = agetlib;
+ m_fptr->ranlib = aranlib;
+ m_fptr->mm_flg = 0;
+
+ do_shuffle = 0;
+ irand(0); /* initialize the random number generator */
+
+ return m_fptr;
+}
+
+void
+closelib()
+{
+ if (libf!=NULL) {
+ fclose(libf);
+ libf = NULL;
+ }
+#ifdef DEBUG
+ if (dfile) fclose(dfile);
+#endif
+}
+
+static int ieven=0;
+static char *desc_save;
+
+int
+agetlib(unsigned char *seq,
+ int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lf_fd,
+ long *l_off)
+{
+ long sq1_off;
+ char lib_desc[120];
+ int i;
+
+ *l_off = 1;
+
+ if (!do_shuffle) {
+ do_shuffle = 1;
+
+ if ((n1_save = getseq(libn_save,lf_fd->sascii,
+ seq,maxs,lib_desc,sizeof(lib_desc),&sq1_off)) < 1)
+ return n1_save;
+
+ strncpy(libstr,lib_desc,n_libstr);
+ libstr[n_libstr-1]='\0';
+
+ if ((aa_save = (unsigned char *)calloc(n1_save+1,sizeof(unsigned char)))==
+ NULL) fprintf(stderr," cannot allocate %d for saved sequence\n",
+ n1_save);
+ memcpy((void *)aa_save,(void *)seq,n1_save);
+
+ if ((desc_save =
+ (char *)calloc(strlen(lib_desc)+1,sizeof(char)))== NULL) {
+ fprintf(stderr," cannot allocate saved desciption [%d]\n",
+ strlen(lib_desc)+1);
+ }
+ else {
+ strncpy (desc_save,lib_desc,strlen(lib_desc));
+ desc_save[strlen(lib_desc)]=='\0';
+ }
+
+ *libpos = 0;
+ return n1_save;
+ }
+ else { /* return a shuffled sequence - here we need a window size; */
+ strncpy(libstr,desc_save,n_libstr);
+ libstr[n_libstr-1]='\0';
+
+ if (shuff_cnt-- <= 0 ) return -1;
+ if (w_flag > 0) wshuffle(aa_save,seq,n1_save,w_flag,&ieven);
+ else shuffle(aa_save,seq,n1_save);
+ seq[n1_save] = EOSEQ;
+#ifdef DEBUG
+ if (dfile!=NULL) {
+ fprintf(dfile,">%d\n",shuff_cnt);
+ for (i=0; i<n1_save; i++) {
+ if (aa[seq[i]]>0) fputc(aa[seq[i]],dfile);
+ else {fprintf(stderr,"error aa0[%d]: %d %d\n",
+ i,seq[i],aa[seq[i]]);}
+ if (i%60 == 59) fputc('\n',dfile);
+ }
+ fputc('\n',dfile);
+ }
+#endif
+ *libpos = 1;
+ return n1_save;
+ }
+}
+
+void
+aranlib(char *str,
+ int cnt,
+ fseek_t seek,
+ char *libstr,
+ struct lmf_str *lm_fd)
+{
+ char *bp;
+ int ll;
+
+ if (use_stdin == 2) {
+ if (llibstr1[0]=='>' || llibstr1[0]==';') {
+ strncpy(str,llibstr1+1,cnt);
+ }
+ else {
+ strncpy(str,llibstr1,cnt);
+ }
+ }
+ else {
+ strncpy(str,desc_save,cnt);
+ }
+ str[cnt-1]='\0';
+ if ((bp = strchr(str,'\001'))!=NULL) *bp='\0';
+ else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+ else str[cnt-1]='\0';
+}
+
+/*
+void
+revcomp(unsigned char *seq, int n, int *c_nt)
+{
+ unsigned char tmp;
+ int i, ni;
+
+
+ for (i=0, ni = n-1; i< n/2; i++,ni--) {
+ tmp = c_nt[seq[i]];
+ seq[i] = c_nt[seq[ni]];
+ seq[ni] = tmp;
+ }
+ if ((n%2)==1) {
+ i = n/2;
+ seq[i] = c_nt[seq[i]];
+ }
+}
+*/
+
+struct lmf_str *
+re_openlib(struct lmf_str *om_fptr, int outtty)
+{
+ return om_fptr;
+}
+
+int re_getlib(unsigned char *aa1, int n1, int maxt3, int loff, int cont,
+ int term_code, long *loffset, long *l_off,
+ struct lmf_str *m_file_p)
+{
+ *loffset = 0;
+ *l_off = 1;
+ return n1;
+}
+
diff --git a/src/lsim4.c b/src/lsim4.c
new file mode 100644
index 0000000..cb0a016
--- /dev/null
+++ b/src/lsim4.c
@@ -0,0 +1,998 @@
+/*
+ lsim4.c - calculate non-overlapping local alignments
+
+ derived from lsim2.c from Webb Miller
+*/
+
+/* $Id: lsim4.c 1070 2012-10-12 16:44:16Z wrp $ */
+/* $Revision: 1070 $ */
+
+/* March 2007 - modified to avoid global references */
+
+/* October, 2008 - modified following changes from Xiaoqui Huang to
+ prevent alignments from crossing the identity diagonal during
+ self-comparison */
+
+/* October 27, 2008 - modified to free pair_ptr memory more reliably
+ (it is still imperfect) */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+#include "param.h"
+
+#include "lsim4.h"
+
+/* SIM(A,B,M,N,mini_score,Q,R) reports best non-intersecting alignments with
+ score >= mini_score of the segments of A and B in order of similarity
+ scores, where pam2[a][b] is the score of aligning a and b, and
+ -(Q+R*i) is the score of an i-symbol indel. */
+
+/* SIM uses A[1..M], B[1..N] FORTRAN indexing */
+
+void SIM(const unsigned char *A, /* seq1 indexed A[1..M] */
+ const unsigned char *B, /* seq2 indexed B[1..N] */
+ int M, int N, /* len seq1, seq2 */
+ struct pstruct *ppst, /* parameters */
+ int nseq, /* nseq - number of different sequences */
+ int mini_score, /* cut-off score */
+ int max_count, /* number of alignments */
+ struct a_res_str *a_res) /* alignment result structure */
+{
+ int endi, endj, stari, starj; /* endpoint and startpoint */
+ int score; /* the max score in LIST */
+ int ck;
+ int i; /* row and column indices */
+
+ int flag;
+ struct a_res_str *cur_ares, *tmp_ares;
+
+ bool first_pass;
+ int q, r, qr;
+
+ int *sapp, last;
+ struct vert_str vLIST;
+
+ struct l_struct *l_ptr;
+ pair_ptr z, z_save;
+ int count; /* maximum size of list */
+ vertex_p v_cur; /* temporary pointer */
+
+ /* allocate space for all vectors */
+
+ l_ptr = (struct l_struct *)ckalloc(1, sizeof(struct l_struct));
+
+ l_ptr->CCC = ( space_ptr ) ckalloc(N+1,sizeof(space));
+
+ l_ptr->r_ss = (struct lrr_str *) ckalloc(N + 1,sizeof(struct lrr_str));
+
+ l_ptr->c_ss = (struct lcc_str *) ckalloc(M + 1, sizeof(struct lcc_str));
+
+ l_ptr->row = ( pair_ptr * ) ckalloc( M + 1, sizeof(pair_ptr));
+
+ /* set up list for each row */
+ if ( nseq == 2 ) {
+ for ( i = 1; i <= M; i++ ) {l_ptr->row[i] = NULL;}
+ }
+ else {
+ for ( i = 1; i <= M; i++ ) {
+ l_ptr->row[i] = z = (pair_ptr) ckalloc(1, sizeof(pair));
+ z->COL = i;
+ z->NEXT = NULL;
+ }
+ }
+
+#ifdef OLD_FASTA_GAP
+ q = -(ppst->gdelval - ppst->ggapval);
+#else
+ q = -ppst->gdelval;
+#endif
+
+ r = -ppst->ggapval;
+
+ qr = q + r;
+
+ vLIST.LIST = vLIST.most = NULL;
+ vLIST.numnode = 0;
+
+ /* fill in l_ptr->CCC and vLIST */
+ big_pass(A,B,M,N,mini_score,ppst->pam2[0],q,r,nseq, &vLIST, l_ptr);
+ first_pass= 1;
+
+ /* Report the K best alignments one by one. After each alignment is
+ output, recompute part of the matrix. First determine the size
+ of the area to be recomputed, then do the recomputation */
+
+ count = 0;
+ while (count < max_count) {
+ if ( vLIST.numnode == 0 ) break; /* no more alignments */
+
+ if (first_pass) {
+ cur_ares = a_res;
+ }
+ else { /* need a new a_res */
+ tmp_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+ cur_ares->next = tmp_ares;
+ cur_ares = tmp_ares;
+ }
+
+ /* get the best next alignment */
+ v_cur = findmax(&vLIST);
+
+ score = v_cur->SCORE;
+ stari = ++v_cur->STARI;
+ starj = ++v_cur->STARJ;
+ endi = v_cur->ENDI;
+ endj = v_cur->ENDJ;
+
+ l_ptr->m1 = v_cur->TOP;
+ l_ptr->mm = v_cur->BOT;
+ l_ptr->n1 = v_cur->LEFT;
+ l_ptr->nn = v_cur->RIGHT;
+
+ l_ptr->rl = endi - stari + 1;
+ l_ptr->cl = endj - starj + 1;
+
+ l_ptr->I = stari - 1;
+ l_ptr->J = starj - 1;
+
+ /* minimum allocation for alignment */
+
+ sapp = cur_ares->res =(int *)calloc(2*min(l_ptr->rl,l_ptr->cl), sizeof(int));
+ last = 0;
+
+ cur_ares->n1 = N;
+ cur_ares->sw_score = cur_ares->rst.score[ppst->score_ix] = score;
+ cur_ares->min0 = stari-1;
+ cur_ares->min1 = starj-1;
+ cur_ares->max0 = stari+l_ptr->rl-1;
+ cur_ares->max1 = starj+l_ptr->cl-1;
+ cur_ares->next = NULL;
+
+ /* produce an alignment, encoded in sapp - equivalent to "align() in dropgsw.c" */
+ (void) diff(&A[stari]-1, &B[starj]-1, l_ptr->rl, l_ptr->cl,
+ q, q, (nseq == 2),ppst->pam2[0], q, r, &sapp, &last, l_ptr);
+
+ ck = CHECK_SCORE(&A[stari]-1,&B[starj]-1,l_ptr->rl,l_ptr->cl,
+ cur_ares->res,ppst->pam2[0],q,r,&cur_ares->nres);
+
+#ifdef DEBUG
+ /* the same errors are produced by Miller and Huang's sim96 code, so I hope
+ this reflects a mistake in CHECK_SCORE */
+
+ if (score != ck) {
+ fprintf(stderr,"*** Check_score error: orig %d != %d recons ***\n aa0[%d-%d] : aa1[%d-%d]\n",
+ score,ck, cur_ares->min0, cur_ares->max0, cur_ares->min1, cur_ares->max1);
+ }
+#endif
+
+ free(v_cur);
+
+ flag = 0;
+ if (first_pass && maxi(l_ptr->rl, l_ptr->cl) > maxi(M,N)/4) {
+ /*printf("no locate\n");*/
+ flag = 1; l_ptr->n1 = l_ptr->m1 = 0;
+ }
+ else {
+ locate(A,B,mini_score,ppst->pam2[0],q,r, nseq, &flag, &vLIST, l_ptr);
+ }
+ if ( flag ) {
+ /*printf("small pass\n");*/
+ small_pass(A,B,mini_score,ppst->pam2[0],q,r, nseq, &vLIST, l_ptr);
+ }
+ first_pass= 0;
+ count++;
+ }
+ /* start cleaning up */
+
+ while (vLIST.numnode > 0) {
+ v_cur = findmax(&vLIST);
+ if (v_cur) free(v_cur);
+ }
+
+ for (i=M; i>0; i--) {
+ if ((z=l_ptr->row[i]) != NULL) {
+ for (z= l_ptr->row[i], z_save = z->NEXT; z_save != NULL; z = z_save) {
+ z_save = z->NEXT;
+ free(z);
+ }
+ }
+ }
+
+ free(l_ptr->row);
+ free(l_ptr->c_ss);
+ free(l_ptr->r_ss);
+ free(l_ptr->CCC);
+ free(l_ptr);
+}
+
+/* A big pass to compute classes scoring over K */
+/* fills in the l_ptr->CCC structure for further analysis */
+/* adds nodes to v_ptr->LIST when appropriate */
+/* does not produce alignments */
+
+static void
+big_pass(const unsigned char *A,
+ const unsigned char *B,
+ int M, int N,
+ int mini_score,
+ int **pam2,
+ int Q, int R,
+ int nseq,
+ struct vert_str *v_ptr,
+ struct l_struct *l_ptr)
+{
+ int i, j; /* row and column indices */
+ int c; /* best score at current point */
+ int f; /* best score ending with insertion */
+ int d; /* best score ending with deletion */
+ int p; /* best score at (i-1, j-1) */
+ int ci, cj; /* end-point associated with c */
+ int fi, fj; /* end-point associated with f */
+ int pi, pj; /* end-point associated with p */
+ space_ptr sp;
+ pair_ptr z;
+ int qr;
+ int *va; /* pointer to v(A[i], B[j]) */
+
+ qr = Q+R;
+
+ /* Compute the matrix and save the best scores in LIST
+ CC : the scores of the current row
+ RR and EE : the starting point that leads to score CC
+ DD : the scores of the current row, ending with deletion
+ SS and FF : the starting point that leads to score DD
+ */
+
+ /* Initialize the 0 th row */
+ for ( sp=&l_ptr->CCC[1], j = 1; j <= N; j++, sp++ ) {
+ sp->CC = sp->RR = 0;
+ sp->EE = j;
+ sp->DD = - (qr);
+ sp->SS = 1;
+ sp->FF = j;
+ }
+
+ for ( i = 1; i <= M; i++) {
+ c = 0; /* Initialize column 0 */
+ f = - (qr);
+ va = pam2[A[i]];
+ ci = fi = i;
+ if ( nseq == 2 ) {
+ p = 0; /* score of current row */
+ pi = (i - 1); /* starting point */
+ cj = fj = pj = 0; /* pj starting point */
+ }
+ else {
+ p = l_ptr->CCC[i].CC;/* score of current row */
+ pi = l_ptr->CCC[i].RR; /* starting point */
+ pj = l_ptr->CCC[i].EE; /* starting point */
+ cj = fj = i;
+ }
+ j = (nseq == 2 ? 1: i+1);
+ for ( sp = &l_ptr->CCC[j]; sp <= &l_ptr->CCC[N]; j++, sp++) {
+
+ d = sp->DD;
+ c = -1;
+ /* assign p+va[B[j]] to c if i, j not part of path */
+ DIAG(i, j, c, p+va[B[j]]) /* diagonal */
+ if (c < 0) {
+ p = sp->CC; pi = sp->RR; pj = sp->EE;
+ if (f >= 0) {
+ c = f; ci = fi; cj = fj;
+ /* replace ci, cj with sp->SS, sp->FF if c < d */
+ ORDER1(c, ci, cj, d, sp->SS, sp->FF)
+ sp->CC = c; sp->RR = ci; sp->EE = cj;
+ sp->DD -= R; f-=R;
+ } else if (d >= 0) {
+ sp->CC = d; sp->RR = sp->SS; sp->EE = sp->FF;
+ sp->DD -= R;
+ } else {
+ sp->CC = 0; sp->RR=i; sp->EE = j;
+ }
+ } else {
+ ci = pi; cj = pj;
+ ORDER1(c, ci, cj, f, fi, fj)
+ ORDER1(c, ci, cj, d, sp->SS, sp->FF)
+ p = sp->CC;
+ sp->CC = c;
+ pi = sp->RR;
+ sp->RR = ci;
+ pj = sp->EE;
+ sp->EE = cj;
+ f -= R;
+ if (c >= qr) {
+ if ( c > mini_score) { /* add the score into list */
+ addnode(c, ci, cj, i, j, v_ptr);
+ }
+ d -= R; c -= qr;
+ ORDER1(f, fi, fj, c, ci, cj)
+ ORDER1(d, sp->SS, sp->FF, c, ci, cj)
+ sp->DD = d;
+ } else {
+ sp->DD -= R;
+ }
+ }
+ }
+ }
+}
+
+/* Determine the left and top boundaries of the recomputed area */
+/* this function is not recursive */
+
+static void
+locate(const unsigned char *A,
+ const unsigned char *B,
+ int mini_score,
+ int **pam2, int Q, int R,
+ int nseq,
+ int *flag_p,
+ struct vert_str *v_ptr,
+ struct l_struct *l_ptr) {
+ int i, j; /* row and column indices */
+ int c; /* best score at current point */
+ int f; /* best score ending with insertion */
+ int d; /* best score ending with deletion */
+ int p; /* best score at (i-1, j-1) */
+ int ci, cj; /* end-point associated with c */
+ int di, dj;
+ int fi, fj; /* end-point associated with f */
+ int pi, pj; /* end-point associated with p */
+
+ space_ptr sp;
+ pair_ptr z;
+ bool cflag, rflag; /* for recomputation */
+ int *va; /* pointer to v(A[i], B[j]) */
+ int limit; /* the bound on j */
+ int qr;
+
+ qr = Q + R;
+
+ /* Reverse pass
+ rows come from CCC
+ CC : the scores on the current row
+ RR and EE : the endpoints that lead to CC
+ DD : the deletion scores
+ SS and FF : the endpoints that lead to DD
+
+ columns come from c_ss[]
+ HH : the scores on the current columns
+ II and JJ : the endpoints that lead to HH
+ WW : the deletion scores
+ XX and YY : the endpoints that lead to WW
+ */
+
+ for ( j = l_ptr->nn; j >= l_ptr->n1 ; j-- ) {
+ l_ptr->CCC[j].CC = 0;
+ l_ptr->CCC[j].EE = j;
+ l_ptr->CCC[j].DD = - (Q);
+ l_ptr->CCC[j].FF = j;
+ if ( nseq == 2 || j > l_ptr->mm )
+ l_ptr->CCC[j].RR = l_ptr->CCC[j].SS = l_ptr->mm + 1;
+ else
+ l_ptr->CCC[j].RR = l_ptr->CCC[j].SS = j;
+ }
+
+ for ( i = l_ptr->mm; i >= l_ptr->m1; i-- ) {
+ c = p = 0;
+ f = - (Q);
+ ci = fi = i;
+ pi = i + 1;
+ cj = fj = pj = l_ptr->nn + 1;
+ va = pam2[A[i]];
+ if ( nseq == 2 || l_ptr->n1 > i ) limit = l_ptr->n1;
+ else limit = i + 1;
+
+ for ( j = l_ptr->nn, sp = &l_ptr->CCC[j]; j >= limit ; j--, sp-- ) {
+ f = f - R;
+ c = c - qr;
+ ORDER(f, fi, fj, c, ci, cj)
+ c = sp->CC - qr;
+ d = sp->DD - R;
+ ORDER(d, sp->SS, sp->FF, c, sp->RR, sp->EE)
+ c = 0;
+ DIAG(i, j, c, p+va[B[j]]) /* diagonal */
+ if ( c <= 0 ) { c = 0; ci = i; cj = j; }
+ else { ci = pi; cj = pj; }
+ ORDER1(c, ci, cj, d, sp->SS, sp->FF)
+ ORDER1(c, ci, cj, f, fi, fj)
+ p = sp->CC;
+ sp->CC = c;
+ pi = sp->RR;
+ pj = sp->EE;
+ sp->RR = ci;
+ sp->EE = cj;
+ sp->DD = d;
+ if ( c > mini_score ) *flag_p = 1;
+ }
+
+ if ( nseq == 2 || i < l_ptr->n1 ) {
+ l_ptr->c_ss[i].HH = l_ptr->CCC[l_ptr->n1].CC;
+ l_ptr->c_ss[i].II = l_ptr->CCC[l_ptr->n1].RR;
+ l_ptr->c_ss[i].JJ = l_ptr->CCC[l_ptr->n1].EE;
+ l_ptr->c_ss[i].WW = f;
+ l_ptr->c_ss[i].XX = fi;
+ l_ptr->c_ss[i].YY = fj;
+ }
+ }
+
+ for ( l_ptr->rl = l_ptr->m1, l_ptr->cl = l_ptr->n1; ; ) {
+ for ( rflag = cflag = 1; ( rflag && l_ptr->m1 > 1 ) || ( cflag && l_ptr->n1 > 1 ) ; ) {
+ if ( rflag && l_ptr->m1 > 1 ) { /* Compute one row */
+ rflag = 0;
+ l_ptr->m1--;
+ c = p = 0;
+ f = - (Q);
+ ci = fi = l_ptr->m1;
+ pi = l_ptr->m1 + 1;
+ cj = fj = pj = l_ptr->nn + 1;
+ va = pam2[A[l_ptr->m1]];
+ for ( j = l_ptr->nn, sp = &l_ptr->CCC[j]; j >= l_ptr->n1 ; j--, sp-- ) {
+ f = f - R;
+ c = c - qr;
+ ORDER(f, fi, fj, c, ci, cj)
+ c = sp->CC - qr;
+ ci = sp->RR;
+ cj = sp->EE;
+ d = sp->DD - R;
+ di = sp->SS;
+ dj = sp->FF;
+ ORDER(d, di, dj, c, ci, cj)
+ c = 0;
+ DIAG(l_ptr->m1, j, c, p+va[B[j]]) /* diagonal */
+ if ( c <= 0 ) { c = 0; ci = l_ptr->m1; cj = j; }
+ else { ci = pi; cj = pj; }
+ ORDER1(c, ci, cj, d, di, dj)
+ ORDER1(c, ci, cj, f, fi, fj)
+ sp->SS = di;
+ sp->FF = dj;
+ p = sp->CC;
+ sp->CC = c;
+ pi = sp->RR;
+ pj = sp->EE;
+ sp->RR = ci;
+ sp->EE = cj;
+ sp->DD = d;
+ if ( c > mini_score ) *flag_p = 1;
+ if ( ! rflag && ( (ci > l_ptr->rl && cj > l_ptr->cl) || (di > l_ptr->rl && dj > l_ptr->cl) || (fi > l_ptr->rl && fj > l_ptr->cl) ) ) rflag = 1;
+ }
+
+ l_ptr->c_ss[l_ptr->m1].HH = l_ptr->CCC[l_ptr->n1].CC;
+ l_ptr->c_ss[l_ptr->m1].II = l_ptr->CCC[l_ptr->n1].RR;
+ l_ptr->c_ss[l_ptr->m1].JJ = l_ptr->CCC[l_ptr->n1].EE;
+ l_ptr->c_ss[l_ptr->m1].WW = f;
+ l_ptr->c_ss[l_ptr->m1].XX = fi;
+ l_ptr->c_ss[l_ptr->m1].YY = fj;
+
+ if ( ! cflag && ( (ci > l_ptr->rl && cj > l_ptr->cl) || (di > l_ptr->rl && dj > l_ptr->cl) || (fi > l_ptr->rl && fj > l_ptr->cl ) )) cflag = 1;
+ }
+
+ if ( nseq == 1 && l_ptr->n1 == (l_ptr->m1 + 1) && ! rflag ) cflag = 0;
+ if ( cflag && l_ptr->n1 > 1 ) { /* Compute one column */
+ cflag = 0;
+ l_ptr->n1--;
+ c = 0;
+ f = - (Q);
+ cj = fj = l_ptr->n1;
+ va = pam2[B[l_ptr->n1]];
+ if ( nseq == 2 || l_ptr->mm < l_ptr->n1 ) {
+ p = 0;
+ ci = fi = pi = l_ptr->mm + 1;
+ pj = l_ptr->n1 + 1;
+ limit = l_ptr->mm;
+ }
+ else {
+ p = l_ptr->c_ss[l_ptr->n1].HH;
+ pi = l_ptr->c_ss[l_ptr->n1].II;
+ pj = l_ptr->c_ss[l_ptr->n1].JJ;
+ ci = fi = l_ptr->n1;
+ limit = l_ptr->n1 - 1;
+ }
+
+ for ( i = limit; i >= l_ptr->m1 ; i-- ) {
+ f = f - R;
+ c = c - qr;
+ ORDER(f, fi, fj, c, ci, cj)
+ c = l_ptr->c_ss[i].HH - qr;
+ ci = l_ptr->c_ss[i].II;
+ cj = l_ptr->c_ss[i].JJ;
+ d = l_ptr->c_ss[i].WW - R;
+ di = l_ptr->c_ss[i].XX;
+ dj = l_ptr->c_ss[i].YY;
+ ORDER(d, di, dj, c, ci, cj)
+ c = 0;
+ DIAG(i, l_ptr->n1, c, p+va[A[i]])
+ if ( c <= 0 ) { c = 0; ci = i; cj = l_ptr->n1; }
+ else { ci = pi; cj = pj; }
+ ORDER1(c, ci, cj, d, di, dj)
+ ORDER1(c, ci, cj, f, fi, fj)
+ p = l_ptr->c_ss[i].HH;
+ l_ptr->c_ss[i].HH = c;
+ pi = l_ptr->c_ss[i].II;
+ pj = l_ptr->c_ss[i].JJ;
+ l_ptr->c_ss[i].II = ci;
+ l_ptr->c_ss[i].JJ = cj;
+ l_ptr->c_ss[i].WW = d;
+ l_ptr->c_ss[i].XX = di;
+ l_ptr->c_ss[i].YY = dj;
+ if ( c > mini_score ) *flag_p = 1;
+ if ( ! cflag && ( (ci > l_ptr->rl && cj > l_ptr->cl) || (di > l_ptr->rl && dj > l_ptr->cl)
+ || (fi > l_ptr->rl && fj > l_ptr->cl) ) ) cflag = 1;
+ }
+
+ l_ptr->CCC[l_ptr->n1].CC = l_ptr->c_ss[l_ptr->m1].HH;
+ l_ptr->CCC[l_ptr->n1].RR = l_ptr->c_ss[l_ptr->m1].II;
+ l_ptr->CCC[l_ptr->n1].EE = l_ptr->c_ss[l_ptr->m1].JJ;
+ l_ptr->CCC[l_ptr->n1].DD = f;
+ l_ptr->CCC[l_ptr->n1].SS = fi;
+ l_ptr->CCC[l_ptr->n1].FF = fj;
+ if ( ! rflag && ( (ci > l_ptr->rl && cj > l_ptr->cl) || (di > l_ptr->rl && dj > l_ptr->cl)
+ || (fi > l_ptr->rl && fj > l_ptr->cl )) ) rflag = 1;
+ }
+ }
+ if (( l_ptr->m1 == 1 && l_ptr->n1 == 1) || no_cross(flag_p, v_ptr->LIST, l_ptr) ) break;
+ }
+ l_ptr->m1--;
+ l_ptr->n1--;
+}
+
+/* recompute the area on forward pass */
+static void
+small_pass(const unsigned char *A,
+ const unsigned char *B,
+ int mini_score,
+ int **pam2, int Q, int R,
+ int nseq,
+ struct vert_str *v_ptr,
+ struct l_struct *l_ptr) {
+
+ int i, j; /* row and column indices */
+ int c; /* best score at current point */
+ int f; /* best score ending with insertion */
+ int d; /* best score ending with deletion */
+ int p; /* best score at (i-1, j-1) */
+ int ci, cj; /* end-point associated with c */
+ int fi, fj; /* end-point associated with f */
+ int pi, pj; /* end-point associated with p */
+ space_ptr sp;
+ pair_ptr z;
+ int q, r, qr;
+ int *va; /* pointer to pam2(A[i], B[j]) */
+
+ int limit; /* lower bound on j */
+
+ q = Q; r = R; qr = q + r;
+
+ for ( sp = &l_ptr->CCC[l_ptr->n1 + 1], j = l_ptr->n1+1; sp <= &l_ptr->CCC[l_ptr->nn] ; sp++, j++ ) {
+ sp->CC = 0;
+ sp->RR = l_ptr->m1;
+ sp->EE = j;
+ sp->DD = - (qr);
+ sp->SS = l_ptr->m1+1;
+ sp->FF = j;
+ }
+
+ for ( i = l_ptr->m1 + 1; i <= l_ptr->mm; i++) {
+ c = 0; /* Initialize column 0 */
+ f = - (qr);
+ ci = fi = i;
+ va = pam2[A[i]];
+ if ( nseq == 2 || i <= l_ptr->n1 ) {
+ p = 0;
+ pi = i - 1;
+ cj = fj = pj = l_ptr->n1;
+ limit = l_ptr->n1 + 1;
+ }
+ else {
+ p = l_ptr->CCC[i].CC;
+ pi = l_ptr->CCC[i].RR;
+ pj = l_ptr->CCC[i].EE;
+ cj = fj = i;
+ limit = i + 1;
+ }
+
+ for ( j = limit, sp = &l_ptr->CCC[j] ; j <= l_ptr->nn ; j++, sp++ ) {
+ d = sp->DD;
+ c = -1;
+ DIAG(i, j, c, p+va[B[j]]) /* diagonal */
+ if (c < 0) {
+ p = sp->CC; pi = sp->RR; pj = sp->EE;
+ if (f >= 0) {
+ c = f; ci = fi; cj = fj;
+ ORDER1(c, ci, cj, d, sp->SS, sp->FF)
+ sp->CC = c; sp->RR = ci; sp->EE = cj;
+ sp->DD -= r; f-=r;
+ }
+ else if (d >= 0) {
+ sp->CC = d; sp->RR = sp->SS; sp->EE = sp->FF;
+ sp->DD -= r;
+ }
+ else {
+ sp->CC = 0;
+ sp->RR=i;
+ sp->EE = j;
+ }
+ }
+ else {
+ ci = pi; cj = pj;
+ ORDER1(c, ci, cj, f, fi, fj)
+ ORDER1(c, ci, cj, d, sp->SS, sp->FF)
+ p = sp->CC;
+ sp->CC = c;
+ pi = sp->RR;
+ sp->RR = ci;
+ pj = sp->EE;
+ sp->EE = cj;
+ f-=r;
+ if (c >= qr) {
+ if ( c > mini_score ) /* add the score into list */
+ addnode(c, ci, cj, i, j, v_ptr);
+ d -= r; c-=qr;
+ ORDER1(f, fi, fj, c, ci, cj)
+ ORDER1(d, sp->SS, sp->FF, c, ci, cj)
+ sp->DD = d;
+ }
+ else {
+ sp->DD -= r;
+ }
+ }
+ }
+ }
+}
+
+/* Add a new node into list. */
+
+static void
+addnode(int c, int ci, int cj, int i, int j, struct vert_str *v_ptr) {
+
+ bool found; /* 1 if the node is in LIST */
+
+ found = 0;
+ if ( v_ptr->most != NULL && v_ptr->most->STARI == ci && v_ptr->most->STARJ == cj)
+ found = 1;
+ else {
+ for ( v_ptr->most = v_ptr->LIST; v_ptr->most; v_ptr->most = v_ptr->most->next ) {
+ if ( v_ptr->most->STARI == ci && v_ptr->most->STARJ == cj) {
+ found = 1;
+ break;
+ }
+ }
+ }
+ if ( found ) {
+ if ( v_ptr->most->SCORE < c ) {
+ v_ptr->most->SCORE = c;
+ v_ptr->most->ENDI = i;
+ v_ptr->most->ENDJ = j;
+ }
+ if ( v_ptr->most->TOP > i ) v_ptr->most->TOP = i;
+ if ( v_ptr->most->BOT < i ) v_ptr->most->BOT = i;
+ if ( v_ptr->most->LEFT > j ) v_ptr->most->LEFT = j;
+ if ( v_ptr->most->RIGHT < j ) v_ptr->most->RIGHT = j;
+ }
+ else {
+ v_ptr->numnode++;
+ v_ptr->most = (vertex_p) ckalloc(1,sizeof(vertex));
+ v_ptr->most->SCORE = c;
+ v_ptr->most->STARI = ci;
+ v_ptr->most->STARJ = cj;
+ v_ptr->most->ENDI = i;
+ v_ptr->most->ENDJ = j;
+ v_ptr->most->TOP = v_ptr->most->BOT = i;
+ v_ptr->most->LEFT = v_ptr->most->RIGHT = j;
+ v_ptr->most->next = v_ptr->LIST;
+ v_ptr->LIST = v_ptr->most;
+ }
+}
+
+/* Find and remove the largest score in list */
+
+static vertex_p
+findmax(struct vert_str *v_ptr) {
+ vertex_p ap, cur;
+ register int score;
+
+ for ( score = (v_ptr->LIST)->SCORE, cur = NULL, ap = (v_ptr->LIST); ap->next; ap = ap->next) {
+ if ( ap->next->SCORE > score ) {
+ cur = ap; score = ap->next->SCORE;
+ }
+ }
+ if (cur) {ap = cur->next; cur->next = ap->next; }
+ else { ap = v_ptr->LIST; v_ptr->LIST = (v_ptr->LIST)->next;}
+ v_ptr->numnode--;
+ v_ptr->most = v_ptr->LIST;
+ return ( ap );
+}
+
+/* return 1 if no node in LIST share vertices with the area */
+
+static bool
+no_cross(int *flag_p, vertex_p LIST, struct l_struct *l_ptr) {
+
+ vertex_p cur;
+
+ for ( cur = LIST; cur; cur = cur->next ) {
+ if ( cur->STARI <= l_ptr->mm && cur->STARJ <= l_ptr->nn && cur->BOT >= l_ptr->m1-1 &&
+ cur->RIGHT >= l_ptr->n1-1 && (cur->STARI < l_ptr->rl || cur->STARJ < l_ptr->cl)) {
+ if ( cur->STARI < l_ptr->rl ) l_ptr->rl = cur->STARI;
+ if ( cur->STARJ < l_ptr->cl ) l_ptr->cl = cur->STARJ;
+ *flag_p = 1;
+ break;
+ }
+ }
+ return !cur;
+}
+
+/* The following definitions are for function diff() */
+
+#define gap(k) ((k) <= 0 ? 0 : Q+R*(k)) /* k-symbol indel score */
+
+/* Append "Delete k" op */
+#define DEL(k) \
+{ l_ptr->I += k; \
+ if (*last < 0) \
+ *last = (*sapp)[-1] -= (k); \
+ else { \
+ *last = (*sapp)[0] = -(k); \
+ (*sapp)++; \
+ } \
+}
+
+/* Append "Insert k" op */
+#define INS(k) \
+{ l_ptr->J += k; \
+ if (*last < 0) { \
+ (*sapp)[-1] = (k); \
+ (*sapp)[0] = *last; \
+ (*sapp)++; \
+ } \
+ else { \
+ *last = (*sapp)[0] = (k); \
+ (*sapp)++; \
+ } \
+}
+
+/* diff(A,B,M,N,tb,te) returns the score of an optimum conversion between
+ A[1..M] and B[1..N] that begins(ends) with a delete if tb(te) is zero
+ and appends such a conversion to the current script. */
+
+static int
+diff(const unsigned char *A,
+ const unsigned char *B,
+ int M, int N,
+ int tb, int te,
+ int two_seq,
+ int **pam2, int Q, int R,
+ int **sapp, int *last,
+ struct l_struct *l_ptr) {
+
+ int midi, midj, type; /* Midpoint, type, and cost */
+ int limit;
+ int midc;
+
+ register int i, j;
+ register int c, e, d, s;
+
+ pair_ptr z;
+
+ int t;
+ int *va;
+ int qr;
+
+ bool tt;
+
+ qr = Q + R;
+
+ /* Boundary cases: M <= 1 or N == 0 */
+
+ if (N <= 0){
+ if (M > 0) {DEL(M)}
+ return - gap(M);
+ }
+
+ if (M <= 1) {
+ if (M <= 0) {
+ INS(N)
+ return - gap(N);
+ }
+
+ if (tb > te) tb = te;
+ midc = - (tb + R + gap(N) );
+ midj = 0;
+
+ va = pam2[A[1]];
+ j = 2 + l_ptr->I - l_ptr->J;
+ if (two_seq || j < 1) j = 1;
+ for ( ; j <= N; j++) {
+ for ( tt = 1, z = l_ptr->row[l_ptr->I+1]; z != NULL; z = z->NEXT ) {
+ if ( z->COL == j+l_ptr->J ) { tt = 0; break; }
+ }
+ if (tt) {
+ c = va[B[j]] - ( gap(j-1) + gap(N-j) );
+ if (c > midc) { midc = c; midj = j; }
+ }
+ }
+
+ if (midj == 0) { INS(N) DEL(1) }
+ else {
+ if (midj > 1) INS(midj-1)
+ *last = (*sapp)[0] = 0;
+ (*sapp)++;
+
+ /* mark (A[I],B[J]) as used: put J into list row[I] */
+ l_ptr->I++; l_ptr->J++;
+ z = ( pair_ptr ) ckalloc(1,sizeof(pair));
+ z->COL = l_ptr->J;
+ z->NEXT = l_ptr->row[l_ptr->I];
+ l_ptr->row[l_ptr->I] = z;
+ if (midj < N) INS(N-midj)
+ }
+ return midc;
+ }
+
+ /* Divide: Find optimum midpoint (midi,midj) of cost midc */
+
+ midi = M/2; /* Forward phase: */
+ l_ptr->r_ss[0].CC = 0; /* Compute C(M/2,k) & D(M/2,k) for all k */
+ t = -Q;
+ for (j = 1; j <= N; j++) {
+ l_ptr->r_ss[j].CC = t = t-R;
+ l_ptr->r_ss[j].DD = t-Q;
+ }
+ t = -tb;
+ for (i = 1; i <= midi; i++) {
+ va = pam2[A[i]];
+ t = t-R;
+ j = i + l_ptr->I - l_ptr->J;
+ if (two_seq || j <= 0) {
+ j = 0;
+ s = l_ptr->r_ss[0].CC;
+ l_ptr->r_ss[0].CC = c = t;
+ }
+ else {
+ if ( (c = (s = l_ptr->r_ss[j].CC) - qr) < (d = l_ptr->r_ss[j].DD)) c = d;
+ l_ptr->r_ss[j].CC = l_ptr->r_ss[j].DD = c;
+ }
+ e = c-Q;
+ for (j++ ; j <= N; j++) {
+ if ((c = c - qr) > (e = e - R)) e = c;
+ if ((c = l_ptr->r_ss[j].CC - qr) > (d = l_ptr->r_ss[j].DD - R)) d = c;
+ DIAG(i+l_ptr->I, j+l_ptr->J, c, s+va[B[j]])
+ if (c < d) c = d;
+ if (c < e) c = e;
+ s = l_ptr->r_ss[j].CC;
+ l_ptr->r_ss[j].CC = c;
+ l_ptr->r_ss[j].DD = d;
+ }
+ }
+ l_ptr->r_ss[0].DD = l_ptr->r_ss[0].CC;
+
+ l_ptr->r_ss[N].RR = 0; /* Reverse phase: */
+ t = -Q; /* Compute R(M/2,k) & S(M/2,k) for all k */
+ for (j = N-1; j >= 0; j--) {
+ l_ptr->r_ss[j].RR = t = t-R;
+ l_ptr->r_ss[j].SS = t-Q;
+ }
+ t = -te;
+
+ for (i = M-1; i >= midi; i--) {
+ s = l_ptr->r_ss[N].RR;
+ l_ptr->r_ss[N].RR = c = t = t-R;
+ e = t-Q;
+ va = pam2[A[i+1]];
+ limit = i + l_ptr->I - l_ptr->J + 1;
+ if (two_seq || limit < 0) limit = 0;
+ for (j = N-1; j >= limit; j--) {
+ if ((c = c - qr) > (e = e - R)) e = c;
+ if ((c = l_ptr->r_ss[j].RR - qr) > (d = l_ptr->r_ss[j].SS - R)) d = c;
+ DIAG(i+1+l_ptr->I, j+1+l_ptr->J, c, s+va[B[j+1]])
+ if (c < d) c = d;
+ if (c < e) c = e;
+ s = l_ptr->r_ss[j].RR;
+ l_ptr->r_ss[j].RR = c;
+ l_ptr->r_ss[j].SS = d;
+ }
+ }
+ l_ptr->r_ss[N].SS = l_ptr->r_ss[N].RR;
+
+ midc = l_ptr->r_ss[0].CC+l_ptr->r_ss[0].RR; /* Find optimal midpoint */
+ midj = 0;
+ type = 1;
+ limit = midi + l_ptr->I - l_ptr->J + 1;
+ if (two_seq || limit < 0) limit = 0;
+ for (j = limit; j <= N; j++) {
+ if ((c = l_ptr->r_ss[j].CC + l_ptr->r_ss[j].RR) >= midc) {
+ if (c > midc ||
+ (l_ptr->r_ss[j].CC != l_ptr->r_ss[j].DD && l_ptr->r_ss[j].RR == l_ptr->r_ss[j].SS)) {
+ midc = c;
+ midj = j;
+ }
+ }
+ }
+ for (j = N; j >= limit; j--)
+ if ((c = l_ptr->r_ss[j].DD + l_ptr->r_ss[j].SS + Q) > midc)
+ { midc = c;
+ midj = j;
+ type = 2;
+ }
+
+/* Conquer: recursively around midpoint */
+
+ if (type == 1) {
+ (void)diff(A,B,midi,midj,tb,Q,two_seq,pam2,Q,R, sapp, last, l_ptr);
+ (void)diff(A+midi,B+midj,M-midi,N-midj,Q,te,two_seq,pam2,Q,R, sapp, last, l_ptr);
+ }
+ else {
+ (void)diff(A,B,midi-1,midj,tb,0,two_seq,pam2,Q,R, sapp, last, l_ptr);
+ DEL(2);
+ (void)diff(A+midi+1,B+midj,M-midi-1,N-midj,0,te,two_seq,pam2,Q,R, sapp, last, l_ptr);
+ }
+ return midc;
+}
+
+/* CHECK_SCORE - return the score of the alignment stored in S */
+
+static int CHECK_SCORE(const unsigned char *A, const unsigned char *B,
+ int M, int N,
+ int *S, int **w,
+ int qq, int rr, int *NC)
+{
+ register int i, j, op, nc;
+ int itmp, score;
+#ifdef SHOW_ALIGN_SCORE
+ int mx_l_score;
+#endif
+
+ /* print_seq_prof(A,M,w,iw); */
+
+ score = i = j = op = nc = 0;
+#ifdef SHOW_ALIGN_SCORE
+ mx_l_score = 0;
+ printf("#===start\n");
+ printf("#i j pam2 score mx_l_score\n");
+#endif
+ while (i < M || j < N) {
+ op = *S++;
+ if (op == 0) {
+ itmp = w[A[++i]][B[++j]];
+ score += itmp;
+ nc++;
+ }
+ else if (op > 0) {
+ itmp = -(qq + op*rr);
+ score += itmp;
+ j += op;
+ nc += op;
+ } else { /* op < 0 */
+ itmp = - (qq - op*rr);
+ score += itmp;
+ i -= op; /* i increased */
+ nc -= op; /* nc increased */
+ }
+#ifdef SHOW_ALIGN_SCORE
+ if (score > mx_l_score) mx_l_score = score;
+ printf("%d\t%d\t%d\t%d\t%d\n",i, j, itmp, score, mx_l_score);
+#endif
+ }
+#ifdef SHOW_ALIGN_SCORE
+ printf("%d\t%d\tend\t%d\t%d\n====\n",i, j, score, mx_l_score);
+#endif
+ *NC = nc;
+ return score;
+}
+
+/* ckalloc - allocate space; check for success */
+void *ckalloc(size_t amount, size_t size)
+{
+ void *p;
+ static size_t mtotal;
+
+ mtotal += amount * size;
+
+ if ((p = malloc( amount * size )) == NULL) {
+ fprintf(stderr,"Ran out of near memory: %ld*%ld/%ld\n",amount,size,mtotal);
+ exit(1);
+ }
+ return(p);
+}
diff --git a/src/lsim4.h b/src/lsim4.h
new file mode 100644
index 0000000..96c8628
--- /dev/null
+++ b/src/lsim4.h
@@ -0,0 +1,145 @@
+
+/* $Id: lsim4.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+/* global definitions, #defines, for lsim4.c */
+
+typedef int bool;
+
+void *ckalloc(size_t amount, size_t size);
+
+#define maxi(x, y) (((x) > (y)) ? x: y)
+
+typedef struct ONE {int COL ; struct ONE *NEXT ;} pair, *pair_ptr;
+
+#define PAIRNULL (pair_ptr)NULL
+
+typedef struct NODE
+{ int SCORE;
+ int STARI, STARJ;
+ int ENDI, ENDJ;
+ int TOP, BOT;
+ int LEFT, RIGHT;
+ struct NODE *next;
+} vertex, *vertex_p;
+
+struct lrr_str {
+ int CC, DD; /* saving row matrix scores */
+ int RR, SS, EE, FF; /* saving row start-points */
+};
+
+struct lcc_str {
+ int HH, WW; /* saving col matrix scores HH=CC, */
+ int II, JJ, XX, YY; /* saving col start-points , II=RR, JJ=EE */
+};
+
+typedef struct spa {
+ int CC, RR, EE, DD, SS, FF;
+} space;
+
+typedef struct spa *space_ptr;
+
+struct vert_str {
+ int numnode;
+ vertex_p LIST, most;
+};
+
+struct l_struct {
+ space_ptr CCC;
+ struct lrr_str *r_ss;
+ struct lcc_str *c_ss;
+ pair_ptr *row; /* for saving used aligned pairs */
+ int m1, mm, n1, nn; /* boundaries of recomputed area */
+ int rl, cl; /* left and top boundaries */
+ int I, J; /* current positions of A ,B - used by diff() */
+};
+
+static void big_pass(const unsigned char *A,
+ const unsigned char *B,
+ int M, int N,
+ int mini_score,
+ int **pam2, int Q, int R,
+ int nseq,
+ struct vert_str *v_ptr,
+ struct l_struct *l_ptr);
+
+static void locate(const unsigned char *A,
+ const unsigned char *B,
+ int mini_score,
+ int **pam2, int Q, int R,
+ int nseq,
+ int *flag_p,
+ struct vert_str *v_ptr,
+ struct l_struct *l_ptr);
+
+static void small_pass(const unsigned char *A,
+ const unsigned char *B,
+ int mini_score,
+ int **pam2, int Q, int R,
+ int nseq,
+ struct vert_str *v_ptr,
+ struct l_struct *l_ptr);
+
+static void addnode(int c, int ci, int cj, int i, int j,
+ struct vert_str *v_ptr);
+
+static bool no_cross(int *flag_p, vertex_p LIST, struct l_struct *l_ptr);
+
+static int diff(const unsigned char *A,
+ const unsigned char *B,
+ int M, int N, int tb, int te,
+ int two_seq,
+ int **pam2, int q, int r,
+ int **sapp, int *last,
+ struct l_struct *l_ptr);
+
+static int CHECK_SCORE(const unsigned char *A, const unsigned char *B,
+ int M, int N,
+ int *S, int **W, int G, int H, int *nres);
+
+static vertex_p findmax(struct vert_str *v_ptr);
+
+/* DIAG() assigns value to x if (ii,jj) is never used before */
+#define DIAG(ii, jj, x, value) \
+{ for ( z = l_ptr->row[(ii)]; z != 0 && z->COL != (jj); z = z->NEXT ) ; \
+ if ( !z ) x = ( value ); \
+ }
+
+/* replace (ss1, xx1, yy1) by (ss2, xx2, yy2) if the latter is large */
+#define ORDER(ss1, xx1, yy1, ss2, xx2, yy2) \
+{ if ( ss1 < ss2 ) \
+ { ss1 = ss2; xx1 = xx2; yy1 = yy2; } \
+else \
+if ( ss1 == ss2 ) \
+ { if ( xx1 < xx2 ) { xx1 = xx2; yy1 = yy2; } \
+ else \
+ if ( xx1 == xx2 && yy1 < yy2 ) yy1 = yy2; \
+ } \
+}
+
+#define ORDER1(ss1, xx1, yy1, ss2, xx2, yy2) \
+{ if (ss1 <= ss2) { \
+ if (ss1 == ss2) { \
+ if (xx1 < xx2) { \
+ xx1 = xx2; yy1 = yy2; \
+ } else { \
+ if (xx1 == xx2 && yy1 < yy2) \
+ yy1 = yy2; \
+ } \
+ } else { \
+ ss1 = ss2; xx1 = xx2; yy1 = yy2; \
+ } \
+ } \
+}
+
+#define ORDER2(ss1, xx1, ss2, xx2) \
+{ \
+ if (ss1 <= ss2) { \
+ if (ss1 == ss2) { \
+ if (xx1 < xx2) xx1 = xx2; \
+ } else { \
+ ss1 = ss2; xx1 = xx2; \
+ } \
+ } \
+}
+
diff --git a/src/map_db.c b/src/map_db.c
new file mode 100644
index 0000000..5429508
--- /dev/null
+++ b/src/map_db.c
@@ -0,0 +1,600 @@
+/* map_db.c - read a FASTA or GCG format database and generate a list
+ of indices for rapid memory mapping */
+
+/* $Id: map_db.c 1239 2013-11-02 01:09:58Z wrp $ */
+
+/* copyright (c) 1999, 2014 by William R. Pearson and The Rector &
+ Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* input is a libtype 1,5, or 6 sequence database */
+/* output is a BLAST2 formatdb type index file */
+
+/* format of the index file:
+
+1) map_db version number ["MP"+2 bytes]
+2) number of sequences in database [4 bytes]
+3) total length of database [8 bytes] (MP1, 4 bytes for MP0)
+4) longest sequence in database [8 bytes] (MP1, 4 bytes for MP0)
+5) list of offsets to definitions [num_seq+1] int*8 (MP1, 4 bytes for MP0)
+6) list of offsets to sequences [num_seq+1] int*8 (MP1, 4 bytes for MP1)
+7) list of flag characters for sequences [num_seq+1]bytes
+ (used for GCG binary to encode 2bit or 4 bit representation)
+
+ sequence files will be as defined by their format
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#ifndef FSEEK_T_DEF
+#ifndef USE_FSEEKO
+#define FSEEK_T_DEF
+#define FSEEK fseek
+#define FTELL ftell
+typedef long fseek_t;
+#else
+#define FSEEK fseeko
+#define FTELL ftello
+typedef off_t fseek_t;
+#endif
+#endif
+
+#define LASTLIB 6
+
+int (*get_entry) ();
+
+int a_get_ent(unsigned char *, int, fseek_t *, fseek_t *);
+int gbf_get_ent(unsigned char *, int, fseek_t *, fseek_t *);
+
+void src_int4_write(FILE *, int);
+void src_int4_read(FILE *, int *);
+void src_long4_write(FILE *, long);
+void src_long4_read(FILE *, long *);
+void src_long8_write(FILE *, int64_t);
+void src_long8_read(FILE *, int64_t *);
+
+void newname(char *nname, char *oname, char *suff, int maxn);
+void init_ascii0(int *xascii, char *sq_map);
+
+int (*get_ent_arr[LASTLIB+1])()={a_get_ent, gbf_get_ent, NULL, NULL, NULL,
+ NULL, NULL};
+
+fseek_t openlib(char *, int);
+
+#define NA 123
+#define TERM 24
+#define EL 125
+#define ES 126
+#define AAMASK 127
+
+static int *sascii, aascii[128];
+char *NCBIstdaa_ext = "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ-abcdefghiklmnpqrstvwxyzu*oj";
+char NCBIstdaa_ext_n = 56;
+
+int
+main(int argc, char **argv) {
+ FILE *libi, *b_fd; /* b_fd is used for both the binary data file and the index file */
+ char lname[256];
+ char bname[256];
+ char iname[256];
+ char format[4];
+ char *bp;
+ struct stat stat_buf;
+
+ int i;
+ int nlib; /* number of entries */
+ long max_len; /* longest sequence */
+ fseek_t tot_len; /* total sequence length */
+
+ int n1;
+
+ fseek_t f_size; /* file size from fstat() */
+ int lib_size; /* current space available - may be realloc'ed */
+ int lib_inc;
+ int lib_type; /* 1 for protein, 0 for DNA */
+ int lib_aa; /* dna=1; prot=0; */
+ int build_binary; /* build binary sequence file */
+
+ /* file offsets */
+ fseek_t d_pos; /* start of description */
+ fseek_t s_pos; /* start of sequence */
+ fseek_t b_pos; /* start of binary encoding */
+ fseek_t *d_pos_arr; /* array of description pointers */
+ fseek_t *s_pos_arr; /* array of ascii sequence pointers */
+ fseek_t *b_pos_arr; /* array of binary sequence pointers */
+ unsigned char *zbuff; /* tmp buffer for writes */
+ unsigned char *sbuff; /* sequence buffer */
+ int sbuff_max, sbuff_dup;
+
+ lib_type = 0;
+ lib_size = 200000;
+ lib_inc = 100000;
+ lib_aa = 1;
+ sbuff = NULL;
+ sbuff_max = 50000;
+ sbuff_dup = 0;
+ build_binary = 0;
+
+ while (argc > 1 && *argv[1]=='-') {
+ if (strcmp(argv[1],"-n")==0) lib_aa = 0;
+ else if (strcmp(argv[1], "-b")==0) build_binary = 1;
+ argv++;
+ argc--;
+ }
+
+ /* open the database */
+ if (argc > 1) strncpy(lname, argv[1],sizeof(lname));
+ else {
+ fprintf(stderr," Entry library name: ");
+ fgets(lname,sizeof(lname),stdin);
+ if ((bp=strchr(lname,'\n'))!=NULL) *bp='\0';
+ }
+
+ if ((bp=strchr(lname,' '))!=NULL) {
+ lib_type = atoi(bp+1);
+ *bp='\0';
+ }
+ else lib_type = 0;
+
+ if (get_ent_arr[lib_type] == NULL) {
+ fprintf(stderr," cannot index file %s type %d\n",lname,lib_type);
+ exit(1);
+ }
+
+ if (lib_type == 6) lib_aa = 0;
+ if (lib_type == 1) lib_aa = 0;
+
+ if (lib_aa == 1) {
+ init_ascii0(aascii, NCBIstdaa_ext);
+ }
+ else {
+ if (build_binary) {
+ fprintf(stderr,"*** WARNING *** map_db -- binary files not available for DNA libraries\n");
+ build_binary = 0;
+ }
+ init_ascii0(aascii, "\0ACGTURYMWSKDHVBNacgturymwskdhvbn");
+ aascii['X'] = aascii['N'];
+ aascii['x'] = aascii['n'];
+ }
+ sascii = &aascii[0];
+
+ if ((f_size=openlib(lname,lib_type))==0) {
+ fprintf(stderr," cannot open %s (type: %d)\n",lname,lib_type);
+ exit(1);
+ }
+
+ if (build_binary) {
+ newname(bname, lname, "bsq",sizeof(bname));
+ if ((b_fd = fopen(bname, "w")) == NULL) {
+ fprintf(stderr, "cannot open %s binary file for writing\n",bname);
+ exit(1);
+ }
+
+ if ((zbuff=(unsigned char *)calloc(256,sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate zbuff[256]\n");
+ exit(1);
+ }
+
+ if ((sbuff=(unsigned char *)calloc(sbuff_max,sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate sbuff[%d]\n",sbuff_max);
+ exit(1);
+ }
+
+ /* write out the initial NULL */
+ fwrite(zbuff,sizeof(char),1, b_fd);
+ }
+ b_pos = 1; /* initialize whether used or not */
+
+ /* allocate array of description pointers */
+ if ((d_pos_arr=(fseek_t *)calloc(lib_size, sizeof(fseek_t)))==NULL) {
+ fprintf(stderr," cannot allocate %d for desc. array\n",lib_size);
+ exit(1);
+ }
+ /* allocate array of sequence pointers */
+ if ((s_pos_arr=(fseek_t *)calloc(lib_size, sizeof(fseek_t)))==NULL) {
+ fprintf(stderr," cannot allocate %d for seq. array\n",lib_size);
+ exit(1);
+ }
+ /* allocate array of sequence pointers */
+ if ((b_pos_arr=(fseek_t *)calloc(lib_size, sizeof(fseek_t)))==NULL) {
+ fprintf(stderr," cannot allocate %d for binary array\n",lib_size);
+ exit(1);
+ }
+
+ /* allocate array of sequence flags */
+
+ nlib = 0; tot_len=0; max_len=-1;
+ while ((n1=get_entry(sbuff, sbuff_max, &d_pos, &s_pos)) > 0) {
+ if (build_binary) fwrite(sbuff, sizeof(char), n1+1, b_fd);
+
+ d_pos_arr[nlib] = d_pos;
+ s_pos_arr[nlib] = s_pos;
+ b_pos_arr[nlib] = b_pos;
+
+ b_pos += n1+1;
+
+ nlib++;
+ tot_len += n1;
+
+ if (n1 > max_len) max_len = n1;
+ if (nlib >= lib_size) { /* too many entries */
+
+ lib_size += lib_inc;
+ if ((d_pos_arr=(fseek_t *)realloc(d_pos_arr,lib_size*sizeof(fseek_t)))==NULL) {
+ fprintf(stderr," cannot realloc allocate %d for desc.. array\n",
+ lib_size);
+ exit(1);
+ }
+ if ((s_pos_arr=(fseek_t *)realloc(s_pos_arr,lib_size*sizeof(fseek_t)))==NULL) {
+ fprintf(stderr," cannot realloc allocate %d for seq. array\n",
+ lib_size);
+ exit(1);
+ }
+ if ((b_pos_arr=(fseek_t *)realloc(b_pos_arr,lib_size*sizeof(fseek_t)))==NULL) {
+ fprintf(stderr," cannot realloc allocate %d for binary aseq. array\n",
+ lib_size);
+ exit(1);
+ }
+ }
+ }
+
+ if (build_binary) fclose(b_fd);
+
+ if (stat(lname,&stat_buf)<0) {
+ fprintf(stderr," cannot stat library: %s\n",lname);
+ exit(1);
+ }
+ else {
+ f_size = stat_buf.st_size;
+ }
+
+ d_pos_arr[nlib]= d_pos; /* put in the end of the file */
+ s_pos_arr[nlib]=0;
+ b_pos_arr[nlib]= b_pos;
+
+ /* all the information is in, write it out */
+
+ newname(iname,lname,"xin",sizeof(iname));
+ if ((libi=fopen(iname,"w"))==NULL) {
+ fprintf(stderr," cannot open %s for writing\n",iname);
+ exit(1);
+ }
+
+ /* write out format version, etc. to .xin file */
+ format[0]='M';
+ format[1]='P';
+#ifdef BIG_LIB64
+ format[2]= 1; /* format 1,2 for 8-byte offsets */
+#else
+ format[2]='\0'; /* format '\0' for original 4-byte */
+#endif
+
+ format[3]=lib_type;
+ fwrite(format,4,sizeof(char),libi);
+
+ /* write out sequence type */
+ src_int4_write(libi, lib_aa);
+ /* write out file fstat as integrity check */
+#ifdef BIG_LIB64
+ src_long8_write(libi, f_size);
+#else
+ src_int4_write(libi, f_size);
+#endif
+ /* write out num_seq */
+ src_int4_write(libi, nlib);
+
+#ifdef BIG_LIB64
+ /* write out tot_len, max_len */
+ src_long8_write(libi, tot_len);
+#else
+ src_int4_write(libi, tot_len);
+#endif
+ src_int4_write(libi, max_len);
+
+#ifdef BIG_LIB64
+ for (i=0; i<=nlib; i++) src_long8_write(libi,d_pos_arr[i]);
+ for (i=0; i<=nlib; i++) src_long8_write(libi,s_pos_arr[i]);
+#else
+ for (i=0; i<=nlib; i++) src_int4_write(libi,d_pos_arr[i]);
+ for (i=0; i<=nlib; i++) src_int4_write(libi,s_pos_arr[i]);
+#endif
+ fclose(libi);
+
+ if (build_binary) { /* do the same thing for the .xin_b file */
+ if (stat(bname,&stat_buf)<0) {
+ fprintf(stderr," cannot stat library: %s\n",bname);
+ exit(1);
+ }
+ else {
+ f_size = stat_buf.st_size;
+ }
+
+ newname(iname,lname,"xin_b",sizeof(iname));
+ if ((b_fd=fopen(iname,"w"))==NULL) {
+ fprintf(stderr," cannot open %s for writing\n",iname);
+ exit(1);
+ }
+
+ /* write out format version, etc. to .xin file */
+ format[0]='M';
+ format[1]='P';
+#ifdef BIG_LIB64
+ format[2]= 2; /* format 1,2 for 8-byte offsets */
+#else
+ format[2]='\0'; /* format '\0' for original 4-byte */
+#endif
+
+ format[3]=lib_type;
+ fwrite(format,4,sizeof(char),libi);
+
+ /* write out sequence type */
+ src_int4_write(libi, lib_aa);
+ /* write out file fstat as integrity check */
+ src_long8_write(libi, f_size);
+ /* write out num_seq */
+ src_int4_write(libi, nlib);
+
+ /* write out tot_len, max_len */
+ src_long8_write(libi, tot_len);
+ src_int4_write(libi, max_len);
+ /* write out maximum length */
+ src_int4_write(libi, sbuff_max);
+ /* write out overlap */
+ src_int4_write(libi, sbuff_dup);
+
+ for (i=0; i<=nlib; i++) src_long8_write(libi,b_pos_arr[i]);
+ fclose(libi);
+ }
+
+#ifdef BIG_LIB64
+ fprintf(stderr," wrote %d sequences (tot=%lld, max=%ld) to %s\n",
+ nlib,tot_len,max_len,iname);
+#else
+ fprintf(stderr," wrote %d sequences (tot=%ld, max=%ld) to %s\n",
+ nlib,tot_len,max_len,iname);
+#endif
+ if (build_binary) {
+ fprintf(stderr," binary file: %s.bsq written\n",lname);
+ }
+
+ exit(0);
+}
+
+
+FILE *libf=NULL;
+fseek_t lpos;
+
+#define MAXLINE 4096
+char lline[MAXLINE+1];
+
+fseek_t
+openlib(char *lname, int lib_type)
+{
+ struct stat stat_buf;
+
+ if (stat(lname,&stat_buf)<0) {
+ fprintf(stderr," cannot stat library: %s\n",lname);
+ return 0;
+ }
+
+ if ((libf=fopen(lname,"r"))==NULL) {
+ fprintf(stderr," cannot open library: %s (type: %d)\n",
+ lname, lib_type);
+ return 0;
+ }
+
+ get_entry = get_ent_arr[lib_type];
+
+ lpos = FTELL(libf);
+ if (fgets(lline,MAXLINE,libf)==NULL) return 0;
+ return stat_buf.st_size;
+}
+
+int
+a_get_ent(unsigned char *sbuff, int max_sbuff,
+ fseek_t *d_pos, fseek_t *s_pos)
+{
+ char *cp;
+ unsigned char *sptr, *sptr_max;
+ int *ap, n1;
+
+ sptr = sbuff;
+ sptr_max = sbuff+max_sbuff;
+
+ ap = sascii;
+
+ while (lline[0]!='>' && lline[0]!=';') {
+ lpos = FTELL(libf);
+ if (fgets(lline,sizeof(lline),libf)==NULL) {
+ *d_pos = lpos;
+ return 0;
+ }
+ }
+
+ *d_pos = lpos;
+
+ /* make certain we have the end of the line */
+ while (strchr((char *)lline,'\n')==NULL) {
+ if (fgets(lline,sizeof(lline),libf)==NULL) break;
+ }
+
+ *s_pos = FTELL(libf);
+ lline[0]='\0';
+ n1 = 0;
+ while (fgets(lline,sizeof(lline),libf)!=NULL) {
+ if (lline[0]=='>') break;
+ if (lline[0]==';') {
+ if (strchr(lline,'\n')==NULL) {
+ fprintf(stderr," excessive continuation\n%s",lline);
+ return -1;
+ }
+ }
+
+ if (sbuff) {
+ for (cp=lline; *cp && sptr < sptr_max;) {
+ if ((*sptr = ap[*cp++])<NA) {sptr++;}
+ }
+
+ if (sptr >= sptr_max) {
+ fprintf(stderr," sequence too long: %ld\n",(long)(sptr-sbuff));
+ exit(1);
+ }
+ }
+ else {
+ for (cp=lline; *cp; ) if (ap[*cp++]<NA) n1++;
+ }
+
+ lpos = FTELL(libf);
+ }
+ if (sbuff) {
+ *sptr = '\0';
+ return sptr - sbuff;
+ }
+ else {
+ return n1;
+ }
+}
+
+int
+gbf_get_ent(unsigned char *sbuff, int max_sbuff,
+ fseek_t *d_pos, fseek_t *s_pos)
+{
+ int n1;
+ char *cp;
+ register int *ap;
+
+#if !defined(TFAST)
+ ap = sascii;
+#else
+ ap = nascii;
+#endif
+
+ while (lline[0]!='L' || lline[1]!='O' ||
+ strncmp(lline,"LOCUS",5)) { /* find LOCUS */
+ lpos = FTELL(libf);
+ if (fgets(lline,MAXLINE,libf)==NULL) return (-1);
+ }
+ *d_pos=lpos;
+
+ while (lline[0]!='O' || lline[1]!='R' ||
+ strncmp(lline,"ORIGIN",6)) { /* find ORIGIN */
+ if (fgets(lline,MAXLINE,libf)==NULL) return (-1);
+ }
+ *s_pos = FTELL(libf);
+
+ lline[0]='\0';
+ n1=0;
+ while (fgets(lline,MAXLINE,libf)!=NULL) {
+ if (lline[0]=='/') break;
+ for (cp=lline; *cp; ) if (ap[*cp++]<NA) n1++;
+ }
+ lpos = FTELL(libf);
+ fgets(lline,MAXLINE,libf);
+
+ return n1;
+}
+
+void src_int4_read(FILE *fd, int *val)
+{
+#ifdef IS_BIG_ENDIAN
+ fread((char *)val,(size_t)4,(size_t)1,fd);
+#else
+ unsigned char b[4];
+
+ fread((char *)&b[0],(size_t)1,(size_t)4,fd);
+ *val = 0;
+ *val = (int)((int)((int)(b[0]<<8)+((int)b[1]<<8))+((int)b[2]<<8))
+ +(int)b[3];
+#endif
+}
+
+void src_int4_write(FILE *fd, int val)
+{
+#ifdef IS_BIG_ENDIAN
+ fwrite(&val,(size_t)4,(size_t)1,fd);
+#else
+ unsigned char b[4];
+
+ b[3] = val & 255;
+ b[2] = (val=val>>8)&255;
+ b[1] = (val=val>>8)&255;
+ b[0] = (val=val>>8)&255;
+
+ fwrite(b,(size_t)1,(size_t)4,fd);
+#endif
+}
+
+void src_long8_write(FILE *fd, int64_t val)
+{
+#ifdef IS_BIG_ENDIAN
+ fwrite(&val,(size_t)8,(size_t)1,fd);
+#else
+ unsigned char b[8];
+
+ b[7] = val & 255;
+ b[6] = (val=val>>8)&255;
+ b[5] = (val=val>>8)&255;
+ b[4] = (val=val>>8)&255;
+ b[3] = (val=val>>8)&255;
+ b[2] = (val=val>>8)&255;
+ b[1] = (val=val>>8)&255;
+ b[0] = (val=val>>8)&255;
+
+ fwrite(b,(size_t)1,(size_t)8,fd);
+#endif
+}
+
+void
+newname(char *nname, char *oname, char *suff, int maxn)
+{
+ strncpy(nname,oname,maxn-1);
+ strncat(nname,".",1);
+ strncat(nname,suff,maxn-strlen(nname));
+}
+
+/* init_ascii0 -- initializes an ascii mapping from a sequence
+ ordering
+*/
+void
+init_ascii0(int *xascii, char *sq_map) {
+ int i;
+ int n_sq_map;
+
+ n_sq_map = strlen(sq_map+1) + 1;
+
+ /* first map everything as non-sequence */
+ for (i=0; i<128; i++) {
+ xascii[i] = NA;
+ }
+
+ /* then map the actual sequence letters */
+ for (i = 1; i < n_sq_map; i++) {
+ xascii[sq_map[i]] = i;
+ if (n_sq_map <= 28) { /* only uppercase */
+ xascii[sq_map[i]+32] = i; /* map lowercase */
+ }
+ }
+
+ /* then map the other stuff, EL etc */
+ xascii[0] = ES;
+ xascii[10] = EL;
+ xascii[13] = EL;
+}
+
diff --git a/src/mm_file.h b/src/mm_file.h
new file mode 100644
index 0000000..8011153
--- /dev/null
+++ b/src/mm_file.h
@@ -0,0 +1,153 @@
+/* mm_file.h - defines m_file_str for mmap()ed files */
+
+/* $Id: mm_file.h 938 2012-06-04 16:15:06Z wrp $ */
+
+/* copyright (c) 1999, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+#include <sys/types.h>
+
+#ifndef FSEEK_T_DEF
+#ifndef USE_FSEEKO
+#define FSEEK_T_DEF
+#define FSEEK fseek
+#define FTELL ftell
+typedef long fseek_t;
+#else
+#define FSEEK fseeko
+#define FTELL ftello
+typedef off_t fseek_t;
+#endif
+#endif
+
+#ifdef HAS_INTTYPES
+#include <inttypes.h>
+#else
+#ifdef WIN32
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#else
+typedef long int64_t;
+typedef unsigned long uint64_t;
+#endif
+#endif
+typedef int64_t MM_OFF;
+
+#ifdef MYSQL_DB
+#include <mysql.h>
+#endif
+#ifdef PGSQL_DB
+#include <libpq-fe.h>
+#endif
+
+#ifndef MAX_FN
+#include "defs.h"
+#endif
+
+extern unsigned long adler32();
+
+struct lmf_str {
+ FILE *libf; /* sequence file being read */
+ char *lb_name; /* file name */
+ char opt_text[MAX_FN]; /* text after filename */
+ int lb_type; /* library type */
+ int *sascii; /* ascii -> sq mapping */
+ int *vascii; /* annotation to ann mapping */
+
+ char *annot_sname; /* annotation script name */
+
+ /* used by flat files */
+ char *lline; /* last line read */
+ int acc_off; /* start of libstr (+1 for agetlib/fasta) */
+ unsigned char *cpsave; /* position in line for lgetlib() */
+ fseek_t lpos; /* position in file */
+
+ /* blast2.0 stuff */
+ FILE *hfile; /* BLAST2.0 description file */
+ int bl_format_ver; /* blast formatdb version */
+ int bl_lib_pos; /* for ncbl2 */
+ int pref_db; /* preferred database */
+ int have_oid_list; /* we have an oid file, must read oid's */
+ unsigned int *oid_list; /* oid list for subsets */
+ int oid_seqs; /* start offset for mask array */
+ unsigned int max_oid; /* start offset for mask array */
+
+ /* Genbank Flat files */
+ int lfflag; /* flag for CRLF in EMBL CDROM files */
+
+ /* stuff for GCG format files (5,6) */
+ int gcg_binary; /* flag for binary gcg format */
+ long gcg_len; /* length of GCG sequence */
+
+ /* used when memory mapping */
+ int mm_flg; /* mmap worked */
+ int mmap_fd; /* mmap_fd */
+ char *mmap_base; /* base */
+ char *mmap_addr; /* current pos */
+ long st_size; /* file size */
+
+ MM_OFF *d_pos_arr; /* pointer to desc. offsets */
+ MM_OFF *s_pos_arr; /* pointer to seq. offsets */
+ MM_OFF *b_pos_arr; /* pointer to binary seq. offsets */
+ MM_OFF *a_pos_arr; /* pointer to aux offsets */
+
+ /* currently available only for memory mapped files */
+ int max_cnt; /* # database entries */
+ int64_t tot_len; /* total residue length */
+ long max_len; /* maximum sequence lengh */
+ long maxn; /* maximum possible length */
+ long mdup; /* duplication for overlapping sequences */
+ int lib_aa; /* 0 = DNA, 1 = prot */
+ char *tmp_buf; /* temporary buffer */
+ int tmp_buf_max; /* max size */
+ int (*sel_acc_p)(char *, int gi, void *); /* used to select subset of library */
+ void *sel_local; /* local data structure for sel_acc_p() */
+
+ /* used for SQL database queries */
+ char *sql_db, *sql_query, *sql_getdesc, *sql_getseq, *sql_close_tables;
+ int sql_reopen;
+ char **sql_uid_arr; /* indexed by lpos */
+ /* used to get sequence data */
+ char *sql_seqp;
+
+#ifdef MYSQL_DB
+ /* used to open the database */
+ MYSQL *mysql_conn;
+ MYSQL_RES *mysql_res;
+ MYSQL_ROW mysql_row;
+#endif
+
+#ifdef PGSQL_DB
+ /* used to open the database */
+ PGconn *pgsql_conn;
+ PGresult *pgsql_res;
+#endif
+
+ int (*getlib)(unsigned char *seq, int maxs, char *ann,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lm_fd,
+ long *l_off);
+
+ void (*ranlib)(char *str, int cnt,
+ fseek_t libpos, char *libstr,
+ struct lmf_str *lm_fd);
+
+ int (*get_mmap_chain)(struct seqr_chain *cur_seqr_chain,
+ struct lmf_str *m_fd, struct db_str *db);
+};
diff --git a/src/mmgetaa.c b/src/mmgetaa.c
new file mode 100644
index 0000000..77d58fe
--- /dev/null
+++ b/src/mmgetaa.c
@@ -0,0 +1,1116 @@
+/* mmgetaa.c - functions for mmap()ed access to libraries */
+
+/* $Id: mmgetaa.c 1153 2013-05-20 13:29:29Z wrp $ */
+
+/* copyright (c) 1999, 2000, 2014 by William R. Pearson and The Rector &
+ Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/*
+ This is one of two alternative files that can be used to
+ read a database. The two files are nmgetaa.c, and mmgetaa.c
+ (nxgetaa.c has been retired).
+
+ nmgetlib.c and mmgetaa.c are used together. nmgetlib.c provides
+ the same functions as nxgetaa.c if memory mapping is not used,
+ mmgetaa.c provides the database reading functions if memory
+ mapping is used. The decision to use memory mapping is made on
+ a file-by-file basis.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <limits.h>
+#include <string.h>
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define MAXLINE 512
+#define EOSEQ 0
+
+#define XTERNAL
+#include "uascii.h"
+/* #include "upam.h" */
+#undef XTERNAL
+
+#define GCGBIN 6
+
+#ifndef MAP_FILE
+#define MAP_FILE 0
+#endif
+
+#include "defs.h"
+#include "structs.h"
+#include "mm_file.h"
+
+extern int64_t bl2_long8_cvt(int64_t);
+extern int bl2_uint4_cvt(int);
+extern void newname(char *, char *, char *, int);
+unsigned long adler32(unsigned long, const unsigned char *, unsigned int);
+
+long crck(char *, int);
+extern void src_int4_read(FILE *fd, int *val);
+extern void src_long4_read(FILE *fd, long *valp);
+extern void src_long8_read(FILE *fd, int64_t *val);
+
+int
+agetlib_mb(unsigned char *seq,int maxs,char *libstr,int n_libstr,fseek_t *libpos,
+ int *lcont, struct lmf_str *m_fd, long *l_off);
+
+extern void
+aranlib(char *str, int cnt, fseek_t seek, char *libstr, struct lmf_str *lm_fd);
+void
+aranlib_mb(char *str, int cnt, fseek_t seek, char *libstr, struct lmf_str *lm_fd);
+
+/* mmap()ed functions */
+#ifdef USE_MMAP
+int agetlibm(); void aranlibm();
+int lgetlibm(); void lranlibm();
+void vranlibm();
+int gcg_getlibm();
+
+int (*getlibam[])()={
+ agetlibm,lgetlibm, NULL, NULL,NULL,agetlibm,gcg_getlibm
+};
+
+void (*ranlibam[])()={
+ aranlibm,lranlibm,NULL,NULL,NULL,vranlibm,vranlibm
+};
+#endif
+
+int
+bmap_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
+ struct lmf_str *m_fd, struct db_str *db);
+
+/* load_mmap() loads the d_pos[] and s_pos[] arrays for rapid access */
+/* 24-July-2011 -- checks and maps bsq file */
+
+int can_mmap(int lib_type) {
+ return (getlibam[lib_type] != NULL);
+}
+
+
+struct lmf_str *
+load_mmap(FILE *libi, /* fd for already open ".xin" file */
+ char *sname, /* name of sequence database file */
+ int lib_type, /* 0-Fasta, 5-vms_pir, 6-gcg_binary */
+ int ldnaseq, /* 1 for DNA, 0 for protein */
+ struct lmf_str *m_fd)
+{
+ char format[4];
+ char bname[MAX_FN], xbname[MAX_FN];
+ int i, lib_aa;
+ fseek_t f_size;
+ long lf_size;
+ struct stat statbuf;
+ int max_cnt;
+ fseek_t *d_pos_arr, *s_pos_arr, *b_pos_arr;
+ int mm_flag, mm64_flag, mmb_flag;
+ FILE *libi_b; /* FILE * for .xin_b file */
+ int *tmp_pos_arr;
+
+ /* first check that the necessary indices are up-to-date */
+ /* read the offsets in ".xin" file */
+ if (fread(format,1,4,libi)==0) {
+ fprintf(stderr," cannot read .xin format\n");
+ return NULL;
+ }
+
+ mm64_flag = (format[2]>=1); /* 4 bytes or 8 bytes for long? */
+
+#ifndef BIG_LIB64
+ if (mm64_flag) {return NULL;}
+#endif
+
+ if (format[3]!=lib_type) {
+ fprintf(stderr," cannot read format %d != lib_type %d\n",
+ format[3],lib_type);
+ return NULL;
+ }
+
+ src_int4_read(libi,&lib_aa);
+ if (lib_aa == ldnaseq) { /* database residue mismatch */
+ fprintf(stderr," residue type mismatch %s != %s (.xin) in %s\n",
+ (lib_aa ? "DNA" : "prot."),(ldnaseq ? "prot." : "DNA"),
+ sname);
+ return NULL;
+ }
+
+ /* everything looks good, allocate an lmf_str */
+ m_fd->lib_aa = lib_aa;
+
+ /* get ascii file size from index */
+ if (mm64_flag) src_long8_read(libi,&f_size);
+ else {
+ src_long4_read(libi,&lf_size);
+ f_size = lf_size;
+ }
+
+ if (sizeof(char *) < sizeof(fseek_t) && f_size > UINT_MAX) {
+ fprintf(stderr,"\n *** Warning *** database too large (%lld) for 32-bit mmap()\n",f_size);
+ return NULL;
+ }
+
+ /* check for .bsq binary mapping */
+ newname(bname,sname,"bsq",sizeof(bname));
+ mm_flag = (m_fd->mmap_fd=open(bname,O_RDONLY) >= 0);
+ mmb_flag = 0;
+ if (mm_flag) {
+ mmb_flag = 1;
+
+ /* fstat the binary sequence file */
+ if(stat(bname, &statbuf) < 0) {
+ fprintf(stderr," cannot stat %s for mmap()", sname);
+ perror("...");
+ mmb_flag = 0;
+ goto next_mmap;
+ }
+
+ /* now open the .xin_b file and read the offsets */
+ newname(xbname, sname, "xin_b",sizeof(xbname));
+ if ((libi_b = fopen(xbname,"r"))==NULL) {
+ fprintf(stderr,"Cannot open %s binary index file\n",xbname);
+ mmb_flag = 0;
+ goto next_mmap;
+ }
+
+ /* now read the .xin_b file */
+ if (fread(format,1,4,libi_b)==0) {
+ fprintf(stderr," cannot read .xin_b format\n");
+ mmb_flag = 0;
+ goto next_mmap;
+
+ }
+ src_int4_read(libi_b,&lib_aa);
+ /* get .bsq file size from .xin_b */
+ src_long8_read(libi_b,&f_size);
+ if (f_size != statbuf.st_size) {
+ fprintf(stderr," %s file size (%lld) and expected size (%lld) don't match\n",
+ bname,statbuf.st_size,f_size);
+ mmb_flag = 0;
+ goto next_mmap;
+ }
+
+ /* now, start to open mmap()ed file */
+ mmb_flag=((m_fd->mmap_fd=open(bname,O_RDONLY))>=0);
+
+ if (!mmb_flag) {
+ fprintf(stderr," cannot open %s for mmap()", bname);
+ perror("...");
+ goto next_mmap;
+ }
+
+ /* the index file and library file are open and the sizes match */
+ /* allocate the m_file struct and map the file */
+
+ m_fd->st_size = statbuf.st_size;
+ if((m_fd->mmap_base =
+ mmap(NULL, m_fd->st_size, PROT_READ,
+ MAP_FILE | MAP_SHARED, m_fd->mmap_fd, 0)) == (char *) -1) {
+ mm_flag = 0;
+#ifdef DEBUG
+ fprintf(stderr," cannot mmap %s", bname);
+ perror("...");
+#endif
+ }
+
+ /* now finish reading the index file */
+ src_int4_read(libi_b,&max_cnt);
+
+ src_long8_read(libi_b,&m_fd->tot_len);
+ src_long4_read(libi_b,&lf_size);
+ m_fd->max_len = lf_size;
+ /* get seqbuf_max */
+ src_long4_read(libi_b,&lf_size);
+ /* get seqbuf_dup */
+ src_long4_read(libi_b,&lf_size);
+
+#ifdef DEBUG
+ fprintf(stderr,
+ "\n%s\tformat: %c%c%d %d; max_cnt: %d; tot_len: %lld max_len: %ld\n",
+ sname,format[0],format[1],format[2],format[3],
+ max_cnt,m_fd->tot_len,m_fd->max_len);
+#endif
+
+ /* allocate array of description pointers */
+
+ if ((b_pos_arr=(fseek_t *)calloc(max_cnt+1, sizeof(fseek_t)))==NULL) {
+ fprintf(stderr," cannot allocate %d for binary seq array\n",max_cnt+1);
+ exit(1);
+ }
+
+ /* now read the binary offsets (b_pos_arr) */
+ if (fread(b_pos_arr,sizeof(fseek_t),max_cnt+1,libi_b)!=
+ max_cnt+1) {
+ fprintf(stderr," error reading bseq offsets: %s\n",xbname);
+ return NULL;
+ }
+
+#ifndef IS_BIG_ENDIAN
+ for (i=0; i<=max_cnt; i++) {
+ b_pos_arr[i] = bl2_long8_cvt(b_pos_arr[i]);
+ }
+#endif
+ /* now have the b_pos_arr[] allocated and read, close libi_b */
+ fclose(libi_b);
+ }
+
+ next_mmap:
+ /* here if no mmb_flag or mmb read failed */
+ if (!mmb_flag) {
+ /* now, start to open mmap()ed file */
+ mm_flag=((m_fd->mmap_fd=open(sname,O_RDONLY))>=0);
+ if (!mm_flag) {
+ fprintf(stderr," cannot open %s for mmap()", sname);
+ perror("...");
+ return NULL; /* file did not open */
+ }
+
+ /* fstat the library file and get size */
+ if(fstat(m_fd->mmap_fd, &statbuf) < 0) {
+ fprintf(stderr," cannot stat %s for mmap()", sname);
+ perror("...");
+ m_fd->mm_flg = 0;
+ goto finish;
+ }
+
+ /* check for identical sizes - if different, do not mmap */
+ if (f_size != statbuf.st_size) {
+ fprintf(stderr," %s file size (%lld) and expected size (%lld) don't match\n",
+ sname,statbuf.st_size,f_size);
+ mm_flag = 0;
+ goto finish;
+ }
+
+ /* the index file and library file are open and the sizes match */
+ /* allocate the m_file struct and map the file */
+
+ m_fd->st_size = statbuf.st_size;
+ if((m_fd->mmap_base =
+ mmap(NULL, m_fd->st_size, PROT_READ,
+ MAP_FILE | MAP_SHARED, m_fd->mmap_fd, 0)) == (char *) -1) {
+ mm_flag = 0;
+#ifdef DEBUG
+ fprintf(stderr," cannot mmap %s", sname);
+ perror("...");
+#endif
+ }
+ }
+
+ /* here, we have a memory mapped file, and if mmb_flag, we have the
+ b_pos_arr[] read, but not s_pos_arr[] or d_pos_arr[] */
+
+ finish:
+ close(m_fd->mmap_fd);
+ if (!mm_flag) { return NULL; }
+
+ /* now finish reading the index file */
+ src_int4_read(libi,&max_cnt);
+
+ if (mm64_flag) {
+ src_long8_read(libi,&m_fd->tot_len);
+ }
+ else {
+ src_long4_read(libi,&lf_size);
+ m_fd->tot_len = lf_size;
+ }
+ src_long4_read(libi,&lf_size);
+ m_fd->max_len = lf_size;
+
+#ifdef DEBUG
+ fprintf(stderr,
+ "\n%s\tformat: %c%c%d %d; max_cnt: %d; tot_len: %lld max_len: %ld\n",
+ sname,format[0],format[1],format[2],format[3],
+ max_cnt,m_fd->tot_len,m_fd->max_len);
+#endif
+
+ /* allocate array of description pointers */
+ if (!mm64_flag) {
+ if ((tmp_pos_arr=(int *)calloc(max_cnt+1,sizeof(int)))==NULL) {
+ fprintf(stderr," cannot allocate %d for tmp_pos array\n",
+ max_cnt+1);
+ return NULL;
+ }
+ }
+
+ if ((d_pos_arr=(fseek_t *)calloc(max_cnt+1, sizeof(fseek_t)))==NULL) {
+ fprintf(stderr," cannot allocate %d for desc. array\n",max_cnt+1);
+ exit(1);
+ }
+
+ /* read m_fd->d_pos[max_cnt+1] */
+ if (mm64_flag) {
+ if (fread(d_pos_arr,sizeof(fseek_t),max_cnt+1,libi)!=
+ max_cnt+1) {
+ fprintf(stderr," error reading desc. offsets: %s\n",sname);
+ return NULL;
+ }
+ }
+ else {
+ if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!=
+ max_cnt+1) {
+ fprintf(stderr," error reading desc. offsets: %s\n",sname);
+ return NULL;
+ }
+#ifdef DEBUG
+ fprintf(stderr,"d_pos_crc: %ld\n",
+ crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));
+#endif
+ }
+
+#ifndef IS_BIG_ENDIAN
+ if (mm64_flag)
+ for (i=0; i<=max_cnt; i++) {
+ d_pos_arr[i] = bl2_long8_cvt(d_pos_arr[i]);
+ }
+ else
+ for (i=0; i<=max_cnt; i++) {
+ d_pos_arr[i] = bl2_uint4_cvt(tmp_pos_arr[i]);
+ }
+#else
+ if (!mm64_flag) {
+ for (i=0; i<=max_cnt; i++) {
+ d_pos_arr[i] = tmp_pos_arr[i];
+ }
+ }
+#endif
+
+#ifdef DEBUG
+ for (i=0; i<max_cnt-1; i++) {
+ if (d_pos_arr[i+1] <= d_pos_arr[i] )
+ fprintf(stderr," ** dpos_error [%d]\t%lld\t%lld\n",
+ i,d_pos_arr[i],d_pos_arr[i+1]);
+ }
+#endif
+
+ /* allocate array of sequence pointers */
+ if ((s_pos_arr=(fseek_t *)calloc(max_cnt+1,sizeof(fseek_t)))==NULL) {
+ fprintf(stderr," cannot allocate %d for seq. array\n",max_cnt+1);
+ exit(1);
+ }
+
+ /* read m_fd->s_pos[max_cnt+1] */
+ if (mm64_flag) {
+ if (fread(s_pos_arr,sizeof(fseek_t),max_cnt+1,libi)!=
+ max_cnt+1) {
+ fprintf(stderr," error reading seq offsets: %s\n",sname);
+ return NULL;
+ }
+ }
+ else {
+ if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!=
+ max_cnt+1) {
+ fprintf(stderr," error reading seq offsets: %s\n",sname);
+ return NULL;
+ }
+#ifdef DEBUG
+ fprintf(stderr,"s_pos_crc: %ld\n",
+ crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));
+#endif
+ }
+
+#ifndef IS_BIG_ENDIAN
+ if (mm64_flag)
+ for (i=0; i<=max_cnt; i++)
+ s_pos_arr[i] = bl2_long8_cvt(s_pos_arr[i]);
+ else
+ for (i=0; i<=max_cnt; i++)
+ s_pos_arr[i] = (long)bl2_uint4_cvt(tmp_pos_arr[i]);
+#else
+ if (!mm64_flag)
+ for (i=0; i<=max_cnt; i++)
+ s_pos_arr[i] = (long)tmp_pos_arr[i];
+#endif
+
+#ifdef DEBUG
+ for (i=1; i<max_cnt-1; i++) {
+ if (s_pos_arr[i+1]<s_pos_arr[i])
+ fprintf(stderr," ** spos_error [%d]\t%lld\t%lld\n",
+ i,s_pos_arr[i],s_pos_arr[i]);
+ }
+#endif
+
+ if (!mm64_flag) free(tmp_pos_arr);
+
+ m_fd->max_cnt = max_cnt;
+ m_fd->d_pos_arr = d_pos_arr;
+ m_fd->s_pos_arr = s_pos_arr;
+ if (mmb_flag) m_fd->b_pos_arr = b_pos_arr;
+ m_fd->lpos = 0;
+ m_fd->lb_type = lib_type;
+ m_fd->getlib = getlibam[lib_type];
+ m_fd->ranlib = ranlibam[lib_type];
+ m_fd->get_mmap_chain = NULL;
+ m_fd->mm_flg = 1;
+ if (mmb_flag) {
+ m_fd->getlib = agetlib_mb;
+ m_fd->ranlib = aranlib_mb;
+ m_fd->get_mmap_chain = bmap_get_mmap_chain;
+ }
+
+ /* check_mmap(m_fd,-2); */
+
+ return m_fd;
+}
+
+char *mgets (char *s, int n, struct lmf_str *m_fd)
+{
+ char *cs, *mfp;
+
+ mfp = m_fd->mmap_addr;
+ cs = s;
+
+ while (--n > 0 && (*mfp != (char)EOF))
+ if ((*cs++ = *mfp++) == '\n') break;
+ *cs = '\0';
+
+ m_fd->mmap_addr = mfp;
+ return (*mfp == (char)EOF && cs == s) ? NULL : s;
+}
+
+int
+agetlibm(unsigned char *seq,
+ int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *m_fd,
+ long *l_off)
+{
+ register unsigned char *cp, *seqp;
+ register int *ap;
+ int sel_status;
+ char *desc;
+ int lpos; /* entry number in library */
+ long l;
+ unsigned char *seqm, *seqm1;
+ char *bp;
+ static long seq_len, desc_len;
+ static unsigned char *cp_max;
+
+ *l_off = 1;
+
+ lpos = m_fd->lpos;
+
+ seqp = seq;
+ seqm = &seq[maxs-9];
+ seqm1 = seqm-1;
+
+ ap = m_fd->sascii;
+
+ if (*lcont==0) {
+ start_seq:
+ if (lpos >= m_fd->max_cnt) return (-1);
+ seq_len = m_fd->d_pos_arr[lpos+1] - m_fd->s_pos_arr[lpos];
+ desc_len = m_fd->s_pos_arr[lpos] - m_fd->d_pos_arr[lpos]-m_fd->acc_off;
+ if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {
+ fprintf(stderr," ** sequence over-run: %ld at %d\n",seq_len,lpos);
+ return(-1);
+ }
+ *libpos = (fseek_t)lpos;
+
+ desc = m_fd->mmap_base+m_fd->d_pos_arr[lpos]+m_fd->acc_off;
+ strncpy(libstr,desc,n_libstr-1);
+ libstr[n_libstr-1]='\0';
+
+ if ((m_fd->sel_acc_p != NULL) &&
+ (sel_status = (m_fd->sel_acc_p)(libstr, 0, m_fd->sel_local)) <= 0) {
+ if (sel_status < 0) return (-1);
+ lpos++;
+ goto start_seq;
+ }
+
+ if ((bp=strchr(libstr,'\r'))!=NULL) *bp='\0';
+ if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
+
+ if (n_libstr > MAX_UID) {
+ bp = libstr;
+ while (*bp++) if ( *bp=='\001' || *bp=='\t') *bp=' ';
+ }
+
+ /* find @C:offset in the last 11 characters of the description */
+ /* check that we can offset desc by 12 characters to get to ' @C:' */
+ if ((desc_len > 12) && (bp = memchr(desc+desc_len-12,'@', 11)) && !strncmp(bp+1,"C:",2)) {
+ *l_off = atol(bp+3); /* this addresses an apparent bug in sscanf for non-null terminated strings */
+ }
+
+ m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
+ cp_max = (unsigned char *)(m_fd->mmap_addr+seq_len);
+ }
+
+ for (cp=(unsigned char *)m_fd->mmap_addr; seqp<seqm1; ) {
+ if ((*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA) continue;
+ --seqp;
+ if (cp >= cp_max) break;
+ }
+ m_fd->mmap_addr = (char *)cp;
+
+ if (seqp>=seqm1) (*lcont)++;
+ else {
+ *lcont=0;
+ lpos++;
+ m_fd->lpos = lpos;
+ }
+ *seqp = EOSEQ;
+ /* if ((int)(seqp-seq)==0) return 1; */
+ return (int)(seqp-seq);
+}
+
+int
+agetlib_mb(unsigned char *seq,
+ int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *m_fd,
+ long *l_off)
+{
+ int lpos, seq_len; /* entry number in library */
+
+ *l_off = 1;
+ lpos = m_fd->lpos++;
+
+ if (lpos >= m_fd->max_cnt) return (-1);
+ seq_len = m_fd->b_pos_arr[lpos+1] - m_fd->b_pos_arr[lpos]-1;
+ if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {
+ fprintf(stderr," ** sequence over-run: %d at %d\n",seq_len,lpos);
+ return(-1);
+ }
+ *libpos = (fseek_t)lpos;
+
+ strncpy(libstr,"",n_libstr-1);
+
+ memcpy(seq, m_fd->mmap_base+m_fd->b_pos_arr[lpos], min(seq_len+1,maxs));
+
+ *lcont=0;
+ /* if ((int)(seqp-seq)==0) return 1; */
+ return seq_len;
+}
+
+void
+aranlibm(char *str,
+ int cnt,
+ fseek_t libpos,
+ char *libstr,
+ struct lmf_str *m_fd)
+{
+ char *bp;
+ int llen;
+ int lpos;
+
+ lpos = (int) libpos;
+
+ llen = m_fd->s_pos_arr[lpos]-m_fd->d_pos_arr[lpos];
+
+ if (llen >= cnt) llen = cnt-1;
+
+ strncpy(str,m_fd->mmap_base+m_fd->d_pos_arr[lpos]+1,llen);
+ str[llen]='\0';
+ if ((bp = strchr(str,'\r'))!=NULL) *bp='\0';
+ if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+ bp = str;
+ while (*bp++) if ( *bp=='\001' || *bp=='\t') *bp=' ';
+ m_fd->lpos = lpos;
+}
+
+/* aranlib_mb is a hybrid of aranlib/aranlibm that uses
+ s_pos_arr[lpos] to get the fseek offset for the description
+*/
+
+void
+aranlib_mb(char *str,
+ int cnt,
+ fseek_t libpos,
+ char *libstr,
+ struct lmf_str *m_fd)
+{
+ char *bp;
+ int llen;
+ int lpos, fs_pos;
+
+ aranlib(str, cnt, (fseek_t)m_fd->d_pos_arr[(int)libpos],
+ libstr, m_fd);
+ m_fd->lpos = libpos;
+}
+
+/* bmap_get_mmap_chain fills cur_seqr_chain with sequence pointers
+ from the memory mapped file at *m_fd
+
+ because the database is opened read-only, this code only works with
+ an amino acid mapping identical to that used by blastdbcmd, aa_b2toa[]
+
+ bmap_get_mmap_chain must return EOF AND a set of sequences for the
+ comp_lib9.c/next_seqr_chain() logic to work properly.
+*/
+
+int bmap_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
+ struct lmf_str *m_fd, struct db_str *db) {
+ int i, lib_cnt;
+ struct seq_record *seq_a, *seq_p;
+ struct mseq_record *mseq_a, *mseq_p;
+
+ lib_cnt = m_fd->lpos;
+ if (lib_cnt >= m_fd->max_cnt) return EOF;
+ seq_a = cur_seqr_chain->seqr_base;
+ mseq_a = cur_seqr_chain->mseqr_base;
+
+ for (i=0; i < cur_seqr_chain->max_chain_seqs; i++) {
+ if (lib_cnt >= m_fd->max_cnt) break;
+ seq_p = &seq_a[i];
+ mseq_p = &mseq_a[i];
+ seq_p->n1 = m_fd->b_pos_arr[lib_cnt+1] - m_fd->b_pos_arr[lib_cnt]-1; /* value is +1 off to get the NULL */
+
+ db->entries++;
+ db->length += seq_p->n1;
+ if (db->length > LONG_MAX) {
+ db->length -= LONG_MAX; db->carry++;
+ }
+
+ mseq_p->m_file_p = m_fd;
+ mseq_p->n1tot_p=NULL;
+ mseq_p->cont = 0;
+ seq_p->index = mseq_p->index = mseq_p->lseek = lib_cnt;
+#ifndef DEBUG
+ mseq_p->libstr[0] = '\0';
+#else
+#endif
+ seq_p->aa1b = (unsigned char *)(m_fd->mmap_base + m_fd->b_pos_arr[lib_cnt++]);
+ seq_p->l_offset = 0;
+ seq_p->l_off = 1;
+#if DEBUG
+ seq_p->adler32_crc = mseq_p->adler32_crc = adler32(1L,seq_p->aa1b,seq_p->n1);
+#endif
+ }
+ cur_seqr_chain->cur_seq_cnt = i;
+ m_fd->lpos = lib_cnt;
+ if (lib_cnt >= m_fd->max_cnt) return EOF;
+ else return i;
+}
+
+/* there is no vgetlibm() because vgetlibm() and agetlibm() are
+ identical - the difference in the two file formats relates to the
+ location of the sequence, which is already available in spos_arr[].
+
+ however vranlibm must accomodate both type 5 and 6 files;
+ type 6 has extra stuff after the seq_id.
+*/
+
+void
+vranlibm(char *str,
+ int cnt,
+ fseek_t libpos,
+ char *libstr,
+ struct lmf_str *m_fd)
+{
+ char *bp, *mp;
+ int llen;
+ int lpos;
+
+ lpos = (int)libpos;
+
+ llen = m_fd->s_pos_arr[lpos]-m_fd->d_pos_arr[lpos];
+
+ mp = m_fd->mmap_base+m_fd->d_pos_arr[lpos];
+
+ strncpy(str,mp+4,20);
+ str[20]='\0';
+ if ((bp=strchr(str,' '))!=NULL) *(bp+1) = '\0';
+ else if ((bp=strchr(str,'\n'))!=NULL) *bp = ' ';
+ bp = strchr(mp,'\n');
+
+ llen -= (bp-mp)-5;
+ if (llen > cnt-strlen(str)) llen = cnt-strlen(str)-1;
+
+ strncat(str,bp+1,llen);
+ if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+ str[cnt-1]='\0';
+ m_fd->lpos = lpos;
+}
+
+void
+close_mmap(struct lmf_str *m_fd) {
+ free(m_fd->s_pos_arr);
+ free(m_fd->d_pos_arr);
+ if (m_fd->mm_flg) {
+ munmap(m_fd->mmap_base,m_fd->st_size);
+ free(m_fd);
+ }
+ m_fd->mm_flg=0;
+}
+
+#ifndef min
+#define min(x,y) ((x) > (y) ? (y) : (x))
+#endif
+
+static int gcg_bton[4]={2,4,1,3};
+
+int
+gcg_getlibm(unsigned char *seq,
+ int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *m_fd,
+ long *l_off)
+{
+ char dummy[20];
+ char gcg_date[6];
+ char gcg_type[10];
+ register unsigned char *cp, *seqp, stmp;
+ register int *ap, lpos;
+ unsigned char *seqm, *seqm1;
+ long r_block, b_block, r_fact, r16_block;
+
+ *l_off = 1;
+
+ seqp = seq;
+ seqm = &seq[maxs-9];
+
+ ap = m_fd->sascii;
+ lpos = m_fd->lpos;
+
+ if (*lcont==0) {
+ if (lpos >= m_fd->max_cnt) return (-1);
+ sscanf(m_fd->mmap_base+m_fd->d_pos_arr[lpos]+4,"%s %s %s %s %ld\n",
+ libstr,gcg_date,gcg_type,dummy,&(m_fd->gcg_len));
+
+ m_fd->gcg_binary = (gcg_type[0]=='2');
+
+ libstr[12]='\0';
+ *libpos = lpos;
+ m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
+ }
+
+ r_block = b_block = min((size_t)(seqm-seqp),m_fd->gcg_len);
+ if (m_fd->gcg_binary) {
+ r_block = (r_block+3)/4;
+ }
+
+ cp=(unsigned char *)m_fd->mmap_addr;
+ if (!m_fd->gcg_binary) {
+ r_fact = 1;
+ r16_block = r_block/16;
+ while (r16_block-- > 0) {
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ *seqp++ = ap[*cp++];
+ }
+ while (seqp<seq+r_block) *seqp++ = ap[*cp++];
+ }
+ else if (m_fd->gcg_binary) {
+ r_fact = 4;
+ r16_block = r_block/8;
+ while(r16_block-- > 0) {
+ stmp = *cp++;
+ *seqp++ = gcg_bton[(stmp>>6) &3];
+ *seqp++ = gcg_bton[(stmp>>4) &3];
+ *seqp++ = gcg_bton[(stmp>>2) &3];
+ *seqp++ = gcg_bton[(stmp) &3];
+ stmp = *cp++;
+ *seqp++ = gcg_bton[(stmp>>6) &3];
+ *seqp++ = gcg_bton[(stmp>>4) &3];
+ *seqp++ = gcg_bton[(stmp>>2) &3];
+ *seqp++ = gcg_bton[(stmp) &3];
+ stmp = *cp++;
+ *seqp++ = gcg_bton[(stmp>>6) &3];
+ *seqp++ = gcg_bton[(stmp>>4) &3];
+ *seqp++ = gcg_bton[(stmp>>2) &3];
+ *seqp++ = gcg_bton[(stmp) &3];
+ stmp = *cp++;
+ *seqp++ = gcg_bton[(stmp>>6) &3];
+ *seqp++ = gcg_bton[(stmp>>4) &3];
+ *seqp++ = gcg_bton[(stmp>>2) &3];
+ *seqp++ = gcg_bton[(stmp) &3];
+ stmp = *cp++;
+ *seqp++ = gcg_bton[(stmp>>6) &3];
+ *seqp++ = gcg_bton[(stmp>>4) &3];
+ *seqp++ = gcg_bton[(stmp>>2) &3];
+ *seqp++ = gcg_bton[(stmp) &3];
+ stmp = *cp++;
+ *seqp++ = gcg_bton[(stmp>>6) &3];
+ *seqp++ = gcg_bton[(stmp>>4) &3];
+ *seqp++ = gcg_bton[(stmp>>2) &3];
+ *seqp++ = gcg_bton[(stmp) &3];
+ stmp = *cp++;
+ *seqp++ = gcg_bton[(stmp>>6) &3];
+ *seqp++ = gcg_bton[(stmp>>4) &3];
+ *seqp++ = gcg_bton[(stmp>>2) &3];
+ *seqp++ = gcg_bton[(stmp) &3];
+ stmp = *cp++;
+ *seqp++ = gcg_bton[(stmp>>6) &3];
+ *seqp++ = gcg_bton[(stmp>>4) &3];
+ *seqp++ = gcg_bton[(stmp>>2) &3];
+ *seqp++ = gcg_bton[(stmp) &3];
+ }
+
+ while (seqp < seq+4*r_block) {
+ stmp = *cp++;
+ *seqp++ = gcg_bton[(stmp>>6) &3];
+ *seqp++ = gcg_bton[(stmp>>4) &3];
+ *seqp++ = gcg_bton[(stmp>>2) &3];
+ *seqp++ = gcg_bton[(stmp) &3];
+ }
+ }
+ if (r_fact * r_block >= m_fd->gcg_len) {
+ *lcont = 0;
+ m_fd->lpos++;
+ }
+ else {
+ if (m_fd->gcg_binary) b_block = 4*r_block;
+ m_fd->gcg_len -= b_block;
+ (*lcont)++;
+ }
+
+ seq[b_block] = EOSEQ;
+ /* if (b_block==0) return 1; else */
+ return b_block;
+}
+
+void lget_ann_m(struct lmf_str *lm_fd, char *libstr, int n_libstr);
+
+int
+lgetlibm(unsigned char *seq,
+ int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *m_fd,
+ long *l_off)
+{
+ register unsigned char *cp, *seqp;
+ register int *ap, lpos;
+ unsigned char *seqm, *seqm1;
+
+ *l_off = 1;
+
+ seqp = seq;
+ seqm = &seq[maxs-11];
+ seqm1 = seqm-1;
+
+ lpos = m_fd->lpos;
+ ap = m_fd->sascii;
+
+ if (*lcont==0) {
+ if (lpos >= m_fd->max_cnt) return (-1);
+
+ if (n_libstr <= 21) {
+ strncpy(libstr,m_fd->mmap_base+m_fd->d_pos_arr[lpos]+12,12);
+ libstr[12]='\0';
+ }
+ else {
+ lget_ann_m(m_fd,libstr,n_libstr);
+ }
+ *libpos = lpos;
+
+ m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
+ cp = (unsigned char *)m_fd->mmap_addr;
+ }
+ else cp = (unsigned char *)m_fd->mmap_addr;
+
+ while (seqp<seqm1) {
+ if (*cp=='/' && *(cp-1)=='\n') break;
+ if ((*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA) continue;
+ --seqp;
+ if (*cp=='\n' && *(cp+1)==' ') cp += 11;
+ }
+
+ if (seqp>=seqm1) {
+ (*lcont)++;
+ m_fd->mmap_addr = (char *)cp;
+ }
+ else {
+ *lcont=0;
+ m_fd->lpos++;
+ }
+
+ *seqp = EOSEQ;
+ return (int)(seqp-seq);
+}
+
+void
+lget_ann_m(struct lmf_str *lm_fd, char *libstr, int n_libstr) {
+ char *bp, *bp_gid, locus[120], desc[120], acc[120], ver[120];
+
+ /* copy in locus from lm_fd->lline */
+ strncpy(locus,&lm_fd->mmap_addr[12],sizeof(locus));
+ if ((bp=strchr(locus,' '))!=NULL) *(bp+1) = '\0';
+
+ /* get description */
+ mgets(desc,sizeof(desc),lm_fd);
+ while (desc[0]!='D' || desc[1]!='E' || strncmp(desc,"DEFINITION",10))
+ mgets(desc,sizeof(desc),lm_fd);
+ if ((bp = strchr(&desc[12],'\n'))!=NULL) *bp='\0';
+
+ /* get accession */
+ mgets(acc,sizeof(acc),lm_fd);
+ while (acc[0]!='A' || acc[1]!='C' || strncmp(acc,"ACCESSION",9)) {
+ mgets(acc,sizeof(acc),lm_fd);
+ if (acc[0]=='O' && acc[1]=='R' && strncmp(acc,"ORIGIN",6)==0)
+ break;
+ }
+ if ((bp = strchr(&acc[12],'\n'))!=NULL) *bp='\0';
+ if ((bp = strchr(&acc[12],' '))!=NULL) *bp='\0';
+
+ /* get version */
+ mgets(ver,sizeof(ver),lm_fd);
+ while (ver[0]!='V' || ver[1]!='E' || strncmp(ver,"VERSION",7)) {
+ mgets(ver,sizeof(ver),lm_fd);
+ if (ver[0]=='O' && ver[1]=='R' && strncmp(ver,"ORIGIN",6)==0)
+ break;
+ }
+ if ((bp = strchr(&ver[12],'\n'))!=NULL) *bp='\0';
+
+ /* extract gi:123456 from version line */
+ bp_gid = strchr(&ver[12],':');
+ if (bp_gid != NULL) {
+ if ((bp=strchr(bp_gid+1,' '))!=NULL) *bp='\0';
+ bp_gid++;
+ }
+ if ((bp = strchr(&ver[12],' '))!=NULL) *bp='\0';
+
+ /* build up FASTA header line */
+ if (bp_gid != NULL) {
+ strncpy(libstr,"gi|",n_libstr-1);
+ strncat(libstr,bp_gid,n_libstr-4);
+ strncat(libstr,"|gb|",n_libstr-20);
+ }
+ else {libstr[0]='\0';}
+
+ /* if we have a version number, use it, otherwise accession,
+ otherwise locus/description */
+
+ if (ver[0]=='V') {
+ strncat(libstr,&ver[12],n_libstr-1-strlen(libstr));
+ strncat(libstr,"|",n_libstr-1-strlen(libstr));
+ }
+ else if (acc[0]=='A') {
+ strncat(libstr,&acc[12],n_libstr-1-strlen(libstr));
+ strncat(libstr," ",n_libstr-1-strlen(libstr));
+ }
+
+ strncat(libstr,locus,n_libstr-1-strlen(libstr));
+ strncat(libstr,&desc[11],n_libstr-1-strlen(libstr));
+ libstr[n_libstr-1]='\0';
+}
+
+void
+lranlibm(char *str,
+ int cnt,
+ fseek_t seek,
+ char *libstr,
+ struct lmf_str *m_fd)
+{
+ lget_ann_m(m_fd,str,cnt);
+
+ str[cnt-1]='\0';
+
+ m_fd->lpos = seek;
+}
+
+static int check_status=0;
+
+void
+check_mmap(struct lmf_str *m_fd,long ntt) {
+
+ int i, seq_len, ok_stat;
+
+ ok_stat = 1;
+ if ( ++check_status > 5) return;
+
+ fprintf(stderr," ** checking %s %ld**\n", m_fd->lb_name,ntt);
+ for (i=0; i<m_fd->max_cnt; i++) {
+ seq_len = m_fd->d_pos_arr[i+1] - m_fd->s_pos_arr[i];
+ if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {
+ fprintf(stderr,"%d:\t%lld\t%lld\t%lld\n",
+ i,m_fd->d_pos_arr[i],m_fd->s_pos_arr[i],
+ m_fd->d_pos_arr[i+1]-m_fd->s_pos_arr[i]);
+ ok_stat=0;
+ }
+ }
+ if (ok_stat) {
+ if (check_status) fprintf(stderr," ** check_mmap OK %s %ld**\n",
+ m_fd->lb_name,ntt);
+ }
+}
+
+#ifdef DEBUG
+/* C H K 3 -- Compute a type-3 Kermit block check. */
+/*
+ Calculate the 16-bit CRC of a null-terminated string using a byte-oriented
+ tableless algorithm invented by Andy Lowry (Columbia University). The
+ magic number 010201 is derived from the CRC-CCITT polynomial x^16+x^12+x^5+1.
+ Note - this function could be adapted for strings containing imbedded 0's
+ by including a length argument.
+*/
+long
+crck(s,n)
+ char *s; int n;
+{
+ unsigned int c, q;
+ long crc = 0;
+
+ while (n-->0) {
+ c = *s++;
+ /* if (parity)*/
+ c &= 0177;
+ q = (crc ^ c) & 017; /* Low-order nibble */
+ crc = (crc >> 4) ^ (q * 010201);
+ q = (crc ^ (c >> 4)) & 017; /* High order nibble */
+ crc = (crc >> 4) ^ (q * 010201);
+ }
+ return(crc);
+}
+#endif
diff --git a/src/mrandom.c b/src/mrandom.c
new file mode 100644
index 0000000..77ab0fb
--- /dev/null
+++ b/src/mrandom.c
@@ -0,0 +1,97 @@
+/* mrandom.c 28-Jan-2010 */
+
+/* $Id: */
+/* $Revision: 625 $ */
+
+/* system versions of random/nrand48/nrand tend have thread contention
+ issues. This version uses a random number generator from Wikipedia
+ that maintains state in a separate buffer, so that there is no contention.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef UNIX
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
+
+/* minimal standard random number generator taken from:
+ S. K. Park and K. W. Miller (1988) "Random number generators: Good
+ ones are hard to find" Comm. ACM 31:1192-1201
+*/
+#define MIN_STD_RAND
+
+struct m_rand_struct {
+#ifndef MIN_STD_RAND
+ unsigned int mw;
+ unsigned int mz;
+#else
+ int seed;
+#endif
+};
+
+#ifdef MIN_STD_RAND
+#define A 16807
+#define M 2147483647
+#define Q 127773 /* M / A */
+#define R 2836 /* M % A */
+#endif
+
+void *
+my_srand(int set) /* initialize random number generator */
+{
+#ifdef UNIX
+ struct timeval t;
+#endif
+ int n;
+ struct m_rand_struct *my_rand_state;
+
+ if ((my_rand_state = (struct m_rand_struct *)calloc(1, sizeof(struct m_rand_struct)))==NULL) {
+ fprintf(stderr," *** [my_srand] cannot allocate random state ***\n");
+ exit(1);
+ }
+
+#ifdef UNIX
+ gettimeofday(&t,NULL);
+ n = t.tv_usec % 65535;
+#else
+ n = time(NULL);
+#endif
+ if ((n % 2)==0) n++;
+
+#ifndef MIN_STD_RAND
+ my_rand_state->mw = n;
+ /* swap things around, since the next time will be close */
+ n = ((n & 0xFFF) << 12) + ((n>>12) & 0xFFF);
+ if ((n%2)==0) n++;
+ my_rand_state->mz = n;
+#else
+ if (set > 0) { my_rand_state->seed = set;}
+ else {my_rand_state->seed = n;}
+#endif
+ return my_rand_state;
+}
+
+/* returns a random number between 0 and n-1 where n < 2^31) */
+unsigned int
+my_nrand(int n, struct m_rand_struct *my_rand_state)
+{
+ unsigned int rn;
+#ifdef MIN_STD_RAND
+ int lo, hi, test;
+
+ hi = my_rand_state->seed / Q;
+ lo = my_rand_state->seed % Q;
+ test = A * lo - R * hi;
+ if (test > 0) { my_rand_state->seed = test;}
+ else {my_rand_state->seed = test + M;}
+ rn = my_rand_state->seed;
+#else
+ my_rand_state->mz = 36969 * (my_rand_state->mz & 65535) + (my_rand_state->mz >> 16);
+ my_rand_state->mw = 18000 * (my_rand_state->mw & 65535) + (my_rand_state->mw >> 16);
+ rn = (my_rand_state->mz << 16) + my_rand_state->mw; /* 32-bit result */
+#endif
+
+ return rn%n;
+}
diff --git a/src/msg.h b/src/msg.h
new file mode 100644
index 0000000..4e6e4f9
--- /dev/null
+++ b/src/msg.h
@@ -0,0 +1,57 @@
+/* Concurrent read version */
+
+/* $Id: msg.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+/* Cube definitions */
+
+#ifdef PVM_SRC
+#define FIRSTNODE 1
+#define FIRSTWORK 1
+#else
+#define FIRSTNODE 1
+#define FIRSTWORK 1
+#endif
+
+#define MAXNOD 128
+#define ALLTYPES -1
+#define HOSTPID 0
+#define MANAGEPID 0
+#define WORKPID 0
+
+#define MANAGER 0
+#define ALLNODES -1
+#define ALLPIDS -1
+
+#define STARTTYPE0 0 /* configuration buffer values */
+#define STARTTYPE1 1 /* struct mngmsg m_msp */
+#define STARTTYPE2 2 /* struct pstruct ppst */
+#define STARTTYPE3 3 /* pam2[0,1] matrix */
+#define STARTTYPE4 4 /* *pascii for fasty/tfasty */
+
+#define QSEQTYPE0 5
+#define QSEQTYPE1 6
+
+#define MSEQTYPE0 10 /* cur_buf->hdr */
+#define MSEQTYPE1 11 /* cur_buf->buf2_data * cur_buf->hdr.buf2_cnt */
+#define MSEQTYPE2 12 /* bulk - seq_b, seq_record * hdr.buf2_cnt */
+#define MSEQTYPE3 13 /* bulk - aa1b_start, aa1b_used+1 */
+#define MSEQTYPE4 14 /* individ. - seq_record */
+#define MSEQTYPE5 15 /* individ. - aa1b */
+#define MSEQTYPE6 16
+
+#define RES_TYPE0 20
+#define RES_TYPE1 21
+#define RES_TYPE2 22
+
+#define ALN_TYPE0 30
+#define ALN_TYPE1 31
+#define ALN_TYPE2 32
+#define ALN_TYPE3 33
+
+#define FINISHED 16384 /* this must be larger than BFR */
+
+#define DO_SEARCH_FLG 0
+#define DO_OPT_FLG 1
+#define DO_ALIGN_FLG 2
+#define DO_CALC_FLG 3
diff --git a/src/mshowalign2.c b/src/mshowalign2.c
new file mode 100644
index 0000000..090343b
--- /dev/null
+++ b/src/mshowalign2.c
@@ -0,0 +1,999 @@
+/* $Id: mshowalign2.c 1269 2014-07-29 21:24:25Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* mshowalign.c - show sequence alignments in pvcomplib */
+
+/*
+ In the serial and current threaded versions of the programs,
+ showalign gets a list of high scoring sequences and must
+ re_getlib() the sequence, do_walign(), and then calculate the
+ alignment.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+#include "msg.h"
+#include "structs.h"
+#include "param.h"
+
+#include "mm_file.h"
+
+/* best_stats.h must come after mm_file.h */
+#include "best_stats.h"
+
+/* used to position the library sequence for re_getlib - also gets
+ description */
+#define RANLIB (m_fptr->ranlib)
+
+extern struct lmf_str *
+re_openlib(struct lmf_str *, int outtty);
+
+int
+re_getlib(unsigned char *aa1, struct annot_str **annot_p,
+ int maxn, int maxt,
+ int loff, int cont, int term_code,
+ long *l_offset, long *l_off,
+ struct lmf_str *m_fptr);
+
+#include "drop_func.h"
+/* drop_func.c includes dyn_string.h */
+
+extern void calc_astruct(struct a_struct *aln_p, struct a_res_str *a_res_p, void *f_str);
+
+extern void calc_coord(int n0, int n1, long qoffset, long loffset,
+ struct a_struct *aln);
+
+void initseq(char **, char **, char **, int);
+void initseq_ann(char **, char **, int);
+
+void freeseq(char **, char **, char **);
+void freeseq_ann(char **, char **);
+
+void do_show(FILE *fp, int n0, int n1, int score,
+ char *name0, char *name1, int nml, char *link_name,
+ const struct mngmsg *m_msp, const struct pstruct *ppst,
+ char *seqc0, char *seqc0a, char *seqc1, char *seqc1a,
+ char *seqca, int *cumm_seq_score, int nc,
+ float percent, float gpercent, int lc,
+ struct a_struct *aln, const char *annot_var_s,
+ const struct annot_str *q_annot_p,
+ const struct annot_str *l_annot_p);
+
+void
+do_lav(FILE *fp, struct a_struct *aln, char *seqc, float percent, int is_mirror);
+
+void
+buf_align_seq(unsigned char **aa0, int n0,
+ struct beststr **bestp_arr, int nbest,
+ struct pstruct *ppst, struct mngmsg *m_msp,
+ const struct mng_thr *m_bufi_p
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ , void **f_str
+#endif
+ );
+
+/* pre-alignment */
+extern void
+pre_load_best(unsigned char *aa1, int maxn,struct beststr **bbp_arr,
+ int nbest, struct mngmsg *m_msp, int debug);
+
+float
+calc_fpercent_id(float scale, int n_ident, int n_alen, int tot_ident, float fail);
+
+extern int E1_to_s(double e_val, int n0, int n1, int db_size, void *pu);
+
+extern void discons(FILE *fd, const struct mngmsg *m_msg,
+ char *seqc0, char *seqc0a,
+ char *seqc1, char *seqc1a,
+ char *seqca, int *cumm_seq_score, int nc,
+ int n0, int n1, char *name0, char *name1, int nml,
+ struct a_struct *aln);
+
+extern void disgraph(FILE *fd, int n0, int n1,
+ float percent, int score,
+ int min0, int min1, int max0, int max1, long sq0off,
+ char *name0, char *name1, int nml, int llen, int markx);
+
+extern double find_z(int score, double escore, int length, double comp,void *);
+extern double zs_to_bit(double, int, int);
+extern double s_to_bit(int score, int n0, int n1, void *pu);
+extern double bit_to_E (double bit, int n0, int n1, long db_size, void *pu);
+extern double zs_to_E(double zs, int n1, int dnaseq, long db_size, struct db_str db);
+
+extern void
+do_url1(FILE *, const struct mngmsg *, const struct pstruct *, char *, int,
+ const struct a_struct *, const char *,
+ const struct annot_str *, const struct annot_str *);
+
+#ifndef A_MARK
+#define A_MARK ">>"
+#endif
+
+/* this version does not check for m_msg->e_cut because nshow/nbest has
+ already been set to limit on e_cut */
+
+void showalign (FILE *fp, unsigned char **aa0, unsigned char *aa1save, int maxn,
+ struct beststr **bptr, int nbest, int qlib,
+ struct mngmsg *m_msp, struct pstruct *ppst,
+ char *info_gstring2
+ , void **f_str, struct mng_thr *m_bufi_p
+ )
+{
+ unsigned char *aa1, *aa1a;
+ char tmp_str[20];
+ char info_str[200];
+ char bline[2048], *qline_p, *bline_p, *bl_ptr, *bp, *bp1, fmt[40];
+ struct dyn_string_str *annot_var_dyn, *align_code_dyn;
+ char *annot_var_s10;
+ int tmp_len, ttmp_len, l_llen, desc_llen, ranlib_done;
+ char name0[80], name0s[80], name1[200];
+ char l_name[128], link_name[140]; /* link name */
+ int istop, i = 0, ib, nml, first_line;
+ int l_ashow;
+ int n1tot;
+ struct beststr *bbp;
+ struct a_res_str *cur_ares_p;
+ struct rstruct *rst_p;
+ int nc, lc, maxc;
+ double lzscore, lzscore2, lbits;
+ struct a_struct l_aln, *l_aln_p;
+ float percent, gpercent;
+ /* strings, lengths for conventional alignment */
+ char *seqc0, *seqc0a, *seqc1, *seqc1a, *seqca;
+ int *cumm_seq_score;
+ /* strings, lengths, for encoded alignment for MX10 */
+ char *seq_code=NULL, *annot_code=NULL;
+ int seq_code_len=0, annot_code_len=0;
+ long loffset, l_off;
+ long qt_offset, lt_offset;
+ int lsw_score, l_score0;
+ char html_pre_E[120], html_post_E[120];
+ int disp_dna_align = ((m_msp->qdnaseq>0) && (m_msp->ldb_info.ldnaseq > 0));
+ int score_delta = 0;
+#ifdef LALIGN
+ int lalign_repeat_thresh_done = 0;
+#endif
+
+ int n1;
+ struct lmf_str *m_fptr;
+ int ngap;
+
+ align_code_dyn = init_dyn_string(4096, 4096);
+ annot_var_dyn = init_dyn_string(4096, 4096);
+
+ qline_p = m_msp->qtitle;
+ if (!strncmp(m_msp->qtitle,"gi|",3)) {
+ qline_p = strchr(qline_p+4,'|');
+ /* check for additional '|'s associated with NCBI gi|12346|db|acc entry */
+ if (!qline_p || strchr(qline_p+1,'|')==NULL) {
+ qline_p = m_msp->qtitle;
+ }
+ else { qline_p += 1;}
+ }
+
+ memcpy(&l_aln, &(m_msp->aln),sizeof(struct a_struct));
+ l_aln_p = &l_aln; /* aln_p = &m_msp->aln; */
+
+ /* set the name0,1 label length */
+ if (m_msp->markx & (MX_M10FORM+MX_MBLAST)) nml = 12;
+ else if (m_msp->markx & MX_M11OUT) nml = MAX_UID;
+ else nml = m_msp->nmlen;
+
+ if (strlen(qline_p) > 0) {
+ if (qline_p[0]=='>') {SAFE_STRNCPY(name0s,qline_p+1,sizeof(name0s));}
+ else {SAFE_STRNCPY(name0s,qline_p,sizeof(name0s));}
+ }
+ else {
+ SAFE_STRNCPY(name0s,m_msp->tname,sizeof(name0s));
+ }
+
+ if ((bp=strchr(name0s,' '))!=NULL) *bp='\0';
+
+ if (m_msp->revcomp) name0[nml-1]='-';
+
+ if (m_msp->markx & MX_HTML) {
+ SAFE_STRNCPY(html_pre_E,"<font color=\"darkred\">",sizeof(html_pre_E));
+ SAFE_STRNCPY(html_post_E,"</font>",sizeof(html_post_E));
+
+ }
+ else {
+ html_pre_E[0] = html_post_E[0] = '\0';
+ }
+
+ desc_llen = l_llen = m_msp->aln.llen;
+ if ((m_msp->markx & MX_M9SUMM) && (m_msp->show_code != SHOW_CODE_ID && m_msp->show_code != SHOW_CODE_IDD)) {
+ l_llen += 40;
+ if (l_llen > 200) l_llen=200;
+ }
+
+ if (m_msp->markx & MX_MBLAST) {
+ sprintf(fmt,">%%-%ds\n%%sLength=%%d\n",l_llen+15);
+ desc_llen = l_llen+25;
+ }
+ else {
+ sprintf(fmt,"%s%%-%ds (%%d %s)\n",A_MARK,l_llen-5,m_msp->sqnam);
+ }
+
+ if (m_msp->std_output && !(m_msp->markx&MX_M10FORM)) fprintf (fp,"\n");
+
+ l_ashow = m_msp->ashow;
+ if (l_ashow < 0) l_ashow = m_msp->nshow;
+ istop = min(min(nbest,l_ashow),m_msp->nshow);
+
+ tmp_len = sizeof(bline)-1;
+ if (!(m_msp->markx & MX_M10FORM) && !m_msp->long_info) {tmp_len = l_llen-5;}
+
+ /* don't call pre_load_best if we already have sequences and alignments */
+ if (!m_msp->align_done) {
+ if (!m_msp->pre_load_done) { pre_load_best(aa1save, maxn, bptr, istop, m_msp, ppst->debug_lib); }
+
+ /* don't call buf_align_seq if the algorithm does not support
+ pre-alignment */
+ if (ppst->can_pre_align) {
+#ifdef LALIGN
+ for (ib=0; ib<istop; ib++) {
+ bbp = bptr[ib];
+ bbp->repeat_thresh =
+ min(E1_to_s(ppst->e_cut_r, m_msp->n0, bbp->seq->n1,ppst->zdb_size, m_msp->pstat_void),
+ bbp->rst.score[ppst->score_ix]);
+ }
+ lalign_repeat_thresh_done = 1;
+#endif
+
+ buf_align_seq(aa0, m_msp->n0, bptr, istop, ppst, m_msp, m_bufi_p
+#if !defined(COMP_THR) && !defined(PCOMPLIB)
+ , f_str
+#endif
+ );
+ }
+ }
+
+ for (ib=0; ib<istop; ib++) {
+ bbp = bptr[ib];
+
+#ifdef LALIGN
+ if (!lalign_repeat_thresh_done) {
+ bbp->repeat_thresh =
+ min(E1_to_s(ppst->e_cut_r, m_msp->n0, bbp->seq->n1,ppst->zdb_size, m_msp->pstat_void),
+ bbp->rst.score[ppst->score_ix]);
+ }
+#endif
+ /* preload stuff guaranteed to be in bbp->seq */
+ n1 = bbp->seq->n1;
+ aa1 = bbp->seq->aa1b;
+ if (bbp->seq->annot_p != NULL) aa1a = bbp->seq->annot_p->aa1_ann;
+ else aa1a = NULL;
+ l_off = bbp->seq->l_off;
+ loffset = bbp->seq->l_offset;
+
+ /* make sure we have a description */
+ if (bbp->mseq->bline == NULL || bbp->mseq->bline_max < tmp_len) {
+ if ((m_fptr=re_openlib(bbp->mseq->m_file_p,!m_msp->quiet))==NULL)
+ exit(1);
+ RANLIB(bline,tmp_len,bbp->mseq->lseek,bbp->mseq->libstr,bbp->mseq->m_file_p);
+ bline[tmp_len]='\0';
+ ranlib_done = 1;
+ }
+ else {
+ ranlib_done = 0;
+ SAFE_STRNCPY(bline, bbp->mseq->bline, sizeof(bline));
+ }
+
+ /* make sure we have a sequence */
+ if (bbp->seq->aa1b == NULL || (m_msp->ann_flg==1 && &(bbp->seq->annot_p)==NULL)) {
+ if (!ranlib_done) {
+ if ((m_fptr=re_openlib(bbp->mseq->m_file_p,!m_msp->quiet))==NULL)
+ exit(1);
+ RANLIB(bline,tmp_len,bbp->mseq->lseek,bbp->mseq->libstr,bbp->mseq->m_file_p);
+ bline[tmp_len]='\0';
+ ranlib_done = 1;
+ }
+
+ n1 = re_getlib(aa1save, (m_msp->ann_flg==1) ? &(bbp->seq->annot_p) : NULL, maxn,
+ m_msp->ldb_info.maxt3,
+ m_msp->ldb_info.l_overlap,bbp->mseq->cont,m_msp->ldb_info.term_code,
+ &loffset,&l_off,bbp->mseq->m_file_p);
+ aa1 = aa1save;
+ if (m_msp->ann_flg==1 && bbp->seq->annot_p->aa1_ann) {aa1a = bbp->seq->annot_p->aa1_ann;}
+ }
+
+#ifdef DEBUG
+ if (n1 != bbp->seq->n1) {
+ fprintf(stderr," library sequence: %s lengths differ: %d != %d\n",
+ bline,bbp->seq->n1, n1);
+ fprintf(stderr, "offset is: %lld\n",bbp->mseq->lseek);
+ }
+#endif
+
+ /* make sure we have an alignment encoding */
+ if (!(bbp->have_ares & 0x1)) {
+
+ bbp->a_res = do_walign(aa0[bbp->frame],m_msp->n0, aa1, n1,
+ bbp->frame, bbp->repeat_thresh, ppst,
+ f_str[bbp->frame], &bbp->have_ares);
+ }
+ else {
+ pre_cons(aa1,n1,bbp->frame,f_str[bbp->frame]);
+ }
+
+ cur_ares_p = bbp->a_res;
+ /* current do_walign()'s provide valid rst */
+ /*
+ memcpy(&cur_ares_p->rst,&bbp->rst, sizeof(struct rstruct));
+ */
+ aln_func_vals(bbp->frame, l_aln_p);
+
+ if (strlen(bline)==0) {
+ SAFE_STRNCPY(bline,">",sizeof(bline));
+ SAFE_STRNCAT(bline,m_msp->lname,l_llen-5);
+ }
+
+ bline_p = bline;
+ /* always remove "gi|" for alignments */
+ if (!strncmp(bline,"gi|",3)) {
+ bline_p = strchr(bline+4,'|');
+ if (!bline_p || !strchr(bline_p+1,'|')) {bline_p = bline;}
+ else bline_p += 1;
+ }
+
+ /* re-format bline */
+ while ((bp=strchr(bline_p,'\n'))!=NULL) *bp=' ';
+ if (m_msp->long_info) {
+ ttmp_len = strlen(bline_p);
+ bl_ptr = bline_p;
+ if (!(m_msp->markx & MX_M10FORM)) {
+ while (ttmp_len > desc_llen) {
+ for (i=desc_llen; i>10; i--)
+ if (bl_ptr[i]==' ') {
+ bl_ptr[i]='\n';
+ break;
+ }
+ if (i <= 10) break;
+ ttmp_len -= i;
+ bl_ptr += i;
+ }
+ }
+ bline[tmp_len]='\0';
+ }
+
+ n1tot = (bbp->mseq->n1tot_p) ? *bbp->mseq->n1tot_p : bbp->seq->n1;
+
+ /* name1 is used to label the display */
+ /* bline_p does not have gi|12345, but could have pf26|12345 or sp|P09488 */
+ SAFE_STRNCPY(name1,bline_p,sizeof(name1));
+
+ if (!(m_msp->markx & MX_M10FORM)) name1[nml]='\0';
+ if ((bp = strchr(name1,' '))!=NULL) *bp = '\0';
+
+ /* l_name is used to build an HTML link from the bestscore line to
+ the alignment. It can also be used to discriminate multiple hits
+ from the same long sequence. Text must match that in mshowbest.c */
+
+ SAFE_STRNCPY(l_name,bline_p,sizeof(l_name));
+ l_name[sizeof(l_name)-1]='\0';
+ if ((bp = strchr(l_name,' '))!=NULL) *bp = '\0';
+ if ((bp=strchr(&l_name[6],'|'))!=NULL) *bp='\0'; /* increase to [6] from [3] to allow longer db names "ref", "unk", */
+ if (m_msp->nframe > 2) sprintf(&l_name[strlen(l_name)],"_%d",bbp->frame+1);
+ else if (m_msp->qframe >= 0 && bbp->frame == 1) {
+ SAFE_STRNCAT(l_name,"_r",sizeof(l_name));
+ }
+ if (bbp->mseq->cont-1 > 0) {
+ sprintf(tmp_str,":%d",bbp->mseq->cont-1);
+ SAFE_STRNCAT(l_name,tmp_str,sizeof(l_name)-strlen(l_name));
+ }
+
+ if (m_msp->markx & MX_MBLAST) { SAFE_STRNCPY(name1,"Sbjct",sizeof(name1));}
+
+ if (!(m_msp->markx & MX_M10FORM)) name1[nml]='\0';
+
+ /* print out score information; */
+
+ if (m_msp->markx & MX_HTML ) {
+ SAFE_STRNCPY(link_name, l_name, sizeof(link_name));
+ fprintf (fp,"<a name=\"%s\"><pre>",link_name);
+ }
+ SAFE_STRNCPY(name0,name0s,nml+1);
+ if (m_msp->markx & MX_MBLAST) { SAFE_STRNCPY(name0,"Query",sizeof(name0));}
+ name0[nml]='\0';
+
+ if (ppst->zsflag%10 == 6) {
+ sprintf(info_str," comp: %.5f H: %.5f",bbp->rst.comp,bbp->rst.H);
+ }
+ else info_str[0]='\0';
+
+ if (m_msp->markx & MX_M11OUT) {
+ qt_offset = m_msp->q_offset + (m_msp->q_off-1)+(m_msp->sq0off);
+ lt_offset = loffset + (bbp->seq->l_off-1) + (m_msp->sq1off);
+ fprintf (fp, "s {\n \"%s\" %ld %ld \n \"%s\" %ld %ld\n}\n",
+ name0, qt_offset, qt_offset + m_msp->n0 - 1,
+ name1, lt_offset, lt_offset + bbp->seq->n1 - 1);
+ fprintf (fp, "h {\n \"%s\"\n \"%s\"\n}\n", qline_p, bline_p);
+ }
+
+
+ /* enables >>seq_acc seq_description length for first alignment, >- after */
+ first_line = 1;
+
+ while (cur_ares_p != NULL && cur_ares_p->nres > 0) {
+
+ /* estimate space for alignment consensus */
+ if (m_msp->aln.showall==1) {
+ maxc = cur_ares_p->nres + max(cur_ares_p->min0,cur_ares_p->min1)+
+ max((m_msp->n0-cur_ares_p->max0),(n1-cur_ares_p->max1))+4;
+ }
+ else {
+ maxc = cur_ares_p->nres + 4*m_msp->aln.llen+4;
+ }
+
+ /* get space to put the sequence alignment consensus */
+ initseq(&seqc0, &seqc1, &seqca, maxc);
+ cumm_seq_score = NULL;
+ if (m_msp->markx & MX_RES_ALIGN_SCORE) {
+ if ((cumm_seq_score = (int *)calloc(maxc,sizeof(int)))==NULL) {
+ fprintf(stderr,"***error*** [%s:%d] cannot allocate cumm_seq_score[%d]\n",
+ __FILE__, __LINE__, (int)(maxc*sizeof(int)));
+ }
+ }
+ if (m_msp->ann_flg && (m_msp->aa0a != NULL || aa1a!=NULL || m_msp->annot_p)) {
+ initseq_ann(&seqc0a, &seqc1a, maxc);
+ }
+ else { seqc0a = seqc1a = NULL;}
+
+ calc_astruct(l_aln_p, cur_ares_p, f_str[bbp->frame]);
+
+ calc_coord(m_msp->n0,bbp->seq->n1,
+ m_msp->q_offset+(m_msp->q_off-1)+(m_msp->sq0off-1),
+ loffset+(l_off-1)+(m_msp->sq1off-1),
+ l_aln_p);
+
+#ifdef LALIGN
+ if ((m_msp->markx & MX_M11OUT) == MX_M11OUT) { /* lav output - skip lots of stuff */
+ lsw_score = cur_ares_p->sw_score;
+ lzscore = find_z(lsw_score, 0.0, bbp->seq->n1, 0.0, m_msp->pstat_void);
+ lzscore2 = find_z(lsw_score, 0.0, bbp->seq->n1, 0.0, m_msp->pstat_void2);
+ lbits = zs_to_bit(lzscore, m_msp->n0, bbp->seq->n1);
+
+ NULL_dyn_string(annot_var_dyn);
+ NULL_dyn_string(align_code_dyn);
+
+ lc=calc_code(aa0[bbp->frame],m_msp->n0,
+ aa1,n1,
+ l_aln_p, cur_ares_p,
+ ppst,
+ align_code_dyn,
+ /* seqc0, maxc, */
+ m_msp->ann_arr,
+ m_msp->aa0a, m_msp->annot_p,
+ aa1a, bbp->seq->annot_p,
+ annot_var_dyn,
+ &score_delta,
+ f_str[bbp->frame],m_msp->pstat_void,m_msp->show_code + SHOW_ANNOT_FULL);
+
+ if (lc > 0) {
+ percent = (100.0*(float)l_aln_p->nident)/(float)lc;
+ }
+ else { percent = -1.00; }
+
+ fprintf (fp, "a {\n");
+ if (annot_var_dyn->string[0]) {
+ bp = annot_var_dyn->string;
+ while ((bp1=strchr(bp, '\n'))) {
+ *bp1 = '\0';
+ fprintf (fp, "# %s\n", bp);
+ *bp1 = '\n';
+ bp = bp1 + 1;
+ }
+ }
+ fprintf (fp, " s %d %.1f\n", lsw_score, lbits);
+ do_lav(fp, l_aln_p, align_code_dyn->string, percent, 0);
+
+ if (ppst->nseq == 1) {
+ fprintf (fp, "a {\n");
+ fprintf (fp, " s %d %.1f\n", lsw_score, lbits);
+ do_lav(fp, l_aln_p, align_code_dyn->string, percent, 1);
+ }
+
+ cur_ares_p = cur_ares_p->next;
+ continue;
+ }
+#endif /* ifdef LALIGN */
+
+ NULL_dyn_string(annot_var_dyn);
+ NULL_dyn_string(align_code_dyn);
+
+ nc=calc_cons_a(aa0[bbp->frame],m_msp->n0, aa1, n1,
+ &lc,l_aln_p, cur_ares_p, ppst,
+ seqc0, seqc1, seqca, cumm_seq_score,
+ m_msp->ann_arr,
+ m_msp->aa0a, m_msp->annot_p, seqc0a,
+ aa1a, bbp->seq->annot_p, seqc1a,
+ &score_delta,
+ annot_var_dyn,
+ f_str[bbp->frame],
+ m_msp->pstat_void
+ );
+
+ if (cur_ares_p->score_delta > 0) score_delta -= cur_ares_p->score_delta;
+
+ percent = calc_fpercent_id(100.0, l_aln_p->nident,lc,m_msp->tot_ident, -1.0);
+
+ ngap = l_aln_p->ngap_q + l_aln_p->ngap_l;
+#ifndef SHOWSIM
+ gpercent = calc_fpercent_id(100.0,l_aln_p->nident,lc-ngap,m_msp->tot_ident, -1.0);
+#else
+ gpercent = calc_fpercent_id(100.0,l_aln_p->nsim,lc,m_msp->tot_ident, -1.0);
+#endif
+
+ lsw_score = cur_ares_p->sw_score + score_delta;
+ /* removed 'first_line &&' so that LALIGN shows subject name/description */
+ if (first_line && !(m_msp->markx&MX_M11OUT )) {
+ if ((m_msp->markx & MX_ATYPE)!=7 && !(m_msp->markx & MX_M10FORM)) {
+ if (m_msp->markx & MX_MBLAST) {
+ /* provides >>id description (length) line */
+ fprintf (fp, fmt,bline_p,annot_var_dyn->string,n1tot);
+ }
+ else {
+ fprintf (fp, fmt,bline_p,n1tot);
+ }
+ }
+ else if (m_msp->markx & MX_M10FORM) {
+ if (annot_var_dyn->string[0]) { /* have annotation with '\n', replace with ';' in copy */
+ if ((annot_var_s10=(char *)calloc(strlen(annot_var_dyn->string)+1,sizeof(char)))==NULL) {
+ fprintf(stderr," ***error*** [%s:%d] -m 10 cannot allocate annot_var_s10[%d]\n",
+ __FILE__,__LINE__,(int)strlen(annot_var_dyn->string));
+ fprintf (fp,">>%s\n",bline_p);
+ }
+ else {
+ SAFE_STRNCPY(annot_var_s10,annot_var_dyn->string,strlen(annot_var_dyn->string));
+ bp = annot_var_s10;
+ while ((bp = strchr(bp+1,'\n'))) {
+ if (bp[-1] != ';') {*bp = ';';}
+ else {*bp = ' ';}
+ }
+ fprintf (fp,">>%s;%s\n",bline_p, annot_var_s10);
+ free(annot_var_s10); annot_var_s10 = NULL;
+ }
+ }
+ else {
+ fprintf (fp,">>%s\n",bline_p);
+ }
+ }
+ }
+
+ /* this is required because cur_ares_p can carry scores from an
+ alignment without -S low-complexity re-scored using low
+ complexity */
+#ifndef LALIGN
+ if (first_line) {
+ rst_p = &bbp->rst;
+ first_line = 0;
+ }
+ else {
+ rst_p = &cur_ares_p->rst;
+ }
+#else
+ /* ensures that LALIGN alignments do not report 100% match
+ values */
+ rst_p = &cur_ares_p->rst;
+ first_line = 0;
+#endif
+
+ l_score0 = rst_p->score[ppst->score_ix] + score_delta;
+
+ if (max(strlen(seqc0),strlen(seqc1)) > nc) {
+ fprintf(stderr," mshowalign: nc/maxc: %d/%d seqc0/1: %lu/%lu\n",
+ nc,maxc,strlen(seqc0),strlen(seqc1));
+ }
+
+#ifdef DEBUG
+ /*
+ if (lsw_score < bbp->rst.score[ppst->score_ix]) {
+ fprintf(stderr," *** warning - SW score=%d < opt score=%d ***\n",
+ lsw_score, bbp->rst.score[ppst->score_ix]);
+ }
+ */
+#endif
+ calc_coord(m_msp->n0,bbp->seq->n1,
+ m_msp->q_offset+(m_msp->q_off-1)+(m_msp->sq0off-1),
+ bbp->seq->l_offset+(bbp->seq->l_off-1)+(m_msp->sq1off-1),
+ l_aln_p);
+
+ lzscore = find_z(l_score0, rst_p->escore, bbp->seq->n1, rst_p->comp, m_msp->pstat_void);
+ if (ppst->zsflag > 20) {
+ lzscore2 = find_z(l_score0, rst_p->escore, bbp->seq->n1, rst_p->comp, m_msp->pstat_void2);
+ }
+ lbits = zs_to_bit(lzscore, m_msp->n0, bbp->seq->n1);
+
+ if (m_msp->markx & MX_MBLAST) {
+ fprintf(fp, "\n Score = %.1f bits (%d), Expect = %.1g\n",
+ lbits, rst_p->score[ppst->score_ix] + score_delta,
+ zs_to_E(lzscore, bbp->seq->n1, ppst->dnaseq, ppst->zdb_size, m_msp->db));
+
+ fprintf(fp, " Identities = %d/%d (%d%%)", l_aln_p->nident, lc-ngap,
+ (int)((100.0*(float)l_aln_p->nident+0.5)/(float)(lc-ngap)));
+
+ if (!disp_dna_align) {
+ fprintf(fp, ", Positives = %d/%d (%d%%)", l_aln_p->npos, lc-ngap,
+ (int)((100.0*(float)l_aln_p->npos+0.5)/(float)(lc-ngap)));
+ }
+
+ fprintf(fp, ", Gaps = %d/%d (%d%%)\n",ngap, lc-ngap,
+ (int)((100.0*(float)(l_aln_p->ngap_q+l_aln_p->ngap_l)+0.5)/(float)(lc-ngap)));
+
+ if (disp_dna_align) {
+ if (m_msp->qframe > 1 && bbp->frame > 0) {
+ fprintf (fp, " Strand=Minus/Plus\n");
+ }
+ else {
+ fprintf (fp, " Strand=Plus/Plus\n");
+ }
+ }
+ else {
+ if (m_msp->nframe > 2) {
+ fprintf (fp, " Frame= %d\n",bbp->frame+1);
+ }
+ else if (m_msp->nframe > 1) {
+ fprintf (fp, " Frame = %s\n",(bbp->frame>0 ? "Reverse" : "Forward"));
+ }
+ else if (m_msp->qframe > 1) {
+ fprintf (fp, " Frame = %s\n",(bbp->frame>0 ? "Reverse" : "Forward"));
+ }
+ }
+ }
+ else if ((m_msp->markx & MX_ATYPE)!=7 && !(m_msp->markx & MX_M10FORM)) {
+ if (annot_var_dyn->string[0]) {
+ if (m_msp->markx & MX_HTML) {
+ fprintf(fp,"<!-- ANNOT_START \"%s\" -->",link_name);}
+ /* ensure that last character is "\n" */
+ if (annot_var_dyn->string[strlen(annot_var_dyn->string)-1] != '\n') {
+ annot_var_dyn->string[strlen(annot_var_dyn->string)-1] = '\n';
+ }
+ fputs(annot_var_dyn->string, fp);
+ if (m_msp->markx & MX_HTML) {fputs("<!-- ANNOT_STOP -->",fp);}
+ }
+
+ /* this code makes sense for library searches, but not for
+ multiple non-intersecting alignments */
+
+#ifndef LALIGN
+ if (m_msp->nframe > 2)
+ fprintf (fp, "Frame: %d",bbp->frame+1);
+ else if (m_msp->nframe > 1)
+ fprintf (fp, "Frame: %c",(bbp->frame? 'r': 'f'));
+ else if (m_msp->qframe >= 0 && bbp->frame > 0 ) {
+ fputs("rev-comp",fp);
+ name0[nml-1]='\0';
+ if (!(m_msp->markx & MX_MBLAST)) SAFE_STRNCAT(name0,"-",sizeof(name0)-1);
+ }
+
+ if (m_msp->arelv > 0)
+ fprintf (fp, " %s: %3d", m_msp->alab[0],rst_p->score[0] + score_delta);
+ if (m_msp->arelv > 1)
+ fprintf (fp, " %s: %3d", m_msp->alab[1],rst_p->score[1] + score_delta);
+ if (m_msp->arelv > 2)
+ fprintf (fp, " %s: %3d", m_msp->alab[2],rst_p->score[2] + score_delta);
+ fprintf (fp,"%s",info_str);
+ if (ppst->zsflag>=0) {
+ fprintf (fp, " Z-score: %4.1f bits: %3.1f %sE(%ld): %4.2g%s",
+ lzscore, lbits,
+ html_pre_E, ppst->zdb_size,
+ zs_to_E(lzscore, bbp->seq->n1, ppst->dnaseq, ppst->zdb_size, m_msp->db),
+ html_post_E);
+ if (ppst->zsflag > 20) {
+ fprintf(fp," E2(): %4.2g",zs_to_E(lzscore2, bbp->seq->n1, ppst->dnaseq, ppst->zdb_size, m_msp->db));
+ }
+ }
+ fprintf (fp, "\n");
+
+#else /* LALIGN */
+ if ((m_msp->markx & MX_M11OUT) == 0) {
+ fprintf (fp, " %s score: %d; ", m_msp->alabel, lsw_score);
+ fprintf (fp," %3.1f bits; E(%ld) < %.2g\n", lbits, ppst->zdb_size,
+ zs_to_E(lzscore, bbp->seq->n1, ppst->dnaseq, ppst->zdb_size, m_msp->db));
+ }
+#endif
+ }
+ else if (m_msp->markx & MX_M10FORM) {
+#ifndef LALIGN
+ if (m_msp->qframe > -1) {
+ if (m_msp->nframe > 2) {
+ fprintf(fp,"; %s_frame: %d\n",m_msp->f_id0,bbp->frame+1);
+ }
+ else {
+ fprintf(fp,"; %s_frame: %c\n",m_msp->f_id0,(bbp->frame > 0? 'r':'f'));
+ }
+ }
+ fprintf (fp, "; %s_%s: %3d\n", m_msp->f_id0,m_msp->alab[0],cur_ares_p->rst.score[0]+score_delta);
+ if (m_msp->arelv > 1)
+ fprintf (fp,"; %s_%s: %3d\n", m_msp->f_id0,m_msp->alab[1],cur_ares_p->rst.score[1]+score_delta);
+ if (m_msp->arelv > 2)
+ fprintf (fp,"; %s_%s: %3d\n", m_msp->f_id0,m_msp->alab[2],cur_ares_p->rst.score[2]+score_delta);
+ if (info_str[0]) fprintf(fp,"; %s_info: %s\n",m_msp->f_id0,info_str);
+ if (ppst->zsflag>=0)
+ fprintf (fp,"; %s_z-score: %4.1f\n; %s_bits: %3.1f\n; %s_expect: %6.2g\n",
+ m_msp->f_id0,bbp->zscore,
+ m_msp->f_id0,zs_to_bit(bbp->zscore, m_msp->n0, bbp->seq->n1),
+ m_msp->f_id0,bbp->rst.escore);
+#else
+ if ((m_msp->markx & MX_M11OUT) == 0) {
+ fprintf (fp,"; %s_%s: %d\n", m_msp->f_id0, m_msp->alab[0], lsw_score);
+ fprintf (fp,"; %s_z-score: %4.1f\n; %s_bits: %3.1f\n; %s_expect: %6.2g\n",
+ m_msp->f_id0, lzscore, m_msp->f_id0, lbits, m_msp->f_id0,
+ zs_to_E(lzscore, bbp->seq->n1, ppst->dnaseq, ppst->zdb_size, m_msp->db));
+ }
+#endif
+ }
+
+ do_show(fp, m_msp->n0, bbp->seq->n1, lsw_score, name0, name1, nml,
+ link_name,
+ m_msp, ppst, seqc0, seqc0a, seqc1, seqc1a, seqca, cumm_seq_score,
+ nc, percent, gpercent, lc, l_aln_p, annot_var_dyn->string,
+ m_msp->annot_p, bbp->seq->annot_p);
+
+ /* display the encoded alignment left over from showbest()*/
+
+ if ((m_msp->markx & MX_M10FORM) &&
+ (m_msp->markx & MX_M9SUMM) &&
+ ((m_msp->show_code & SHOW_CODE_ALIGN) == SHOW_CODE_ALIGN)) {
+
+ seq_code = cur_ares_p->aln_code;
+ seq_code_len = cur_ares_p->aln_code_n;
+ annot_code = cur_ares_p->annot_code;
+ annot_code_len = cur_ares_p->annot_code_n;
+
+ if (seq_code_len > 0 && seq_code != NULL) {
+ fprintf(fp,"; al_code: %s\n",seq_code);
+ /* free(seq_code); -- this is now freed in comp_lib2.c */
+ if (annot_code_len > 0 && annot_code != NULL) {
+ fprintf(fp,"; al_code_ann: %s\n",annot_code);
+ /* free(ann_code); -- this is now freed in comp_lib2.c */
+ }
+ }
+ }
+
+ if (m_msp->markx & MX_HTML) fprintf(fp,"</pre><hr />");
+ fflush(fp);
+
+ freeseq(&seqc0,&seqc1, &seqca);
+ freeseq_ann(&seqc0a, &seqc1a);
+
+ cur_ares_p = cur_ares_p->next;
+
+ if (cur_ares_p != NULL) {
+ if (m_msp->markx & MX_HTML) {
+ sprintf(link_name,"%s_%d",l_name, cur_ares_p->index);
+ fprintf (fp,"<a name=\"%s\"><pre>",link_name);
+ }
+ else {
+ if (!(m_msp->markx & MX_MBLAST)) fprintf(fp,">--\n");
+ }
+ } /* done finishing up */
+ } /* while (cur_ares_p) */
+ /* we are done displaying the alignment - be sure to free a_res memory */
+ }
+
+ free_dyn_string(annot_var_dyn);
+
+ if (!(m_msp->markx & (MX_M8OUT+MX_HTML))) fprintf(fp,"\n");
+}
+
+void do_show(FILE *fp, int n0,int n1, int score,
+ char *name0, char *name1, int nml, char *link_name,
+ const struct mngmsg *m_msp, const struct pstruct *ppst,
+ char *seqc0, char *seqc0a, char *seqc1, char *seqc1a,
+ char *seqca, int *cumm_seq_score, int nc,
+ float percent, float gpercent, int lc,
+ struct a_struct *aln, const char *annot_var_s,
+ const struct annot_str * q_annot_p,
+ const struct annot_str * l_annot_p)
+{
+ int tmp;
+
+ if (m_msp->markx & MX_AMAP && (m_msp->markx & MX_ATYPE)==7)
+ disgraph(fp, n0, n1, percent, score,
+ aln->amin0, aln->amin1, aln->amax0, aln->amax1, m_msp->sq0off,
+ name0, name1, nml, aln->llen, m_msp->markx);
+ else if (m_msp->markx & MX_M10FORM) {
+ if (ppst->sw_flag && m_msp->arelv>0)
+ fprintf(fp,"; %s_score: %d\n",m_msp->f_id1,score);
+ fprintf(fp,"; %s_ident: %5.3f\n",m_msp->f_id1,percent/100.0);
+#ifndef SHOWSIM
+ fprintf(fp,"; %s_gident: %5.3f\n",m_msp->f_id1,gpercent/100.0);
+#else
+ fprintf(fp,"; %s_sim: %5.3f\n",m_msp->f_id1,gpercent/100.0);
+#endif
+
+ fprintf(fp,"; %s_overlap: %d\n",m_msp->f_id1,lc);
+ discons(fp, m_msp,
+ seqc0, seqc0a, seqc1, seqc1a, seqca, cumm_seq_score, nc,
+ n0, n1, name0, name1, nml, aln);
+ }
+ else {
+ if (!(m_msp->markx & MX_MBLAST)) {
+#ifndef LALIGN
+ fprintf(fp,"%s score: %d; ",m_msp->alabel, score);
+#endif
+#ifndef SHOWSIM
+ fprintf(fp,"%4.1f%% identity (%4.1f%% ungapped) in %d %s overlap (%ld-%ld:%ld-%ld)\n",
+ percent,gpercent,lc,m_msp->sqnam,aln->d_start0,aln->d_stop0,
+ aln->d_start1,aln->d_stop1);
+#else
+ fprintf(fp,"%4.1f%% identity (%4.1f%% similar) in %d %s overlap (%ld-%ld:%ld-%ld)\n",
+ percent,gpercent,lc,m_msp->sqnam,aln->d_start0,aln->d_stop0,
+ aln->d_start1,aln->d_stop1);
+#endif
+ }
+
+ if (m_msp->markx & MX_HTML) {
+ do_url1(fp, m_msp, ppst, link_name,n1, aln,
+ annot_var_s, q_annot_p, l_annot_p);
+ }
+
+ if (m_msp->markx & MX_AMAP && (m_msp->markx & MX_ATYPE)!=7) {
+ fputc('\n',fp);
+ tmp = n0;
+
+ if (m_msp->qdnaseq == SEQT_DNA && m_msp->ldb_info.ldnaseq== SEQT_PROT)
+ tmp /= 3;
+
+ disgraph(fp, tmp, n1, percent, score,
+ aln->amin0, aln->amin1,
+ aln->amax0, aln->amax1,
+ m_msp->sq0off,
+ name0, name1, nml, aln->llen,m_msp->markx);
+ }
+
+ if (m_msp->markx & MX_HTML) {
+ fprintf(fp, "<!-- ALIGN_START \"%s\" -->",link_name);
+ }
+ discons(fp, m_msp,
+ seqc0, seqc0a, seqc1, seqc1a, seqca, cumm_seq_score, nc,
+ n0, n1, name0, name1, nml, aln);
+ if (m_msp->markx & MX_HTML) {fputs("<!-- ALIGN_STOP -->",fp);}
+ fputc('\n',fp);
+
+ }
+}
+
+void
+do_lav(FILE *fp, struct a_struct *aln_p, char *seqc,
+ float percent, int is_mirror) {
+ int cur_b0, cur_b1, cur_e0, cur_e1;
+ int ipercent;
+ long len;
+ char *seqc_p, *num_e;
+
+ ipercent = (int)(percent+0.5);
+
+ cur_b0 = aln_p->d_start0;
+ cur_b1 = aln_p->d_start1;
+ cur_e0 = aln_p->d_stop0;
+ cur_e1 = aln_p->d_stop1;
+
+ if (!is_mirror) {
+ fprintf (fp, " b %d %d\n e %d %d\n",
+ cur_b0, cur_b1, cur_e0, cur_e1);
+ }
+ else {
+ fprintf (fp, " b %d %d\n e %d %d\n",
+ cur_b1, cur_b0, cur_e1, cur_e0);
+ }
+
+ seqc_p = seqc;
+
+ while (*seqc_p) {
+ if (*seqc_p == '=') { /* extend match in both sequences */
+ len = strtol(seqc_p+1, &num_e, 10);
+ cur_e0 = cur_b0 + len - 1;
+ cur_e1 = cur_b1 + len - 1;
+ if (!is_mirror) {
+ fprintf(fp, " l %d %d %d %d %d\n",
+ cur_b0, cur_b1, cur_e0, cur_e1,
+ ipercent);
+ }
+ else {
+ fprintf(fp, " l %d %d %d %d %d\n",
+ cur_b1, cur_b0, cur_e1, cur_e0,
+ ipercent);
+ }
+ cur_b0 = cur_e0 + 1;
+ cur_b1 = cur_e1 + 1;
+ }
+ else if (*seqc_p == '+') { /* extend insertion in seq0 by incrementing seq1 */
+ len = strtol(seqc_p+1, &num_e, 10);
+ cur_b1 += len;
+ }
+ else { /* extend insertion in seq1 by incrementing seq0 */
+ len = strtol(seqc_p+1, &num_e, 10);
+ cur_b0 += len;
+ }
+ seqc_p = num_e;
+ }
+
+ fprintf (fp, "}\n");
+}
+
+void /* initialize consensus arrays */
+initseq(char **seqc0, char **seqc1, char **seqca, int seqsiz)
+{
+ *seqc0=(char *)calloc((size_t)seqsiz*3,sizeof(char));
+ if (*seqc0==NULL)
+ {fprintf(stderr,"cannot allocate consensus arrays %d\n",seqsiz);
+ exit(1);}
+ *seqc1=*seqc0 + seqsiz;
+ *seqca=*seqc1 + seqsiz;
+}
+
+void freeseq(char **seqc0, char **seqc1, char **seqca)
+{
+ free(*seqc0);
+ *seqc0 = *seqc1 = *seqca = NULL;
+}
+
+void /* initialize consensus annotation arrays */
+initseq_ann(char **seqc0a, char **seqc1a, int seqsiz)
+{
+ *seqc0a=(char *)calloc((size_t)seqsiz*5,sizeof(char));
+ if (*seqc0a==NULL)
+ {fprintf(stderr,"cannot allocate consensus arrays %d\n",seqsiz);
+ exit(1);}
+ *seqc1a=*seqc0a + seqsiz;
+}
+
+void freeseq_ann(char **seqc0a, char **seqc1a)
+{
+ if (*seqc0a != NULL) {
+ free(*seqc0a);
+ *seqc0a = *seqc1a = NULL;
+ }
+}
+
+#include <math.h>
+/* calculates percentages, optionally ensuring that 100% is completely
+ identical*/
+float calc_fpercent_id(float scale, int n_ident, int n_alen, int tot_ident, float fail) {
+ float f_id, f_decimal;
+ int n_sig;
+
+ if (n_alen <= 0) { return fail;}
+
+ /*
+ n_sig = 3;
+ n_sig = tot_ident;
+ if (tot_ident==1) { n_sig = 3;}
+ */
+
+ f_id = (float)n_ident/(float)n_alen;
+
+ if (tot_ident && n_ident != n_alen) {
+ f_decimal = 0.999;
+ /*
+ if (n_sig == 4) {f_decimal = 0.9999;}
+ else if (n_sig == 3) { f_decimal = 0.999;}
+ else if (n_sig == 5) { f_decimal = 0.99999;}
+ else {f_decimal = 1.0 - powf(0.1, n_sig);}
+ */
+ if (f_id > f_decimal) f_id = f_decimal;
+ }
+
+ return scale*f_id;
+}
diff --git a/src/mshowbest.c b/src/mshowbest.c
new file mode 100644
index 0000000..18cf4bf
--- /dev/null
+++ b/src/mshowbest.c
@@ -0,0 +1,665 @@
+/* $Id: mshowbest.c 1281 2014-08-21 17:32:06Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
+ The Rector and Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License"); you
+ may not use this file except in compliance with the License. You
+ may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* 2-April-2009 changes to simplify interactive display logic. Coming
+ into showbest(), things are interactive (quiet==0) or use
+ m_msg.nshow */
+
+/* 29-Oct-2003 - changes so that bbp->mseq->cont < 0 => aa1 sequence is
+ already in aa1, no re_openlib or re_getlib required
+*/
+
+/* 14-May-2003 Changes to use a more consistent coordinate numbering
+ system for displays. aln->d_start[01] is now consistently used
+ to report the start of the alignment in all functions, and
+ mshowbest.c has been modified to use d_start[01] instead of
+ d_start[01]-1. aln->min[01] now starts at 0 for all functions;
+ instead of 1 for some functions (dropnfa.c, dropgsw.c, dropfs2.c
+ earlier).
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+#include "structs.h"
+#include "param.h"
+#include "mm_file.h"
+#include "best_stats.h"
+
+#define MAX_BLINE 256
+
+/* function calls necessary to re_getlib() the sequence and, do
+ alignments, if necessary
+*/
+
+#define RANLIB (m_fptr->ranlib)
+
+
+int
+re_getlib(unsigned char *, struct annot_str **,
+ int, int, int, int, int, long *, long *,
+ struct lmf_str *m_fptr);
+
+#include "drop_func.h"
+
+void
+s_annot_to_aa1a(int n1, struct annot_str *annot_p, unsigned char *ann_arr);
+
+struct a_res_str *
+build_ares_code(unsigned char *aa0, int n0,
+ unsigned char *aa1, struct seq_record *seq,
+ int frame, int *have_ares, int repeat_thresh,
+ struct mngmsg *m_msp, struct pstruct *ppst,
+ void *f_str
+ );
+
+struct lmf_str *re_openlib(struct lmf_str *, int outtty);
+
+extern void calc_coord(int n0, int n1, long qoffset, long loffset,
+ struct a_struct *aln);
+
+extern float
+calc_fpercent_id(float scale, int n_ident, int n_alen, int tot_ident, float fail);
+
+extern int
+get_annot(char *sname, struct mngmsg *m_msp, char *bline, int n1, struct annot_str **annot_p,
+ int target, int debug);
+extern double find_z(int score, double escore, int length, double comp,void *);
+extern double zs_to_E(double zs, int n1, int dnaseq, long db_size, struct db_str db);
+extern double zs_to_bit(double, int, int);
+extern int E1_to_s(double e_val, int n0, int n1, int db_size, void *pu);
+
+void header_aux(FILE *);
+void show_aux(FILE *, struct beststr *);
+void w_abort (char *p, char *p1);
+
+extern double zs_to_bit(double, int, int);
+
+/* showbest() shows a list of high scoring sequence descriptions, and
+ their rst.scores. If -m 9, then an additional complete set of
+ alignment information is provided.
+
+ If PCOMPLIB or m_msg.quiet then the number of high scores to be
+ shown is pre-determined by m_msg.mshow before showbest is called.
+
+ The comp_lib2.c version re_getlib()'s the sequence for its
+ discription, and then does another alignment for -m 9 (Thus, it
+ needs an f_str. The PCOMPLIB version has everything available in
+ beststr before showbest() is called.
+*/
+
+void showbest (FILE *fp, unsigned char **aa0, unsigned char *aa1save, int maxn,
+ struct beststr **bptr,int nbest,
+ int qlib, struct mngmsg *m_msp,
+ struct pstruct *ppst, struct db_str db,
+ char **info_gstring2
+ ,void **f_str
+)
+{
+ unsigned char *aa1;
+ int best_align_done = 0;
+ int ntmp = 0;
+ char bline[MAX_BLINE], fmt[40], pad[MAX_BLINE], fmt2[40], rline[40];
+ char l_name[128], link_name[140];
+ int istart = 0, istop, ib;
+ int nshow; /* number of sequences shown before prompt,
+ and ultimately displayed */
+ int first_line, link_shown;
+ int quiet;
+ int r_margin;
+ struct beststr *bbp;
+ int n1tot;
+ char *bp, *bline_p;
+ char rel_label[12];
+ char score_label[120];
+ char tmp_str[20], *seq_code, *annot_str;
+ int seq_code_len, annot_str_len;
+ long loffset; /* loffset is offset from beginning of real sequence */
+ long l_off; /* l_off is the the virtual coordinate of residue 1 */
+ int n1, ranlib_done;
+ struct rstruct rst;
+ int l_score0, ngap;
+ double lzscore, lzscore2, lbits;
+ float percent, gpercent, ng_percent;
+ struct a_struct *aln_p;
+ struct a_res_str *cur_ares_p;
+ struct rstruct *rst_p;
+ int gi_num;
+ char html_pre_E[120], html_post_E[120];
+ int have_lalign = 0;
+
+ struct lmf_str *m_fptr;
+
+ /* for lalign alignments, only show stuff when -m != 11 */
+
+ if (m_msp->markx & MX_M11OUT) return;
+ if (strcmp(m_msp->label,"ls-w")==0) {
+ have_lalign = 1;
+ if ((m_msp->markx & MX_M9SUMM) == 0) return;
+ }
+
+ rel_label[0]='\0';
+ SAFE_STRNCPY(score_label,"scores", sizeof(score_label));
+
+ quiet = m_msp->quiet;
+
+ if (m_msp->aln.llen > MAX_BLINE) m_msp->aln.llen = MAX_BLINE;
+
+ if (ppst->zsflag < 0) r_margin = 10;
+ else if (ppst->zsflag>=0 && m_msp->srelv > 1 ) r_margin = 19;
+ else r_margin = 10;
+
+ if (m_msp->markx & MX_M9SUMM && (m_msp->show_code == SHOW_CODE_ID || m_msp->show_code == SHOW_CODE_IDD)) {
+#ifdef SHOWSIM
+ r_margin += 15;
+#else
+ r_margin += 10;
+#endif
+ }
+ else if (m_msp->markx & MX_MBLAST2) {
+ r_margin -= 10;
+ }
+ else if (m_msp->markx & (MX_M9SUMM + MX_M8OUT)) {
+ r_margin = 0;
+ }
+
+ if (m_msp->markx & MX_HTML) {
+ strncpy(html_pre_E,"<font color=\"darkred\">",sizeof(html_pre_E));
+ strncpy(html_post_E,"</font>",sizeof(html_post_E));
+
+ }
+ else {
+ html_pre_E[0] = html_post_E[0] = '\0';
+ }
+
+ if (m_msp->nframe < 0) {
+ sprintf(fmt,"%%-%ds (%%4d)",m_msp->aln.llen-r_margin);
+ }
+ else {
+ sprintf(fmt,"%%-%ds (%%4d)",m_msp->aln.llen-(r_margin+4));
+ }
+ sprintf(fmt2,"%%-%ds",m_msp->aln.llen-r_margin+8);
+
+ memset(pad,' ',m_msp->aln.llen-(r_margin+6));
+ pad[m_msp->aln.llen-(r_margin+12)]='\0';
+ if (have_lalign) {
+ if (ppst->show_ident) {
+ SAFE_STRNCPY(score_label,"alignments", sizeof(score_label));
+ pad[m_msp->aln.llen-(r_margin+16)]='\0';
+ }
+ else {
+ SAFE_STRNCPY(score_label,"non-identical alignments", sizeof(score_label));
+ pad[m_msp->aln.llen-(r_margin+30)]='\0';
+ }
+ }
+
+ nshow = min(m_msp->nshow,nbest);
+
+ if ((bp = strchr (m_msp->qtitle, '\n')) != NULL) *bp = '\0';
+ if (m_msp->markx & MX_M8OUT) {
+ if ((bp = strchr (m_msp->qtitle, ' ')) != NULL) *bp = '\0';
+ }
+
+/* fprintf (fp, "%3d %s\n", qlib,m_msp->qtitle); */
+
+ if (m_msp->markx & MX_HTML) fprintf(fp,"<pre>");
+
+ /* **************************************************************** */
+ /* done with display format */
+ /* **************************************************************** */
+
+ /* **************************************************************** */
+ /* prompt for number of best scores if quiet == 0 */
+ /* **************************************************************** */
+
+ if (quiet == 0) { /* interactive */
+ nshow = min(m_msp->nshow, nbest);
+ printf(" How many scores would you like to see? [%d] ",nshow);
+ fflush(stdout);
+ if (fgets(rline,20,stdin)==NULL) exit(0);
+ if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&nshow);
+ if (nshow > nbest) nshow=nbest;
+ if (nshow<=0) nshow = min(20,nbest);
+ }
+
+ /* display number of hits for -m 8C (Blast Tab-commented format) */
+ if (m_msp->markx & MX_M8COMMENT) {
+ /* line below copied from BLAST+ output */
+ fprintf(fp,"# Fields: query id, subject id, %% identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score");
+ if (ppst->zsflag > 20) {fprintf(fp,", eval2");}
+ if (m_msp->show_code & (SHOW_CODE_ALIGN+SHOW_CODE_CIGAR)) { fprintf(fp,", aln_code");}
+ else if ((m_msp->show_code & SHOW_CODE_BTOP)==SHOW_CODE_BTOP) { fprintf(fp,", BTOP");}
+
+ fprintf(fp,"\n");
+ fprintf(fp,"# %d hits found\n",nshow);
+ }
+
+ /* **************************************************************** */
+ /* have number of scores in interactive or quiet mode */
+ /* display "The best scores are" */
+ /* **************************************************************** */
+
+ if (m_msp->markx & MX_MBLAST2) {
+ fprintf(fp, "%81s\n"," Score E");
+ fprintf(fp, "Sequences producing significant alignments: (Bits) Value\n\n");
+ }
+ else if (!(m_msp->markx & MX_M8OUT)) {
+ if (ppst->zsflag >= 0) {
+ if (m_msp->z_bits==1) {/* show bit score */
+ fprintf(fp,"\nThe best%s %s are:%s%s bits %sE(%ld)%s",
+ rel_label,score_label,pad,m_msp->label,html_pre_E,ppst->zdb_size,html_post_E);
+ if (ppst->zsflag > 20) {
+ fprintf(fp," E2()");
+ }
+ }
+ else {/* show z-score */
+ fprintf(fp,"\nThe best%s %s are:%s%s z-sc %sE(%ld)%s",
+ rel_label,score_label,pad,m_msp->label,html_pre_E,ppst->zdb_size,html_post_E);
+ if (ppst->zsflag > 20) {
+ fprintf(fp," E2()");
+ }
+ }
+ header_aux(fp);
+ if (m_msp->markx & MX_M9SUMM) {
+ if (m_msp->show_code == SHOW_CODE_ID || m_msp->show_code == SHOW_CODE_IDD) {
+#ifdef SHOWSIM
+ fprintf(fp," %%_id %%_sim alen");
+#else
+ fprintf(fp," %%_id alen");
+#endif
+ }
+ else {
+ if (m_msp->markx & MX_HTML && m_msp->show_code != SHOW_CODE_ID && m_msp->show_code != SHOW_CODE_IDD) { fprintf(fp,"<!-- ");}
+#ifndef SHOWSIM
+ fprintf(fp,"\t%%_id %%_gid %4s alen an0 ax0 pn0 px0 an1 ax1 pn1 px1 gapq gapl fs ",m_msp->f_id1);
+#else
+ fprintf(fp,"\t%%_id %%_sim %4s alen an0 ax0 pn0 px0 an1 ax1 pn1 px1 gapq gapl fs ",m_msp->f_id1);
+#endif
+ }
+ if (m_msp->show_code & (SHOW_CODE_ALIGN+SHOW_CODE_CIGAR)) { fprintf(fp," aln_code"); }
+ if (m_msp->markx & MX_HTML && m_msp->show_code != SHOW_CODE_ID && m_msp->show_code != SHOW_CODE_IDD) { fprintf(fp," -->");}
+ }
+ fprintf(fp,"\n");
+ }
+ else {
+ fprintf(fp,"\nThe best%s %s are:%s%s",rel_label,score_label,pad,m_msp->label);
+ header_aux(fp);
+ if (m_msp->markx & MX_M9SUMM) {
+ if (m_msp->show_code == SHOW_CODE_ID || m_msp->show_code == SHOW_CODE_IDD) {
+#ifdef SHOWSIM
+ fprintf(fp," %%_id %%_sm alen");
+#else
+ fprintf(fp," %%_id alen");
+#endif
+ }
+ else {
+#ifndef SHOWSIM
+ fprintf(fp,"\t%%_id %%_gid %4s alen an0 ax0 pn0 px0 an1 ax1 pn1 px1 gapq gapl fs ",m_msp->f_id1);
+#else
+ fprintf(fp,"\t%%_id %%_sim %4s alen an0 ax0 pn0 px0 an1 ax1 pn1 px1 gapq gapl fs ",m_msp->f_id1);
+#endif /* SHOWSIM */
+ }
+ }
+ if (m_msp->show_code & (SHOW_CODE_ALIGN+SHOW_CODE_CIGAR)) { fprintf(fp," aln_code"); }
+ fprintf(fp,"\n");
+ }
+ } /* !(m_msp->markx & MX_M8OUT) */
+
+ istart = 0;
+l1:
+ istop = min(nshow, nbest);
+
+ for (ib=istart; ib<istop; ib++) {
+ bbp = bptr[ib];
+ if (ppst->do_rep) {
+ bbp->repeat_thresh =
+ min(E1_to_s(ppst->e_cut_r, m_msp->n0, bbp->seq->n1,ppst->zdb_size, m_msp->pstat_void),
+ bbp->rst.score[ppst->score_ix]);
+ }
+
+#ifdef DEBUG
+ if (bbp->seq->n1 != bbp->n1 ) {
+ fprintf(stderr, " *** lib len error [%d!=%d] *** %s score %d\n",
+ bbp->seq->n1,bbp->n1, bbp->mseq->libstr, bbp->rst.score[0]);
+ }
+#endif
+
+ /* this gets us a valid bline[] and the library for searching if necessary
+ do not read if we have a long enough bline or we don't need a sequence
+ */
+ if (bbp->mseq->bline != NULL && bbp->mseq->bline_max >= m_msp->aln.llen) {
+ ranlib_done = 0;
+
+ /* copy m_msp->aln.llen, not llen-r_margin, because the r_margin
+ will be set later, possibly after the gi|12345 is removed */
+ strncpy(bline,bbp->mseq->bline,m_msp->aln.llen);
+ bline[m_msp->aln.llen]='\0';
+ }
+ else {
+ if ((m_fptr=re_openlib(bbp->mseq->m_file_p,!m_msp->quiet))==NULL) {
+ fprintf(stderr,"*** cannot re-open %s\n",bbp->mseq->m_file_p->lb_name);
+ exit(1);
+ }
+ RANLIB(bline,m_msp->aln.llen,bbp->mseq->lseek,bbp->mseq->libstr,m_fptr);
+ ranlib_done = 1;
+ }
+
+ /* get a valid cur_ares_p chain and put it in bbp->ares */
+ if (!m_msp->align_done && (m_msp->stages>1 || (m_msp->markx & MX_M9SUMM))) { /* we need a sequence */
+ if (bbp->seq->aa1b == NULL || (m_msp->ann_flg==1 && bbp->seq->annot_p==NULL)) {
+ if (!ranlib_done) { /* we didn't open the library already */
+ if ((m_fptr=re_openlib(bbp->mseq->m_file_p,!m_msp->quiet))==NULL) {
+ fprintf(stderr,"*** cannot re-open %s\n",bbp->mseq->m_file_p->lb_name);
+ exit(1);
+ }
+ RANLIB(bline,m_msp->aln.llen,bbp->mseq->lseek,bbp->mseq->libstr,m_fptr);
+ ranlib_done = 1;
+ }
+ n1 = re_getlib(aa1save,
+ (m_msp->ann_flg==1) ? &(bbp->seq->annot_p) : NULL,
+ maxn,m_msp->ldb_info.maxt3,
+ m_msp->ldb_info.l_overlap,bbp->mseq->cont,m_msp->ldb_info.term_code,
+ &bbp->seq->l_offset,&bbp->seq->l_off,bbp->mseq->m_file_p);
+
+ aa1 = aa1save;
+
+ if (m_msp->ann_flg==2 && bbp->seq->annot_p==NULL ) {
+ /* get information about this sequence from bline */
+ if (get_annot(m_msp->annot1_sname, m_msp, bline, bbp->seq->n1, &(bbp->seq->annot_p), 1, ppst->debug_lib) > 0) {
+ /* do something with annotation */
+ s_annot_to_aa1a(bbp->n1, bbp->seq->annot_p, m_msp->ann_arr);
+ }
+ }
+ }
+ else {
+ n1 = bbp->seq->n1;
+ aa1 = bbp->seq->aa1b;
+ }
+
+ if (n1 != bbp->n1) {
+ fprintf(stderr," *** sequence length conflict %d != %d: %s\n", n1, bbp->n1, bline);
+ continue;
+ }
+
+ if ( m_msp->stages > 1 && bbp->rst.score[2] == -BIGNUM) {
+ /* this is not typically done unless m_msp->stages > 1 */
+ do_opt (aa0[bbp->frame], m_msp->n0, aa1, n1, bbp->frame, ppst, f_str[bbp->frame], &rst);
+ bbp->rst.score[2]=rst.score[2];
+ }
+
+ if (!bbp->have_ares & 0x1) {
+ bbp->a_res = build_ares_code(aa0[bbp->frame], m_msp->n0, aa1, bbp->seq,
+ bbp->frame, &bbp->have_ares,
+ bbp->repeat_thresh, m_msp, ppst, f_str[bbp->frame] );
+ best_align_done = 1;
+ }
+ } /* end stages > 1 || MX_M9SUMM9 */
+
+ n1tot = (bbp->mseq->n1tot_p) ? *bbp->mseq->n1tot_p : bbp->seq->n1;
+
+ bline_p = bline;
+ if (!(m_msp->markx & (MX_M8OUT)) && !strncmp(bline,"gi|",3)) {
+ bline_p = strchr(bline+4,'|')+1;
+ *(bline_p-1) = 0;
+ gi_num = atoi(bline+3);
+ }
+
+ /* l_name is used to build an HTML link from the bestscore line to
+ the alignment. It can also be used to discriminate multiple hits
+ from the same long sequence. This requires that fast_pan use -m 6.
+
+ (6-April-2013) Add ability to specify additional alignments with
+ link_name;
+ */
+
+ SAFE_STRNCPY(l_name,bline_p,sizeof(l_name)); /* get rid of text after second "|" */
+ if ((bp=strchr(l_name,' '))!=NULL) *bp=0;
+ if ((bp=strchr(&l_name[6],'|'))!=NULL) *bp='\0'; /* increase to [6] from [3] to allow longer db names "ref", "unk", */
+ if (m_msp->nframe > 2) sprintf(&l_name[strlen(l_name)],"_%d",bbp->frame+1);
+ else if (m_msp->nframe > 0 && bbp->frame == 1)
+ SAFE_STRNCAT(l_name,"_r",sizeof(l_name));
+ if (bbp->mseq->cont-1 > 0) {
+ sprintf(tmp_str,":%d",bbp->mseq->cont-1);
+ SAFE_STRNCAT(l_name,tmp_str,sizeof(l_name));
+ }
+
+ if (m_msp->markx & MX_M8OUT) {
+ if ((bp=strchr(bline_p,' '))!=NULL) *bp = '\0';
+ }
+ else {
+ bline_p[m_msp->aln.llen-r_margin]='\0';
+ /* check for translated frame info */
+ if (m_msp->nframe > -1) bline_p[m_msp->aln.llen-(r_margin+4)]='\0';
+ }
+ /* now its time to report the summary numbers for all the alignments */
+
+ /* in the next loop, cur_ares_p could be NULL if we haven't done do_walign() */
+ cur_ares_p = bbp->a_res;
+
+ first_line = 1;
+ do {
+ /* if cur_res_p != NULL, then we get rst from a_res->rst
+ Otherwise, it comes from bbp->rst
+ */
+
+ if ((!first_line || (have_lalign && !ppst->show_ident)) && cur_ares_p ) {
+ rst_p = &cur_ares_p->rst;
+ }
+ else {
+ rst_p = &bbp->rst;
+ }
+
+ n1 = bbp->seq->n1;
+ l_score0 = rst_p->score[ppst->score_ix];
+ lzscore = find_z(l_score0, rst_p->escore, n1, rst_p->comp, m_msp->pstat_void);
+ if (ppst->zsflag > 20) {
+ lzscore2 = find_z(l_score0, rst_p->escore, n1, rst_p->comp, m_msp->pstat_void2);
+ }
+ lbits = zs_to_bit(lzscore, m_msp->n0, n1);
+
+ /* *********************************** */
+ /* standard "The best scores are" here */
+ /* *********************************** */
+
+ if (!(m_msp->markx & (MX_M8OUT + MX_MBLAST2))) {
+ if (first_line) {
+ first_line = 0;
+ fprintf (fp, fmt,bline_p,n1tot);
+ if (m_msp->nframe > 2) fprintf (fp, " [%d]", bbp->frame+1);
+ else if (m_msp->nframe >= 0) fprintf(fp," [%c]",(bbp->frame > 0 ?'r':'f'));
+ }
+ else {
+ fprintf (fp, fmt2,"\n+-");
+ }
+
+ if (m_msp->srelv == 1) fprintf (fp, " %4d", rst_p->score[ppst->score_ix]);
+ else {
+ if (m_msp->srelv-1 > 0) fprintf (fp, " %4d", rst_p->score[0]);
+ if (m_msp->srelv-1 > 1 || m_msp->stages>1)
+ fprintf (fp, " %4d", rst_p->score[1]);
+ fprintf (fp, " %4d", rst_p->score[ppst->score_ix]);
+ }
+
+ if (ppst->zsflag>=0) {
+ if (m_msp->z_bits==1) {
+ fprintf (fp, " %.1f %s%7.2g%s",lbits,html_pre_E,
+ zs_to_E(lzscore, n1, ppst->dnaseq, ppst->zdb_size, m_msp->db),
+ html_post_E);
+ if (ppst->zsflag > 20) {
+ fprintf (fp, " %7.2g",zs_to_E(lzscore2, n1, ppst->dnaseq, ppst->zdb_size, m_msp->db));
+ }
+ }
+ else {
+ fprintf (fp, " %.1f %s%7.2g%s",lzscore,html_pre_E,
+ zs_to_E(lzscore, n1, ppst->dnaseq, ppst->zdb_size, m_msp->db),
+ html_post_E);
+ if (ppst->zsflag > 20) {
+ fprintf (fp, " %7.2g",zs_to_E(lzscore2, n1, ppst->dnaseq, ppst->zdb_size, m_msp->db));
+ }
+ }
+ }
+ show_aux(fp,bbp);
+ }
+ else if (m_msp->markx & MX_M8OUT) { /* MX_M8OUT -- provide query, library */
+ if (first_line) {first_line = 0;}
+ fprintf (fp,"%s\t%s",m_msp->qtitle,bline_p);
+ }
+ else if (m_msp->markx & MX_MBLAST2) { /* blast "Sequences producing" */
+ if (first_line) {first_line = 0;}
+ fprintf (fp,"%-67s %6.1f %.1g", bline_p, lbits,
+ zs_to_E(lzscore,n1,ppst->dnaseq,ppst->zdb_size,m_msp->db));
+ }
+
+ if (m_msp->markx & MX_M9SUMM || m_msp->markx & MX_M8OUT) {
+ loffset = bbp->seq->l_offset;
+ l_off = bbp->seq->l_off;
+ aln_p = &cur_ares_p->aln;
+ seq_code = cur_ares_p->aln_code;
+ seq_code_len = cur_ares_p->aln_code_n;
+ annot_str = cur_ares_p->annot_code;
+ annot_str_len = cur_ares_p->annot_code_n;
+
+ ngap = cur_ares_p->aln.ngap_q + cur_ares_p->aln.ngap_l;
+ percent = calc_fpercent_id(100.0,aln_p->nident,aln_p->lc, m_msp->tot_ident, -100.0);
+ ng_percent = calc_fpercent_id(100.0,aln_p->nident,aln_p->lc-ngap, m_msp->tot_ident, -100.0);
+
+#ifndef SHOWSIM
+ gpercent = calc_fpercent_id(100.0, aln_p->nident, aln_p->lc-ngap, m_msp->tot_ident, -100.0);
+#else
+ gpercent = calc_fpercent_id(100.0, cur_ares_p->aln.nsim, aln_p->lc, m_msp->tot_ident, -100.0);
+#endif /* SHOWSIM */
+
+ if (m_msp->show_code != SHOW_CODE_ID && m_msp->show_code != SHOW_CODE_IDD) { /* show more complete info than just identity */
+
+ /* calc_astruct(aln_p, cur_ares_p); -- this function
+ should not be used after calc_code or any other
+ alignment that calculates amax0/amax1 */
+
+ /* we need the coordinates for annotated SHOW_CODE_ALIGN */
+ calc_coord(m_msp->n0,bbp->seq->n1,
+ m_msp->q_offset + (m_msp->q_off-1) + (m_msp->sq0off-1),
+ loffset + (l_off-1) + (m_msp->sq1off-1),
+ aln_p);
+
+ /* if (m_msp->markx & MX_HTML) fprintf(fp,"<!-- "); */
+ /* %_id %_sim s-w alen an0 ax0 pn0 px0 an1 ax1 pn1 px1 gapq gapl fs */
+ /* alignment min max min max */
+ /* sequence coordinate min max min max */
+ if (!(m_msp->markx & MX_M8OUT)) {
+ fprintf(fp,"\t%5.3f %5.3f %4d %4d %4ld %4ld %4ld %4ld %4ld %4ld %4ld %4ld %3d %3d %3d",
+ percent/100.0,gpercent/100.0,
+ cur_ares_p->sw_score,
+ aln_p->lc,
+ aln_p->d_start0,aln_p->d_stop0,
+ aln_p->q_start_off, aln_p->q_end_off,
+ aln_p->d_start1,aln_p->d_stop1,
+ aln_p->l_start_off, aln_p->l_end_off,
+ aln_p->ngap_q,aln_p->ngap_l,aln_p->nfs);
+ if ((m_msp->show_code & (SHOW_CODE_ALIGN+SHOW_CODE_CIGAR+SHOW_CODE_BTOP))
+ && seq_code_len > 0 && seq_code != NULL) {
+ fprintf(fp,"\t%s",seq_code);
+ if (annot_str_len > 0 && annot_str != NULL) {
+ fprintf(fp,"\t%s",annot_str);
+ }
+ }
+ }
+ else { /* MX_M8OUT -- blast order, tab separated */
+ fprintf(fp,"\t%.2f\t%d\t%d\t%d\t%ld\t%ld\t%ld\t%ld\t%.2g\t%.1f",
+ ng_percent,aln_p->lc,aln_p->nmismatch,
+ aln_p->ngap_q + aln_p->ngap_l+aln_p->nfs,
+ aln_p->d_start0, aln_p->d_stop0,
+ aln_p->d_start1, aln_p->d_stop1,
+ zs_to_E(lzscore,n1,ppst->dnaseq,ppst->zdb_size,m_msp->db),
+ lbits);
+ if (ppst->zsflag > 20) {
+ fprintf(fp,"\t%.2g",zs_to_E(lzscore2, n1, ppst->dnaseq, ppst->zdb_size, m_msp->db));
+ }
+ if ((m_msp->show_code & (SHOW_CODE_ALIGN+SHOW_CODE_CIGAR+SHOW_CODE_BTOP)) && seq_code_len > 0 && seq_code != NULL) {
+ fprintf(fp,"\t%s",seq_code);
+ if (annot_str_len > 0 && annot_str != NULL) {
+ fprintf(fp,"\t%s",annot_str);
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ }
+ else { /* !SHOW_CODE -> SHOW_ID or SHOW_IDD*/
+#ifdef SHOWSIM
+ fprintf(fp," %5.3f %5.3f %4d",
+ percent/100.0,
+ (float)aln_p->nsim/(float)aln_p->lc,aln_p->lc);
+#else
+ fprintf(fp," %5.3f %4d", percent/100.0,aln_p->lc);
+#endif
+ if (m_msp->markx & MX_HTML) {
+ if (cur_ares_p->index > 0) {
+ sprintf(link_name,"%s_%d",l_name, cur_ares_p->index);
+ }
+ else {
+ SAFE_STRNCPY(link_name, l_name, sizeof(l_name));
+ }
+ fprintf(fp," <a href=\"#%s\">align</a>",link_name);
+ link_shown = 1;
+ }
+ else { link_shown = 0;}
+
+ if ((m_msp->show_code & SHOW_CODE_ID) == SHOW_CODE_ID) {
+ annot_str = cur_ares_p->annot_var_id;
+ }
+ else if ((m_msp->show_code & SHOW_CODE_IDD) == SHOW_CODE_IDD) {
+ annot_str = cur_ares_p->annot_var_idd;
+ }
+ else {
+ annot_str = NULL;
+ }
+ if (annot_str && annot_str[0]) {
+ fprintf(fp," %s",annot_str);
+ }
+ }
+ }
+ } while ( cur_ares_p && (cur_ares_p = cur_ares_p->next));
+
+ /* if ((m_msp->markx & MX_HTML) && !link_shown) fprintf(fp," <a href=\"#%s\">align</a>",l_name); */
+ if (!(m_msp->markx & MX_M8OUT)) fprintf(fp, "\n");
+ fflush(fp);
+ }
+
+ if (quiet==0) {
+ printf(" More scores? [0] ");
+ fflush(stdout);
+ if (fgets(rline,20,stdin)==NULL) exit(0);
+ ntmp = 0;
+ if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&ntmp);
+ if (ntmp<=0) ntmp = 0;
+ if (ntmp>0) {
+ istart = istop;
+ nshow = min(nshow+ntmp, nbest);
+ goto l1;
+ }
+ } /* end of for (ib) loop */
+
+ if (m_msp->markx & MX_MBLAST2) {fprintf(fp, "\n\n");}
+
+ m_msp->nshow = nshow; /* save the number of hits displayed for showalign */
+
+ if (best_align_done) { m_msp->align_done = 1;} /* note that alignments are done */
+
+ if (m_msp->markx & MX_HTML) fprintf(fp,"</pre><hr>\n");
+}
diff --git a/src/mw.h b/src/mw.h
new file mode 100644
index 0000000..eafd43a
--- /dev/null
+++ b/src/mw.h
@@ -0,0 +1,49 @@
+/* Concurrent read version */
+
+/* $Id: mw.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+#include "param.h"
+
+#ifndef FSEEK_T_DEF
+#ifndef USE_FSEEKO
+typedef long fseek_t;
+#else
+typedef off_t fseek_t;
+#endif
+#endif
+
+struct beststr {
+ struct seq_record *seq; /* sequence info */
+ struct mseq_record *mseq; /* sequence meta-info */
+ struct beststr *bbp_link; /* link to a previous beststr entry with the same sequence */
+ struct rstruct rst; /* results info */
+
+ int n1; /* duplicate of seq.n1, used for error checking/debugging */
+#ifdef DEBUG
+ long adler32_crc; /* duplicate of seq.adler32_crc for error checking/debugging */
+#endif
+ int frame; /* in buf2_str */
+ int repeat_thresh; /* threshold for additional alignments */
+ double zscore; /* the z-score mostly exists for sorting best scores */
+ double zscore2; /* z-score - from high-scoring shuffles */
+ double bit_score; /* move to bit-scores for consistency */
+ double bit_score2; /* bit-score for second shuffle */
+
+ int a_res_cnt;
+ struct a_res_str *a_res; /* need only a_res, not a_res[2], because different frames
+ for the same sequence are stored separately */
+ int have_ares;
+ float percent, gpercent;
+};
+
+struct stat_str {
+ int score;
+ int n1;
+ double comp;
+ double H;
+ double escore;
+ int segnum;
+ int seglen;
+};
+
diff --git a/src/mysql_lib.c b/src/mysql_lib.c
new file mode 100644
index 0000000..41362ca
--- /dev/null
+++ b/src/mysql_lib.c
@@ -0,0 +1,636 @@
+/* $Id: mysql_lib.c 817 2011-08-02 03:54:02Z wrp $ */
+/* $Revision: 817 $ */
+
+/* mysql_lib.c copyright (c) 2000, 2014 by William R. Pearson and The
+ Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* functions for opening, reading, seeking a mySQL database */
+
+/* (close_tables added June, 2011)
+ For the moment, this interface assumes that the file to be searched
+ will be specified in a single, long, string with 4 required and 1
+ optional parts:
+
+ (1) a database open string. This string has four fields, separated by
+ whitespace (' \t'):
+ hostname:port dbname user password
+
+ '--' dashes at the beginning of lines are ignored -
+ thus the first line could be:
+ -- hostname:port dbname user password
+
+ (2) a database query string that will return an unique ID (not
+ necessarily numberic, but it must be < 12 characters as libstr[12]
+ is used) and a sequence string
+
+ (2a) a series of mySQL commands that do not generate results
+ starting with 'DO', followed by a select() statement.
+
+ (3) a database select string that will return a description
+ given a unique ID
+
+ (4) a database select string that well return a sequence given a
+ unique ID
+
+ (5) [optional] an SQL statement to be run when closing the database
+ (e.g. a DROP TABLE statement)
+
+ Lines (3) and (4) are not required for pv34comp* libraries, but
+ line (2) must generate a complete description as well as a sequence.
+
+ 18-July-2001
+ Additional syntax has been added to support multiline SQL queries.
+
+ If the host line begins with '+', then the SQL is openned on the same
+ connection as the previous SQL file.
+
+ If the host line contains '-' just before the terminal ';', then
+ the file will not produce any output.
+
+ This string can contain "\n". ";" are used to separate the four
+ functions, which must be specified in the order shown above.
+ The last (fourth) query must terminate with a ';' */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include <mysql.h>
+#define MYSQL_LIB 16
+
+#include "defs.h"
+#include "structs.h"
+#include "mm_file.h"
+
+#define XTERNAL
+#include "uascii.h"
+#define EOSEQ 0
+/* #include "upam.h" */
+
+char *alloc_file_name(char *f_name);
+int mysql_getlib(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
+void mysql_ranlib(char *, int, fseek_t, char *, struct lmf_str *m_fd);
+
+#define MYSQL_BUF 4096
+
+struct lmf_str *
+mysql_openlib(char *sname, int ldnaseq, int *sascii) {
+ FILE *sql_file;
+ char *tmp_str, *ttmp_str;
+ int tmp_str_len;
+ char *bp, *bps, *bdp, *tp, tchar;
+ int i, qs_len, qqs_len;
+ char *sql_db, *sql_host, *sql_dbname, *sql_user, *sql_pass;
+ char *sql_do;
+ int sql_do_cnt;
+ int sql_port;
+ struct lmf_str *m_fptr;
+
+ /* if (sql_reopen) return NULL; - should not be called for re-open */
+
+ tmp_str_len = MYSQL_BUF;
+ if ((tmp_str=(char *)calloc(tmp_str_len,sizeof(char)))==NULL) {
+ fprintf(stderr,"cannot allocate %d for mySQL buffer\n",tmp_str_len);
+ return NULL;
+ }
+
+ /* immediate mysql scripts start with '%' */
+ if (sname[0] == '%') {
+ strncpy(tmp_str,sname+1,tmp_str_len);
+ tmp_str[sizeof(tmp_str)-1]='\0';
+ }
+ else { /* read the script from a file */
+ if ((sql_file=fopen(sname,"r"))==NULL) {
+ fprintf(stderr," cannot open mySQL file: %s\n",sname);
+ return NULL;
+ }
+
+ if ((qs_len=fread(tmp_str,sizeof(char),tmp_str_len-1,sql_file))<=0) {
+ fprintf(stderr," cannot read mySQL file: %s\n",sname);
+ return NULL;
+ }
+ else { /* read the entire file in MYSQL_BUF (4096) byte
+ chunks, reallocating as necessary */
+ tmp_str[qs_len]='\0';
+ qqs_len = qs_len;
+ while (qqs_len >= tmp_str_len-1) {
+ tmp_str_len += MYSQL_BUF;
+ if ((tmp_str=(char *)realloc(tmp_str,tmp_str_len))==NULL) {
+ fprintf(stderr,
+ " cannot reallocate %d for mySQL buffer\n",tmp_str_len);
+ return NULL;
+ }
+ ttmp_str = &tmp_str[qqs_len];
+ if ((qs_len=fread(ttmp_str,sizeof(char),MYSQL_BUF,sql_file))<0) {
+ fprintf(stderr," cannot read mySQL file: %s\n",sname);
+ return NULL;
+ }
+ ttmp_str[qs_len]='\0';
+ qqs_len += qs_len;
+ }
+ }
+ fclose(sql_file);
+ }
+
+ /* tmp_str has the entire contents of the file */
+ bps = tmp_str;
+ if ((bp=strchr(bps,';'))!=NULL) {
+ /* get the connection info */
+ *bp='\0';
+ if ((sql_db=calloc(strlen(bps)+1,sizeof(char)))==NULL) {
+ fprintf(stderr, " cannot allocate space for database name [%d], %s\n",
+ (int)strlen(bps),bps);
+ return NULL;
+ }
+ /* have database name, parse the fields */
+ else { /* copy connection info into sql_db and parse */
+ strcpy(sql_db,bps); /* strcpy OK because allocated strlen(bps) */
+ bps = bp+1; /* points to next char after ';' */
+ while (isspace(*bps)) bps++;
+ *bp=';'; /* replace ; */
+ bp = sql_db;
+ while (*bp=='-') {*bp++ = ' ';}
+ sql_host = strtok(bp," \t\n");
+ sql_dbname = strtok(NULL," \t\n");
+ sql_user = strtok(NULL," \t\n");
+ sql_pass = strtok(NULL," \t\n");
+ if ((tp=strchr(sql_host,':'))!=NULL) {
+ *tp='\0';
+ sql_port=atoi(tp+1);
+ }
+ else sql_port = 0;
+ }
+ }
+ else {
+ fprintf(stderr," cannot find database fields:\n%s\n",tmp_str);
+ return NULL;
+ }
+
+ /* we have all the info we need to open a database, allocate lmf_str */
+ if ((m_fptr = (struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {
+ fprintf(stderr," cannot allocate lmf_str (%ld) for %s\n",
+ sizeof(struct lmf_str),sname);
+ return NULL;
+ }
+
+ /* have our struct, initialize it */
+
+ m_fptr->lb_name = alloc_file_name(sname);
+
+ m_fptr->sascii = sascii;
+
+ m_fptr->sql_db = sql_db;
+ m_fptr->getlib = mysql_getlib;
+ m_fptr->ranlib = mysql_ranlib;
+ m_fptr->mm_flg = 0;
+ m_fptr->sql_reopen = 0;
+ m_fptr->lb_type = MYSQL_LIB;
+
+ /* now open the database, if necessary */
+ if ((m_fptr->mysql_conn=mysql_init(NULL))==NULL) {
+ fprintf(stderr,"*** Error - mysql_init\n");
+ goto error_r;
+ }
+
+ if (mysql_real_connect(m_fptr->mysql_conn,
+ sql_host,sql_user,sql_pass,
+ sql_dbname,
+ sql_port,
+ NULL,
+ 0)==NULL)
+ {
+ fprintf(stderr,"*** Error %u - could not open database:\n%s\n%s",
+ mysql_errno(m_fptr->mysql_conn),tmp_str,
+ mysql_error(m_fptr->mysql_conn));
+ goto error_r;
+ }
+#ifdef DEBUG
+ else {
+ fprintf(stderr," Database %s opened on %s\n",sql_dbname,sql_host);
+ }
+#endif
+
+ /* check for 'DO' command - copy to 'DO' string */
+ while (*bps == '-') { *bps++=' ';}
+ if (isspace(bps[-1]) && toupper(bps[0])=='D' &&
+ toupper(bps[1])=='O' && isspace(bps[2])) {
+ /* have some 'DO' commands */
+ /* check where the end of the last DO statement is */
+
+ sql_do_cnt = 1; /* count up the number of 'DO' statements for later */
+ bdp=bps+3;
+ while ((bp=strchr(bdp,';'))!=NULL) {
+ tp = bp+2; /* skip ;\n */
+ while (isspace(*tp) || *tp == '-') {*tp++ = ' ';}
+ if (toupper(*tp)=='D' && toupper(tp[1])=='O' && isspace(tp[2])) {
+ sql_do_cnt++; /* count the DO statements */
+ bdp = tp+3; /* move to the next DO statement */
+ }
+ else break;
+ }
+ if (bp != NULL) { /* end of the last DO, begin of select */
+ tchar = *(bp+1);
+ *(bp+1)='\0'; /* terminate DO strings */
+ if ((sql_do = calloc(strlen(bps)+1, sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate %d for sql_do\n",(int)strlen(bps));
+ goto error_r;
+ }
+ else {
+ strcpy(sql_do,bps);
+ *(bp+1)=tchar; /* replace missing ';' */
+ }
+ bps = bp+1;
+ while (isspace(*bps)) bps++;
+ }
+ else {
+ fprintf(stderr," terminal ';' not found: %s\n",bps);
+ goto error_r;
+ }
+ /* all the DO commands are in m_fptr->sql_do in the form:
+ DO command1; DO command2; DO command3; */
+ bdp = sql_do;
+ while (sql_do_cnt-- && (bp=strchr(bdp,';'))!=NULL) {
+ /* do the mysql statement on bdp+3 */
+ /* check for error */
+ *bp='\0';
+ if (mysql_query(m_fptr->mysql_conn,bdp+3)) {
+ fprintf(stderr,"*** Error %u - query failed:\n%s\n%s\n",
+ mysql_errno(m_fptr->mysql_conn), bdp+3, mysql_error(m_fptr->mysql_conn));
+ goto error_r;
+ }
+ *bp=';';
+ bdp = bp+1;
+ while (isspace(*bdp)) bdp++;
+ }
+ }
+
+ /* copy 1st query field */
+ if ((bp=strchr(bps,';'))!=NULL) {
+ *bp='\0';
+ if ((m_fptr->sql_query=calloc(strlen(bps)+1,sizeof(char)))==NULL) {
+ fprintf(stderr, " cannot allocate space for query string [%d], %s\n",
+ (int)strlen(bps),bps);
+ goto error_r;
+ }
+ /* have query, copy it */
+ else {
+ strcpy(m_fptr->sql_query,bps);
+ *bp=';'; /* replace ; */
+ bps = bp+1;
+ while(isspace(*bps)) bps++;
+ }
+ }
+ else {
+ fprintf(stderr," cannot find database query field:\n%s\n",tmp_str);
+ goto error_r;
+ }
+
+ /* copy get_desc field */
+ if ((bp=strchr(bps,';'))!=NULL) {
+ *bp='\0';
+ if ((m_fptr->sql_getdesc=calloc(strlen(bps)+1,sizeof(char)))==NULL) {
+ fprintf(stderr, " cannot allocate space for database name [%d], %s\n",
+ (int)strlen(bps),bps);
+ goto error_r;
+ }
+ /* have get_desc, copy it */
+ else {
+ strcpy(m_fptr->sql_getdesc,bps);
+ *bp=';'; /* replace ; */
+ bps = bp+1;
+ while(isspace(*bps)) bps++;
+ }
+ }
+ else {
+ fprintf(stderr," cannot find getdesc field:\n%s\n",tmp_str);
+ goto error_r;
+ }
+
+ if ((bp=strchr(bps,';'))!=NULL) { *bp='\0';}
+
+ if ((m_fptr->sql_getseq=calloc(strlen(bps)+1,sizeof(char)))==NULL) {
+ fprintf(stderr, " cannot allocate space for database name [%d], %s\n",
+ (int)strlen(bps),bps);
+ goto error_r;
+ }
+
+ if (strlen(bps) > 0) {
+ strcpy(m_fptr->sql_getseq,bps);
+ bps = bp+1;
+ }
+ else {
+ fprintf(stderr," cannot find getseq field:\n%s\n",tmp_str);
+ return 0;
+ }
+ if (bp!=NULL) *bp=';';
+
+ /* check for close_table statement */
+ if ((bp=strchr(bps,';'))!=NULL) {
+ *bp='\0';
+ if ((m_fptr->sql_close_tables=calloc(strlen(bps)+1,sizeof(char)))==NULL) {
+ fprintf(stderr, " cannot allocate space for close_tables [%d], %s\n",
+ (int)strlen(bps),bps);
+ goto error_r;
+ }
+ /* have get_desc, copy it */
+ else {
+ strcpy(m_fptr->sql_close_tables,bps);
+ *bp=';'; /* replace ; */
+ bps = bp+1;
+ while(isspace(*bps)) bps++;
+ }
+ }
+
+ /* now do the query */
+
+ if (mysql_query(m_fptr->mysql_conn,m_fptr->sql_query)) {
+ fprintf(stderr,"*** Error %u - query failed:\n%s\n%s\n",
+ mysql_errno(m_fptr->mysql_conn), m_fptr->sql_query, mysql_error(m_fptr->mysql_conn));
+ goto error_r;
+ }
+
+ if ((m_fptr->mysql_res = mysql_use_result(m_fptr->mysql_conn)) == NULL) {
+ fprintf(stderr,"*** Error = use result failed\n%s\n",
+ mysql_error(m_fptr->mysql_conn));
+ goto error_r;
+ }
+ return m_fptr;
+
+ error_r:
+ if (m_fptr->sql_close_tables) free(m_fptr->sql_close_tables);
+ if (m_fptr->sql_getseq) free(m_fptr->sql_getseq);
+ if (m_fptr->sql_getdesc) free(m_fptr->sql_getdesc);
+ if (m_fptr->sql_query) free(m_fptr->sql_query);
+ free(m_fptr);
+ free(sql_db);
+ return NULL;
+}
+
+struct lmf_str *
+mysql_reopen(struct lmf_str *m_fptr) {
+ m_fptr->sql_reopen = 1;
+ return m_fptr;
+}
+
+void
+mysql_closelib(struct lmf_str *m_fptr) {
+
+ if (m_fptr == NULL) return;
+
+ if (m_fptr->mysql_res != NULL)
+ mysql_free_result(m_fptr->mysql_res);
+
+ if (m_fptr->sql_close_tables) {
+ if (mysql_query(m_fptr->mysql_conn,m_fptr->sql_close_tables)) {
+ fprintf(stderr,"*** Error %u - close_tables failed:\n%s\n%s\n",
+ mysql_errno(m_fptr->mysql_conn), m_fptr->sql_close_tables,
+ mysql_error(m_fptr->mysql_conn));
+ }
+ }
+ mysql_close(m_fptr->mysql_conn);
+ m_fptr->sql_reopen=0;
+}
+
+/*
+static char *sql_seq = NULL, *sql_seqp;
+static int sql_seq_len;
+static MYSQL_ROW sql_row;
+*/
+
+int
+mysql_getlib( unsigned char *seq,
+ int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lm_fd,
+ long *l_off)
+{
+ register unsigned char *cp, *seqp;
+ register int *ap;
+ unsigned char *seqm, *seqm1;
+ char *bp;
+ /* int l_start, l_stop, len; */
+
+ seqp = seq;
+ seqm = &seq[maxs-9];
+ seqm1 = seqm-1;
+
+ ap = lm_fd->sascii;
+
+ if (*lcont==0) {
+ /* get a row, with UID, sequence */
+ *l_off = 1;
+ if ((lm_fd->mysql_row =mysql_fetch_row(lm_fd->mysql_res))!=NULL) {
+ *libpos=(fseek_t)atol(lm_fd->mysql_row[0]);
+
+ /* for @P:1-n removed */
+ /*
+ if ((bp=strchr(lm_fd->mysql_row[2],'@'))!=NULL &&
+ !strncmp(bp+1,"P:",2)) {
+ sscanf(bp+3,"%d-%d",&l_start,&l_stop)
+ l_start--;
+ if (l_start < 0) l_start=0;
+ if (l_stop > (len=strlen(lm_fd->mysql_row[1]))) l_stop= len-1;
+ lm_fd->sql_seqp = lm_fd->mysql_row[1];
+ lm_fd->sql_seqp[l_stop]='\0';
+ lm_fd->sql_seqp += l_start;
+ */
+
+ if (lm_fd->mysql_row[2] == NULL) {
+ fprintf(stderr," NULL comment at: [%s] %ld\n",
+ lm_fd->mysql_row[0],*libpos);
+ }
+ else if ((bp=strchr(lm_fd->mysql_row[2],'@'))!=NULL &&
+ !strncmp(bp+1,"C:",2)) sscanf(bp+3,"%ld",l_off);
+ else *l_off = 1;
+
+ lm_fd->sql_seqp = lm_fd->mysql_row[1];
+
+ /* because of changes in mysql_ranlib(), it is essential that
+ libstr return the unique identifier; thus we must use
+ sql_row[0], not sql_row[2]. Using libstr as the UID allows
+ one to use any UID, not just numeric ones. *libpos is not
+ used for mysql libraries.
+ */
+
+ if (n_libstr <= MAX_UID) {
+ /* the normal case returns only GID/sequence */
+ strncpy(libstr,lm_fd->mysql_row[0],MAX_UID-1);
+ libstr[MAX_UID-1]='\0';
+ }
+ else {
+ /* here we do not use the UID in libstr, because we are not
+ going back into the db */
+ /* the PVM case also returns a long description */
+ if (lm_fd->mysql_row[2]!=NULL) {
+ strncpy(libstr,lm_fd->mysql_row[2],n_libstr-1);
+ }
+ else {
+ strncpy(libstr,lm_fd->mysql_row[0],n_libstr-1);
+ }
+ libstr[n_libstr-1]='\0';
+ }
+ }
+ else {
+ mysql_free_result(lm_fd->mysql_res);
+ lm_fd->mysql_res=NULL;
+ *lcont = 0;
+ *seqp = EOSEQ;
+ return -1;
+ }
+ }
+
+ for (cp=(unsigned char *)lm_fd->sql_seqp; seqp<seqm1 && *cp; ) {
+ if ((*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA) continue;
+ --seqp;
+ if (*(cp-1)==0) break;
+ }
+ lm_fd->sql_seqp = (char *)cp;
+
+ if (seqp>=seqm1) (*lcont)++;
+ else {
+ *lcont=0;
+ if (lm_fd->sql_reopen) {
+ mysql_free_result(lm_fd->mysql_res);
+ lm_fd->mysql_res = NULL;
+ }
+ }
+
+ *seqp = EOSEQ;
+ /* if ((int)(seqp-seq)==0) return 1; */
+ return (int)(seqp-seq);
+}
+
+void
+mysql_ranlib(char *str,
+ int cnt,
+ fseek_t libpos,
+ char *libstr,
+ struct lmf_str *lm_fd
+ )
+{
+ char tmp_query[1024], tmp_val[20];
+ char *bp;
+
+ str[0]='\0';
+
+ /* put the UID into the query string - cannot use sprintf because of
+ "%' etc */
+
+ /* sprintf(tmp_query,lm_fd->sql_getdesc,libpos); */
+
+ if ((bp=strchr(lm_fd->sql_getdesc,'#'))==NULL) {
+ fprintf(stderr, "no GID position in %s\n",lm_fd->sql_getdesc);
+ goto next1;
+ }
+ else {
+ *bp = '\0';
+ strncpy(tmp_query,lm_fd->sql_getdesc,sizeof(tmp_query));
+ tmp_query[sizeof(tmp_query)-1]='\0';
+ /* sprintf(tmp_val,"%ld",(long)libpos); */
+ strncat(tmp_query,libstr,sizeof(tmp_query)-1);
+ strncat(tmp_query,bp+1,sizeof(tmp_query)-1);
+ *bp='#';
+ lm_fd->lpos = libpos;
+ }
+
+ /* fprintf(stderr," requesting: %s\n",tmp_query); */
+
+ if (lm_fd->mysql_res !=NULL) {
+ mysql_free_result(lm_fd->mysql_res);
+ lm_fd->mysql_res = NULL;
+ }
+
+ if (mysql_query(lm_fd->mysql_conn,tmp_query)) {
+ fprintf(stderr,"*** Error - query failed:\n%s\n%s\n",tmp_query,
+ mysql_error(lm_fd->mysql_conn));
+ sprintf(str,"gi|%ld ***Error - query failed***",(long)libpos);
+ goto next1;
+ }
+
+ if ((lm_fd->mysql_res = mysql_use_result(lm_fd->mysql_conn)) == NULL) {
+/* fprintf(stderr,"*** Error = use result failed\n%s\n",
+ mysql_error(lm_fd->mysql_conn)); */
+ sprintf(str,"gi|%ld ***use result failed***",(long)libpos);
+ goto next0;
+ }
+
+ /* have the description */
+ if ((lm_fd->mysql_row = mysql_fetch_row(lm_fd->mysql_res))==NULL) {
+ /* fprintf(stderr," cannot fetch description: %s\n",tmp_query); */
+ sprintf(str,"gi|%ld ***cannot fetch description***",(long)libpos);
+ goto next0;
+ }
+
+ if (lm_fd->mysql_row[1] != NULL) strncpy(str,lm_fd->mysql_row[1],cnt-1);
+ else strncpy(str,lm_fd->mysql_row[0],cnt-1);
+ str[cnt-1]='\0';
+ while (strlen(str) < cnt-1 &&
+ (lm_fd->mysql_row = mysql_fetch_row(lm_fd->mysql_res))!=NULL) {
+ strncat(str," ",cnt-2-strlen(str));
+ if (lm_fd->mysql_row[1]!=NULL)
+ strncat(str,lm_fd->mysql_row[1],cnt-2-strlen(str));
+ else break;
+ }
+
+ str[cnt-1]='\0';
+ if ((bp = strchr(str,'\r'))!=NULL) *bp='\0';
+ if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+
+ next0:
+ mysql_free_result(lm_fd->mysql_res);
+ next1:
+ lm_fd->mysql_res = NULL;
+
+ /* get the sequence, set up for mysql_getseq() */
+ /* put the UID into the query string */
+
+ if ((bp=strchr(lm_fd->sql_getseq,'#'))==NULL) {
+ fprintf(stderr, "no GID position in %s\n",lm_fd->sql_getseq);
+ return;
+ }
+ else {
+ *bp = '\0';
+ strncpy(tmp_query,lm_fd->sql_getseq,sizeof(tmp_query));
+ tmp_query[sizeof(tmp_query)-1]='\0';
+ /* sprintf(tmp_val,"%ld",(long)libpos); */
+ strncat(tmp_query,libstr,sizeof(tmp_query));
+ strncat(tmp_query,bp+1,sizeof(tmp_query));
+ *bp='#';
+ }
+
+ if (mysql_query(lm_fd->mysql_conn,tmp_query)) {
+ fprintf(stderr,"*** Error - query failed:\n%s\n%s\n",tmp_query,
+ mysql_error(lm_fd->mysql_conn));
+ }
+
+ if ((lm_fd->mysql_res = mysql_use_result(lm_fd->mysql_conn)) == NULL) {
+ fprintf(stderr,"*** Error = use result failed\n%s\n",
+ mysql_error(lm_fd->mysql_conn));
+ }
+}
diff --git a/src/ncbl2_head.h b/src/ncbl2_head.h
new file mode 100644
index 0000000..1da9293
--- /dev/null
+++ b/src/ncbl2_head.h
@@ -0,0 +1,35 @@
+/* ncbl_head.h header files for blast1.3 format */
+
+/* $Id: ncbl2_head.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+#define AMINO_ACID_SEQTYPE 1
+#define AA_SEQTYPE AMINO_ACID_SEQTYPE
+#define AAFORMAT AA_SEQTYPE
+
+#define NUCLEIC_ACID_SEQTYPE 0
+#define NT_SEQTYPE NUCLEIC_ACID_SEQTYPE
+#define NTFORMAT NT_SEQTYPE
+
+/* Filename extensions used by the two types of databases (a.a. and nt.) */
+#define AA_LIST_EXT "pal"
+#define AA_HEADER_EXT "phr"
+#define AA_INDEX_EXT "pin"
+#define AA_SEARCHSEQ_EXT "psq"
+
+#define NT_LIST_EXT "nal"
+#define NT_HEADER_EXT "nhr"
+#define NT_INDEX_EXT "nin"
+#define NT_SEARCHSEQ_EXT "nsq"
+
+#define FORMATDBV3 3 /* formatdb version */
+#define FORMATDBV4 4 /* formatdb version */
+
+#define NULLB '\0' /* sentinel byte */
+
+#ifndef CHAR_BIT
+#define CHAR_BIT 8 /* these values should match blast */
+#endif
+
+#define NBPN 2
+#define NSENTINELS 2
diff --git a/src/ncbl2_mlib.c b/src/ncbl2_mlib.c
new file mode 100644
index 0000000..16cc8da
--- /dev/null
+++ b/src/ncbl2_mlib.c
@@ -0,0 +1,2442 @@
+/* ncbl2_lib.c functions to read ncbi-blast format files from
+ formatdb (blast2.0 format files)
+*/
+
+/* copyright (c) 2006, 2014 by William R. Pearson and
+ The Rector and Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* Updated 22-Aug-2014 to include
+
+ ambiguity-decoding code from
+
+ Ralf Jost, Dipl.-Inform.
+ Director, Technical Bioinformatics
+ Biomax Informatics AG
+ ralf.jost at biomax.com
+
+ using code from NCBI Blast distribution
+*/
+
+/* $Name: $ - $Id: ncbl2_mlib.c 1291 2014-08-28 18:32:58Z wrp $ */
+
+/* to turn on mmap()ing for Blast2 files: */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <fcntl.h>
+#ifdef UNIX
+#include <unistd.h>
+#endif
+#include <errno.h>
+
+
+/* ****************************************************************
+
+17-May-2006
+
+Modified to read NCBI .[np]al and .msk files. The .nal or .pal file
+provides a way to read sequences from a list of files. The .msk file
+provides a compact way of indicating the subset of sequences in a
+larger database (typically nr or nt) that comprise a smaller database
+(e.g. swissprot or pdbaa). A .pal file (e.g. swissprot.00.pal) that
+uses a .msk file has the form:
+
+ # Alias file generated by genmask
+ # Date created: Mon Apr 10 11:24:05 2006
+ #
+ TITLE Non-redundant SwissProt sequences
+ DBLIST nr.00
+ OIDLIST swissprot.00.msk
+ LENGTH 74351250
+ NSEQ 198346
+ MAXOID 2617347
+ MEMB_BIT 1
+ # end of the file
+
+To work with this file, we must first load the nr.00 file, and then
+read the swissprot.00.msk file, and then scan all the entries in the
+swissprot.00.msk file (which are packed 32 mask-bit to an int) to
+determine whether a specific libpos index entry is present in the
+subset database.
+
+**************************************************************** */
+
+
+/* ****************************************************************
+This code reads NCBI Blast2 format databases from formatdb version 3 and 4
+
+(From NCBI) This section describes the format of the databases.
+
+Formatdb creates three main files for proteins containing indices,
+sequences, and headers with the extensions, respectively, of pin, psq,
+and phr (for nucleotides these are nin, nsq, and nhr). A number of
+other ISAM indices are created, but these are described elsewhere.
+
+FORMAT OF THE INDEX FILE
+------------------------
+
+1.) formatdb version number [4 bytes].
+
+2.) protein dump flag (1 for a protein database, 0 for a nucleotide
+ database) [4 bytes].
+
+3.) length of the database title in bytes [4 bytes].
+4.) the database title [length given in 3.)].
+5.) length of the date/time string [4 bytes].
+6.) the date/time string [length given in 5.)].
+7.) the number of sequences in the database [4 bytes].
+8.) the total length of the database in residues/basepairs [4 bytes].
+9.) the length of the longest sequence in the database [4 bytes].
+
+10.) a list of the offsets for definitions (one for each sequence) in
+the header file. There are num_of_seq+1 of these, where num_of_seq is
+the number of sequences given in 7.).
+
+11.) a list of the offsets for sequences (one for each sequence) in
+the sequence file. There are num_of_seq+1 of these, where num_of_seq
+is the number of sequences given in 7.).
+
+12.) a list of the offsets for the ambiguity characters (one for each
+sequence) in the sequence file. This list is only present for
+nucleotide databases and, since the database is compressed 4/1 for
+nucleotides, allows the ambiguity characters to be restored when the
+sequence is generated. There are num_of_seq+1 of these, where
+num_of_seq is the number of sequences given in 7.).
+
+
+FORMAT OF THE SEQUENCE FILE
+---------------------------
+
+There are different formats for the protein and nucleotide sequence files.
+
+The protein sequence files is quite simple. The first byte in the
+file is a NULL byte, followed by the sequence in ncbistdaa format
+(described in the NCBI Software Development Toolkit documentation).
+Following the sequence is another NULL byte, followed by the next
+sequence. The file ends with a NULL byte, following the last
+sequence.
+
+The nucleotide sequence file contains the nucleotide sequence, with
+four basepairs compressed into one byte. The format used is NCBI2na,
+documented in the NCBI Software Development Toolkit manual. Any
+ambiguity characters present in the original sequence are replaced at
+random by A, C, G or T. The true value of ambiguity characters are
+stored at the end of each sequence to allow true reproduction of the
+original sequence.
+
+FORMAT OF THE HEADER FILE (formatdb version 3)
+-------------------------
+
+The format of the header file depends on whether or not the identifiers in the
+original file were parsed or not. For the case that they were not, then each
+entry has the format:
+
+gnl|BL_ORD_ID|entry_number my favorite yeast sequence...
+
+Here entry_number gives the ordinal number of the sequence in the
+database (with zero offset). The identifier
+gnl|BL_ORD_ID|entry_number is used by the BLAST software to identify
+the entry, if the user has not provided another identifier. If the
+identifier was parsed, then gnl|BL_ORD_ID|entry_number is replaced by
+the correct identifier, as described in
+ftp://ncbi.nlm.nih.gov/blast/db/README .
+
+There are no separators between these deflines.
+
+For formatdb version 4, the header file contains blast ASN.1 binary
+deflines, which can parsed with parse_fastadl_asn().
+
+FORMAT OF THE .MSK FILE
+-----------------------
+
+The .msk file is simply a packed list of masks for formatdb "oids" for
+some other file (typically nr). The first value is the last oid
+available; the remainder are packed 32 oids/mask, so that the number
+of masks is 1/32 the number of sequences in the file.
+
+**************************************************************** */
+
+#ifdef USE_MMAP
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#ifdef IBM_AIX
+#include <fcntl.h>
+#else
+#include <sys/fcntl.h>
+#endif
+#endif
+
+#ifdef USE_MMAP
+#ifndef MAP_FILE
+#define MAP_FILE 0
+#endif
+#endif
+
+#ifdef UNIX
+#define RBSTR "r"
+#else
+#define RBSTR "rb"
+#endif
+
+#ifdef WIN32
+#define SLASH_CHAR '\\'
+#define SLASH_STR "\\"
+#else
+#define SLASH_CHAR '/'
+#define SLASH_STR "/"
+#endif
+
+#define XTERNAL
+#include "uascii.h"
+
+#define XTERNAL
+#include "upam.h"
+#include "ncbl2_head.h"
+
+#include "defs.h"
+#include "structs.h"
+#include "mm_file.h"
+
+#define MAX_FADL_ACC_LEN 64
+
+unsigned long adler32(unsigned long, const unsigned char *, unsigned int);
+unsigned int bl2_uint4_cvt(unsigned int);
+unsigned int bl2_long4_cvt(long);
+uint64_t bl2_long8_cvt(uint64_t);
+void src_int4_read(FILE *fd, int *valp);
+void src_uint4_read(FILE *fd, unsigned int *valp);
+void src_long4_read(FILE *fd, long *valp);
+void src_long8_read(FILE *fd, int64_t *val);
+void ncbi_long8_read(FILE *fd, int64_t *valp);
+void src_char_read(FILE *fd, char *valp);
+unsigned char *parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
+ int *gi_p, int *db, char *acc, size_t acc_len,
+ char *name, size_t name_len,
+ char *title, size_t t_len, int *taxid);
+
+/* nt_btoa maps from blast 2bit format to ascii characters */
+static char nt_btoa[5] = {"ACGT"};
+
+static char *aa_b2toa= "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ"; /* NCBIstdaa */
+
+static int aa_btof[32]; /* maps to fasta alphabet */
+static int aa_btof_null = 0;
+
+static int dbtype, dbformat, amb_cnt;
+
+#define NCBIBL20 12
+
+struct lmf_str *load_ncbl2(struct lmf_str *m_fptr, FILE *ifile, int dbformat, int dbtype);
+
+int ncbl2_get_mmap_chain_o(struct seqr_chain *cur_seqr_chain,
+ struct lmf_str *m_fd, struct db_str *db);
+
+int ncbl2_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
+ struct lmf_str *m_fd, struct db_str *db);
+
+int ncbl2_getliba(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
+int ncbl2_getlibn(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
+
+int ncbl2_getliba_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
+int ncbl2_getlibn_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
+
+void newname(char *, char *, char *, int);
+void parse_pal(char *, char *, int *, int *, FILE *);
+
+int readMFILE (void *buffer, size_t size, int nitems, struct lmf_str *m_fd);
+
+void ncbl2_ranlib(char *, int, fseek_t, char *, struct lmf_str *m_fd);
+
+/* ncbl2_openlib() is used to open (and memory map) a BLAST2.0 format
+ file. Ifdef USE_MMAP, then ncbl2_openlib returns a structure that can
+ be used to read the database. */
+
+struct lmf_str *
+ncbl2_openlib(struct lib_struct *lib_p, int ldnaseq)
+{
+ char lname[256]; /* .pal, .nal file name */
+ char dname[256]; /* .pin, .nin file for files included from .msk files */
+ char msk_name[256]; /* .msk file name */
+ char hname[256]; /* .phr, .nhr */
+ char sname[256]; /* .psq, .nsq */
+ char tname[256]; /* .pin, .nin file */
+ char db_dir[256]; /* directory where all the files live */
+ int pref_db= -1; /* right now, only swissprot.pal, pdbaa.pal
+ are used for masked OID files */
+ char *bp;
+ int oid_seqs, max_oid, have_oid_list;
+ int oid_cnt, oid_len;
+ unsigned int *oid_list, o_max;
+ int tmp;
+ int i;
+#ifdef USE_MMAP
+ struct stat statbuf;
+#endif
+ FILE *ifile; /* index offsets, also DB info */
+ struct lmf_str *m_fptr;
+
+ /* this function should be reorganized
+ (1) check to see if there is a .nal/.pal file before doing things
+ with names, since the names might be wrong.
+ (2) if there is a .nal/.pal file, check to see if it has a DBLIST
+ (later), if it does, then modify the lib_p->next chain
+ if it does not, then generate the appropriate file names, and
+ read the oid list
+ (3) otherwise (no .nal/.pal file), generate the file names
+
+
+ */
+
+ if (ldnaseq==SEQT_PROT) {
+ newname(lname,lib_p->file_name,AA_LIST_EXT,(int)sizeof(lname)); /* .pal */
+ }
+ else {
+ newname(lname,lib_p->file_name,NT_LIST_EXT,(int)sizeof(lname)); /* .nal */
+ }
+
+ /* check for a .nal/.pal OID list file */
+ max_oid = oid_seqs = 0;
+ oid_list = NULL;
+
+ /* here, we check for a .pal/.nal file by trying to open it */
+ if ((ifile = fopen(lname,"r"))!=NULL) {
+
+ if ((bp = strrchr(lib_p->file_name,SLASH_CHAR))!=NULL) {
+ *bp = '\0';
+ SAFE_STRNCPY(db_dir,lib_p->file_name,sizeof(db_dir));
+ SAFE_STRNCAT(db_dir,SLASH_STR,sizeof(db_dir));
+ *bp = SLASH_CHAR;
+ }
+ else {
+ db_dir[0]='\0';
+ }
+
+ /* we have a list file, we need to parse it */
+ parse_pal(dname, msk_name, &oid_seqs, &max_oid, ifile);
+ fclose(ifile);
+
+ pref_db = -1;
+
+ if (oid_seqs > 0) {
+
+ have_oid_list = 1;
+ /* get the pref_db before adding the directory */
+ if (strncmp(msk_name,"swissprot",9)==0) {
+ pref_db = 7;
+ }
+ else if (strncmp(msk_name,"pdbaa",5)==0) {
+ pref_db = 14;
+ }
+
+ /* need to add directory to both dname and msk_name */
+ SAFE_STRNCPY(tname,db_dir,sizeof(tname));
+ SAFE_STRNCAT(tname,msk_name, sizeof(tname));
+ SAFE_STRNCPY(msk_name, tname, sizeof(msk_name));
+
+ SAFE_STRNCPY(tname,db_dir,sizeof(tname));
+ SAFE_STRNCAT(tname,dname, sizeof(tname));
+ SAFE_STRNCPY(dname,tname,sizeof(dname));
+
+ if (ldnaseq == SEQT_PROT) {
+ newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname));
+ newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname));
+ newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
+ }
+ else { /* reading DNA library */
+ newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname));
+ newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname));
+ newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
+ }
+ /* now load the oid file */
+ if ((ifile = fopen(msk_name,RBSTR))==NULL) {
+ fprintf(stderr,"error - cannot load %s file\n",msk_name);
+ return NULL;
+ }
+ else {
+ src_uint4_read(ifile,&o_max);
+ if (o_max != max_oid) {
+ fprintf(stderr," error - oid count mismatch %d != %d\n",max_oid, o_max);
+ }
+ oid_len = (max_oid/32+1);
+ if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) {
+ fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len);
+ return NULL;
+ }
+ if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) {
+ fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len);
+ return NULL;
+ }
+ fclose(ifile);
+ }
+ }
+#ifdef DEBUG
+ else { /* we had a .msk file, but there are no oid's in
+ it. */
+ fprintf(stderr," *** WARNING -- found .pal/.nal file %s with no OIDs\n",lname);
+ return NULL;
+ }
+#endif
+ }
+ else { /* else no OID/.msk file -- generate the names */
+ have_oid_list = 0;
+
+ /* initialize file names */
+ if (ldnaseq==SEQT_PROT) { /* read a protein database */
+ newname(tname,lib_p->file_name,AA_INDEX_EXT,(int)sizeof(tname)); /* .pin */
+ newname(hname,lib_p->file_name,AA_HEADER_EXT,(int)sizeof(hname)); /* .phr */
+ newname(sname,lib_p->file_name,AA_SEARCHSEQ_EXT,(int)sizeof(sname)); /* .psq */
+
+ }
+ else { /* reading DNA library */
+ newname(tname,lib_p->file_name,NT_INDEX_EXT,(int)sizeof(tname)); /* .nin */
+ newname(hname,lib_p->file_name,NT_HEADER_EXT,(int)sizeof(hname)); /* .nhr */
+ newname(sname,lib_p->file_name,NT_SEARCHSEQ_EXT,(int)sizeof(sname)); /* .nsq */
+ }
+ }
+
+
+ if (ldnaseq == SEQT_PROT) {
+ /* initialize map of BLAST2 amino acids to FASTA amino acids */
+ for (i=0; aa_b2toa[i]; i++) {
+ if ((tmp=aascii[aa_b2toa[i]])<NA) {
+ aa_btof[i]=tmp;
+ if (aa_b2toa[i] == 'O' || aa_b2toa[i] == 'o') aa_btof[i] = aascii['K'];
+ else if (aa_b2toa[i] == 'U' || aa_b2toa[i] == 'u') aa_btof[i] = aascii['C'];
+ }
+ else if (aa_b2toa[i]=='*') aa_btof[i]=aascii['X'];
+ else aa_btof[i]=0;
+/* else aa_btof[i]=aascii['X']; */
+ }
+
+ /* check to see if aa_btof[] actually does anything interesting */
+ aa_btof_null = 1;
+ for (i=0; i<sizeof(aa_b2toa); i++) {
+ if (i != aa_btof[i]) {
+#ifdef DEBUG
+ fprintf(stderr," difference at: i: %d [%c] != aa_btof[i]: %d [%c]\n",
+ i,aa_b2toa[i],aa_btof[i],NCBIstdaa[aa_btof[i]]);
+#endif
+ aa_btof_null = 0;
+ }
+ }
+ } /* else no OID/.msk file */
+
+
+ /* now we have all the file names, open the files and read the data */
+ /* open the index/header file, and read the sequence type info */
+ if ((ifile = fopen(tname,RBSTR))==NULL) {
+ fprintf(stderr," cannot open %s (%s) INDEX file",tname,lib_p->file_name);
+ perror("...");
+ return NULL;
+ }
+ src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */
+ src_uint4_read(ifile,(unsigned *)&dbtype); /* get 1 for protein/0 DNA */
+
+ if (dbformat != FORMATDBV3 && dbformat!=FORMATDBV4) {
+ fprintf(stderr,"error - %s wrong formatdb version (%d/%d)\n",
+ tname,dbformat,FORMATDBV3);
+ return NULL;
+ }
+
+ if ((ldnaseq==SEQT_PROT && dbtype != AAFORMAT) ||
+ (ldnaseq==SEQT_DNA && dbtype!=NTFORMAT)) {
+ fprintf(stderr,"error - %s wrong format (%d/%d)\n",
+ tname,dbtype,(ldnaseq ? NTFORMAT: AAFORMAT));
+ return NULL;
+ }
+
+ /* the files are there - allocate lmf_str */
+ if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {
+ fprintf(stderr," cannot allocate lmf_str\n");
+ return NULL;
+ }
+
+ m_fptr->lib_aa = (ldnaseq == 0);
+ m_fptr->tmp_buf_max = 4096;
+ if ((m_fptr->tmp_buf=
+ (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n");
+ return NULL;
+ }
+
+ /* load the oid info */
+ m_fptr->have_oid_list = have_oid_list;
+ m_fptr->max_oid = max_oid;
+ m_fptr->oid_seqs = oid_seqs;
+ m_fptr->oid_list = oid_list;
+ m_fptr->pref_db= pref_db;
+ m_fptr->get_mmap_chain = NULL;
+
+ /* open the header file */
+ if ((m_fptr->hfile = fopen(hname,RBSTR))==NULL) {
+ fprintf(stderr," cannot open %s header file\n",hname);
+ goto error_r;
+ }
+
+ /* ncbl2_ranlib is used for all BLAST2.0 access */
+ m_fptr->ranlib = ncbl2_ranlib;
+ m_fptr->bl_format_ver = dbformat;
+ m_fptr->lb_type = NCBIBL20;
+ m_fptr->libf = NULL;
+
+ if (ldnaseq==SEQT_DNA) {
+ if (oid_seqs > 0) {
+ m_fptr->getlib = ncbl2_getlibn_o;
+ }
+ else {
+ m_fptr->getlib = ncbl2_getlibn;
+ }
+ m_fptr->sascii = nascii;
+ }
+ else {
+ if (oid_seqs > 0) {
+ m_fptr->getlib = ncbl2_getliba_o;
+ m_fptr->get_mmap_chain = ncbl2_get_mmap_chain_o;
+ }
+ else {
+ m_fptr->getlib = ncbl2_getliba;
+ m_fptr->get_mmap_chain = ncbl2_get_mmap_chain;
+ }
+ m_fptr->sascii = aascii;
+ }
+ m_fptr->lb_name = lib_p->file_name;
+
+ /* open the sequence file */
+
+#if defined (USE_MMAP)
+ m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0);
+ if (!m_fptr->mm_flg) {
+ fprintf(stderr," cannot open %s",sname);
+ perror("...");
+ }
+ else {
+ if(fstat(m_fptr->mmap_fd, &statbuf) < 0) {
+ fprintf(stderr," cannot fstat %s",sname);
+ perror("...");
+ m_fptr->mm_flg = 0;
+ }
+ else {
+ m_fptr->st_size = statbuf.st_size;
+ if((m_fptr->mmap_base =
+ mmap(NULL, m_fptr->st_size, PROT_READ,
+ MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) {
+ fprintf(stderr," cannot mmap %s",sname);
+ perror("...");
+ m_fptr->mm_flg = 0;
+ }
+ else {
+ m_fptr->mmap_addr = m_fptr->mmap_base;
+ m_fptr->mm_flg = 1;
+ }
+ }
+ /* regardless, close the open()ed version */
+ close(m_fptr->mmap_fd);
+ }
+#else
+ m_fptr->mm_flg = 0;
+#endif
+
+ if (!m_fptr->mm_flg) {
+ if ((m_fptr->libf = fopen(sname,RBSTR))==NULL) {
+ fprintf(stderr," cannot open %s sequence file",sname);
+ perror("...");
+ goto error_r;
+ }
+ }
+
+/* all files should be open -- the rest of the work can be done by a
+ function common to ncbl2_openlib() and ncbl2_reopen()
+*/
+
+ return load_ncbl2(m_fptr, ifile, dbformat, dbtype);
+
+ error_r:
+ /* here if failure after m_fptr allocated */
+ free(m_fptr);
+ return NULL;
+}
+
+
+/* **************************************************************** */
+/* re-open an already opened library file using information in m_fptr
+
+ a valid m_fptr guarantees that the necessary files exist, and we
+ know whether there is an OID file. m_fptr's are NEVER used for
+ file lists, only for files with sequence data
+*/
+/* **************************************************************** */
+
+struct lmf_str *ncbl2_reopen(struct lmf_str *m_fptr) {
+ char lname[256]; /* .pal, .nal file name */
+ char dname[256]; /* .pin, .nin file for files included from .msk files */
+ char msk_name[256]; /* .msk file name */
+ char hname[256]; /* .phr, .nhr */
+ char sname[256]; /* .psq, .nsq */
+ char tname[256]; /* .pin, .nin file */
+ char db_dir[256]; /* directory where all the files live */
+ int pref_db= -1; /* right now, only swissprot.pal, pdbaa.pal
+ are used for masked OID files */
+ char *bp;
+ int oid_seqs, max_oid, have_oid_list;
+ int oid_cnt, oid_len;
+ unsigned int *oid_list, o_max;
+#ifdef USE_MMAP
+ struct stat statbuf;
+#endif
+ FILE *ifile; /* index offsets, also DB info */
+
+ /* its not open, but its being re-used, so re-initialize things */
+ m_fptr->libf = NULL;
+ m_fptr->mmap_fd = -1;
+ m_fptr->mm_flg = 0;
+
+ /* if we have an oid list, open it, read it, and use it to get the file names */
+ /* check for a .nal/.pal OID list file */
+ max_oid = oid_seqs = 0;
+ oid_list = NULL;
+
+ if (m_fptr->have_oid_list) {
+ if (m_fptr->lib_aa==1) {
+ newname(lname,m_fptr->lb_name,AA_LIST_EXT,(int)sizeof(lname)); /* .pal */
+ }
+ else {
+ newname(lname,m_fptr->lb_name,NT_LIST_EXT,(int)sizeof(lname)); /* .nal */
+ }
+
+ ifile = fopen(lname,"r"); /* it has to open, it did before */
+ if ((bp = strrchr(m_fptr->lb_name,SLASH_CHAR))!=NULL) {
+ *bp = '\0';
+ SAFE_STRNCPY(db_dir,m_fptr->lb_name,sizeof(db_dir));
+ SAFE_STRNCAT(db_dir,SLASH_STR,sizeof(db_dir));
+ *bp = SLASH_CHAR;
+ }
+ else {
+ db_dir[0]='\0';
+ }
+
+ /* we have a list file, we need to parse it */
+ parse_pal(dname, msk_name, &oid_seqs, &max_oid, ifile);
+ fclose(ifile);
+
+ /* we have read the .pal/.nal file, now deal with the .msk file */
+
+ pref_db = -1;
+
+ /* get the pref_db before adding the directory */
+ if (strncmp(msk_name,"swissprot",9)==0) {
+ pref_db = 7;
+ }
+ else if (strncmp(msk_name,"pdbaa",5)==0) {
+ pref_db = 14;
+ }
+
+ /* need to add directory to both dname and msk_name */
+ SAFE_STRNCPY(tname,db_dir,sizeof(tname));
+ SAFE_STRNCAT(tname,msk_name, sizeof(tname));
+ SAFE_STRNCPY(msk_name, tname, sizeof(msk_name));
+
+ SAFE_STRNCPY(tname,db_dir,sizeof(tname));
+ SAFE_STRNCAT(tname,dname, sizeof(tname));
+ SAFE_STRNCPY(dname,tname,sizeof(dname));
+
+ if (m_fptr->lib_aa) {
+ newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname));
+ newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname));
+ newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
+ }
+ else { /* reading DNA library */
+ newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname));
+ newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname));
+ newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
+ }
+
+ ifile = fopen(msk_name,RBSTR);
+ src_uint4_read(ifile,&o_max);
+ oid_len = (max_oid/32+1);
+ if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) {
+ fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len);
+ return NULL;
+ }
+ if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) {
+ fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len);
+ return NULL;
+ }
+ fclose(ifile);
+ }
+ else { /* else no OID/.msk file -- generate the names */
+ /* initialize file names */
+ if (m_fptr->lib_aa) { /* read a protein database */
+ newname(tname,m_fptr->lb_name,AA_INDEX_EXT,(int)sizeof(tname)); /* .pin */
+ newname(hname,m_fptr->lb_name,AA_HEADER_EXT,(int)sizeof(hname)); /* .phr */
+ newname(sname,m_fptr->lb_name,AA_SEARCHSEQ_EXT,(int)sizeof(sname)); /* .psq */
+
+ }
+ else { /* reading DNA library */
+ newname(tname,m_fptr->lb_name,NT_INDEX_EXT,(int)sizeof(tname)); /* .nin */
+ newname(hname,m_fptr->lb_name,NT_HEADER_EXT,(int)sizeof(hname)); /* .nhr */
+ newname(sname,m_fptr->lb_name,NT_SEARCHSEQ_EXT,(int)sizeof(sname)); /* .nsq */
+ }
+ }
+
+ /* now we have all the file names, open the files and read the data */
+ /* open the index/header file, and read the sequence type info */
+ if ((ifile = fopen(tname,RBSTR))==NULL) {
+ fprintf(stderr," cannot open %s (%s) INDEX file",tname,m_fptr->lb_name);
+ perror("...");
+ return NULL;
+ }
+ src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */
+ src_uint4_read(ifile,(unsigned *)&dbtype); /* get 1 for protein/0 DNA */
+
+ m_fptr->tmp_buf_max = 4096;
+ if ((m_fptr->tmp_buf=
+ (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n");
+ return NULL;
+ }
+
+ /* load the oid info */
+ m_fptr->max_oid = max_oid;
+ m_fptr->oid_seqs = oid_seqs;
+ m_fptr->oid_list = oid_list;
+ m_fptr->pref_db= pref_db;
+ m_fptr->get_mmap_chain = NULL;
+
+ /* open the header file */
+ m_fptr->hfile = fopen(hname,RBSTR);
+
+ /* ncbl2_ranlib is used for all BLAST2.0 access */
+ m_fptr->ranlib = ncbl2_ranlib;
+ m_fptr->bl_format_ver = dbformat;
+
+ if (!m_fptr->lib_aa) {
+ if (oid_seqs > 0) {
+ m_fptr->getlib = ncbl2_getlibn_o;
+ }
+ else {
+ m_fptr->getlib = ncbl2_getlibn;
+ }
+ m_fptr->sascii = nascii;
+ }
+ else {
+ if (oid_seqs > 0) { m_fptr->getlib = ncbl2_getliba_o; }
+ else { m_fptr->getlib = ncbl2_getliba; }
+ m_fptr->sascii = aascii;
+ }
+
+ /* open the sequence file */
+
+#if defined (USE_MMAP)
+ m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0);
+ if(fstat(m_fptr->mmap_fd, &statbuf) < 0) {
+ fprintf(stderr," cannot fstat %s",sname);
+ perror("...");
+ m_fptr->mm_flg = 0;
+ }
+ else {
+ m_fptr->st_size = statbuf.st_size;
+ if((m_fptr->mmap_base =
+ mmap(NULL, m_fptr->st_size, PROT_READ,
+ MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) {
+ fprintf(stderr," cannot mmap %s",sname);
+ perror("...");
+ m_fptr->mm_flg = 0;
+ }
+ else {
+ m_fptr->mmap_addr = m_fptr->mmap_base;
+ m_fptr->mm_flg = 1;
+ }
+ }
+ /* regardless, close the open()ed version */
+ close(m_fptr->mmap_fd);
+#else
+ m_fptr->mm_flg = 0;
+#endif
+
+ if (!m_fptr->mm_flg) {
+ m_fptr->libf = fopen(sname,RBSTR);
+ if (!m_fptr->libf) {
+ fprintf(stderr," cannot open %s\n",sname);
+ return NULL;
+ }
+ m_fptr->mm_flg = 0;
+ }
+
+ return load_ncbl2(m_fptr, ifile, dbformat, dbtype);
+}
+
+struct lmf_str
+*load_ncbl2(struct lmf_str *m_fptr, FILE *ifile, int dbformat, int dbtype) {
+ int title_len;
+ char *title_str=NULL;
+ int date_len;
+ char *date_str=NULL;
+ long ltmp;
+ int64_t l8tmp;
+ int i, tmp;
+ unsigned int *f_pos_arr;
+
+ src_uint4_read(ifile,(unsigned *)&title_len);
+
+ if (title_len > 0) {
+ if ((title_str = calloc((size_t)title_len+1,sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate title string (%d)\n",title_len);
+ goto error_r;
+ }
+ fread(title_str,(size_t)1,(size_t)title_len,ifile);
+ }
+
+ src_uint4_read(ifile,(unsigned *)&date_len);
+
+ if (date_len > 0) {
+ if ((date_str = calloc((size_t)date_len+1,sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate date string (%d)\n",date_len);
+ goto error_r;
+ }
+ fread(date_str,(size_t)1,(size_t)date_len,ifile);
+ }
+
+ m_fptr->lpos = 0;
+ src_uint4_read(ifile,(unsigned *)&m_fptr->max_cnt);
+
+ if (dbformat == FORMATDBV3) {
+ src_long4_read(ifile,<mp);
+ m_fptr->tot_len = ltmp;
+ }
+ else {
+ ncbi_long8_read(ifile,&l8tmp);
+ m_fptr->tot_len = l8tmp;
+ }
+
+ src_long4_read(ifile,<mp);
+ m_fptr->max_len = ltmp;
+
+ /* currently we are not using this information, but perhaps later */
+ if (title_str!=NULL) free(title_str);
+ if (date_str!=NULL) free(date_str);
+
+#ifdef DEBUG
+ fprintf(stderr,"%s format: BL2 (%s) max_cnt: %d, totlen: %lld, maxlen %ld\n",
+ m_fptr->lb_name,m_fptr->mm_flg ? "mmap" : "fopen",
+ m_fptr->max_cnt,m_fptr->tot_len,m_fptr->max_len);
+#endif
+
+ /* allocate and read hdr indexes */
+ if ((f_pos_arr=(unsigned int *)calloc((size_t)m_fptr->max_cnt+1,sizeof(int)))==NULL) {
+ fprintf(stderr," cannot allocate tmp header pointers\n");
+ goto error_r;
+ }
+
+ if ((m_fptr->d_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
+ fprintf(stderr," cannot allocate header pointers\n");
+ goto error_r;
+ }
+
+ /* allocate and read sequence offsets */
+ if ((m_fptr->s_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
+ fprintf(stderr," cannot allocate sequence pointers\n");
+ goto error_r;
+ }
+
+ if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
+ fprintf(stderr," error reading hdr offsets: %s\n",m_fptr->lb_name);
+ goto error_r;
+ }
+
+ for (i=0; i<=m_fptr->max_cnt; i++)
+#ifdef IS_BIG_ENDIAN
+ m_fptr->d_pos_arr[i] = f_pos_arr[i];
+#else
+ m_fptr->d_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
+#endif
+
+ if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
+ fprintf(stderr," error reading seq offsets: %s\n",m_fptr->lb_name);
+ goto error_r;
+ }
+ for (i=0; i<=m_fptr->max_cnt; i++) {
+#ifdef IS_BIG_ENDIAN
+ m_fptr->s_pos_arr[i] = f_pos_arr[i];
+#else
+ m_fptr->s_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
+#endif
+ }
+
+ if (dbtype == NTFORMAT) {
+ /* allocate and ambiguity offsets */
+ if ((m_fptr->a_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
+ fprintf(stderr," cannot allocate sequence pointers\n");
+ goto error_r;
+ }
+
+ /*
+ for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->a_pos_arr[i]);
+ */
+
+ if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
+ fprintf(stderr," error reading seq offsets: %s\n",m_fptr->lb_name);
+ goto error_r;
+ }
+ for (i=0; i<=m_fptr->max_cnt; i++) {
+#ifdef IS_BIG_ENDIAN
+ m_fptr->a_pos_arr[i] = f_pos_arr[i];
+#else
+ m_fptr->a_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
+#endif
+ }
+ }
+
+ /*
+ for (i=0; i < min(m_fptr->max_cnt,10); i++) {
+ fprintf(stderr,"%d: %d %d %d\n",i,m_fptr->s_pos_arr[i],m_fptr->a_pos_arr[i],m_fptr->d_pos_arr[i]);
+ }
+ */
+
+ /* all done with ifile, close it */
+ fclose(ifile);
+ free(f_pos_arr);
+
+ if (!m_fptr->mm_flg) {
+ tmp = fgetc(m_fptr->libf);
+ if (tmp!=NULLB)
+ fprintf(stderr," phase error: %d:%d found\n",0,tmp);
+ }
+
+ m_fptr->bl_lib_pos = 1;
+ amb_cnt = 0;
+
+ return m_fptr;
+
+ error_r:
+ /* here if failure after m_fptr allocated */
+ free(m_fptr);
+ return NULL;
+}
+
+/* **************************************************************** */
+/* close the library, free s_pos_arr, a_pos_arr, but save file info */
+/* **************************************************************** */
+
+void ncbl2_closelib(struct lmf_str *m_fptr)
+{
+
+ if (m_fptr->tmp_buf != NULL) {
+ free(m_fptr->tmp_buf);
+ m_fptr->tmp_buf = NULL;
+ m_fptr->tmp_buf_max = 0;
+ }
+
+ if (m_fptr->s_pos_arr !=NULL) {
+ free(m_fptr->s_pos_arr);
+ m_fptr->s_pos_arr = NULL;
+ }
+ if (m_fptr->a_pos_arr!=NULL) {
+ free(m_fptr->a_pos_arr);
+ m_fptr->a_pos_arr = NULL;
+ }
+
+ if (m_fptr->hfile !=NULL ) {
+ fclose(m_fptr->hfile);
+ m_fptr->hfile=NULL;
+ free(m_fptr->d_pos_arr);
+ m_fptr->d_pos_arr = NULL;
+ }
+
+ if (m_fptr->oid_list != NULL) {
+ free(m_fptr->oid_list);
+ m_fptr->oid_list = NULL;
+ m_fptr->oid_seqs = m_fptr->max_oid = 0;
+ }
+
+#ifdef use_mmap
+ if (m_fptr->mm_flg) {
+ munmap(m_fptr->mmap_base,m_fptr->st_size);
+ m_fptr->mmap_fd = -1;
+ }
+ else
+#endif
+ if (m_fptr->libf !=NULL ) {
+ fclose(m_fptr->libf);
+ m_fptr->libf=NULL;
+ }
+
+ m_fptr->mm_flg = 0;
+}
+
+/* **************************************************************** */
+/* read a protein sequence using OID offsets */
+/* **************************************************************** */
+
+int
+ncbl2_getliba_o(unsigned char *seq,
+ int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *m_fd,
+ long *l_off)
+{
+ int tpos;
+ unsigned int t_mask, t_shift, oid_mask;
+
+ /* get to the next valid pointer */
+
+ for ( tpos = m_fd->lpos ;tpos <= m_fd->max_oid; tpos++) {
+ t_mask = tpos / 32;
+ t_shift = 31 - (tpos % 32);
+ if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; }
+
+ if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
+ if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
+ m_fd->lpos = tpos; /* already bumped up */
+ m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
+ return ncbl2_getliba(seq, maxs, libstr, n_libstr,
+ libpos, lcont, m_fd, l_off);
+ }
+ }
+ return -1;
+}
+
+/* **************************************************************** */
+/* read a protein sequence */
+/* **************************************************************** */
+
+int
+ncbl2_getliba(unsigned char *seq,
+ int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *m_fd,
+ long *l_off)
+{
+ unsigned char *sptr, *dptr;
+ int s_chunk, d_len, lib_cnt;
+ long seqcnt;
+ long tmp;
+ static long seq_len;
+#if defined(DEBUG) || defined(PCOMPLIB)
+ int gi, my_db, taxid;
+ char acc[MAX_FADL_ACC_LEN], title[MAX_UID], name[MAX_FADL_ACC_LEN];
+#endif
+
+ *l_off = 1;
+
+ lib_cnt = m_fd->lpos;
+ *libpos = (fseek_t)m_fd->lpos;
+
+ if (*lcont==0) {
+ if (lib_cnt >= m_fd->max_cnt) return -1; /* no more sequences */
+ seq_len = m_fd->s_pos_arr[lib_cnt+1] - m_fd->s_pos_arr[lib_cnt]; /* value is +1 off to get the NULL */
+ if (m_fd->mm_flg) m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lib_cnt];
+#if !defined(DEBUG) && !defined(PCOMPLIB)
+ libstr[0]='\0';
+#else
+ /* get the name from the header file */
+ fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);
+
+ if (m_fd->bl_format_ver == FORMATDBV3) {
+ d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
+ fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
+ libstr[d_len]='\0';
+ }
+ else {
+ d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
+ fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile);
+ parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len,
+ &gi, &my_db, acc, sizeof(acc), name, sizeof(name), title, sizeof(title), &taxid);
+ sprintf(m_fd->tmp_buf,"gi|%d",gi);
+ SAFE_STRNCPY(libstr,m_fd->tmp_buf,n_libstr);
+ }
+ libstr[n_libstr-1]='\0';
+#endif
+ }
+ if (seq_len <= maxs) { /* sequence fits */
+ seqcnt = seq_len;
+ m_fd->lpos++;
+ *lcont = 0;
+ }
+ else { /* doesn't fit */
+ seqcnt = maxs-1;
+ (*lcont)++;
+ }
+
+ if (m_fd->mm_flg) sptr = (unsigned char *)m_fd->mmap_addr;
+ else {
+ if ((tmp=fread(seq,(size_t)1,(size_t)seq_len,m_fd->libf))!=(size_t)seq_len) {
+ fprintf(stderr," could not read sequence record: %lld %ld != %ld\n",
+ *libpos,tmp,seq_len);
+ goto error;
+ }
+ sptr = seq;
+ }
+ if (seq_len <= maxs) {seqcnt = --seq_len;}
+
+ /* everything is ready, set up dst. pointer, seq_len */
+ if (aa_b2toa[sptr[seq_len-1]]=='*') seq_len--;
+ if (aa_btof_null) {
+ memcpy(seq,sptr,seq_len);
+ }
+ else {
+ dptr = seq;
+ s_chunk = seqcnt/16;
+ while (s_chunk-- > 0) {
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ *dptr++ = aa_btof[*sptr++];
+ }
+ while (dptr < seq+seqcnt) *dptr++ = aa_btof[*sptr++];
+ }
+
+ if (m_fd->mm_flg) m_fd->mmap_addr = (char *)sptr;
+
+ /* we didn't get it all, so reset for more */
+ if (*lcont) seq_len -= seqcnt;
+
+ seq[seqcnt]= EOSEQ;
+ return (seqcnt);
+
+error: fprintf(stderr," error reading %s at %lld\n",libstr,*libpos);
+ fflush(stderr);
+ return (-1);
+}
+
+/* ncbl2_mmap_getchain_o fills cur_seqr_chain with sequence pointers
+ from the memory mapped file at *m_fd, based on oid coordinates (not
+ contiguous)
+
+ requires NCBIstdaa core encoding
+*/
+int
+ncbl2_get_mmap_chain_o(struct seqr_chain *cur_seqr_chain,
+ struct lmf_str *m_fd, struct db_str *db)
+{
+ int i;
+ struct seq_record *seq_a, *seq_p;
+ struct mseq_record *mseq_a, *mseq_p;
+
+ int tpos;
+ unsigned int t_mask, t_shift, oid_mask;
+
+ tpos = m_fd->lpos;
+ if (tpos > m_fd->max_oid) return EOF;
+ seq_a = cur_seqr_chain->seqr_base;
+ mseq_a = cur_seqr_chain->mseqr_base;
+
+ for (i=0; i < cur_seqr_chain->max_chain_seqs; i++) {
+ if (tpos > m_fd->max_oid) {
+ break;
+ }
+ seq_p = &seq_a[i];
+ mseq_p = &mseq_a[i];
+
+ /* now get the next valid pointer */
+ while (tpos <= m_fd->max_oid) {
+ t_mask = tpos / 32;
+ t_shift = 31 - (tpos % 32);
+ if ((oid_mask = m_fd->oid_list[t_mask])==0) { tpos++; continue; }
+
+ /* have a valid entry */
+ if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
+ m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
+ seq_p->n1 = m_fd->s_pos_arr[tpos+1] - m_fd->s_pos_arr[tpos]-1; /* value is +1 off to get the NULL */
+ seq_p->aa1b = (unsigned char *)(m_fd->mmap_base + m_fd->s_pos_arr[tpos]);
+ seq_p->l_offset = 0;
+ seq_p->l_off = 1;
+
+ db->entries++;
+ db->length += seq_p->n1;
+ if (db->length > LONG_MAX) {
+ db->length -= LONG_MAX; db->carry++;
+ }
+
+ mseq_p->m_file_p = m_fd;
+ mseq_p->n1tot_p=NULL;
+ mseq_p->cont = 0;
+ seq_p->index = mseq_p->index = mseq_p->lseek = tpos++;
+#ifndef DEBUG
+ mseq_p->libstr[0] = '\0';
+#else
+#endif
+#if DEBUG
+ seq_p->adler32_crc = mseq_p->adler32_crc = adler32(1L,seq_p->aa1b,seq_p->n1);
+#endif
+ break;
+ }
+ else {
+ tpos++;
+ }
+ }
+ }
+ if (i==0 && tpos > m_fd->max_oid) return EOF;
+ m_fd->lpos = tpos;
+ cur_seqr_chain->cur_seq_cnt = i;
+ if (i >= m_fd->max_cnt) return EOF;
+ else return i;
+}
+
+/* ncbl2_mmap_getchain fills cur_seqr_chain with sequence pointers
+ from the memory mapped file at *m_fd
+
+ because the database is opened read-only, this code only works with
+ an amino acid mapping identical to that used by blastdbcmd, aa_b2toa[]
+
+*/
+int
+ncbl2_get_mmap_chain(struct seqr_chain *cur_seqr_chain,
+ struct lmf_str *m_fd, struct db_str *db) {
+ int i, lib_cnt;
+ struct seq_record *seq_a, *seq_p;
+ struct mseq_record *mseq_a, *mseq_p;
+
+ lib_cnt = m_fd->lpos;
+ if (lib_cnt >= m_fd->max_cnt) return EOF;
+ seq_a = cur_seqr_chain->seqr_base;
+ mseq_a = cur_seqr_chain->mseqr_base;
+
+ for (i=0; i < cur_seqr_chain->max_chain_seqs; i++) {
+ if (lib_cnt >= m_fd->max_cnt) break;
+ seq_p = &seq_a[i];
+ mseq_p = &mseq_a[i];
+ seq_p->n1 = m_fd->s_pos_arr[lib_cnt+1] - m_fd->s_pos_arr[lib_cnt]-1; /* value is +1 off to get the NULL */
+
+ db->entries++;
+ db->length += seq_p->n1;
+ if (db->length > LONG_MAX) {
+ db->length -= LONG_MAX; db->carry++;
+ }
+
+ mseq_p->m_file_p = m_fd;
+ mseq_p->n1tot_p=NULL;
+ mseq_p->cont = 0;
+ seq_p->index = mseq_p->index = mseq_p->lseek = lib_cnt;
+#ifndef DEBUG
+ mseq_p->libstr[0] = '\0';
+#else
+#endif
+ seq_p->aa1b = (unsigned char *)(m_fd->mmap_base + m_fd->s_pos_arr[lib_cnt++]);
+ seq_p->l_offset = 0;
+ seq_p->l_off = 1;
+#if DEBUG
+ seq_p->adler32_crc = mseq_p->adler32_crc = adler32(1L,seq_p->aa1b,seq_p->n1);
+#endif
+ }
+ m_fd->lpos = lib_cnt;
+ cur_seqr_chain->cur_seq_cnt = i;
+ return i;
+}
+
+/* **************************************************************** */
+/* read a DNA sequence using OID offsets */
+/* **************************************************************** */
+
+int
+ncbl2_getlibn_o(unsigned char *seq,
+ int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *m_fd,
+ long *l_off)
+{
+ int tpos;
+ unsigned int t_mask, t_shift, oid_mask;
+
+ /* get to the next valid pointer */
+
+ for (tpos = m_fd->lpos; tpos <= m_fd->max_oid; tpos++) {
+ t_mask = tpos / 32;
+ t_shift = 31 - (tpos % 32);
+ if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; }
+
+ if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
+ if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
+ m_fd->lpos = tpos; /* already bumped up */
+ m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
+ return ncbl2_getlibn(seq, maxs, libstr, n_libstr,
+ libpos, lcont, m_fd, l_off);
+ }
+ }
+ return -1;
+}
+
+static char tmp_amb[4096];
+
+/* **************************************************************** */
+/* read a DNA sequence */
+/* **************************************************************** */
+
+int
+ncbl2_getlibn(unsigned char *seq,
+ int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *m_fd,
+ long *l_off)
+{
+ unsigned char *sptr, *tptr, stmp;
+ long seqcnt;
+ int s_chunk, lib_cnt;
+ size_t tmp;
+ char ch;
+ static long seq_len;
+ static int c_len,c_pad;
+ int c_len_set, d_len;
+#if defined(DEBUG) || defined(PCOMPLIB)
+ int gi, my_db, taxid;
+ char acc[MAX_FADL_ACC_LEN], title[MAX_UID], name[MAX_FADL_ACC_LEN];
+#endif
+
+ /* ambiguity code adapted from NCBI Blast sources by:
+
+ Ralf Jost, Dipl.-Inform.
+ Director, Technical Bioinformatics
+ Biomax Informatics AG
+ ralf.jost at biomax.com
+ */
+
+ int amb_lower = *lcont * maxs;
+ /* int amb_upper = (*lcont + 1) * maxs - 1; */ /* not used */
+
+ unsigned long x;
+ unsigned long soff, eoff;
+ int index;
+
+ unsigned int amb_cnt = 0;
+ unsigned int large_amb = 0;
+
+ unsigned int start;
+ unsigned int end;
+ unsigned int amb_start;
+
+ const char str_2bit[] = "ACGT";
+ const char str_4bit[] = "-ACMGRSVTWYHKDBN";
+
+ unsigned int *amb_ptr = NULL;
+ long filepos = 0;
+ char *mmap_pos = NULL;
+
+ unsigned int i;
+ unsigned char res;
+ int row_len;
+ int j, position = 0;
+
+ *l_off = 1;
+
+ lib_cnt = m_fd->lpos;
+ *libpos = (fseek_t)lib_cnt;
+ if (*lcont==0) { /* not a continuation of previous */
+ if (lib_cnt >= m_fd->max_cnt) return (-1);
+ c_len = m_fd->a_pos_arr[lib_cnt]- m_fd->s_pos_arr[lib_cnt];
+ if (!m_fd->mm_flg) {
+ /* fseek over amb_ray */
+ fseek(m_fd->libf,m_fd->s_pos_arr[lib_cnt],0);
+ m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
+ }
+ else m_fd->mmap_addr = m_fd->mmap_base + m_fd->s_pos_arr[lib_cnt];
+#if !defined(DEBUG) && !defined(PCOMPLIB)
+ libstr[0]='\0';
+#else
+ /* get the name from the header file */
+ fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);
+
+ if (m_fd->bl_format_ver == FORMATDBV3) {
+ d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
+ fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
+ }
+ else {
+ d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
+ fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile);
+ parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len,
+ &gi, &my_db, acc, sizeof(acc), name, sizeof(name), title, sizeof(title), &taxid);
+ sprintf(m_fd->tmp_buf,"gi|%d",gi);
+ SAFE_STRNCPY(libstr,m_fd->tmp_buf,n_libstr);
+ }
+ libstr[n_libstr-1]='\0';
+#endif
+ } /* end of *lcont==0 */
+
+ /* To avoid the situation where c_len <= 1; we must anticipate what
+ c_len will be after this pass. If it will be <= 64, back off this
+ time so next time it will be > 64 */
+
+ seq_len = c_len*4;
+
+ if ((seq_len+4 > maxs) && (seq_len+4 - maxs <= 256)) {
+ /* we won't be done but we will have less than 256 to go */
+ c_len -= 64; seq_len -= 256; c_len_set = 1; maxs -= 256;}
+ else c_len_set = 0;
+
+ /*
+ fprintf(stderr," lib_cnt: %d %d %d %d\n",lib_cnt,c_len,seq_len,maxs);
+ */
+
+ /* does the rest of the sequence fit? */
+ if (seq_len <= maxs-4 && !c_len_set) {
+ seqcnt = c_len;
+ if (!m_fd->mm_flg) {
+ if ((tmp=fread(seq,(size_t)1,(size_t)seqcnt,m_fd->libf))!=(size_t)seqcnt) {
+ fprintf(stderr,
+ " could not read sequence record: %s %lld %ld != %ld: %d\n",
+ libstr,*libpos,tmp,seqcnt,*seq);
+ goto error;
+ }
+ m_fd->bl_lib_pos += tmp;
+ sptr = seq + seqcnt;
+ }
+ else sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);
+
+ *lcont = 0; /* this is the last chunk */
+ // lib_cnt++; /* increment to the next sequence */
+ /* the last byte is either '0' (no remainder) or the last 1-3 chars and the remainder */
+ c_pad = *(sptr-1);
+ c_pad &= 0x3; /* get the last (low) 2 bits */
+ seq_len -= (4 - c_pad); /* if the last 2 bits are 0, its a NULL byte */
+ }
+ else { /* get the next chunk, but more to come */
+ seqcnt = ((maxs+3)/4)-1;
+ if (!m_fd->mm_flg) {
+ if ((tmp=fread(seq,(size_t)1,(size_t)(seqcnt),m_fd->libf))!=(size_t)(seqcnt)) {
+ fprintf(stderr," could not read sequence record: %lld %ld/%ld\n",
+ *libpos,tmp,seqcnt);
+ goto error;
+ }
+ m_fd->bl_lib_pos += tmp;
+ sptr = seq + seqcnt;
+ }
+ else {
+ sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);
+ m_fd->mmap_addr += seqcnt;
+ }
+ seq_len = 4*seqcnt;
+ c_len -= seqcnt;
+/* if (c_len_set) {c_len += 64; maxs += 256;} */
+ (*lcont)++;
+/* hopefully we don't need this because of c_len -= 64. */
+/*
+ if (c_len == 1) {
+#if !defined (USE_MMAP)
+ c_pad = fgetc(m_fd->libf);
+ *sptr=c_pad;
+#else
+ c_pad = *m_fd->mmap_addr++;
+ sptr = m_fd->mmap_addr;
+#endif
+ c_pad &= 0x3;
+ seq_len += c_pad;
+ seqcnt++;
+ lib_cnt++;
+ *lcont = 0;
+ }
+*/
+ }
+
+ /* point to the last packed byte and to the end of the array
+ seqcnt is the exact number of bytes read
+ tptr points to the destination, use multiple of 4 to simplify math
+ sptr points to the source, note that the last byte will be read 4 cycles
+ before it is written
+ */
+
+ tptr = seq + 4*seqcnt;
+ s_chunk = seqcnt/8;
+ while (s_chunk-- > 0) {
+ stmp = *--sptr;
+ *--tptr = (stmp&3) +1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ stmp = *--sptr;
+ *--tptr = (stmp&3) +1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ stmp = *--sptr;
+ *--tptr = (stmp&3) +1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ stmp = *--sptr;
+ *--tptr = (stmp&3) +1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ stmp = *--sptr;
+ *--tptr = (stmp&3) +1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ stmp = *--sptr;
+ *--tptr = (stmp&3) +1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ stmp = *--sptr;
+ *--tptr = (stmp&3) +1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ stmp = *--sptr;
+ *--tptr = (stmp&3) +1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ }
+ while (tptr>seq) {
+ stmp = *--sptr;
+ *--tptr = (stmp&3) +1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ }
+
+ /*
+ * Ambiguity-Decoding code from
+
+ Ralf Jost, Dipl.-Inform.
+ Director, Technical Bioinformatics
+ Biomax Informatics AG
+ ralf.jost at biomax.com
+
+ using code from NCBI Blast distribution
+ */
+
+ amb_start = m_fd->a_pos_arr[lib_cnt];
+ end = m_fd->s_pos_arr[lib_cnt + 1];
+
+ if (amb_start != end) {
+ if (!m_fd->mm_flg) {
+ filepos = ftell(m_fd->libf);
+ /* find the size of the ambiguity table */
+ if (fseek(m_fd->libf, amb_start, SEEK_SET) != 0) {
+ fprintf(stderr, "*** error [%s:%d] *** -- Seek amb start 0x%08x error %d\n",
+ __FILE__, __LINE__, amb_start, ferror(m_fd->libf));
+ }
+ if (fread(&amb_cnt, sizeof(unsigned int), 1, m_fd->libf) != 1) {
+ fprintf(stderr, "*** error [%s:%d] *** -- Read amb count error %d\n",
+ __FILE__, __LINE__, ferror(m_fd->libf));
+ }
+ } else {
+ mmap_pos = m_fd->mmap_addr;
+ m_fd->mmap_addr = m_fd->mmap_base + amb_start;
+ if (readMFILE((void *)&amb_cnt, sizeof(unsigned int), 1, m_fd) != 1) {
+ fprintf(stderr, "*** error [%s:%d] *** -- Read amb count error %d\n",
+ __FILE__, __LINE__, ferror(m_fd->libf));
+ }
+ }
+
+ amb_cnt = bl2_uint4_cvt(amb_cnt);
+
+ /* if the most significant bit is set on the count, then each
+ * correction will take two entries in the table. the layout
+ * is described below.
+ */
+ large_amb = amb_cnt >> 31;
+ amb_cnt = amb_cnt & 0x7fffffff;
+
+ /* allocate enough space for the ambiguity table */
+ amb_ptr = (unsigned int *) malloc(amb_cnt * sizeof(unsigned int));
+ if (amb_ptr == NULL) {
+ fprintf(stderr, "*** error [%s:%d] malloc amb table error size %ld\n",
+ __FILE__, __LINE__, amb_cnt * sizeof(unsigned int));
+ }
+
+ /* read the table */
+ if (!m_fd->mm_flg) {
+ if (fread((unsigned char *) amb_ptr, sizeof(unsigned int), amb_cnt, m_fd->libf)
+ != amb_cnt) {
+ fprintf(stderr, "*** error [%s:%d] *** -- Read amb table %d error %d\n",
+ __FILE__, __LINE__, amb_cnt, ferror(m_fd->libf));
+ }
+ } else {
+ if (readMFILE((void *) amb_ptr, sizeof(unsigned int), amb_cnt, m_fd)
+ != amb_cnt) {
+ fprintf(stderr, "*** error [%s:%d] *** -- Read amb table %d error %d\n",
+ __FILE__, __LINE__, amb_cnt, ferror(m_fd->libf));
+ }
+ }
+
+ for (index=0; index < amb_cnt; index++) {
+ amb_ptr[index] = bl2_uint4_cvt(amb_ptr[index]);
+ }
+
+ for (i = 0; i < amb_cnt; i++) {
+
+ if (large_amb) {
+ res = (unsigned char) (amb_ptr[i] >> 28);
+ row_len = (int) (amb_ptr[i] >> 16) & 0xFFF;
+ position = amb_ptr[i + 1];
+ } else {
+ res = (unsigned char) (amb_ptr[i] >> 28);
+ row_len = (int) ((amb_ptr[i] >> 24) & 0xF);
+ position = amb_ptr[i] & 0xFFFFFF;
+ }
+ for (index = position, j = 0; j <= row_len; j++) {
+ if ((index + j >= amb_lower) && (index + j < amb_lower + 4 * seqcnt))
+ seq[index + j - amb_lower] = nascii[str_4bit[res]];
+ }
+
+ if (large_amb)
+ i++;
+ }
+
+ if (amb_ptr != NULL)
+ free(amb_ptr);
+
+ if (!m_fd->mm_flg) {
+ fseek(m_fd->libf, filepos, SEEK_SET);
+ } else {
+ m_fd->mmap_addr = mmap_pos;
+ }
+ }
+
+ /*
+ * End of ambiguity-decoding.
+ */
+
+ if ( *lcont == 0)
+ lib_cnt++;
+
+
+
+
+
+ /*
+ for (sptr=seq; sptr < seq+seq_len; sptr++) {
+ printf("%c",nt[*sptr]);
+ if ((int)(sptr-seq) % 60 == 59) printf("\n");
+ }
+ printf("\n");
+ */
+
+ m_fd->lpos = lib_cnt;
+ if (seqcnt*4 >= seq_len) { /* there was enough room */
+ seq[seq_len]= EOSEQ;
+ /* printf("%d\n",seq_len); */
+ return seq_len;
+ }
+ else { /* not enough room */
+ seq[seqcnt*4]=EOSEQ;
+ seq_len -= 4*seqcnt;
+ return (4*seqcnt);
+ }
+
+error: fprintf(stderr," error reading %s at %lld\n",libstr,*libpos);
+ fflush(stderr);
+ return (-1);
+}
+
+ /* 0 1 2 3 4 5 6 7
+ 8 9 10 11 12 13 14 15
+ 16 17 */
+static char
+*db_type_arr[] = {"lcl","gib","gim","gii","gb","emb","pir","sp",
+ "pat","ref","gnl","gi","dbj","prf","pdb","tpg",
+ "tpe","tpd"};
+
+/* **************************************************************** */
+/* read a description, position for sequence */
+/* **************************************************************** */
+void
+ncbl2_ranlib(char *str,
+ int cnt,
+ fseek_t libpos,
+ char *libstr,
+ struct lmf_str *m_fd)
+{
+ int llen, lib_cnt;
+ char *bp;
+ unsigned char *my_buff=NULL;
+ char descr[2048];
+ unsigned char *abp;
+ int gi, taxid;
+ int my_db;
+ char db[5], acc[MAX_FADL_ACC_LEN], name[MAX_FADL_ACC_LEN];
+ char title[2048];
+ int have_my_buff=0;
+ int have_descr = 0;
+
+ lib_cnt = (int)libpos;
+ llen = m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt];
+
+ fseek(m_fd->hfile,m_fd->d_pos_arr[libpos],0);
+
+ if (m_fd->bl_format_ver == FORMATDBV3) {
+ if (llen >= cnt) llen = cnt-1;
+ fread(str,(size_t)1,(size_t)(llen),m_fd->hfile);
+ }
+ else {
+ if (llen >= m_fd->tmp_buf_max) {
+ if ((my_buff=(unsigned char *)calloc(llen,sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate ASN.1 buffer: %d\n",llen);
+ my_buff = (unsigned char *)m_fd->tmp_buf;
+ llen = m_fd->tmp_buf_max;
+ }
+ else have_my_buff = 1;
+ }
+ else {
+ my_buff = (unsigned char *)m_fd->tmp_buf;
+ }
+ abp = my_buff;
+ fread(my_buff,(size_t)1,llen,m_fd->hfile);
+
+ do {
+ abp = parse_fastadl_asn(abp, my_buff+llen,
+ &gi, &my_db, acc, sizeof(acc), name, sizeof(name),
+ title, sizeof(title), &taxid);
+
+ if (gi > 0) {
+ sprintf(descr,"gi|%d|%s|%s|%s ",gi,db_type_arr[my_db],acc,name);
+ }
+ else {
+ if (acc[0] != '\0') sprintf(descr,"%s ",acc);
+ else descr[0] = '\0';
+ if (name[0] != '\0' && strcmp(name,"BL_ORD_ID")!=0) sprintf(descr+strlen(descr),"%s ", name);
+ }
+ if (my_db == 0 || m_fd->pref_db < 0) {
+ if (!have_descr) {
+ SAFE_STRNCPY(str,descr,cnt);
+ have_descr = 1;
+ }
+ else {
+ SAFE_STRNCAT(str,"\001",cnt);
+ SAFE_STRNCAT(str,descr,cnt);
+ }
+ SAFE_STRNCAT(str,title,cnt);
+ if (strlen(str) >= cnt-1) break;
+ }
+ else if (m_fd->pref_db == my_db) {
+ have_descr = 1;
+ SAFE_STRNCPY(str,descr,cnt);
+ SAFE_STRNCAT(str,title,cnt);
+ break;
+ }
+ } while (abp);
+
+ if (!have_descr) {
+ SAFE_STRNCPY(str,descr,cnt);
+ SAFE_STRNCAT(str,descr,cnt);
+ }
+
+ if (have_my_buff) free(my_buff);
+ }
+
+ str[cnt-1]='\0';
+
+ bp = str;
+ while((bp=strchr(bp,'\001'))!=NULL) {*bp++=' ';}
+
+ if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[libpos],0);
+
+ m_fd->lpos = lib_cnt;
+ m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
+}
+
+unsigned int bl2_uint4_cvt(unsigned int val)
+{
+ unsigned int res;
+#ifdef IS_BIG_ENDIAN
+ return val;
+#else /* it better be LITTLE_ENDIAN */
+ res = ((val&255)*256)+ ((val>>8)&255);
+ res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
+ return res;
+#endif
+}
+
+unsigned int bl2_long4_cvt(long val)
+{
+ int val4;
+ unsigned int res;
+#ifdef IS_BIG_ENDIAN
+ val4 = val;
+ return val4;
+#else /* it better be LITTLE_ENDIAN */
+ res = ((val&255)*256)+ ((val>>8)&255);
+ res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
+ return res;
+#endif
+}
+
+uint64_t bl2_long8_cvt(uint64_t val)
+{
+ uint64_t res;
+#ifdef IS_BIG_ENDIAN
+ return val;
+#else /* it better be LITTLE_ENDIAN */
+ res = ((val&255)*256)+ ((val>>8)&255);
+ res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
+#ifdef BIG_LIB64
+ res = (res<<16) + (((val>>32)&255)*256) + ((val>>40)&255);
+ res = (res<<16) + (((val>>48)&255)*256) + ((val>>56)&255);
+#else
+ fprintf(stderr,"Cannot use bl2_long8_cvt without 64-bit longs\n");
+ exit(1);
+#endif
+ return res;
+#endif
+}
+
+void src_int4_read(FILE *fd, int *val)
+{
+#ifdef IS_BIG_ENDIAN
+ fread((char *)val,(size_t)4,(size_t)1,fd);
+#else
+ unsigned char b[4];
+
+ fread((char *)&b[0],(size_t)1,(size_t)4,fd);
+ *val = 0;
+ *val = (int)(((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
+#endif
+}
+
+void src_long4_read(FILE *fd, long *valp)
+{
+ int val4;
+#ifdef IS_BIG_ENDIAN
+ fread(&val4,(size_t)4,(size_t)1,fd);
+ *valp = val4;
+#else
+ unsigned char b[4];
+
+ fread((char *)&b[0],(size_t)1,(size_t)4,fd);
+ val4 = (int)(((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
+ *valp = val4;
+#endif
+}
+
+void src_uint4_read(FILE *fd, unsigned int *valp)
+{
+#ifdef IS_BIG_ENDIAN
+ fread(valp,(size_t)4,(size_t)1,fd);
+#else
+ unsigned char b[4];
+
+ fread((char *)&b[0],(size_t)1,(size_t)4,fd);
+ *valp = 0;
+ *valp = (unsigned int)(((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
+#endif
+}
+
+void src_long8_read(FILE *fd, int64_t *val)
+{
+#ifdef IS_BIG_ENDIAN
+ fread((void *)val,(size_t)8,(size_t)1,fd);
+#else
+ int val_h;
+ unsigned int val_l;
+ unsigned char b[8];
+
+ fread((char *)&b[0],(size_t)1,(size_t)8,fd);
+
+ /* modified to work with both 32-bit and 64-bit native ints */
+ val_h = (((((b[0]<<8)+b[1])<<8)+b[2])<<8)+b[3];
+ val_l = (((((b[4]<<8)+b[5])<<8)+b[6])<<8)+b[7];
+ *val = val_h * 4294967296LL + val_l;
+
+ /*
+ *val = 0;
+ *val = (long)(((((((b[0]<<8)+b[1]<<8)+b[2]<<8)
+ +b[3]<<8)+b[4]<<8)+b[5]<<8)
+ +b[6]<<8)+b[7];
+ */
+#endif
+}
+
+void ncbi_long8_read(FILE *fd, int64_t *val)
+{
+ unsigned char b[8];
+
+ fread((char *)&b[0],(size_t)1,(size_t)8,fd);
+ *val = 0;
+ *val = (long)(((((((((((((b[7]<<8)+b[6])<<8)+b[5])<<8)+b[4])<<8)+b[3])<<8)+b[2])<<8)+b[1])<<8)+b[0];
+}
+
+void src_char_read(FILE *fd, char *val)
+{
+ fread(val,(size_t)1,(size_t)1,fd);
+}
+
+void src_fstr_read(FILE *fd, char *val, int slen)
+{
+ fread(val,(size_t)slen,(size_t)1,fd);
+}
+
+void
+newname(char *nname, char *oname, char *suff, int maxn)
+{
+ SAFE_STRNCPY(nname,oname,maxn);
+ SAFE_STRNCAT(nname,".",maxn);
+ SAFE_STRNCAT(nname,suff,maxn);
+}
+
+#define ASN_SEQ 0x30
+#define ASN_IS_BOOL 1
+#define ASN_IS_INT 2
+#define ASN_IS_STR 26
+#define ASN_TYPE_MASK 31
+
+unsigned char *
+get_asn_int(unsigned char *abp, int *val) {
+
+ int v_len, v;
+
+ v = 0;
+ if (*abp++ != ASN_IS_INT) { /* check for int */
+ fprintf(stderr,"*** error [%s:%d] -- int missing\n",__FILE__, __LINE__);
+ }
+ else {
+ v_len = *abp++;
+ while (v_len-- > 0) {
+ v *= 256;
+ v += *abp++;
+ }
+ abp += 2; /* skip over null's */
+ }
+ *val = v;
+ return abp;
+}
+
+unsigned char *
+get_asn_text(unsigned char *abp, char *text, int t_len) {
+ int tch, at_len;
+
+ text[0] = '\0';
+ if (*abp++ != ASN_IS_STR) { /* check for str */
+ fprintf(stderr,"*** error [%s:%d] - str missing\n",__FILE__,__LINE__);
+ }
+ else {
+ if ((tch = *abp++) > 128) { /* string length is in next bytes */
+ tch &= 0x7f; /* get number of bytes for len */
+ at_len = 0;
+ while (tch-- > 0) { at_len = (at_len << 8) + *abp++;}
+ }
+ else {
+ at_len = tch;
+ }
+
+ if ( at_len < t_len-1) {
+ memcpy(text, abp, at_len);
+ text[at_len] = '\0';
+ }
+ else {
+ memcpy(text, abp, t_len-1);
+ text[t_len-1] = '\0';
+ }
+ abp += at_len + 2;
+ }
+ return abp;
+}
+
+/* something to try to skip over stuff we don't want */
+unsigned char *
+get_asn_junk(unsigned char *abp) {
+
+ int seq_cnt = 0;
+ int tmp;
+ char string[256];
+
+ while (*abp) {
+ if ( *abp == ASN_SEQ) { abp += 2; seq_cnt++;}
+ else if ( *abp == ASN_IS_BOOL ) {abp = get_asn_int(abp, &tmp);}
+ else if ( *abp == ASN_IS_INT ) {abp = get_asn_int(abp, &tmp);}
+ else if ( *abp == ASN_IS_STR ) {abp = get_asn_text(abp, string, sizeof(string)-1);}
+ else { abp += 2;}
+ }
+
+ while (seq_cnt-- > 0) abp += 2;
+ return abp;
+}
+
+#define ASN_FADL_TITLE 0xa0 /* \240 160 */
+#define ASN_FADL_SEQID 0xa1 /* \241 161 */
+#define ASN_FADL_TAXID 0xa2 /* \242 162 */
+#define ASN_FADL_MEMBERS 0xa3 /* \243 163 */
+#define ASN_FADL_LINKS 0xa4 /* \244 164 */
+#define ASN_FADL_OTHER 0xa5 /* \245 165 */
+#define ASN_FADL_GI 171
+
+#define ASN_FADL_TEXTSEQ_ID 0xa4 /* \244 164 */
+#define ASN_FADL_OTHERSEQ_ID 0xa5 /* \245 164 */
+
+/* from seq.asn::Textseq-id/Textannot-id */
+#define ASN_TEXTSEQ_ID_NAME 0xa0 /* \240 160 */
+#define ASN_TEXTSEQ_ID_ACC 0xa1 /* \241 161 */
+#define ASN_TEXTSEQ_ID_REL 0xa2 /* \242 162 */
+#define ASN_TEXTSEQ_ID_VER 0xa3 /* \243 163 */
+
+unsigned char *
+get_asn_textseq_id(unsigned char *abp,
+ char *name, size_t name_len, char *acc, size_t acc_len)
+{
+ char release[20], ver_str[10];
+ int version;
+ int seqcnt = 0;
+
+ ver_str[0]='\0';
+
+ if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}
+
+ while (*abp) {
+ switch (*abp) {
+ case ASN_TEXTSEQ_ID_NAME :
+ abp = get_asn_text(abp+2, name, name_len);
+ break;
+ case ASN_TEXTSEQ_ID_ACC :
+ abp = get_asn_text(abp+2, acc, acc_len);
+ break;
+ case ASN_TEXTSEQ_ID_REL :
+ abp = get_asn_text(abp+2, release, sizeof(release));
+ break;
+ case ASN_TEXTSEQ_ID_VER :
+ abp = get_asn_int(abp+2, &version);
+ sprintf(ver_str,".%d",version);
+ break;
+ default: abp += 2;
+ }
+ }
+ while (seqcnt-- > 0) abp += 4;
+ strncat(acc,ver_str,acc_len-strlen(acc));
+ acc[19]='\0';
+ return abp; /* skip 2 NULL's */
+}
+
+unsigned char *
+get_asn_local_id(unsigned char *abp, char *acc, size_t acc_len)
+{
+ int seqcnt = 0;
+
+ if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}
+
+ abp = get_asn_text(abp+2, acc, acc_len);
+
+ while (seqcnt-- > 0) abp += 4;
+ acc[acc_len-1]='\0';
+ return abp+2; /* skip 2 NULL's */
+}
+
+unsigned char *
+get_asn_dbtag(unsigned char *abp, char *name, size_t name_len, char *str, size_t str_len, int *id_p) {
+
+ if (*abp == ASN_SEQ) { abp += 2;}
+
+ if (*abp == 0xa0) { /* get db */
+ abp = get_asn_text(abp+2, name, name_len);
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] - missing dbtag:db %d %d\n",__FILE__, __LINE__, abp[0],abp[1]);
+ abp += 2;
+ }
+
+ if (*abp == 0xa1) { /* get tag */
+ abp += 2;
+ abp += 2; /* skip over id */
+ if (*abp == 2) abp = get_asn_int(abp,id_p);
+ else abp = get_asn_text(abp, str, str_len);
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] - missing dbtag:tag %2x %2x\n",__FILE__, __LINE__, abp[0],abp[1]);
+ abp += 2;
+ }
+ return abp+2; /* skip 2 NULL's */
+}
+
+#define ASN_DATE_STR 0xa0
+#define ASN_DATE_STD 0xa1
+#define ASN_DATE_STD_YR 0xa0
+#define ASN_DATE_STD_MO 0xa1
+#define ASN_DATE_STD_DAY 0xa2
+
+unsigned char *
+get_asn_date_std(unsigned char *abp, char *date) {
+ int seq_cnt=0;
+ int year, month, day;
+
+ year = month = day = 0;
+
+ while (*abp == ASN_SEQ) { abp+=2; seq_cnt++;}
+
+ while (*abp) {
+ switch (*abp) {
+ case ASN_DATE_STD_YR :
+ abp = get_asn_int(abp+2, &year);
+ break;
+ case ASN_DATE_STD_MO :
+ abp = get_asn_int(abp+2, &month);
+ break;
+ case ASN_DATE_STD_DAY :
+ abp = get_asn_int(abp+2, &day);
+ break;
+ default:
+ fprintf(stderr, "*** error [%s:%d] - incorrect date-std code: %0x1 %0x1\n",
+ __FILE__, __LINE__, abp[0], abp[1]);
+ }
+ }
+ sprintf(date, "%02d-%02d-%02d", year, month, day);
+
+ while (seq_cnt-- > 0) { abp += 4;}
+
+ return abp+2;
+}
+
+unsigned char *
+get_asn_date(unsigned char *abp, char *date, size_t date_len) {
+
+ if (*abp == ASN_DATE_STR) {
+ abp = get_asn_text(abp, date, date_len);
+ }
+ else if (*abp == ASN_DATE_STD) {
+ abp = get_asn_date_std(abp+2, date);
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] - incorrect date code: %0x1 %0x1\n",
+ __FILE__, __LINE__, abp[0], abp[1]);
+ }
+ return abp+2;
+}
+
+unsigned char *
+get_asn_pdb_id(unsigned char *abp, char *acc, size_t acc_len, char *chain, size_t chain_len)
+{
+ int ichain, seq_cnt=0;
+ char dummy[40];
+
+ if (*abp == ASN_SEQ) { abp += 2; seq_cnt++;}
+
+ while (*abp) {
+ switch (*abp) {
+ case 0: abp += 2; break;
+ case 0xa0: /* mol */
+ abp = get_asn_text(abp+2, acc, 20);
+ break;
+ case 0xa1: /* chain */
+ abp = get_asn_int(abp+2, &ichain);
+ chain[0] = ichain;
+ chain[1] = '\0';
+ break;
+ case 0xa2: /* release */
+ abp = get_asn_date(abp+2, dummy, sizeof(dummy));
+ break;
+ default: abp+=2;
+ }
+ }
+ while (seq_cnt-- > 0) {abp += 4;}
+ return abp;
+}
+
+unsigned char *
+get_asn_seqid_ori(unsigned char *abp, int *gi_p, int *db, char *acc, size_t acc_len, char *name, size_t name_len)
+{
+ int db_type, itmp, seq_cnt=0;
+
+ *gi_p = 0;
+
+ if (*abp != ASN_SEQ) {
+ fprintf(stderr, "*** error [%s:%d] - seqid - missing SEQ 1: %2x %2x\n",
+ __FILE__, __LINE__, abp[0], abp[1]);
+ return abp;
+ }
+ else { abp += 2; seq_cnt++;}
+
+ db_type = (*abp & ASN_TYPE_MASK);
+
+ if (db_type == 11) { /* gi */
+ abp = get_asn_int(abp+2,gi_p);
+ }
+
+ while (*abp == ASN_SEQ) {abp += 2; seq_cnt++;}
+
+ db_type = (*abp & ASN_TYPE_MASK);
+ if (db_type > 17) {db_type = 0;}
+ *db = db_type;
+
+ switch(db_type) {
+ case 0:
+ abp = get_asn_local_id(abp, acc, acc_len);
+ break;
+ case 1:
+ case 2:
+ abp = get_asn_int(abp+2,&itmp);
+ abp += 2;
+ break;
+ case 11:
+ abp = get_asn_int(abp+2,&itmp);
+ break;
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ case 9:
+ case 12:
+ case 13:
+ case 15:
+ case 16:
+ case 17:
+ abp = get_asn_textseq_id(abp,name,name_len, acc, acc_len);
+ break;
+ case 10:
+ abp = get_asn_dbtag(abp+2,name,name_len, acc, acc_len, &itmp);
+ case 14:
+ abp = get_asn_pdb_id(abp,acc,acc_len,name,name_len);
+ break;
+ default: abp += 2;
+ }
+
+ while (seq_cnt-- > 0) { abp += 4;}
+ return abp; /* skip over 2 NULL's */
+}
+
+unsigned char *
+get_asn_db_info(unsigned char *abp, int db_type, int *gi_p, char *name, size_t name_len, char *acc, size_t acc_len) {
+ int seq_cnt = 0, itmp;
+
+ if (db_type == 11) {
+ abp = get_asn_int(abp, gi_p);
+ return abp;
+ }
+
+ while (*abp == ASN_SEQ) {abp += 2; seq_cnt++;}
+
+ switch(db_type) {
+ case 0:
+ abp = get_asn_local_id(abp, acc, acc_len);
+ break;
+ case 1:
+ case 2:
+ abp = get_asn_int(abp,gi_p);
+ break;
+ case 11:
+ abp = get_asn_int(abp+2,gi_p);
+ break;
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ case 9:
+ case 12:
+ case 13:
+ case 15:
+ case 16:
+ case 17:
+ abp = get_asn_textseq_id(abp,name,name_len, acc, acc_len);
+ break;
+ case 10:
+ abp = get_asn_dbtag(abp,name,name_len, acc, acc_len, &itmp);
+ break;
+ case 14:
+ abp = get_asn_pdb_id(abp,acc,acc_len,name,name_len);
+ break;
+ default: abp += 2;
+ }
+
+ while (seq_cnt-- > 0) { abp += 4;}
+ return abp; /* skip over 2 NULL's */
+}
+
+
+unsigned char *
+get_asn_seqid(unsigned char *abp, int *gi_p, int *db, char *acc, size_t acc_len, char *name, size_t name_len)
+{
+ int db_type, itmp, seq_cnt=0;
+
+ *gi_p = 0;
+
+ if (*abp != ASN_SEQ) {
+ fprintf(stderr, "*** error [%s:%d] - get_asn_seqid - missing SEQ 1: %2x %2x\n",
+ __FILE__, __LINE__, abp[0], abp[1]);
+ return abp;
+ }
+ else { abp += 2; seq_cnt++;}
+
+ while (*abp) {
+ if (*abp == ASN_FADL_TEXTSEQ_ID) {
+ abp = get_asn_textseq_id(abp+2, name, name_len, acc, acc_len );
+ }
+ else if (*abp == ASN_FADL_OTHERSEQ_ID) {
+ abp = get_asn_textseq_id(abp+2, name, name_len, acc, acc_len );
+ }
+ else if ((db_type = (*abp & ASN_TYPE_MASK)) < 17) {
+ abp = get_asn_db_info(abp+2, db_type, gi_p, name, name_len, acc, acc_len);
+ *db = db_type;
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] -- get_asn_seqid not TEXTSEQ/not GI: %2x %2x\n",
+ __FILE__, __LINE__,abp[0], abp[1]);
+ return abp;
+ }
+ }
+
+ while (seq_cnt-- > 0) { abp += 4;}
+ return abp; /* skip over 2 NULL's */
+}
+
+unsigned char *
+get_asn_seqid_other(unsigned char *abp, int *gi_p, char *acc, size_t acc_len, char *name, size_t name_len)
+{
+ int db_type, itmp, seq_cnt=0;
+
+ *gi_p = 0;
+ name[0] = acc[0] = '\0';
+
+ if (*abp != ASN_SEQ) {
+ fprintf(stderr, "*** error [%s:%d] - get_asn_seqid - missing SEQ 1: %2x %2x\n",
+ __FILE__, __LINE__, abp[0], abp[1]);
+ return abp;
+ }
+ else { abp += 2; seq_cnt++;}
+
+ while (*abp) {
+ if (*abp == ASN_TEXTSEQ_ID_ACC) {
+ abp = get_asn_text(abp+2, acc, acc_len );
+ }
+ if (*abp == ASN_TEXTSEQ_ID_VER) {
+ abp = get_asn_text(abp+2, acc, acc_len );
+ }
+ else if (*abp == ASN_TEXTSEQ_ID_NAME) {
+ abp = get_asn_text(abp+2, name, name_len );
+ }
+ else if (*abp == ASN_IS_INT) {
+ abp = get_asn_int(abp, &itmp );
+ }
+ else {
+ fprintf(stderr,"*** error [%s:%d] -- get_asn_seqid not SEQID_ACC: %2x %2x\n",
+ __FILE__, __LINE__,abp[0], abp[1]);
+ return abp;
+ }
+ }
+
+ if (*abp == ASN_FADL_GI) {
+ abp = get_asn_int(abp+2, gi_p);
+ }
+
+ while (seq_cnt-- > 0) { abp += 4;}
+ return abp; /* skip over 2 NULL's */
+}
+
+
+unsigned char *
+parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
+ int *gi_p, int *db, char *acc, size_t acc_len,
+ char *name, size_t name_len,
+ char *title, size_t t_len, int *taxid_p) {
+ unsigned char *abp;
+ int this_db, itmp;
+ int seq_cnt = 0;
+
+ acc[0] = name[0] = db[0] = title[0] = '\0';
+
+ abp = asn_buff;
+ while ( abp < asn_max && *abp) {
+ if (*abp == ASN_SEQ) { abp += 2; seq_cnt++; }
+ else if (*abp == ASN_FADL_TITLE) {
+ abp = get_asn_text(abp+2, title, t_len);
+ }
+ else if (*abp == ASN_FADL_SEQID ) {
+ abp = get_asn_seqid(abp+2, gi_p, db, acc, acc_len, name, name_len);
+ }
+ else if (*abp == ASN_FADL_TAXID ) {
+ abp = get_asn_int(abp+2, taxid_p);
+ }
+ else if (*abp == ASN_FADL_MEMBERS) {
+ abp = get_asn_junk(abp+2);
+ break;
+ }
+ else if (*abp == ASN_FADL_LINKS ) {
+ abp = get_asn_junk(abp+2);
+ break;
+ }
+ else if (*abp == ASN_FADL_OTHER ) { /* possibly here for seqid without name */
+ abp = get_asn_seqid_other(abp+2, gi_p, acc, acc_len, name, name_len);
+ }
+ else {
+ /* fprintf(stderr, " Error - missing ASN.1 %2x:%2x:%2x:%2x\n",
+ abp[-2],abp[-1],abp[0],abp[1]); */
+ abp = get_asn_junk(abp);
+ /* something is broken, give up */
+ break;
+ }
+ }
+ while (abp < asn_max && *abp == '\0' ) abp++;
+ if (abp >= asn_max) return NULL;
+ else return abp;
+}
+
+
+void
+parse_pal(char *dname, char *msk_name,
+ int *oid_seqs, int *max_oid,
+ FILE *fd) {
+
+ char line[MAX_STR];
+
+ while (fgets(line,sizeof(line),fd)) {
+ if (line[0] == '#') continue;
+
+ if (strncmp(line, "DBLIST", 6)==0) {
+ sscanf(line+7,"%s",dname);
+ }
+ else if (strncmp(line, "OIDLIST", 7)==0) {
+ sscanf(line+8,"%s",msk_name);
+ }
+ else if (strncmp(line, "NSEQ", 4)==0) {
+ sscanf(line+5,"%d",oid_seqs);
+ }
+ else if (strncmp(line, "MAXOID", 6)==0) {
+ sscanf(line+7,"%d",max_oid);
+ }
+ }
+}
+
+/* part of new ambiguity code */
+int readMFILE (void *buffer, size_t size, int nitems, struct lmf_str *m_fd) {
+ register size_t diff, len;
+
+ if (m_fd == NULL)
+ return 0;
+
+ if (m_fd->mm_flg) {
+ len = size * nitems;
+ memcpy((void *) buffer, (void *) m_fd->mmap_addr, len);
+ m_fd->mmap_addr += len;
+ return nitems;
+ }
+ else {
+ return fread((unsigned char *)buffer, size, nitems, m_fd->libf);
+ }
+}
diff --git a/src/ncbl_head.h b/src/ncbl_head.h
new file mode 100644
index 0000000..786d431
--- /dev/null
+++ b/src/ncbl_head.h
@@ -0,0 +1,33 @@
+/* ncbl_head.h header files for blast1.3 format */
+
+/* $Id: ncbl_head.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+#define AMINO_ACID_SEQTYPE 1
+#define AA_SEQTYPE AMINO_ACID_SEQTYPE
+#define NUCLEIC_ACID_SEQTYPE 2
+#define NT_SEQTYPE NUCLEIC_ACID_SEQTYPE
+
+/* Filename extensions used by the two types of databases (a.a. and nt.) */
+#define AA_HEADER_EXT "ahd"
+#define AA_TABLE_EXT "atb"
+#define AA_SEARCHSEQ_EXT "bsq"
+#define NT_HEADER_EXT "nhd"
+#define NT_TABLE_EXT "ntb"
+#define NT_SEARCHSEQ_EXT "csq"
+
+#define DB_TYPE_PRO 0x78857a4f /* Magic # for a protein sequence database */
+#define DB_TYPE_NUC 0x788325f8 /* Magic # for a nt. sequence database */
+
+#define AAFORMAT 3 /* Latest a.a. database format ID number */
+#define NTFORMAT 6 /* Latest nt. database format ID number */
+
+#define NULLB '\0' /* sentinel byte */
+#define NT_MAGIC_BYTE 0xfc /* Magic byte at end of compressed nt db */
+
+#ifndef CHAR_BIT
+#define CHAR_BIT 8 /* these values should match blast */
+#endif
+
+#define NBPN 2
+#define NSENTINELS 2
diff --git a/src/ncbl_lib.c b/src/ncbl_lib.c
new file mode 100644
index 0000000..4a11659
--- /dev/null
+++ b/src/ncbl_lib.c
@@ -0,0 +1,491 @@
+/* ncbl_lib.c functions to read ncbi-blast format files from
+ setdb (blastp 1.3.2) format files */
+
+/* $Id: ncbl_lib.c 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
+ The Rector and Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef WIN32
+#define RBSTR "r"
+#else
+#define RBSTR "rb"
+#endif
+
+#define XTERNAL
+#include "uascii.h"
+#include "upam.h"
+#include "ncbl_head.h"
+#include "mm_file.h"
+
+int ncbl_getliba(char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
+int ncbl_getlibn(char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
+
+void src_ulong_read();
+
+#ifndef NCBL13_ONLY
+static void src_char_read();
+static void newname(char *, char *, char *, int);
+#else
+void src_char_read();
+void newname(char *, char *, char *, int);
+#endif
+
+/* nt_btoa maps from blast 2bit format to ascii characters */
+static char nt_btoa[5] = {"ACGT"};
+
+static char aa_btoa[27]= {"-ARNDCQEGHILKMFPSTWYVBZX*"};
+static int aa_btof[32]; /* maps to fasta alphabet */
+
+static FILE *tfile=NULL, /* table of offsets, also DB info */
+ *hfile=NULL, /* description lines */
+ *sfile=NULL; /* binary sequence data */
+
+static unsigned long lib_cnt, max_cnt, totlen, mxlen, dbline_len;
+static unsigned long *seq_beg, *hdr_beg;
+static unsigned char *ambiguity_ray;
+static long seq_format, dbtype, dbformat;
+static char dline[512];
+
+#define NCBIBL13 11
+
+struct lmf_str *
+ncbl_openlib(char *name, int ldnaseq)
+{
+ char hname[256];
+ char sname[256];
+ char tname[256];
+ long title_len;
+ char *title_str;
+ int rdtmp;
+ int i;
+ unsigned long line_len, c_len, clean_count;
+
+ if (ldnaseq!=1) {
+ newname(tname,name,AA_TABLE_EXT,(int)sizeof(tname));
+ if ((tfile = fopen(tname,RBSTR))==NULL) {
+ fprintf(stderr," cannot open %s (%s.%s) table file\n",
+ name,tname,NT_TABLE_EXT);
+ return (-1);
+ }
+ seq_format = AAFORMAT;
+ }
+ else {
+ newname(tname,name,NT_TABLE_EXT,(int)sizeof(tname));
+ if ((tfile = fopen(tname,RBSTR))==NULL) {
+ fprintf(stderr," cannot open %s (%s.%s) table file\n",
+ name,tname,NT_TABLE_EXT);
+ return (-1);
+ }
+ seq_format = NTFORMAT;
+ }
+
+ src_ulong_read(tfile,&dbtype);
+ src_ulong_read(tfile,&dbformat);
+
+ if (seq_format == AAFORMAT && (dbformat != seq_format || dbtype !=
+ DB_TYPE_PRO)) {
+ fprintf(stderr,"error - %s wrong type (%ld/%d) or format (%ld/%ld)\n",
+ tname,dbtype,DB_TYPE_PRO,dbformat,seq_format);
+ return (-1);
+ }
+ else if (seq_format == NTFORMAT && (dbformat != seq_format || dbtype !=
+ DB_TYPE_NUC)) {
+ fprintf(stderr,"error - %s wrong type (%ld/%d) or format (%ld/%ld)\n",
+ tname,dbtype,DB_TYPE_NUC,dbformat,seq_format);
+ return (-1);
+ }
+
+ if (seq_format == AAFORMAT) {
+ newname(hname,name,AA_HEADER_EXT,(int)sizeof(hname));
+ if ((hfile = fopen(hname,RBSTR))==NULL) {
+ fprintf(stderr," cannot open %s header file\n",hname);
+ return (-1);
+ }
+ newname(sname,name,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
+ if ((sfile = fopen(sname,RBSTR))==NULL) {
+ fprintf(stderr," cannot open %s sequence file\n",sname);
+ return (-1);
+ }
+ }
+ else {
+ newname(hname,name,NT_HEADER_EXT,(int)sizeof(hname));
+ if ((hfile = fopen(hname,RBSTR))==NULL) {
+ fprintf(stderr," cannot open %s header file\n",hname);
+ return (-1);
+ }
+ newname(sname,name,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
+ if ((sfile = fopen(sname,RBSTR))==NULL) {
+ fprintf(stderr," cannot open %s sequence file\n",sname);
+ return (-1);
+ }
+ }
+
+/* all files should be open */
+
+ src_ulong_read(tfile,&title_len);
+ rdtmp = title_len + ((title_len%4 !=0 ) ? 4-(title_len%4) : 0);
+ if ((title_str = calloc((size_t)rdtmp,sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate title string (%d)\n",rdtmp);
+ return(-1);
+ }
+ fread(title_str,(size_t)1,(size_t)rdtmp,tfile);
+
+ lib_cnt = 0;
+ if (seq_format == AAFORMAT) {
+ src_ulong_read(tfile,&max_cnt);
+ src_ulong_read(tfile,&totlen);
+ src_ulong_read(tfile,&mxlen);
+
+ /* fprintf(stderr," max_cnt: %d, totlen: %d\n",max_cnt,totlen); */
+
+ if ((seq_beg=(unsigned long *)calloc((size_t)max_cnt+1,sizeof(long)))==NULL) {
+ fprintf(stderr," cannot allocate sequence pointers\n");
+ return -1;
+ }
+ if ((hdr_beg=(unsigned long *)calloc((size_t)max_cnt+1,sizeof(long)))==NULL) {
+ fprintf(stderr," cannot allocate header pointers\n");
+ return -1;
+ }
+ for (i=0; i<max_cnt+1; i++) src_ulong_read(tfile,&seq_beg[i]);
+ for (i=0; i<max_cnt+1; i++) src_ulong_read(tfile,&hdr_beg[i]);
+
+ for (i=0; i<sizeof(aa_btoa); i++) {
+ if ((rdtmp=aascii[aa_btoa[i]])<NA) aa_btof[i]=rdtmp;
+ else aa_btof[i]=aascii['X'];
+ }
+ }
+ else if (seq_format == NTFORMAT) {
+ src_ulong_read(tfile,&dbline_len); /* length of uncompress DB lines */
+ src_ulong_read(tfile,&max_cnt); /* number of entries */
+ src_ulong_read(tfile,&mxlen); /* maximum length sequence */
+ src_ulong_read(tfile,&totlen); /* total count */
+ src_ulong_read(tfile,&c_len); /* compressed db length */
+ src_ulong_read(tfile,&clean_count); /* count of nt's cleaned */
+
+ fseek(tfile,(size_t)((clean_count)*4),1);
+ /* seek over clean_count */
+ if ((seq_beg=(unsigned long *)calloc((size_t)max_cnt+1,sizeof(long)))==NULL) {
+ fprintf(stderr," cannot allocate sequence pointers\n");
+ return -1;
+ }
+ if ((hdr_beg=(unsigned long *)calloc((size_t)max_cnt+1,sizeof(long)))==NULL) {
+ fprintf(stderr," cannot allocate header pointers\n");
+ return -1;
+ }
+ if ((ambiguity_ray=
+ (unsigned char *)calloc((size_t)max_cnt/CHAR_BIT+1,sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate ambiguity_ray\n");
+ return -1;
+ }
+
+ for (i=0; i<max_cnt+1; i++) src_ulong_read(tfile,&seq_beg[i]);
+ fseek(tfile,(size_t)((max_cnt+1)*4),1);
+ /* seek over seq_beg */
+ for (i=0; i<max_cnt+1; i++) src_ulong_read(tfile,&hdr_beg[i]);
+ for (i=0; i<max_cnt/CHAR_BIT+1; i++)
+ src_char_read(tfile,&ambiguity_ray[i]);
+ }
+ return 1;
+}
+
+void ncbl_closelib()
+{
+ if (tfile !=NULL ) {fclose(tfile); tfile=NULL;}
+ if (hfile !=NULL ) {fclose(hfile); hfile=NULL;}
+ if (sfile !=NULL ) {fclose(sfile); sfile=NULL;}
+}
+
+int
+ncbl_getliba(char *seq, int maxs,
+ char *libstr, int n_libstr,
+ fseek_t *libpos,
+ int lcont)
+{
+ register char *sptr;
+ long seqcnt;
+ long tmp;
+ char ch;
+ static long seq_len;
+
+ *libpos = lib_cnt;
+ if (*lcont==0) {
+ if (lib_cnt >= max_cnt) return -1;
+ seq_len = seq_beg[lib_cnt+1] - seq_beg[lib_cnt] -1;
+ tmp=(long)fgetc(sfile); /* skip the null byte */
+ if (tmp!=NULLB)
+ fprintf(stderr," phase error: %ld:%ld found\n",lib_cnt,tmp);
+ libstr[0]='\0';
+ }
+
+ if (seq_len < maxs) {
+ if ((tmp=fread(seq,(size_t)1,(size_t)seq_len,sfile))!=(size_t)seq_len) {
+ fprintf(stderr," could not read sequence record: %ld %ld != %ld\n",
+ *libpos,tmp,seq_len);
+ goto error;
+ }
+ if (aa_btoa[seq[seq_len-1]]=='*') seqcnt = seq_len-1;
+ else seqcnt=seq_len;
+ lib_cnt++;
+ *lcont = 0;
+ }
+ else {
+ if (fread(seq,(size_t)1,(size_t)(maxs-1),sfile)!=(size_t)(maxs-1)) {
+ fprintf(stderr," could not read sequence record: %ld %ld\n",
+ *libpos,seq_len);
+ goto error;
+ }
+ (*lcont)++;
+ seqcnt = maxs-1;
+ seq_len -= seqcnt;
+ }
+ sptr = seq+seqcnt;
+
+ while (--sptr >= seq) *sptr = aa_btof[*sptr];
+
+ seq[seqcnt]= EOSEQ;
+ return (seqcnt);
+
+error: fprintf(stderr," error reading %ld at %ld\n",libstr,*libpos);
+ fflush(stderr);
+ return (-1);
+}
+
+int
+ncbl_getlibn(char *seq, int maxs,
+ char *libstr, int n_libstr,
+ fseek_t *libpos, int *lcont)
+{
+ register char *sptr, *tptr, stmp;
+ long seqcnt;
+ long tmp;
+ char ch;
+ static long seq_len;
+ static int c_len,c_pad;
+
+ *libpos = lib_cnt;
+ if (*lcont==0) {
+ if (lib_cnt >= max_cnt) return -1;
+ c_len = seq_beg[lib_cnt+1]/(CHAR_BIT/NBPN)
+ - seq_beg[lib_cnt]/(CHAR_BIT/NBPN);
+ c_len -= NSENTINELS;
+
+ seq_len = c_len*(CHAR_BIT/NBPN);
+ c_pad = seq_beg[lib_cnt] & ((CHAR_BIT/NBPN)-1);
+ if (c_pad != 0) seq_len -= ((CHAR_BIT/NBPN) - c_pad);
+
+ tmp=fgetc(sfile); /* skip the null byte */
+ if (tmp!=NT_MAGIC_BYTE) {
+ fprintf(stderr," phase error: %ld:%ld (%ld/%d) found\n",
+ lib_cnt,seq_len,tmp,NT_MAGIC_BYTE);
+ goto error;
+ }
+ libstr[0]='\0';
+ }
+
+ if (seq_len < maxs-3) {
+ seqcnt=(seq_len+3)/4;
+ if (seqcnt==0) seqcnt++;
+ if ((tmp=fread(seq,(size_t)1,(size_t)seqcnt,sfile))
+ !=(size_t)seqcnt) {
+ fprintf(stderr,
+ " could not read sequence record: %s %ld %ld != %ld: %d\n",
+ libstr,*libpos,tmp,seqcnt,*seq);
+ goto error;
+ }
+ tmp=fgetc(sfile); /* skip the null byte */
+ if (tmp!=(unsigned char)NT_MAGIC_BYTE) {
+ fprintf(stderr," phase2 error: %ld:%ld (%ld/%d) next ",
+ lib_cnt,seqcnt,tmp,NT_MAGIC_BYTE);
+
+ goto error;
+ }
+ *lcont = 0;
+ lib_cnt++;
+ }
+ else {
+ seqcnt = ((maxs+3)/4)-1;
+ if (fread(seq,(size_t)1,(size_t)(seqcnt),sfile)!=(size_t)(seqcnt)) {
+ fprintf(stderr," could not read sequence record: %s %ld %ld\n",
+ libstr,*libpos,seqcnt);
+ goto error;
+ }
+ (*lcont)++;
+ }
+
+ /* point to the last packed byte and to the end of the array
+ seqcnt is the exact number of bytes read
+ tptr points to the destination, use multiple of 4 to simplify math
+ sptr points to the source, note that the last byte will be read 4 cycles
+ before it is written
+ */
+
+ sptr = seq + seqcnt;
+ tptr = seq + 4*seqcnt;
+ while (sptr>seq) {
+ stmp = *--sptr;
+ *--tptr = (stmp&3) +1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ *--tptr = ((stmp >>= 2)&3)+1;
+ }
+ /*
+ for (sptr=seq; sptr < seq+seq_len; sptr++) {
+ printf("%c",nt[*sptr]);
+ if ((int)(sptr-seq) % 60 == 59) printf("\n");
+ }
+ printf("\n");
+ */
+ if (seqcnt*4 >= seq_len) { /* there was enough room */
+ seq[seq_len]= EOSEQ;
+ /* printf("%d\n",seq_len); */
+ return seq_len;
+ }
+ else { /* not enough room */
+ seq[seqcnt*4]=EOSEQ;
+ seq_len -= 4*seqcnt;
+ return (4*seqcnt);
+ }
+
+error: fprintf(stderr," error reading %ld at %ld\n",libstr,*libpos);
+ fflush(stderr);
+ return (-1);
+}
+
+void
+ncbl_ranlib(str,cnt,libpos)
+ char *str; int cnt;
+ long libpos;
+{
+ char hline[256], *bp, *bp0;
+ int llen;
+ long spos;
+
+ lib_cnt = libpos;
+ llen = hdr_beg[lib_cnt+1]-hdr_beg[lib_cnt];
+ if (llen > sizeof(hline)) llen = sizeof(hline);
+ fseek(hfile,hdr_beg[lib_cnt]+1,0);
+
+ fread(hline,(size_t)1,(size_t)(llen-1),hfile);
+ hline[llen-1]='\0';
+
+ if (hline[9]=='|' || hline[10]=='|') {
+ bp0 = strchr(hline+3,'|');
+ if ((bp=strchr(bp0+1,' '))!=NULL) *bp='\0';
+ if (dbformat == NTFORMAT &&
+ (ambiguity_ray[lib_cnt/CHAR_BIT]&(1<<lib_cnt%CHAR_BIT))) {
+ sprintf(str,"*%-9s ",bp0+1);
+ }
+ else sprintf(str,"%-10s ",bp0+1);
+ strncat(str+11,bp+1,cnt-strlen(str));
+ }
+ else {
+ if (dbformat == NTFORMAT &&
+ (ambiguity_ray[lib_cnt/CHAR_BIT]&(1<<lib_cnt%CHAR_BIT))) {
+ str[0]='*';
+ strncpy(str+1,hline,cnt-1);
+ }
+ else strncpy(str,hline,cnt);
+ }
+ str[cnt-1]='\0';
+
+ if (dbformat == AAFORMAT)
+ fseek(sfile,seq_beg[lib_cnt]-1,0);
+ else {
+ spos = (seq_beg[lib_cnt])/(CHAR_BIT/NBPN);
+ fseek(sfile,spos-1,0);
+ }
+}
+
+void src_ulong_read(fd, val)
+ FILE *fd;
+ unsigned long *val;
+{
+#ifdef IS_BIG_ENDIAN
+ fread((char *)val,(size_t)4,(size_t)1,fd);
+#else
+ unsigned char b[4];
+
+ fread((char *)&b[0],(size_t)1,(size_t)4,fd);
+ *val = 0;
+ *val = (unsigned long)((unsigned long)((unsigned long)(b[0]<<8) +
+ (unsigned long)b[1]<<8) + (unsigned long)b[2]<<8)+(unsigned long)b[3];
+#endif
+}
+
+void src_long_read(fd,val)
+ FILE *fd;
+ long *val;
+{
+#ifdef IS_BIG_ENDIAN
+ fread((char *)val,(size_t)4,(size_t)1,fd);
+#else
+ unsigned char b[4];
+
+ fread((char *)&b[0],(size_t)1,(size_t)4,fd);
+ *val = 0;
+ *val = (long)((long)((long)(b[0]<<8)+(long)b[1]<<8)+(long)b[2]<<8)
+ +(long)b[3];
+#endif
+}
+
+#ifndef NCBL13_ONLY
+static void
+#else
+void
+#endif
+src_char_read(fd, val)
+ FILE *fd;
+ char *val;
+{
+ fread(val,(size_t)1,(size_t)1,fd);
+}
+
+#ifndef NCBL13_ONLY
+static void
+#else
+void
+#endif
+src_fstr_read(fd, val, slen)
+ FILE *fd;
+ char *val;
+ long slen;
+{
+ fread(val,(size_t)slen,(size_t)1,fd);
+}
+
+#ifndef NCBL13_ONLY
+static void
+#else
+void
+#endif
+newname(char *nname, char *oname, char *suff, int maxn)
+{
+ char *tptr;
+
+ if (oname[0]=='@') strncpy(nname,&oname[1],maxn);
+ else strncpy(nname,oname,maxn);
+ for (tptr=nname; *tptr=='.' && *tptr; tptr++);
+ for (; *tptr!='.'&& *tptr; tptr++); /* get to '.' or EOS */
+ *tptr++='.'; *tptr='\0';
+ strncat(nname,suff,maxn);
+}
+
diff --git a/src/nmgetlib.c b/src/nmgetlib.c
new file mode 100644
index 0000000..3b96da4
--- /dev/null
+++ b/src/nmgetlib.c
@@ -0,0 +1,2254 @@
+/* $Id: nmgetlib.c 1251 2014-01-24 21:34:05Z wrp $ */
+/* $Revision: 1251 $ */
+
+/* copyright (c) 1987, 1988, 1989, 1992, 1995, 2000, 2014 by
+ William R. Pearson and The Rector & Vistors of the University of
+ Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* May, June 1987 - modified for rapid read of database
+
+ revised (split) version of nmgetaa.c -> renamed nmgetlib.c
+
+ This version seeks to be a thread safe, no global, library
+ reading program. While adjusting the routines in this file
+ should be relatively easy, ncbl2_mlib.c and mysql_lib.c may be
+ more difficult.
+
+ nmgetlib.c and mmgetaa.c are used together. nmgetlib.c provides
+ the same functions as nxgetaa.c if memory mapping is not used,
+ mmgetaa.c provides the database reading functions if memory
+ mapping is used. The decision to use memory mapping is made on
+ a file-by-file basis.
+
+ June 2, 1987 - added TFASTA
+ March 30, 1988 - combined ffgetaa, fgetgb;
+ April 8, 1988 - added PIRLIB format for unix
+ Feb 4, 1989 - added universal subroutines for libraries
+ December, 1995 - added range option file.name:1-1000
+ September, 1999 - added option for mmap()ed files using ".xin" */
+
+
+/*
+ February 4, 1988 - this starts a major revision of the getaa
+ routines. The goal is to be able to seach the following format
+ libraries:
+
+ 0 - normal FASTA format
+ 1 - full Genbank flatfile format
+ 2 - NBRF/PIR CODATA format
+ 3 - EMBL/Swiss-prot format
+ 4 - Intelligentics format
+ 5 - NBRF/PIR VMS format
+ 6 - GCG 2bit format
+
+ 10 - list of gi/acc's
+ 11 - NCBI setdb/blastp (1.3.2) AA/NT
+ 12 - NCBI setdb/blastp (2.0) AA/NT
+ 16 - mySQL queries
+
+ see file altlib.h to confirm numbers
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "defs.h"
+#include "structs.h"
+
+#ifndef SFCHAR
+#define SFCHAR ':'
+#endif
+
+#define EOSEQ 0
+
+#include "uascii.h"
+/* #include "upam.h" */
+
+#define LFCHAR '\015' /* for MWC 5.5 */
+
+#include "altlib.h"
+
+#include <fcntl.h>
+#ifndef O_RAW
+#ifdef O_BINARY
+#define O_RAW O_BINARY
+#else
+#define O_RAW 0
+#endif /* O_BINARY */
+#endif /* O_RAW */
+
+#ifdef WIN32
+#define RBSTR "rb" /* read file in binary mode */
+#else
+#define RBSTR "r"
+#endif
+
+char *alloc_file_name(char *f_name);
+struct lib_struct *get_lnames(char *tname, struct lib_struct *cur_lib_p);
+struct lmf_str *load_mmap(FILE *, char *, int, int, struct lmf_str *);
+struct lmf_str *ncbl2_openlib(struct lib_struct *, int ldnaseq);
+struct lmf_str *ncbl2_reopen(struct lmf_str *);
+
+static struct lmf_str *last_m_fptr=NULL;
+
+int sel_acc_libstr(char *libstr, int gi, void *ptr);
+void *sel_acc_libstr_init(FILE *libf, int *acc_off, char fmt_term);
+
+int sel_acc_gi(char *libstr, int gi, void *ptr);
+void *sel_acc_gi_init(FILE *libf, int *acc_off, char fmt_term);
+
+int sel_hacc_libstr(char *libstr, int gi, void *ptr);
+void *sel_hacc_libstr_init(FILE *libf, int *acc_off, char fmt_term);
+
+int sel_hacc_gi(char *libstr, int gi, void *ptr);
+void *sel_hacc_gi_init(FILE *libf, int *acc_off, char fmt_term);
+
+#define MAX_ACC_TYPE 4
+int (*sel_acc_arr[MAX_ACC_TYPE+1])(char *libstr, int gi, void *ptr) = {
+ NULL, sel_acc_libstr, sel_acc_gi, sel_hacc_libstr, sel_hacc_gi
+};
+
+void *(*sel_acc_init[MAX_ACC_TYPE+1])(FILE *libf, int *acc_off, char fmt_term) = {
+ NULL, sel_acc_libstr_init, sel_acc_gi_init, sel_hacc_libstr_init, sel_hacc_gi_init
+};
+
+unsigned int hash_func(char *key);
+unsigned int fast_hash32 (unsigned int data);
+
+#ifdef MYSQL_DB
+struct lmf_str *mysql_openlib(char *, int, int *);
+struct lmf_str *mysql_reopen(struct lmf_str *);
+#endif
+
+#ifdef PGSQL_DB
+struct lmf_str *pgsql_openlib(char *, int, int *);
+struct lmf_str *pgsql_reopen(struct lmf_str *);
+#endif
+
+extern int can_mmap(int lib_type);
+
+int closelib(struct lmf_str *m_fptr, int force);
+extern void newname(char *nname, char *oname, char *suff, int maxn);
+
+/* a file name for openlib may include a library type suffix */
+
+struct lmf_str *
+open_lib(struct lib_struct *lib_p, int ldnaseq, int *sascii, int outtty)
+{
+ struct lmf_str *om_fptr;
+ char rline[10], iname[MAX_FN];
+ char *bp, *bp1, *bp2;
+ char opt_text[MAX_FN]; /* save text after ':' */
+ char f_line[MAX_STR];
+ int wcnt, opnflg;
+ int lib_type;
+ int acc_ltype = 1; /* def type is 1, not zero, so that the acc is read */
+ struct lmf_str *acc_fptr; /* file of subset accessions */
+ char af_name[MAX_FN];
+ FILE *libi=NULL;
+ FILE *libf;
+ int use_stdin;
+ struct lmf_str *m_fptr=NULL;
+ int acc_off=0;
+ char fmt_term;
+ struct lib_struct *next_lib_p, *this_lib_p, *tmp_lib_p;
+
+ om_fptr = lib_p->m_file_p;
+
+ if (om_fptr != NULL && om_fptr->mm_flg) {
+ om_fptr->lpos = 0;
+ return om_fptr;
+ }
+
+ wcnt = 0; /* number of times to ask for file name */
+
+ /* check to see if there is a file option ":1-100" */
+#ifndef WIN32
+ if ((bp=strchr(lib_p->file_name,':'))!=NULL && *(bp+1)!='\0') {
+#else
+ if ((bp=strchr(lib_p->file_name+3,':'))!=NULL && *(bp+1)!='\0') {
+#endif
+ strncpy(opt_text,bp+1,sizeof(opt_text));
+ opt_text[sizeof(opt_text)-1]='\0';
+ *bp = '\0';
+ }
+ else opt_text[0]='\0';
+
+ if (lib_p->file_name[0] == '-' || lib_p->file_name[0] == '@') {
+ use_stdin = 1;
+ }
+ else use_stdin=0;
+
+ /* check for library type */
+ if ((bp=strchr(lib_p->file_name,' '))!=NULL) {
+ *bp='\0';
+ sscanf(bp+1,"%d",&lib_type);
+ if (lib_type<0 || lib_type >= LASTLIB) {
+ fprintf(stderr,"\n invalid library type: %d (>%d)- resetting\n%s\n",
+ lib_type,LASTLIB,lib_p->file_name);
+ lib_type=0;
+ }
+ else {
+ lib_p->lib_type = lib_type;
+ }
+ }
+ else lib_type = lib_p->lib_type;
+
+ if (use_stdin && lib_type !=0 ) {
+ fprintf(stderr,"\n @/- STDIN libraries must be in FASTA format\n");
+ return NULL;
+ }
+
+ /* check to see if file can be open()ed? */
+
+ l1:
+ opnflg = 0;
+ if (lib_type<=LASTTXT) {
+ if (!use_stdin) {
+ opnflg=((libf=fopen(lib_p->file_name,RBSTR))!=NULL);
+ }
+ else {
+ libf=stdin;
+ lib_p->file_name = alloc_file_name("STDIN");
+ opnflg=1;
+ }
+ }
+ else if (lib_type==ACC_LIST) {
+ /* if we have already processed the acc_list file,
+ open the file, modify the acc_list stuff, and return it
+ */
+ if (lib_p->acc_file_p != NULL) {
+ if ((acc_fptr = open_lib(lib_p, ldnaseq, sascii, outtty))==NULL) {
+ fprintf(stderr, "Cannot open %s library for ACC_LIST\n",lib_p->file_name);
+ return NULL;
+ }
+ else {
+ /* note that sel_acc_arr[0] must be NULL */
+ acc_fptr->sel_acc_p = lib_p->acc_file_p->sel_acc_p;
+ acc_fptr->acc_off = lib_p->acc_file_p->acc_off;
+ return acc_fptr;
+ }
+ }
+
+ /* open the file, read the first line, do an openlib on the first line */
+ if (!use_stdin) {
+ opnflg=((libf=fopen(lib_p->file_name,RBSTR))!=NULL);
+ }
+ else {
+ libf=stdin;
+ lib_p->file_name = alloc_file_name("STDIN");
+ opnflg=1;
+ }
+
+ if (!opnflg) {
+ fprintf(stderr, "Cannot open %s library\n",lib_p->file_name);
+ return NULL;
+ }
+ else {
+ /* read in the file line */
+ if (fgets(f_line, sizeof(f_line), libf)==NULL) {
+ fprintf(stderr, "Cannot read ACC_LIST file line\n");
+ return NULL;
+ }
+ /* else parse the file line */
+ if (f_line[0] != '<') {
+ fprintf(stderr, "missing < - %s\n",f_line); return NULL;
+ }
+ if ((bp=strchr(f_line+1,'\r'))!=NULL) {*bp = '\0';}
+ if ((bp=strchr(f_line+1,'\n'))!=NULL) {*bp = '\0';}
+
+ /* check for accession format */
+ if ((bp=strchr(f_line+1,':'))!=NULL) {
+ *bp = '\0';
+ /* access string should be %d %d%c - acc_ltype, acc_off, fmt_term */
+ sscanf(bp+1,"%d %d%c",&acc_ltype, &acc_off, &fmt_term);
+ /* blank terminator is default */
+ if (acc_off == 0) acc_off = 1; /* always skip the '>' */
+ if (fmt_term < ' ' || fmt_term > '~') fmt_term = ' ';
+ if (acc_ltype > MAX_ACC_TYPE) {acc_ltype = MAX_ACC_TYPE;}
+ }
+
+ this_lib_p = get_lnames(f_line+1, NULL);
+
+ /* this_lib_p now has the list of files specified by the
+ <@?acc_list_file. If there is only one file, then this
+ information, and the associated m_file_p, should be put into
+ lib_p. If there is more than one file, the the first should
+ be put in lib_p, and the subsequent lib_struct's should be
+ linked from lib_p->next, and the end of list needs to point
+ to lib_p->next.
+ */
+
+ /* done whether list or not */
+ lib_p->file_name = this_lib_p->file_name;
+ /* deal with other list issues after we have a acc_fptr */
+
+ /* check that we can open the library file */
+ if ((acc_fptr = open_lib(this_lib_p, ldnaseq, sascii, outtty))==NULL) {
+ fprintf(stderr, "Cannot open %s library for ACC_LIST\n",f_line+1);
+ free(this_lib_p);
+ return NULL;
+ }
+ else {
+ /* set up the auxiliary information for the current open file */
+ this_lib_p->m_file_p = acc_fptr;
+ /* note that sel_acc_arr[0] must be NULL */
+ acc_fptr->sel_acc_p = sel_acc_arr[acc_ltype];
+ acc_fptr->acc_off = acc_off;
+ /* read in the data */
+ acc_fptr->sel_local = sel_acc_init[acc_ltype](libf, &acc_fptr->acc_off, fmt_term);
+
+ /* now handle the rest of the this_lib_p list */
+
+ tmp_lib_p = this_lib_p->next; /* skip over the first entry in this_lib_p */
+ /* fill in the information up to the next to last entry in the chain */
+ while (tmp_lib_p && tmp_lib_p->next) {
+ tmp_lib_p->acc_file_p = acc_fptr;
+ tmp_lib_p = tmp_lib_p->next;
+ }
+ if (tmp_lib_p) {
+ tmp_lib_p->acc_file_p = acc_fptr; /* fill in last entry */
+ tmp_lib_p->next = lib_p->next; /* continue the chain */
+ }
+ lib_p->next = this_lib_p->next; /* insert the chain */
+ return acc_fptr;
+ }
+ }
+ }
+#ifdef NCBIBL13
+ else if (lib_type==NCBIBL13) opnflg=(ncbl_openlib(lib_p->file_name,ldnaseq)!= -1);
+#endif
+#ifdef NCBIBL20
+ else if (lib_type==NCBIBL20) {
+ opnflg=((m_fptr=ncbl2_openlib(lib_p,ldnaseq))!=NULL);
+ }
+#endif
+
+#ifdef MYSQL_DB
+ /* a mySQL filename contains mySQL commands, not sequences */
+ else if (lib_type==MYSQL_LIB) {
+ opnflg=((m_fptr=mysql_openlib(lib_p->file_name,ldnaseq,sascii))!=NULL);
+ m_fptr->get_mmap_chain = NULL;
+ }
+#endif
+#ifdef PGSQL_DB
+ /* a mySQL filename contains mySQL commands, not sequences */
+ else if (lib_type==PGSQL_LIB) {
+ opnflg=((m_fptr=pgsql_openlib(lib_p->file_name,ldnaseq,sascii))!=NULL);
+ m_fptr->get_mmap_chain = NULL;
+ }
+#endif
+
+ if (!opnflg) { /* here if open failed */
+ if (outtty) {
+ fprintf(stderr,"\n cannot open %s library\n",lib_p->file_name);
+ fprintf(stderr," enter new file name or <RET> to quit ");
+ fflush(stderr);
+ if (fgets(f_line,sizeof(f_line),stdin)==NULL) return NULL;
+ if ((bp=strchr(f_line,'\n'))!=0) *bp='\0';
+ if (strlen(f_line)==0) return NULL;
+ if (++wcnt > 10) return NULL;
+ lib_p->file_name = alloc_file_name(f_line);
+ goto l1;
+ }
+ else return NULL;
+ } /* !openflg */
+
+ if (lib_type <= LASTTXT) {
+ /* modify to re-use the om_fptr if it exists */
+ if (om_fptr != NULL) {
+ m_fptr = om_fptr;
+ }
+ else {
+ if ((m_fptr = calloc(1,sizeof(struct lmf_str)))==NULL) {
+ fprintf(stderr,"\n *** cannot allocate lmf_str (%ld) for %s\n",
+ sizeof(struct lmf_str),lib_p->file_name);
+ return NULL;
+ }
+ if ((m_fptr->lline = calloc(MAX_STR,sizeof(char)))==NULL) {
+ fprintf(stderr,"\n *** cannot allocate lline (%d) for %s\n",
+ MAX_STR,lib_p->file_name);
+ return NULL;
+ }
+ }
+
+ m_fptr->lb_name = lib_p->file_name;
+ strncpy(m_fptr->opt_text,opt_text,MAX_FN);
+ m_fptr->opt_text[MAX_FN-1]='\0';
+ m_fptr->sascii = sascii;
+ m_fptr->get_mmap_chain = NULL;
+
+ m_fptr->libf = libf;
+ m_fptr->lb_type = lib_type;
+ m_fptr->acc_off = 1; /* default for FASTA format */
+ m_fptr->getlib = getliba[lib_type];
+ m_fptr->ranlib = ranliba[lib_type];
+ m_fptr->sel_acc_p = NULL;
+ m_fptr->mm_flg = 0;
+ m_fptr->tot_len = 0;
+ m_fptr->max_len = 0;
+ m_fptr->lib_aa = (ldnaseq==SEQT_PROT);
+ }
+ last_m_fptr = m_fptr;
+
+#ifdef USE_MMAP
+ /* check for possible mmap()ed files */
+ if (!use_stdin && (lib_type <= LASTTXT) && can_mmap(lib_type)) {
+ /* this is a file we can mmap() */
+ /* look for .xin file */
+ newname(iname,lib_p->file_name,"xin",sizeof(iname));
+ if ((libi=fopen(iname,"r"))!=NULL) { /* have a *.xin file, use mmap */
+ if (load_mmap(libi,lib_p->file_name,lib_type,ldnaseq,m_fptr)!=NULL) {
+ fclose(libi); /* close index file */
+ return m_fptr;
+ }
+ fclose(libi); /* memory mapping failed, but still must close file */
+ }
+ }
+#endif
+
+ if (lib_type <= LASTTXT) {
+ m_fptr->lpos = 0;
+ if (fgets(m_fptr->lline,MAX_STR,libf)==NULL) return NULL;
+ }
+ return m_fptr;
+}
+
+int
+closelib(struct lmf_str *m_fptr,int force) {
+
+ if (m_fptr == NULL) return 0;
+
+#ifdef USE_MMAP
+ if (!force && m_fptr->mm_flg) {
+/* don't close memory mapped files
+*/
+ m_fptr->lpos = 0;
+ return 0;
+ }
+#endif
+
+ if (m_fptr->libf!=NULL && m_fptr->libf != stdin) {
+ fclose(m_fptr->libf);
+ m_fptr->libf = NULL;
+ m_fptr->mm_flg = 0;
+ }
+
+ /* keep m_fptr->lline around for re-use -- alternatively, always
+ allocate */
+ /*
+ if (m_fptr->lline != NULL) {
+ free(m_fptr->lline);
+ m_fptr->lline = NULL;
+ }
+ */
+
+#ifdef NCBIBL13
+ if (m_fptr->lb_type == NCBIBL13) ncbl_closelib(m_fptr);
+#endif
+#ifdef NCBIBL20
+ if (m_fptr->lb_type == NCBIBL20) ncbl2_closelib(m_fptr);
+#endif
+#ifdef MYSQL_DB
+ if (m_fptr->lb_type == MYSQL_LIB) mysql_closelib(m_fptr);
+#endif
+
+ if (m_fptr == last_m_fptr) {
+ last_m_fptr = NULL;
+ }
+
+ return 1;
+}
+
+struct lmf_str *
+ re_openlib(struct lmf_str *om_fptr, struct lib_struct *lib_p, int outtty)
+{
+ int opnflg;
+
+ /* if its already open, return it */
+ if (om_fptr == last_m_fptr) {
+ return om_fptr;
+ }
+ else {
+ if (om_fptr->mm_flg) {
+ last_m_fptr = om_fptr;
+ om_fptr->lpos = 0;
+ return om_fptr;
+ }
+#ifdef MYSQL_DB
+ /* if this is a mysql database - use it and return */
+ else if (om_fptr->lb_type == MYSQL_LIB) {
+ return om_fptr;
+ }
+#endif
+ else {
+ closelib(last_m_fptr,1);
+ }
+ }
+
+ last_m_fptr = om_fptr;
+
+
+ /* data is available, but file is closed or not memory mapped, open it */
+ /* no longer check to memory map - because we could not do it before */
+
+ opnflg = 1;
+ if (om_fptr->lb_type<=LASTTXT && om_fptr->libf==NULL)
+ opnflg=((om_fptr->libf=fopen(om_fptr->lb_name,RBSTR))!=NULL);
+#ifdef NCBIBL20
+ else if (om_fptr->lb_type==NCBIBL20) {
+ opnflg=((om_fptr=ncbl2_reopen(om_fptr))!=NULL);
+ }
+#endif
+#ifdef MYSQL_DB
+ /* a mySQL filename contains mySQL commands, not sequences */
+ else if (om_fptr->lb_type==MYSQL_LIB)
+ opnflg=(mysql_reopen(om_fptr)!=NULL);
+#endif
+
+ if (!opnflg) {
+ fprintf(stderr,"*** could not re_open %s\n",om_fptr->lb_name);
+ return NULL;
+ }
+
+ /* use the old buffer for the opened text file */
+ return om_fptr;
+}
+
+void sf_sort(int *, int);
+
+int
+agetlib(unsigned char *seq, int maxs,
+ char *libstr, int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lm_fd,
+ long *l_off)
+{
+ int i;
+ register unsigned char *cp, *seqp, *seqb;
+ register int *ap;
+ unsigned char *seqm, *seqm1;
+ /* int ic, l_start, l_stop, l_limit, rn; */
+ char *bp, *bp1, *bpa, *tp;
+ int sel_status;
+
+ seqp = seqb = seq;
+ seqm = &seq[maxs-9];
+ seqm1 = seqm-1;
+
+ ap = lm_fd->sascii;
+
+ *l_off = 1;
+ if (*lcont==0) {
+
+ start_seq:
+ while (lm_fd->lline[0]!='>' && lm_fd->lline[0]!=';') {
+ if (lm_fd->libf != stdin) lm_fd->lpos = FTELL(lm_fd->libf);
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ }
+
+ /* get l_off coordinate from @C:123 */
+ if ((bp=strchr(lm_fd->lline,'@'))!=NULL && !strncmp(bp+1,"C:",2)) {
+ sscanf(bp+3,"%ld",l_off);
+ }
+
+ strncpy(libstr,lm_fd->lline+lm_fd->acc_off,n_libstr-1);
+ libstr[n_libstr-1]='\0';
+
+ if ((lm_fd->sel_acc_p != NULL) &&
+ (sel_status = (lm_fd->sel_acc_p)(libstr, 0, lm_fd->sel_local)) <= 0) {
+ if (sel_status < 0) return (-1);
+ while (strchr((char *)lm_fd->lline,'\n')==NULL) {
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ }
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ goto start_seq;
+ }
+
+ if ((bp=strchr(libstr,'\r'))!=NULL) *bp='\0';
+ if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
+
+ if (n_libstr > MAX_UID) {
+ tp = libstr;
+ while (*tp++) if (*tp == '\001' || *tp== '\t') *tp = ' ';
+ }
+
+ *libpos = lm_fd->lpos;
+
+ /* make certain we have the end of the line */
+ while (strchr((char *)lm_fd->lline,'\n')==NULL) {
+ if (strlen(lm_fd->lline)<MAX_STR/2)
+ fgets(&lm_fd->lline[strlen(lm_fd->lline)],MAX_STR/2,lm_fd->libf);
+ else
+ fgets(&lm_fd->lline[MAX_STR/2],MAX_STR/2,lm_fd->libf);
+ }
+ lm_fd->lline[MAX_STR-1]='\0';
+ }
+
+ lm_fd->lline[0]='\0';
+ while (seqb<seqm1 && fgets((char *)seqb,(size_t)(seqm-seqb),lm_fd->libf)!=NULL) {
+ if (*seqb=='>') goto new;
+ if (*seqb==';') {
+ if (strchr((char *)seqb,'\n')==NULL) goto cont;
+ continue;
+ }
+
+ /* removed - used for @P:1-n
+ if (l_limit) {
+ for (cp=seqp; seqp<seqm1 && rn < l_stop && (ic=ap[*cp++])<EL; )
+ if (ic < NA && ++rn > l_start) *seqp++ = (unsigned char)ic;
+ if (rn > l_stop) goto finish;
+ }
+ else {
+ */
+ seqp = seqb;
+ for (cp=seqp; seqp<seqm1; ) {
+ if ((*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA) continue;
+ if (*(--seqp)>NA) break;
+ }
+ seqb = seqp;
+ if (*seqp==ES) goto done;
+ if (lm_fd->libf != stdin) lm_fd->lpos = FTELL(lm_fd->libf);
+ }
+ goto done;
+ new:
+ strncpy(lm_fd->lline,(char *)seqp,MAX_STR);
+ lm_fd->lline[MAX_STR-1]='\0';
+ /* be certain to get complete line, if possible */
+ if (strchr(lm_fd->lline,'\n')==NULL)
+ fgets(&lm_fd->lline[strlen(lm_fd->lline)],MAX_STR-strlen(lm_fd->lline),lm_fd->libf);
+ lm_fd->lline[MAX_STR-1]='\0';
+ if (strchr(lm_fd->lline,'\n')==NULL && strchr((char *)seqp,'\n')!=NULL)
+ lm_fd->lline[strlen(lm_fd->lline)-1]='\n';
+ goto done;
+
+ /* removed - used for @P:1-n
+finish:
+ while (lm_fd->lline[0]!='>' &&
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf)!=NULL) {
+ if (lm_fd->libf != stdin) lm_fd->lpos = FTELL(lm_fd->libf);
+ }
+ goto done;
+*/
+ cont:
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ seqm1 = seqp;
+ done:
+ if (seqp>=seqm1) (*lcont)++;
+ else {
+ *lcont=0;
+ }
+
+ *seqp = EOSEQ;
+ /* if ((int)(seqp-seq)==0) return 1; */
+ return (int)(seqp-seq);
+}
+
+void
+aranlib(char *str, int cnt, fseek_t seek, char *libstr, struct lmf_str *lm_fd)
+{
+ char *bp;
+
+ if (lm_fd->libf != stdin) {
+ FSEEK(lm_fd->libf, seek, 0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+
+ if (lm_fd->lline[0]=='>' || lm_fd->lline[0]==';') {
+ strncpy(str,lm_fd->lline+lm_fd->acc_off,cnt);
+ str[cnt-1]='\0';
+ if ((bp = strchr(str,'\r'))!=NULL) *bp='\0';
+ if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+ /*
+ if ((bp = strchr(str,SFCHAR))!=NULL) *bp='\0';
+ else if ((bp = strchr(str,'\001'))!=NULL) *bp='\0';
+ else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+ else str[cnt-1]='\0';
+ */
+ bp = str;
+ while (*bp++) if (*bp=='\001' || *bp=='\t') *bp=' ';
+ }
+ else {
+ str[0]='\0';
+ }
+ }
+ else str[0]='\0';
+}
+
+int
+qgetlib(unsigned char *seq, int maxs,
+ char *libstr, int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lm_fd,
+ long *l_off)
+{
+ int i;
+ register unsigned char *cp, *seqp, *seqb;
+ register int *ap;
+ unsigned char *seqm, *seqm1;
+ /* int ic, l_start, l_stop, l_limit, rn; */
+ char *bp, *bp1, *bpa, *tp;
+ int sel_status;
+
+ seqp = seqb = seq;
+ seqm = &seq[maxs-9];
+ seqm1 = seqm-1;
+
+ ap = lm_fd->sascii;
+
+ *l_off = 1;
+ if (*lcont==0) {
+
+ while (lm_fd->lline[0]!='@') {
+ if (lm_fd->libf != stdin) lm_fd->lpos = FTELL(lm_fd->libf);
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ }
+
+ strncpy(libstr,lm_fd->lline+lm_fd->acc_off,n_libstr-1);
+ libstr[n_libstr-1]='\0';
+
+ if ((bp=strchr(libstr,'\r'))!=NULL) *bp='\0';
+ if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
+
+ *libpos = lm_fd->lpos;
+
+ /* make certain we have the end of the line */
+ while (strchr((char *)lm_fd->lline,'\n')==NULL) {
+ if (strlen(lm_fd->lline)<MAX_STR/2)
+ fgets(&lm_fd->lline[strlen(lm_fd->lline)],MAX_STR/2,lm_fd->libf);
+ else
+ fgets(&lm_fd->lline[MAX_STR/2],MAX_STR/2,lm_fd->libf);
+ }
+ lm_fd->lline[MAX_STR-1]='\0';
+ }
+
+ lm_fd->lline[0]='\0';
+ while (seqb<seqm1 && fgets((char *)seqb,(size_t)(seqm-seqb),lm_fd->libf)!=NULL) {
+ if (*seqb=='+') goto new;
+
+ seqp = seqb;
+ for (cp=seqp; seqp<seqm1; ) {
+ if ((*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA) continue;
+ if (*(--seqp)>NA) break;
+ }
+ seqb = seqp;
+ if (*seqp==ES) goto done;
+ if (lm_fd->libf != stdin) lm_fd->lpos = FTELL(lm_fd->libf);
+ }
+ goto done;
+ new:
+ strncpy(lm_fd->lline,(char *)seqp,MAX_STR);
+ lm_fd->lline[MAX_STR-1]='\0';
+ /* be certain to get complete line, if possible */
+ if (strchr(lm_fd->lline,'\n')==NULL)
+ fgets(&lm_fd->lline[strlen(lm_fd->lline)],MAX_STR-strlen(lm_fd->lline),lm_fd->libf);
+ lm_fd->lline[MAX_STR-1]='\0';
+ if (strchr(lm_fd->lline,'\n')==NULL && strchr((char *)seqp,'\n')!=NULL)
+ lm_fd->lline[strlen(lm_fd->lline)-1]='\n';
+
+ done:
+ if (seqp>=seqm1) (*lcont)++;
+ else {
+ *lcont=0;
+ }
+
+ *seqp = EOSEQ;
+ /* if ((int)(seqp-seq)==0) return 1; */
+ return (int)(seqp-seq);
+}
+
+void
+qranlib(char *str, int cnt, fseek_t seek, char *libstr, struct lmf_str *lm_fd)
+{
+ char *bp;
+
+ if (lm_fd->libf != stdin) {
+ FSEEK(lm_fd->libf, seek, 0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+
+ if (lm_fd->lline[0]=='@') {
+ strncpy(str,lm_fd->lline+lm_fd->acc_off,cnt);
+ str[cnt-1]='\0';
+ if ((bp = strchr(str,'\r'))!=NULL) *bp='\0';
+ if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+ bp = str;
+ }
+ else {
+ str[0]='\0';
+ }
+ }
+ else str[0]='\0';
+}
+
+void lget_ann(struct lmf_str *, char *, int);
+
+int
+lgetlib(unsigned char *seq, int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lm_fd,
+ long *l_off)
+{
+ register unsigned char *cp, *seqp;
+ register int *ap;
+ unsigned char *seqm, *seqm1;
+ char *bp, *bp_gid;
+
+ *l_off = 1;
+
+ seqp = seq;
+ seqm = &seq[maxs-11];
+ seqm1 = seqm-1;
+
+ ap = lm_fd->sascii;
+
+ if (*lcont==0) {
+ while (lm_fd->lline[0]!='L' || lm_fd->lline[1]!='O' ||
+ strncmp(lm_fd->lline,"LOCUS",5)) { /* find LOCUS */
+ lm_fd->lpos = FTELL(lm_fd->libf);
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+ }
+ *libpos= lm_fd->lpos;
+
+ if (n_libstr <= 21) {
+ strncpy(libstr,&lm_fd->lline[12],12);
+ libstr[12]='\0';
+ }
+ else {
+ lget_ann(lm_fd,libstr,n_libstr);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ }
+
+ while (lm_fd->lline[0]!='O' || lm_fd->lline[1]!='R' ||
+ strncmp(lm_fd->lline,"ORIGIN",6)) { /* find ORIGIN */
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+ }
+ }
+ else {
+ for (cp= lm_fd->cpsave; seqp<seqm1; ) {
+ if ((*seqp++=ap[*cp++])<NA) continue;
+ if (*(--seqp)>NA) break;
+ }
+ }
+
+ lm_fd->lline[0]='\0';
+ while (seqp<seqm1 && fgets(lm_fd->lline,MAX_STR,lm_fd->libf)!=NULL) {
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+ if (lm_fd->lline[0]=='/') goto new;
+ for (cp= (unsigned char *)&lm_fd->lline[10]; seqp<seqm1; ) {
+ if ((*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA) continue;
+ if (*(--seqp)>NA) break;
+ }
+ }
+ goto done;
+new:
+ lm_fd->lpos = FTELL(lm_fd->libf);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+
+done:
+ if (seqp>=seqm1) {
+ lm_fd->cpsave = cp;
+ (*lcont)++;
+ }
+ else *lcont=0;
+
+ *seqp = EOSEQ;
+ /* if ((int)(seqp-seq)==0) return 1; */
+ return (int)(seqp-seq);
+}
+
+void
+lget_ann(struct lmf_str *lm_fd, char *libstr, int n_libstr) {
+ char *bp, *bp_gid, locus[120], desc[120], acc[120], ver[120];
+
+ /* copy in locus from lm_fd->lline */
+ strncpy(locus,&lm_fd->lline[12],sizeof(locus));
+ if ((bp=strchr(locus,' '))!=NULL) *(bp+1) = '\0';
+
+ /* get description */
+ fgets(desc,sizeof(desc),lm_fd->libf);
+ while (desc[0]!='D' || desc[1]!='E' || strncmp(desc,"DEFINITION",10))
+ fgets(desc,sizeof(desc),lm_fd->libf);
+ if ((bp = strchr(&desc[12],'\n'))!=NULL) *bp='\0';
+
+ /* get accession */
+ fgets(acc,sizeof(acc),lm_fd->libf);
+ while (acc[0]!='A' || acc[1]!='C' || strncmp(acc,"ACCESSION",9)) {
+ fgets(acc,sizeof(acc),lm_fd->libf);
+ if (acc[0]=='O' && acc[1]=='R' && strncmp(acc,"ORIGIN",6)==0)
+ break;
+ }
+ if ((bp = strchr(&acc[12],'\n'))!=NULL) *bp='\0';
+ if ((bp = strchr(&acc[12],' '))!=NULL) *bp='\0';
+
+ /* get version */
+ fgets(ver,sizeof(ver),lm_fd->libf);
+ while (ver[0]!='V' || ver[1]!='E' || strncmp(ver,"VERSION",7)) {
+ fgets(ver,sizeof(ver),lm_fd->libf);
+ if (ver[0]=='O' && ver[1]=='R' && strncmp(ver,"ORIGIN",6)==0)
+ break;
+ }
+ if ((bp = strchr(&ver[12],'\n'))!=NULL) *bp='\0';
+
+ /* extract gi:123456 from version line */
+ bp_gid = strchr(&ver[12],':');
+ if (bp_gid != NULL) {
+ if ((bp=strchr(bp_gid+1,' '))!=NULL) *bp='\0';
+ bp_gid++;
+ }
+ if ((bp = strchr(&ver[12],' '))!=NULL) *bp='\0';
+
+ /* build up FASTA header line */
+ if (bp_gid != NULL) {
+ strncpy(libstr,"gi|",n_libstr-1);
+ strncat(libstr,bp_gid,n_libstr-4);
+ strncat(libstr,"|gb|",n_libstr-20);
+ }
+ else {libstr[0]='\0';}
+
+ /* if we have a version number, use it, otherwise accession,
+ otherwise locus/description */
+
+ if (ver[0]=='V') {
+ strncat(libstr,&ver[12],n_libstr-1-strlen(libstr));
+ strncat(libstr,"|",n_libstr-1-strlen(libstr));
+ }
+ else if (acc[0]=='A') {
+ strncat(libstr,&acc[12],n_libstr-1-strlen(libstr));
+ strncat(libstr," ",n_libstr-1-strlen(libstr));
+ }
+
+ strncat(libstr,locus,n_libstr-1-strlen(libstr));
+ strncat(libstr,&desc[11],n_libstr-1-strlen(libstr));
+ libstr[n_libstr-1]='\0';
+}
+
+/* this code seeks to provide both the various accession numbers
+ necessary to identify the sequence, and also some description.
+
+ Unfortunately, the various contributors to Genbank use three
+ slightly different formats for including the accession number.
+
+(1)LOCUS HSJ214M20 107422 bp DNA HTG 16-JUN-2000
+ DEFINITION Homo sapiens chromosome 6 clone RP1-214M20 map p12.1-12.3, ***
+ SEQUENCING IN PROGRESS ***, in unordered pieces.
+ ACCESSION AL121969
+
+(2)LOCUS AL359201 117444 bp DNA HTG 15-JUN-2000
+ DEFINITION Homo sapiens chromosome 1 clone RP4-671C13 map p13.2-21.1, ***
+ SEQUENCING IN PROGRESS ***, in unordered pieces.
+ ACCESSION AL359201
+
+(3)LOCUS BB067000 280 bp mRNA EST 19-JUN-2000
+ DEFINITION BB067000 RIKEN full-length enriched, 15 days embryo male testis Mus
+ musculus cDNA clone 8030456L01 3', mRNA sequence.
+ ACCESSION BB067000
+
+This makes it more difficult to both provide the accession number in a
+standard location and to conserve definition space
+*/
+
+void
+lranlib(char *str,
+ int cnt,
+ fseek_t seek,
+ char *libstr,
+ struct lmf_str *lm_fd)
+{
+ char *bp, acc[MAX_STR], desc[MAX_STR];
+
+ FSEEK(lm_fd->libf, seek, 0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+
+ lget_ann(lm_fd, str, cnt);
+ str[cnt-1]='\0';
+
+ FSEEK(lm_fd->libf,seek,0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+}
+
+int
+pgetlib(unsigned char *seq, int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lm_fd,
+ long *l_off)
+{
+ int ic;
+ register unsigned char *cp, *seqp;
+ register int *ap;
+ unsigned char *seqm, *seqm1;
+
+ *l_off = 1;
+
+ seqp = seq;
+ seqm = &seq[maxs-11];
+ seqm1 = seqm-1;
+
+ ap = lm_fd->sascii;
+
+ if (*lcont==0) {
+ while (lm_fd->lline[0]!='E' || lm_fd->lline[1]!='N' || strncmp(lm_fd->lline,"ENTRY",5))
+ { /* find ENTRY */
+ lm_fd->lpos = FTELL(lm_fd->libf);
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ }
+ strncpy(libstr,&lm_fd->lline[16],8);
+ libstr[8]='\0';
+ *libpos = lm_fd->lpos;
+ while (lm_fd->lline[2]!='Q' || lm_fd->lline[0]!='S' || strncmp(lm_fd->lline,"SEQUENCE",8))
+ { /* find SEQUENCE */
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ }
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf); /* get the extra line */
+ }
+ else {
+ for (cp= lm_fd->cpsave; seqp<seqm1; ) {
+ if ((*seqp++=ap[*cp++])<NA) continue;
+ if (*(--seqp)>NA) break;
+ }
+ if (*seqp==ES) goto done;
+ }
+
+ lm_fd->lline[0]='\0';
+ while (seqp<seqm1 && fgets(lm_fd->lline,MAX_STR,lm_fd->libf)!=NULL) {
+ if (lm_fd->lline[0]=='/') goto new;
+ for (cp= (unsigned char *)&lm_fd->lline[8]; seqp<seqm1; ) {
+ if ((*seqp++=ap[*cp++])<NA) continue;
+ if (*(--seqp)>NA) break;
+ };
+ if (*seqp==ES) goto done;
+ }
+ goto done;
+new:
+ lm_fd->lpos = FTELL(lm_fd->libf);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+
+done:
+ if (seqp>=seqm1) {
+ lm_fd->cpsave = cp;
+ (*lcont)++;
+ }
+ else *lcont=0;
+
+ *seqp = EOSEQ;
+ /* if ((int)(seqp-seq)==0) return 1; */
+ return (int)(seqp-seq);
+}
+
+void
+pranlib(char *str,
+ int cnt,
+ fseek_t seek,
+ char *libstr,
+ struct lmf_str *lm_fd)
+{
+ char *bp;
+
+ FSEEK(lm_fd->libf, seek, 0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+
+ strncpy(str,&lm_fd->lline[16],8);
+ str[8]='\0';
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ while (lm_fd->lline[0]!='T' || lm_fd->lline[1]!='I' || strncmp(lm_fd->lline,"TITLE",5))
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ strncpy(&str[8],&lm_fd->lline[16],cnt-9);
+ str[cnt-9]='\0';
+ if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+
+ FSEEK(lm_fd->libf,seek,0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+}
+
+int
+egetlib(unsigned char *seq, int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lm_fd,
+ long *l_off)
+{
+ int ll;
+ int ic;
+ register unsigned char *cp, *seqp;
+ register int *ap;
+ unsigned char *seqm, *seqm1;
+ int sel_status;
+ char id[11]; /* Holds Identifier */
+
+ *l_off=1;
+
+ seqp = seq;
+ seqm = &seq[maxs-11];
+ seqm1 = seqm-1;
+
+ ap = lm_fd->sascii;
+
+ if (*lcont==0) {
+ start_seq:
+ while (lm_fd->lline[0]!='I' || lm_fd->lline[1]!='D') { /* find ID */
+ lm_fd->lpos = FTELL(lm_fd->libf);
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+ }
+ sscanf(&lm_fd->lline[5],"%s",id);
+ sprintf(libstr,"%-12.12s",id);
+ libstr[12]='\0';
+
+ if ((lm_fd->sel_acc_p != NULL) &&
+ (sel_status = (lm_fd->sel_acc_p)(libstr, 0, lm_fd->sel_local)) <= 0) {
+ if (sel_status < 0) return (-1);
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ goto start_seq;
+ }
+
+ *libpos = lm_fd->lpos;
+ while (lm_fd->lline[0]!='S' || lm_fd->lline[1]!='Q') { /* find ORIGIN */
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+ }
+ sscanf(&lm_fd->lline[14],"%ld",&lm_fd->gcg_len);
+ }
+ else {
+ for (cp= lm_fd->cpsave; seqp<seqm1; ) {
+ if ((*seqp++=ap[*cp++])<NA) continue;
+ if (*(--seqp)>NA) break;
+ }
+ if (*seqp==ES) goto done;
+ }
+
+ lm_fd->lline[0]='\0';
+ while (seqp<seqm1 && fgets(lm_fd->lline,MAX_STR,lm_fd->libf)!=NULL) {
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+ if (lm_fd->lline[0]=='/') goto new;
+ lm_fd->lline[70]='\0';
+ for (cp= (unsigned char *)&lm_fd->lline[5]; seqp<seqm1; ) {
+ if ((*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA) continue;
+ if (*(--seqp)>NA) break;
+ }
+ if (*seqp==ES) goto done;
+ }
+ goto done;
+new: lm_fd->lpos = FTELL(lm_fd->libf);
+fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+if (lm_fd->lfflag) getc(lm_fd->libf);
+goto done;
+
+done: if (seqp>=seqm1) {
+ lm_fd->cpsave = cp;
+ (*lcont)++;
+ lm_fd->gcg_len -= (long)(seqp-seq);
+}
+else *lcont=0;
+
+*seqp = EOSEQ;
+/* if ((int)(seqp-seq)==0) return 1; */
+/* if (*lcont==0 && (long)(seqp-seq)!=lm_fd->gcg_len)
+ printf("%s read %d of %d\n",libstr,(int)(seqp-seq),lm_fd->gcg_len);
+ */
+return (int)(seqp-seq);
+}
+
+void
+eranlib(char *str,
+ int cnt,
+ fseek_t seek,
+ char *libstr,
+ struct lmf_str *lm_fd)
+{
+ char *bp;
+ char id[14]; /* Holds Identifier */
+
+ FSEEK(lm_fd->libf, seek, 0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+
+ sscanf(&lm_fd->lline[5],"%s",id);
+ sprintf(str,"%-12.12s ",id);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+
+ while (lm_fd->lline[0]!='D' || lm_fd->lline[1]!='E') fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ strncpy(str+12,&lm_fd->lline[5],cnt-11);
+ str[cnt-11]='\0';
+ if ((bp = strchr(str,'\r'))!=NULL) *bp='\0';
+ if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+
+ FSEEK(lm_fd->libf,seek,0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+}
+
+int
+igetlib(unsigned char *seq, int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lm_fd,
+ long *l_off)
+{
+ register unsigned char *cp, *seqp;
+ register int *ap;
+ unsigned char *seqm, *seqm1;
+ char *bp;
+
+ *l_off = 1;
+
+ seqp = seq;
+ seqm = &seq[maxs-9];
+ seqm1 = seqm-1;
+
+ ap = lm_fd->sascii;
+
+ if (*lcont==0) {
+ while (lm_fd->lline[0]!=';') {
+ lm_fd->lpos = FTELL(lm_fd->libf);
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ }
+ *libpos = lm_fd->lpos;
+ while (lm_fd->lline[0]==';') fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ strncpy(libstr,lm_fd->lline+1,12);
+ libstr[12]='\0';
+ if((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
+ }
+
+ lm_fd->lline[0]='\0';
+ while (seqp<seqm1 && fgets((char *)seqp,(size_t)(seqm-seqp),lm_fd->libf)!=NULL) {
+ if (*seqp=='>') goto new;
+ if (*seqp==';') {
+ if (strchr((char *)seqp,'\n')==NULL) goto cont;
+ continue;
+ }
+ for (cp=seqp; seqp<seqm1; ) {
+ if ((*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA) continue;
+ if (*(--seqp)>NA) break;
+ }
+ if (*seqp==ES) goto done;
+ lm_fd->lpos = FTELL(lm_fd->libf);
+ }
+ goto done;
+new: strncpy(lm_fd->lline,(char *)seqp,MAX_STR);
+ lm_fd->lline[MAX_STR-1]='\0';
+ if (strchr((char *)seqp,'\n')==NULL)
+ fgets(lm_fd->lline,MAX_STR-strlen(lm_fd->lline),lm_fd->libf);
+ goto done;
+
+cont:
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ seqm1 = seqp;
+
+done: if (seqp>=seqm1) {
+ (*lcont)++;
+ }
+ else {
+ *lcont=0;
+ }
+
+
+ *seqp = EOSEQ;
+ /* if ((int)(seqp-seq)==0) return 1; */
+ return (int)(seqp-seq);
+ }
+
+void
+iranlib(char *str,
+ int cnt,
+ fseek_t seek,
+ char *libstr,
+ struct lmf_str *lm_fd)
+{
+ char *bp;
+ char tline[MAX_FN];
+
+ FSEEK(lm_fd->libf, seek, 0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+
+ if (lm_fd->lline[0]=='>' || lm_fd->lline[0]==';') {
+ strncpy(tline,lm_fd->lline+1,sizeof(tline));
+ tline[sizeof(tline)-1]='\0';
+ if ((bp = strchr(tline,'\n'))!=NULL) *bp='\0';
+ }
+ else {
+ tline[0]='\0';
+ }
+
+ while (lm_fd->lline[0]==';') fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if ((bp=strchr(lm_fd->lline,'\n'))!=NULL) *bp=0;
+ if ((bp=strchr(lm_fd->lline,' '))!=NULL) *bp=0;
+ strncpy(str,lm_fd->lline,cnt);
+ str[cnt-1]='\0';
+ strncat(str," ",cnt-strlen(str)-1);
+ strncat(str,tline,cnt-strlen(str)-1);
+
+ FSEEK(lm_fd->libf,seek,0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ }
+
+int
+vgetlib(unsigned char *seq, int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lm_fd,
+ long *l_off)
+{
+ int i, ich;
+ register unsigned char *cp, *seqp;
+ register int *ap;
+ unsigned char *seqm, *seqm1;
+ char *bp, *tp;
+
+ *l_off = 1;
+
+ seqp = seq;
+ seqm = &seq[maxs-9];
+ seqm1 = seqm-1;
+
+ ap = lm_fd->sascii;
+
+ if (*lcont==0) {
+ while (lm_fd->lline[0]!='>' && lm_fd->lline[0]!=';') {
+ lm_fd->lpos = FTELL(lm_fd->libf);
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+ }
+
+ if ((bp=strchr(lm_fd->lline,'\n'))!=NULL) *bp='\0';
+ strncpy(libstr,&lm_fd->lline[4],12);
+ libstr[12]='\0';
+ if ((bp=strchr(libstr,' '))!=NULL) *bp='\0';
+ if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
+
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+
+ if (n_libstr > 21) {
+ strcat(libstr," ");
+ strncat(libstr,lm_fd->lline,n_libstr-1-strlen(libstr));
+ if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
+ libstr[n_libstr-1]='\0';
+ }
+ *libpos = lm_fd->lpos;
+ }
+
+ lm_fd->lline[0]='\0';
+ while (seqp<seqm1 && fgets((char *)seqp,(size_t)(seqm-seqp),lm_fd->libf)!=NULL) {
+ if (lm_fd->lfflag && (ich=getc(lm_fd->libf))!=LFCHAR) ungetc(ich,lm_fd->libf);
+ if (*seqp=='>') goto new;
+ if (*seqp==';') {
+ if (strchr((char *)seqp,'\n')==NULL) goto cont;
+ continue;
+ }
+ for (cp=seqp; seqp<seqm1; ) {
+ if ((*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA) continue;
+ if (*(--seqp)>NA) break;
+ }
+ if (*seqp==ES) goto done;
+ lm_fd->lpos = FTELL(lm_fd->libf);
+ }
+ goto done;
+new:
+ strncpy(lm_fd->lline,(char *)seqp,MAX_STR);
+ lm_fd->lline[MAX_STR-1]='\0';
+ if (strchr((char *)seqp,'\n')==NULL) {
+ fgets(&lm_fd->lline[strlen(lm_fd->lline)],MAX_STR-strlen(lm_fd->lline),lm_fd->libf);
+ if (lm_fd->lfflag && (ich=getc(lm_fd->libf))!=LFCHAR) ungetc(ich,lm_fd->libf);
+ }
+ goto done;
+
+cont:
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if (lm_fd->lfflag && (ich=getc(lm_fd->libf))!=LFCHAR) ungetc(ich,lm_fd->libf);
+ seqm1 = seqp;
+
+done:
+ if (seqp>=seqm1) {
+ (*lcont)++;
+ }
+ else {
+ *lcont=0;
+ }
+
+ *seqp = EOSEQ;
+ /* if ((int)(seqp-seq)==0) return 1;*/
+ return (int)(seqp-seq);
+}
+
+void
+vranlib(char *str,
+ int cnt,
+ fseek_t seek,
+ char *libstr,
+ struct lmf_str *lm_fd)
+{
+ char *bp, *llp;
+
+ FSEEK(lm_fd->libf, seek, 0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+
+ if (lm_fd->lline[0]=='>'&&(lm_fd->lline[3]==';'||
+ lm_fd->lline[3]=='>') /* GCG ascii */
+ ) {
+ strncpy(str,&lm_fd->lline[4],cnt-1);
+ str[cnt-1]='\0';
+
+ if (lm_fd->lline[3]=='>' && (bp = strchr(str,' '))!=NULL) *bp='\0';
+
+ if ((bp = strchr(str,':'))!=NULL) *bp='\0';
+ if ((bp=strchr(str,'\r'))!=NULL) *bp='\0';
+ else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+ else str[cnt-1]='\0';
+
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+
+ /* skip over redundant stuff */
+ for (llp=lm_fd->lline,bp=str; *llp==*bp; llp++,bp++);
+ if ((int)(llp-lm_fd->lline)<5) llp = lm_fd->lline;
+
+ if ((bp=strchr(llp,'\r'))!=NULL) *bp=' ';
+ if ((bp=strchr(llp,'\n'))!=NULL) *bp='\0';
+ strncat(str," ",(size_t)1);
+ strncat(str,llp,(size_t)cnt-strlen(str)-1);
+ }
+ else {
+ str[0]='\0';
+ }
+
+ FSEEK(lm_fd->libf,seek,0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ if (lm_fd->lfflag) getc(lm_fd->libf);
+}
+
+static int gcg_bton[4]={2,4,1,3};
+
+int
+gcg_getlib(unsigned char *seq, int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lm_fd,
+ long *l_off)
+{
+ char dummy[20];
+ char gcg_date[10];
+ register unsigned char *cp, *seqp, stmp;
+ register int *ap;
+ char gcg_type[10];
+ unsigned char *seqm, *seqm1;
+ long r_block, b_block;
+ char *bp;
+
+ *l_off = 1;
+
+ seqp = seq;
+ seqm = &seq[maxs-9];
+ seqm1 = seqm-1;
+
+ ap = lm_fd->sascii;
+
+ if (*lcont==0) {
+ while (lm_fd->lline[0]!='>' && lm_fd->lline[0]!=';') {
+ lm_fd->lpos = FTELL(lm_fd->libf);
+ if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
+ }
+ sscanf(&lm_fd->lline[4],"%s %s %s %s %ld",
+ libstr,gcg_date,gcg_type,dummy,&(lm_fd->gcg_len));
+
+ lm_fd->gcg_binary = (gcg_type[0]=='2');
+
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ while (strchr((char *)lm_fd->lline,'\n')==NULL) {
+ if (strlen(lm_fd->lline)<MAX_STR/2)
+ fgets(&lm_fd->lline[strlen(lm_fd->lline)],MAX_STR/2,lm_fd->libf);
+ else
+ fgets(&lm_fd->lline[strlen(lm_fd->lline)-MAX_STR/2],MAX_STR/2,lm_fd->libf);
+ }
+ lm_fd->lline[MAX_STR-1]='\0';
+ if (n_libstr <= 21) {
+ libstr[12]='\0';
+ }
+ else {
+ strncat(libstr," ",1);
+ strncat(libstr,lm_fd->lline,n_libstr-1-strlen(libstr));
+ if ((bp = strchr(libstr,'\n'))!=NULL) *bp='\0';
+ libstr[n_libstr-1]='\0';
+ }
+ *libpos = lm_fd->lpos;
+ }
+
+ lm_fd->lline[0]='\0';
+
+ r_block = b_block = min((size_t)(seqm-seqp),lm_fd->gcg_len);
+ if (lm_fd->gcg_binary) { r_block = (r_block+3)/4; }
+
+ fread((char *)seqp,(size_t)r_block,(size_t)1,lm_fd->libf);
+ if (!lm_fd->gcg_binary)
+ for (cp=seqp; seqp<seq+r_block; ) *seqp++ = ap[*cp++];
+ else if (lm_fd->gcg_binary) {
+ seqp = seq + r_block;
+ cp = seq + 4*r_block;
+ while (seqp > seq) {
+ stmp = *--seqp;
+ *--cp = gcg_bton[stmp&3];
+ *--cp = gcg_bton[(stmp >>= 2)&3];
+ *--cp = gcg_bton[(stmp >>= 2)&3];
+ *--cp = gcg_bton[(stmp >>= 2)&3];
+ }
+ }
+ if (4 * r_block >= lm_fd->gcg_len) {
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+ *lcont = 0;
+ }
+ else {
+ if (lm_fd->gcg_binary) b_block = 4*r_block;
+ lm_fd->gcg_len -= b_block;
+ (*lcont)++;
+ }
+
+ seq[b_block] = EOSEQ;
+ /* if (b_block==0) return 1; else */
+ return b_block;
+}
+
+void
+gcg_ranlib(char *str,
+ int cnt,
+ fseek_t seek,
+ char *libstr,
+ struct lmf_str *lm_fd)
+{
+ char *bp, *bp1, *llp;
+
+ FSEEK(lm_fd->libf, seek, 0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+
+ if (lm_fd->lline[0]=='>'&&(lm_fd->lline[3]==';'||lm_fd->lline[3]=='>')) {
+ strncpy(str,&lm_fd->lline[4],cnt-1);
+ str[cnt-1]='\0';
+ if ((bp = strchr(str,' '))!=NULL) *bp='\0';
+ else if ((bp=strchr(str,'\r'))!=NULL) *bp='\0';
+ else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+ else str[cnt-1]='\0';
+
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+
+ /* check beginning of line it is a duplicate */
+ for (llp=lm_fd->lline,bp=str; *llp == *bp; llp++,bp++);
+ if ((int)(llp-lm_fd->lline)<5) llp = lm_fd->lline;
+
+ /* here we would like to skip over some species stuff */
+ /*
+ if ((bp1 = strchr(llp,';'))!=NULL && (int)(bp1-llp)<50) {
+ if ((bp2 = strchr(bp1+1,';'))!=NULL && (int)(bp2-bp1)<50) {
+ *(bp2+1)='\0'; bp1 = bp2+2;
+ }
+ else {bp1=llp;}
+ }
+ else if ((bp1=strchr(llp,'.'))!=NULL && *(bp1+1)==' ') {
+ *(bp1+1) = '\0'; bp1 += 2;}
+ else bp1 = llp;
+ */
+
+ bp1 = llp;
+ if ((bp=strchr(bp1,'\r'))!=NULL) *bp='\0';
+ if ((bp=strchr(bp1,'\n'))!=NULL) *bp='\0';
+ strncat(str," ",(size_t)1);
+ strncat(str,bp1,(size_t)cnt-strlen(str));
+ if (bp1!=llp) strncat(str,llp,(size_t)cnt-strlen(str));
+ }
+ else {
+ str[0]='\0';
+ }
+
+ FSEEK(lm_fd->libf,seek,0);
+ fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
+}
+
+/* **************************************************************** */
+/* the following section contains the functions used to initialize,
+ read, hash, and lookup acc_lists, either from a sorted list, or a
+ hash table.
+
+ The functions are:
+
+ void *sel_acc_libstr_init(FILE *libf, int *acc_off, char fmt_term)
+ -- allocate space, read the accessions from an already open file of
+ accessions
+
+ int sel_acc_libstr(char *libstr, int gi, void *ptr)
+ -- compare libstr to current accession, returning 1 and
+ incrementing cur_entry if match is found.
+ Requires sorted list; does not access gi
+
+ int sel_acc_gi(char *libstr, int gi, void *ptr)
+ -- compare gi to gi_list, returning 1 and incrementing if match.
+ if (gi <= 0), get gi from libstr
+
+ int sel_hacc_libstr(char *libstr, int gi, void *ptr)
+ -- check to see whether libstr is in hash table
+
+ int sel_hacc_gi(char *libstr, int gi, void *ptr)
+ -- check to see if gi in hash32 table
+*/
+/* **************************************************************** */
+
+struct sel_acc_str {
+ int curr_entry;
+ int max_entry;
+ char fmt_term;
+ char *acc_buff;
+ char **acc_list;
+ int *gi_list;
+ int *acc_hash;
+ int *acc_hash_link;
+ int hash_mask;
+};
+
+/* allocate space, read the accessions from an already open file of
+ accessions */
+
+void *sel_acc_libstr_init(FILE *libf, int *acc_off, char fmt_term) {
+ struct sel_acc_str *sel_acc_ptr;
+ char acc_line[MAX_STR];
+ char *bp, *bp1;
+ char *acc_buff;
+ char *acc_buff_max; /* end of buffer */
+ char *new_buff; /* reallocated buffer size */
+ char *acc_buff_p;
+ char **acc_list;
+ int acc_cnt, i;
+ int new_buff_siz;
+ int abuff_siz; /* allocated buffer size */
+ int buff_siz; /* fread buff_siz */
+
+ if ((sel_acc_ptr = (struct sel_acc_str *)calloc(1,sizeof(struct sel_acc_str)))==NULL) {
+ fprintf(stderr, "Cannot allocate struct sel_acc_str\n");
+ return NULL;
+ }
+
+ /*
+ if (fmt && *fmt != '\0') {
+ sel_acc_ptr->fmt = (char *)calloc(strlen(fmt)+1,sizeof(char));
+ strncpy(sel_acc_ptr->fmt, fmt, strlen(fmt)+1);
+ sel_acc_ptr->fmt[strlen(fmt)] = '\0';
+ }
+ */
+
+ sel_acc_ptr->fmt_term = fmt_term;
+
+ /* allocate some space for the ACC's */
+
+ abuff_siz = new_buff_siz = 640000;
+
+ if ((acc_buff = (char *)calloc(abuff_siz*10, sizeof(char)))==NULL) {
+ fprintf(stderr, "Cannot allocate acc buff %d\n",abuff_siz*10);
+ free(sel_acc_ptr);
+ return NULL;
+ }
+
+ /* iteratively read and reallocate space for buffer until its all read */
+
+ acc_buff_p = acc_buff;
+ while ((buff_siz = fread(acc_buff_p, sizeof(char), new_buff_siz, libf))==new_buff_siz) {
+ if ((new_buff = realloc(acc_buff, (size_t)(abuff_siz+new_buff_siz)))==NULL) {
+ fprintf(stderr, " cannot reallocate for acc_buf[%d]\n",abuff_siz);
+ break;
+ }
+ else {
+ acc_buff = new_buff;
+ acc_buff_p = acc_buff + abuff_siz;
+ abuff_siz += new_buff_siz;
+ }
+ }
+ fclose(libf);
+
+ acc_buff_max = acc_buff_p + buff_siz;
+
+ /* convert all the ACC lines (with \n) to null-terminated and
+ count the number of aacc's */
+
+ acc_cnt = 0;
+ acc_buff_p = acc_buff;
+ while (acc_buff_p < acc_buff_max && (bp = strchr(acc_buff_p,'\n'))!=NULL) {
+ *bp = '\0';
+ /* also remove '\r'); */
+ if ((bp1=strchr(acc_buff_p,'\r'))!=NULL) {*bp1 = '\0';}
+ acc_cnt++;
+ acc_buff_p = bp+1;
+ }
+
+ /* allocate the acc_list */
+ if ((acc_list=(char **)calloc(acc_cnt+1, sizeof(char **)))==NULL) {
+ fprintf(stderr," cannot allocate acc_list[%d]\n",acc_cnt+1);
+ free(sel_acc_ptr);
+ return NULL;
+ }
+
+ /* now load acc_list[] */
+ for (i=0, acc_buff_p=acc_buff; i<acc_cnt; i++) {
+ acc_list[i] = acc_buff_p;
+ acc_buff_p += strlen(acc_buff_p)+1;
+ }
+
+ /* finally put everything in the structure to be returned */
+ sel_acc_ptr->acc_buff = acc_buff;
+ sel_acc_ptr->acc_list = acc_list;
+ sel_acc_ptr->curr_entry = 0;
+ sel_acc_ptr->max_entry = acc_cnt;
+ return (void *)sel_acc_ptr;
+}
+
+int sel_acc_libstr(char *libstr, int gi, void *ptr) {
+ struct sel_acc_str *sel_acc_ptr;
+ char *curr_acc;
+ char acc[MAX_SSTR], *acc_p, *bp;
+
+ sel_acc_ptr = (struct sel_acc_str *)ptr;
+
+ if (sel_acc_ptr->curr_entry >= sel_acc_ptr->max_entry) return -1;
+
+ if ((bp = strchr(libstr,sel_acc_ptr->fmt_term))!=NULL) {
+ *bp = '\0';
+ }
+
+ curr_acc = sel_acc_ptr->acc_list[sel_acc_ptr->curr_entry];
+ if (libstr[2] == curr_acc[2] && libstr[1] == curr_acc[1] &&
+ strncmp(libstr,curr_acc,MAX_UID)==0) {
+ sel_acc_ptr->curr_entry++;
+ return 1;
+ }
+ else {
+ return 0;
+ }
+}
+
+void *sel_acc_gi_init(FILE *libf, int *acc_off, char fmt_term) {
+ struct sel_acc_str *sel_acc_ptr;
+ char acc_line[MAX_STR];
+ char *bp;
+ int *gi_list;
+ int *new_buff; /* reallocated buffer size */
+ int *acc_buff_p;
+ int acc_cnt, i;
+ int new_buff_siz;
+ int abuff_siz; /* allocated buffer size */
+ int buff_siz; /* fread buff_siz */
+
+ if ((sel_acc_ptr = (struct sel_acc_str *)calloc(1,sizeof(struct sel_acc_str)))==NULL) {
+ fprintf(stderr, "Cannot allocate struct sel_acc_str\n");
+ return NULL;
+ }
+
+ /*
+ if (fmt != NULL && *fmt != '\0') {
+ sel_acc_ptr->fmt = (char *)calloc(strlen(fmt)+1,sizeof(char));
+ strncpy(sel_acc_ptr->fmt, fmt, strlen(fmt)+1);
+ sel_acc_ptr->fmt[strlen(fmt)] = '\0';
+ }
+ else {
+ sel_acc_ptr->fmt = NULL;
+ }
+ */
+
+ sel_acc_ptr->fmt_term = fmt_term;
+
+ /* now allocate some space for the ACC's */
+
+ abuff_siz = new_buff_siz = 64000;
+
+ if ((gi_list = (int *)calloc(abuff_siz, sizeof(int)))==NULL) {
+ fprintf(stderr, "Cannot allocate acc buff %d\n",abuff_siz);
+ free(sel_acc_ptr);
+ return NULL;
+ }
+
+ /* now iteratively read and reallocate space for buffer until its all read */
+
+ acc_cnt = 0;
+ while (fgets(acc_line, sizeof(acc_line), libf)!=NULL) {
+ gi_list[acc_cnt++] = atoi(acc_line);
+ if (acc_cnt >= abuff_siz) {
+ if ((new_buff = realloc(gi_list,(abuff_siz + new_buff_siz)*sizeof(int)))==NULL) {
+ fprintf(stderr,"cannot realloc gi_list[%d]\n",abuff_siz+new_buff_siz);
+ break;
+ }
+ else {
+ abuff_siz += new_buff_siz;
+ gi_list = new_buff;
+ }
+ }
+ }
+ fclose(libf);
+
+ /* finally put everything in the structure to be returned */
+ sel_acc_ptr->gi_list = gi_list;
+ sel_acc_ptr->curr_entry = 0;
+ sel_acc_ptr->max_entry = acc_cnt;
+ return (void *)sel_acc_ptr;
+}
+
+int sel_acc_gi(char *libstr, int gi, void *ptr) {
+ struct sel_acc_str *sel_acc_ptr;
+ char *bp;
+
+ sel_acc_ptr = (struct sel_acc_str *)ptr;
+
+ if (sel_acc_ptr->curr_entry >= sel_acc_ptr->max_entry) return -1;
+
+ if (gi <= 0) {
+ if (libstr) {
+ /*
+ if (sel_acc_ptr->fmt) {
+ sscanf(libstr,sel_acc_ptr->fmt,&gi);
+ }
+ */
+ gi = atoi(libstr);
+ }
+ }
+
+ if (gi == sel_acc_ptr->gi_list[sel_acc_ptr->curr_entry]) {
+ sel_acc_ptr->curr_entry++;
+ return 1;
+ }
+ else {
+ return 0;
+ }
+}
+
+/* this version of the selection algorithm does not require sorted
+ lists. It hashes the initial list, and uses the hash table to
+ lookup the library sequences.
+
+*/
+
+#define HASH_TABLE_MULT 8
+
+void *sel_hacc_libstr_init(FILE *libf, int *acc_off, char fmt_term) {
+ struct sel_acc_str *sel_acc_ptr;
+ char acc_line[MAX_STR];
+ char *bp, *bp1;
+ char *acc_buff;
+ char *acc_buff_max; /* end of buffer */
+ char *new_buff; /* reallocated buffer size */
+ char *acc_buff_p;
+ char **acc_list;
+ int acc_cnt, i;
+ int new_buff_siz;
+ int abuff_siz; /* allocated buffer size */
+ int buff_siz; /* fread buff_siz */
+ int hash_mask, hash_max;
+ int hash_val, *acc_hash, *acc_hash_link;
+ int link_save;
+
+ if ((sel_acc_ptr = (struct sel_acc_str *)calloc(1,sizeof(struct sel_acc_str)))==NULL) {
+ fprintf(stderr, "Cannot allocate struct sel_acc_str\n");
+ return NULL;
+ }
+
+ /*
+ if (fmt && *fmt != '\0') {
+ sel_acc_ptr->fmt = (char *)calloc(strlen(fmt)+1,sizeof(char));
+ strncpy(sel_acc_ptr->fmt, fmt, strlen(fmt)+1);
+ sel_acc_ptr->fmt[strlen(fmt)] = '\0';
+ }
+ */
+
+ sel_acc_ptr->fmt_term = fmt_term;
+
+ /* now allocate some space for the ACC's */
+
+ abuff_siz = new_buff_siz = 640000;
+
+ if ((acc_buff = (char *)calloc(abuff_siz, sizeof(char)))==NULL) {
+ fprintf(stderr, "Cannot allocate acc buff %d\n",abuff_siz);
+ free(sel_acc_ptr);
+ return NULL;
+ }
+
+ /* now iteratively read and reallocate space for buffer until its all read */
+
+ acc_buff_p = acc_buff;
+ while ((buff_siz = fread(acc_buff_p, sizeof(char), new_buff_siz, libf))==new_buff_siz) {
+ if ((new_buff = realloc(acc_buff, (size_t)(abuff_siz+new_buff_siz)))==NULL) {
+ fprintf(stderr, " cannot reallocate for acc_buf[%d]\n",abuff_siz);
+ break;
+ }
+ else {
+ acc_buff = new_buff;
+ acc_buff_p = acc_buff + abuff_siz;
+ abuff_siz += new_buff_siz;
+ }
+ }
+ fclose(libf);
+
+ acc_buff_max = acc_buff_p + buff_siz;
+
+ /* now convert all the ACC lines (with \n) to null-terminated and
+ count the number of aacc's */
+
+ acc_cnt = 0;
+ acc_buff_p = acc_buff;
+ while (acc_buff_p < acc_buff_max && (bp = strchr(acc_buff_p,'\n'))!=NULL) {
+ *bp = '\0';
+ /* also remove '\r'); */
+ if ((bp1=strchr(acc_buff_p,'\r'))!=NULL) {*bp1 = '\0';}
+ acc_cnt++;
+ acc_buff_p = bp+1;
+ }
+
+ /* allocate the acc_list */
+ if ((acc_list=(char **)calloc(acc_cnt+1, sizeof(char **)))==NULL) {
+ fprintf(stderr," cannot allocate acc_list[%d]\n",acc_cnt+1);
+ free(sel_acc_ptr);
+ return NULL;
+ }
+
+ /* now load acc_list[] */
+ for (i=0, acc_buff_p=acc_buff; i<acc_cnt; i++) {
+ acc_list[i] = acc_buff_p;
+ acc_buff_p += strlen(acc_buff_p)+1;
+ }
+
+ /* allocate the hash for the acc_list - we want a table that is
+ about 4X acc_cnt and a power of 2 */
+ for (hash_max = 4096; hash_max <= HASH_TABLE_MULT * acc_cnt; hash_max *= 2);
+ hash_mask = hash_max - 1;
+
+ if ((acc_hash = (int *)calloc(hash_max, sizeof(int)))==NULL) {
+ fprintf(stderr, "cannot allocate acc_hash[%ld]\n",hash_max*sizeof(int));
+ return NULL;
+ }
+
+ /* allocate the acc_list link table */
+ if ((acc_hash_link = (int *)calloc(acc_cnt+1,sizeof(char *)))==NULL) {
+ fprintf(stderr, "cannot allocate acc_hash_link[%ld]\n",acc_cnt*sizeof(char *));
+ return NULL;
+ }
+
+ for (i=0; i<acc_cnt; i++) {
+ hash_val = hash_func(acc_list[i]) & hash_mask;
+ if ((link_save = acc_hash[hash_val]) != 0) {
+ acc_hash_link[i+1]=link_save;
+ }
+ acc_hash[hash_val] = i+1;
+ }
+
+ /* finally put everything in the structure to be returned */
+ sel_acc_ptr->acc_buff = acc_buff;
+ sel_acc_ptr->acc_list = acc_list;
+ sel_acc_ptr->curr_entry = 0;
+ sel_acc_ptr->max_entry = acc_cnt;
+ /* hash stuff */
+ sel_acc_ptr->acc_hash = acc_hash;
+ sel_acc_ptr->acc_hash_link = acc_hash_link;
+ sel_acc_ptr->hash_mask = hash_mask;
+
+ return (void *)sel_acc_ptr;
+}
+
+int sel_hacc_libstr(char *libstr, int gi, void *ptr) {
+ int i, i1;
+ struct sel_acc_str *sel_acc_ptr;
+ char *bp, **acc_list;
+ int hash_val;
+
+ sel_acc_ptr = (struct sel_acc_str *)ptr;
+
+ if (sel_acc_ptr->curr_entry >= sel_acc_ptr->max_entry) return -1;
+
+ if ((bp = strchr(libstr,sel_acc_ptr->fmt_term))!=NULL) {
+ *bp = '\0';
+ }
+
+ hash_val = hash_func(libstr) & sel_acc_ptr->hash_mask;
+
+ if (sel_acc_ptr->acc_hash[hash_val] == 0) return 0;
+
+ acc_list = sel_acc_ptr->acc_list;
+
+ for (i=sel_acc_ptr->acc_hash[hash_val]; i > 0;
+ i = sel_acc_ptr->acc_hash_link[i]) {
+ i1 = i-1;
+ if (libstr[2] == acc_list[i1][2] && libstr[1] == acc_list[i1][1] &&
+ strncmp(libstr,acc_list[i1],MAX_UID)==0) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+void *sel_hacc_gi_init(FILE *libf, int *acc_off, char fmt_term) {
+ struct sel_acc_str *sel_acc_ptr;
+ char acc_line[MAX_STR];
+ char *bp;
+ int *gi_list;
+ int *new_buff; /* reallocated buffer size */
+ int *acc_buff_p;
+ int acc_cnt, i;
+ int new_buff_siz;
+ int abuff_siz; /* allocated buffer size */
+ int buff_siz; /* fread buff_siz */
+ int hash_val, hash_max, hash_mask, link_save;
+ int *gi_hash, *gi_hash_link;
+
+ if ((sel_acc_ptr = (struct sel_acc_str *)calloc(1,sizeof(struct sel_acc_str)))==NULL) {
+ fprintf(stderr, "Cannot allocate struct sel_acc_str\n");
+ return NULL;
+ }
+
+ /*
+ if (fmt != NULL && *fmt != '\0') {
+ sel_acc_ptr->fmt = (char *)calloc(strlen(fmt)+1,sizeof(char));
+ strncpy(sel_acc_ptr->fmt, fmt, strlen(fmt)+1);
+ sel_acc_ptr->fmt[strlen(fmt)] = '\0';
+ }
+ else {
+ sel_acc_ptr->fmt = NULL;
+ }
+ */
+
+ sel_acc_ptr->fmt_term = fmt_term;
+
+ /* now allocate some space for the ACC's */
+
+ abuff_siz = new_buff_siz = 64000;
+
+ if ((gi_list = (int *)calloc(abuff_siz, sizeof(int)))==NULL) {
+ fprintf(stderr, "Cannot allocate acc buff %d\n",abuff_siz);
+ free(sel_acc_ptr);
+ return NULL;
+ }
+
+ /* now iteratively read and reallocate space for buffer until its all read */
+
+ acc_cnt = 0;
+ while (fgets(acc_line, sizeof(acc_line), libf)!=NULL) {
+ gi_list[acc_cnt++] = atoi(acc_line);
+ if (acc_cnt >= abuff_siz) {
+ if ((new_buff = realloc(gi_list,(abuff_siz + new_buff_siz)*sizeof(int)))==NULL) {
+ fprintf(stderr,"cannot realloc gi_list[%d]\n",abuff_siz+new_buff_siz);
+ break;
+ }
+ else {
+ abuff_siz += new_buff_siz;
+ gi_list = new_buff;
+ }
+ }
+ }
+ fclose(libf);
+
+ /* allocate the hash for the gi_list - we want a table that is
+ about 4X acc_cnt and a power of 2 */
+ for (hash_max = 4096; hash_max <= HASH_TABLE_MULT * acc_cnt; hash_max *= 2);
+ hash_mask = hash_max - 1;
+
+ if ((gi_hash = (int *)calloc(hash_max, sizeof(int)))==NULL) {
+ fprintf(stderr, "cannot allocate gi_hash[%ld]\n",hash_max*sizeof(int));
+ return NULL;
+ }
+
+ /* allocate the gi_list link table */
+ if ((gi_hash_link = (int *)calloc(acc_cnt+1,sizeof(char *)))==NULL) {
+ fprintf(stderr, "cannot allocate gi_hash_link[%ld]\n",acc_cnt*sizeof(char *));
+ return NULL;
+ }
+
+ for (i=0; i<acc_cnt; i++) {
+ hash_val = fast_hash32(gi_list[i]) & hash_mask;
+ if (gi_hash[hash_val] != 0) {
+ link_save = gi_hash[hash_val];
+ gi_hash_link[i+1]=link_save;
+ }
+ gi_hash[hash_val] = i+1;
+ }
+
+ /* finally put everything in the structure to be returned */
+ sel_acc_ptr->gi_list = gi_list;
+ sel_acc_ptr->curr_entry = 0;
+ sel_acc_ptr->max_entry = acc_cnt;
+
+ sel_acc_ptr->acc_hash = gi_hash;
+ sel_acc_ptr->acc_hash_link = gi_hash_link;
+ sel_acc_ptr->hash_mask = hash_mask;
+
+ return (void *)sel_acc_ptr;
+}
+
+int sel_hacc_gi(char *libstr, int gi, void *ptr) {
+ struct sel_acc_str *sel_acc_ptr;
+ int hash_val;
+ int *gi_list, i;
+ char *bp;
+
+ sel_acc_ptr = (struct sel_acc_str *)ptr;
+
+ if (sel_acc_ptr->curr_entry >= sel_acc_ptr->max_entry) return -1;
+
+ if (gi <= 0) {
+ if (libstr) {
+ /*
+ if (sel_acc_ptr->fmt) {
+ sscanf(libstr,sel_acc_ptr->fmt,&gi);
+ }
+ */
+ gi = atoi(libstr);
+ }
+ }
+
+ hash_val = fast_hash32(gi) & sel_acc_ptr->hash_mask;
+ if (sel_acc_ptr->acc_hash[hash_val]==0) return 0;
+
+ gi_list = sel_acc_ptr->gi_list;
+
+ for (i=sel_acc_ptr->acc_hash[hash_val]; i > 0;
+ i = sel_acc_ptr->acc_hash_link[i]) {
+ if (gi_list[i-1] == gi) return 1;
+ }
+ return 0;
+}
+
+
+/* adapted from http://burtleburtle.net/bob/hash/doobs.html */
+unsigned int
+hash_func(char *key)
+{
+ unsigned int hash;
+
+ hash = 0;
+
+ while (*key) {
+ hash += *key++;
+ hash += (hash << 10);
+ hash ^= (hash >> 6);
+ }
+
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ return hash;
+}
+
+unsigned int
+fast_hash32 (unsigned int data) {
+ int tmp, hash;
+
+
+ hash = data >> 16;
+ tmp = ((data & 0xFFFF) << 11) ^ hash;
+ hash = (hash << 16) ^ tmp;
+ hash += hash >> 11;
+
+ /* Force "avalanching" of final 127 bits */
+ hash ^= hash << 3;
+ hash += hash >> 5;
+ hash ^= hash << 4;
+ hash += hash >> 17;
+ hash ^= hash << 25;
+ hash += hash >> 6;
+
+ return hash;
+}
+
+/* takes a file name string and allocates space for it, returning
+ pointer to space */
+char *alloc_file_name(char *f_name) {
+ int fn_len;
+ char *alloc_f_name;
+
+ fn_len = strlen(f_name);
+ if ((alloc_f_name = calloc(fn_len+1,sizeof(char)))==NULL) {
+ fprintf(stderr, "Cannot allocate %d space for %s\n",
+ fn_len+1, f_name);
+ exit(1);
+ }
+ else {
+ strncpy(alloc_f_name,f_name,fn_len);
+ return alloc_f_name;
+ }
+}
diff --git a/src/param.h b/src/param.h
new file mode 100644
index 0000000..e4deca5
--- /dev/null
+++ b/src/param.h
@@ -0,0 +1,251 @@
+
+/* $Id: param.h 1233 2013-10-08 18:26:31Z wrp $ */
+/* $Revision: 1233 $ */
+
+#include <sys/types.h>
+
+#ifndef P_STRUCT
+#define P_STRUCT
+
+#define MAXSQ 60
+
+/* Concurrent read version */
+
+struct fastr {
+ int ktup;
+ int cgap;
+ int pgap;
+ int pamfact;
+ int scfact;
+ /* these values will soon be abandoned */
+ int bestoff;
+ int bestscale;
+ int bkfact;
+ int bktup;
+ int bestmax;
+ /* statistics based scaling values */
+ int use_E_thresholds;
+ double E_join, E_band_opt;
+ int altflag;
+ int optflag;
+ int iniflag;
+ int optcut;
+ int optcut_set;
+ int optwid;
+ int optwid_set;
+};
+
+struct prostr {
+ int gopen;
+ int gextend;
+ int width;
+};
+
+/* must be identical in thr_bufs.h */
+struct score_count_s {
+ long s_cnt[3];
+ long tot_scores;
+};
+
+struct pstruct /* parameters */
+{
+ int n0; /* length of query sequence, used for statistics */
+ int gdelval; /* value gap open (-10) */
+ int ggapval; /* value for additional residues in gap (-2) */
+ int gshift; /* frameshift for fastx, fasty */
+ int gsubs; /* nt substitution in fasty */
+ int p_d_mat; /* dna match penalty */
+ int p_d_mis; /* dna mismatch penalty */
+ int p_d_set; /* using match/mismatch */
+ int n1_low;
+ int n1_high; /* sequence length limits */
+ int score_ix; /* index to sorted score */
+ int show_ident; /* flag - show identical lalign alignment */
+ int nseq; /* number of different sequences (for lalign) */
+ int zsflag; /* use scalebest() */
+ int zsflag2; /* statistics for best shuffle */
+ int zsflag_f; /* use scalebest() */
+ int zs_win; /* window shuffle size */
+ int shuffle_dna3; /* shuffle dna as codons */
+ int histint; /* histogram interval */
+ unsigned char sq[MAXSQ+1];
+ int hsq[MAXSQ+1];
+ int nsq; /* length of normal sq */
+ /* int pamh1[MAXSQ+1]; */ /* identical match score (diagonal scores) */
+ /* int *pamh2[MAXSQ+1]; */ /* ktup match score */
+ int ext_sq_set; /* flag for using extended alphabet */
+ unsigned char sqx[MAXSQ+1];
+ int hsqx[MAXSQ+1];
+ int c_nt[MAXSQ+1];
+ int nsqx; /* length of extended sq */
+ int nsq_e; /* effective nsq */
+ int dnaseq; /* -1 = not set (protein); 0 = protein; 1 = DNA; 2 = other, 3 RNA */
+ int nt_align; /* DNA/RNA alignment = 1 */
+ int debug_lib;
+ int tr_type; /* codon table */
+ int sw_flag;
+ char pamfile[MAX_FN]; /* pam file name */
+ char pamfile_save[MAX_FN]; /* original pam file */
+ char pam_name[MAX_FN];
+ char pgpfile[MAX_FN];
+ int pgpfile_type;
+ float pamscale; /* ln(2)/3 or ln(2)/2 */
+ float ulambda; /* ungapped lambda */
+ float entropy; /* bits/position */
+ float tfract_id; /* target fraction id */
+ int pam_pssm;
+ int pam_set;
+ int pam_variable;
+ int have_pam2;
+ int **pam2[2]; /* set of 2D scoring matrices; [0] lower-case 'x', [1] upper/lower case */
+ int **pam2p[2];
+ int pamoff; /* offset for pam values */
+ int pam_l, pam_h, pam_xx, pam_xm; /* lowest, highest pam value */
+ int pam_x_set;
+ int pam_x_id_sim; /* =0 -> 'N,X' identical but not similar;
+ =1 -> 'N,X' identical+similar;
+ <0 -> 'N,X' not identical, not similar */
+ int pam_ms; /* use a Mass Spec pam matrix */
+ void *fp_struct; /* function specific parameters based on algorith/scoring matrix */
+ int LK_set;
+ double pLambda, pK, pH; /* Karlin-Altscul parameters */
+ int maxlen;
+ int max_repeat; /* used for repeat count in ssearch34/lalign */
+ int repeat_thresh;
+ char *other_info;
+ double e_cut; /* cutoff for scores */
+ double e_cut_r; /* cutoff for multiple local alignments */
+ double zs_off; /* z-score offset from sampling */
+ int do_rep; /* enable multiple alignments */
+ int can_pre_align; /* flag for have_ares & 0x1 pre-alignments */
+ long zdb_size; /* force database size */
+ int zdb_size_set; /* flag for user -Z */
+ int pgm_id;
+ int pseudocts;
+ int shuff_node;
+ union {
+ struct fastr fa;
+ struct prostr pr;
+ } param_u;
+};
+
+#include "rstruct.h"
+
+/* the seq_record has all the invariant data about a sequence -
+ sequence length, libstr, sequence itself, etc.
+ it does not have the results information
+ we can have 1, 2, or 6 (obsolete tfasta) results records for a sequence,
+ but there will still be only one sequence record.
+*/
+
+struct annot_str {
+ /* information for conventional annotations */
+ unsigned char *aa1_ann; /* annotation string */
+ /* information for "rich" annotations */
+ int n_annot; /* length of ann_arr_str array */
+ int n_domains; /* length of domain_arr_p array */
+ struct annot_entry *annot_arr_p; /* array[n_annot] of annot_entry's for all annotations */
+ struct annot_entry **s_annot_arr_p; /* sorted version of annots */
+};
+
+/* ann_str keeps information on "rich" annotations, position, type, value */
+struct annot_entry {
+ long pos;
+ long end;
+ char label; /* currently -V *#%!@ symbols, plus 'V' for variant */
+ unsigned char value; /* must be amino acid residue, binary encoded */
+ char *comment;
+ int target; /* 0 for query/ 1 for library */
+};
+
+/* domain_str keeps information on "rich" annotations, position, type, value */
+struct domfeat_data {
+ struct annot_entry *annot_entry_p;
+ struct domfeat_data *next;
+ long pos; /* annotation position */
+ long a_pos; /* aligned annotation position */
+ long end_pos; /* domain annotation end */
+ int score; /* score of current region */
+ int n_ident; /* count for percent id */
+ int n_alen; /* align len for percent id */
+ int n_gaplen; /* number of gap residues in alignment */
+};
+
+/* seq_record has the data required to do a calculation */
+struct seq_record {
+ unsigned char *aa1b; /* sequence buffer */
+ struct annot_str *annot_p;
+ int n1;
+ long l_offset; /* q_offset/l_offset set outside getlib() based on chunks; 0-based */
+ long l_off; /* q_off/l_off comes from @C:123, and is 1-based */
+ int index; /* index in search */
+#ifdef DEBUG
+ long adler32_crc;
+#endif
+};
+
+/* mseq_record has meta data not required to calculate score or alignment */
+struct mseq_record {
+ int *n1tot_p;
+#ifdef USE_FSEEKO
+ off_t lseek;
+#else
+ long lseek;
+#endif
+ struct lmf_str *m_file_p;
+ int cont;
+ char libstr[MAX_UID];
+ char *bline;
+ int bline_max;
+ int annot_req_flag;
+ int index /* index in search */;
+#ifdef DEBUG
+ long adler32_crc;
+#endif
+};
+
+struct seqr_chain {
+ struct seq_record *seqr_base;
+ struct mseq_record *mseqr_base;
+ struct seqr_chain *next;
+ /* struct lib_seq_info *ldb_info; */
+ int max_chain_seqs;
+ int cur_seq_cnt;
+ unsigned char *aa1b_base;
+ int aa1b_size;
+ int aa1b_next;
+ int contiguous;
+};
+
+struct getlib_str {
+ int lcont; /* lcont save */
+ int ocont; /* ocont save */
+ int eof; /* done with this file */
+#ifdef USE_FSEEKO
+ off_t lseek;
+#else
+ long lseek;
+#endif
+ long loffset; /* loffset save */
+ char libstr[MAX_UID]; /* repository for libstr */
+ int n_libstr; /* length of libstr */
+ unsigned char *aa1save; /* overlapping sequence save */
+ struct lib_struct *lib_list_p;
+ int *n1tot_ptr, *n1tot_cur;
+ int n1tot_cnt;
+ int n1tot_v;
+ long tot_memK; /* cummulative amount of memory allocated for aa1b;
+ used to limit memory use */
+ long max_memK; /* allow separate memory limits for main,link
+ searches */
+ long lost_memK; /* check for waste */
+ struct seqr_chain *start_seqr_chain;
+ struct seqr_chain *cur_seqr_chain;
+ int use_memory;
+};
+
+#endif /* P_STRUCT */
+
+#ifndef A_STRUCT
+#include "aln_structs.h"
+#endif
diff --git a/src/pcomp_bufs.h b/src/pcomp_bufs.h
new file mode 100644
index 0000000..4d147fa
--- /dev/null
+++ b/src/pcomp_bufs.h
@@ -0,0 +1,33 @@
+
+/***************************************/
+/* thread global variable declarations */
+/***************************************/
+
+/* $Id: pcomp_bufs.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+/* this file serves the same purpose as thr_bufs2.h in the threaded version
+
+ Unlike thr_bufs2.h, which has two separate lists of buffers,
+ reader_buf[] and worker_buf[], so that the threads can be asynchronous.
+
+ pcomp workers are asynchonous as well, but, initially, there is an
+ array that has an entry for each worker (worker_buf[]), and a queue
+ that captures which buffers are ready (work_q[]).
+ */
+
+#ifndef MAX_WORKERS
+#define MAX_WORKERS 2
+#endif
+
+#ifndef XTERNAL
+struct buf_head **worker_buf; /* pointers buffers of sequences/results */
+int *work_q; /* next worker available */
+int max_worker_q;
+int num_reader_bufs;
+#else
+extern struct buf_head **worker_buf;
+extern int *work_q;
+extern int max_worker_q;
+extern int num_reader_bufs;
+#endif
diff --git a/src/pcomp_subs2.c b/src/pcomp_subs2.c
new file mode 100644
index 0000000..9752d72
--- /dev/null
+++ b/src/pcomp_subs2.c
@@ -0,0 +1,686 @@
+/* $Id: pcomp_subs2.c $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* modified to do more initialization of work_info here, rather than
+ in main() */
+
+/* this file provides the same functions for PCOMPLIB as pthr_subs2.c does for COMP_THR */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/types.h>
+#ifdef UNIX
+#include <unistd.h>
+#endif
+#include <signal.h>
+
+#include "defs.h"
+#include "structs.h" /* mngmsg, libstruct */
+#include "param.h" /* pstruct, thr_str, buf_head, rstruct */
+#include "thr_buf_structs.h"
+
+#ifdef MPI_SRC
+#include "mpi.h"
+#endif
+
+#include "msg.h"
+#include "pcomp_bufs.h"
+#define XTERNAL
+#include "uascii.h"
+#undef XTERNAL
+#include "pthr_subs.h"
+
+#ifdef DEBUG
+unsigned long adler32(unsigned long, const unsigned char *, unsigned int);
+#endif
+
+static int next_worker_idx, num_workers_idle;
+extern int g_worker;
+
+/* used for debugging */
+/*
+int check_seq_range(unsigned char *aa1b, int n1, int nsq, char *);
+*/
+
+/* start the workers, nworkers == number of workers, not nodes */
+void
+init_thr(int nworkers, char *info_lib_range_p, const struct mngmsg *m_msp, struct pstruct *ppst,
+ unsigned char *aa0, struct mng_thr *m_bufi_p)
+{
+#ifdef MPI_SRC
+ MPI_Status mpi_status;
+ int int_msg_b[4]; /* general purpose buffer for integers */
+#endif
+ int node, snode;
+
+ /* start the worker processes */
+
+ if (work_q == NULL) {
+ if ((work_q=(int *)calloc(nworkers, sizeof(int)))==NULL) {
+ fprintf(stderr, " cannot allocate work_q[%d] structure\n",
+ nworkers);
+ exit(1);
+ }
+ else {max_worker_q = nworkers;}
+ }
+ num_workers_idle = 0;
+
+ /* setup thread buffer info */
+ if (aa0 == NULL) {
+ int_msg_b[0] = int_msg_b[1] = int_msg_b[2] = 0;
+ }
+ else {
+ int_msg_b[0] = nworkers;
+ int_msg_b[1] = m_bufi_p->max_buf2_res;
+ int_msg_b[2] = m_bufi_p->max_chain_seqs;
+ int_msg_b[3] = m_bufi_p->seq_buf_size;
+ }
+
+ /* send thread info */
+ for (node=FIRSTNODE; node < nworkers+FIRSTNODE; node++) {
+
+ MPI_Send(int_msg_b, 4, MPI_INT, node, STARTTYPE0, MPI_COMM_WORLD);
+
+ if (aa0 == NULL) { continue;}
+
+ /* send mngmsg */
+ MPI_Send((void *)m_msp, sizeof(struct mngmsg), MPI_BYTE, node,
+ STARTTYPE1, MPI_COMM_WORLD);
+
+ MPI_Send(ppst, sizeof(struct pstruct), MPI_BYTE, node,
+ STARTTYPE2, MPI_COMM_WORLD);
+
+ /* send the rest of the pieces of pam[2] */
+ MPI_Send(&ppst->pam2[0][0][0],m_msp->pamd1*m_msp->pamd2,MPI_INT,node,STARTTYPE3,
+ MPI_COMM_WORLD);
+ MPI_Send(&ppst->pam2[1][0][0],m_msp->pamd1*m_msp->pamd2,MPI_INT,node,STARTTYPE3,
+ MPI_COMM_WORLD);
+
+ /* send pascii (only for fasty/tfasty */
+ MPI_Send(pascii, sizeof(aascii), MPI_BYTE, node, STARTTYPE4, MPI_COMM_WORLD);
+ }
+
+ if (aa0 == NULL) {
+ /* all done */
+ free(work_q);
+ return;
+ }
+
+ /* wait for returned status results */
+ while (num_workers_idle < max_worker_q) {
+ MPI_Recv(&node, 1, MPI_INT, MPI_ANY_SOURCE,MSEQTYPE0,
+ MPI_COMM_WORLD, &mpi_status);
+ snode= mpi_status.MPI_SOURCE;
+ if (snode == FIRSTNODE) {
+ MPI_Recv(info_lib_range_p, MAX_FN, MPI_BYTE, snode,MSEQTYPE0,
+ MPI_COMM_WORLD, &mpi_status);
+ }
+
+ if (snode != node) {
+ fprintf(stderr, " initial node mismatch [%d!=%d]\n",node, snode);
+ }
+ worker_buf[snode-FIRSTNODE]->hdr.have_data = 0;
+ worker_buf[snode-FIRSTNODE]->hdr.have_results = 0;
+ worker_buf[snode-FIRSTNODE]->hdr.worker_idx = snode;
+ work_q[num_workers_idle++] = snode;
+ }
+ next_worker_idx = 0;
+
+ /* send query sequence info to workers */
+ for (node=FIRSTNODE; node < nworkers+FIRSTNODE; node++) {
+ /* send thread buffer info */
+ int_msg_b[0] = m_msp->n0;
+ int_msg_b[1] = m_msp->nm0;
+ MPI_Send(int_msg_b, 2, MPI_INT, node, QSEQTYPE0, MPI_COMM_WORLD);
+ MPI_Send(aa0, m_msp->n0+1, MPI_BYTE, node, QSEQTYPE1, MPI_COMM_WORLD);
+ if (m_msp->ann_flg && m_msp->aa0a) {
+ MPI_Send(m_msp->aa0a, m_msp->n0+2, MPI_BYTE, node, QSEQTYPE1, MPI_COMM_WORLD);
+ }
+ }
+}
+
+/* get_rbuf() provides buffers containing sequences to the main
+ program. max_work_q buffers are available, with each
+ buffer tied to a worker.
+
+ As the main program runs, it calls get_rbuf() to get a worker
+ buffer (reader buffers are not used with PCOMPLIB), fills it with
+ sequences, and sends it to a worker with put_rbuf().
+
+ At the same time, the worker programs call get_wbuf(), to get a
+ filled worker buffer sent by put_rbuf(), takes the sequences from
+ the buffer and does the comparisons, and sends the results back to
+ the manager by calling put_wbuf().
+*/
+
+/* wait for results from any worker */
+struct buf_head *
+next_work_result(int *snode) {
+ int this_node, buf2_cnt;
+ int int_msg_b[4]; /* general purpose int buffer */
+ int i;
+ struct buf2_hdr_s buf2_head;
+ struct buf_head *this_buf_p, tmp_buf_head;
+ struct seq_record *seq_b_save;
+ struct mseq_record *mseq_b_save;
+ unsigned char *aa1b_start_save;
+ struct a_res_str *new_ares_p, *prev_ares_p;
+#ifdef MPI_SRC
+ MPI_Status mpi_status;
+#endif
+
+ /* wait for a returned result */
+ MPI_Recv(&tmp_buf_head, sizeof(struct buf_head), MPI_BYTE, MPI_ANY_SOURCE,RES_TYPE0,
+ MPI_COMM_WORLD, &mpi_status);
+ this_node = mpi_status.MPI_SOURCE;
+ buf2_cnt = tmp_buf_head.hdr.buf2_cnt;
+
+#ifdef DEBUG
+ /*
+ fprintf(stderr," %d: %d results\n", this_node, buf2_cnt);
+ */
+#endif
+
+ this_buf_p = worker_buf[this_node-FIRSTNODE];
+ /* move things selectively to avoid over-writing pointers to res, a_res arrays */
+
+ aa1b_start_save = this_buf_p->hdr.aa1b_start;
+ seq_b_save = this_buf_p->hdr.seq_b;
+ mseq_b_save = this_buf_p->hdr.mseq_b;
+
+ memcpy(&this_buf_p->hdr,&tmp_buf_head.hdr,sizeof(struct buf2_hdr_s));
+
+ this_buf_p->hdr.aa1b_start = aa1b_start_save;
+ this_buf_p->hdr.seq_b = seq_b_save;
+ this_buf_p->hdr.mseq_b =mseq_b_save;
+
+ memcpy(&this_buf_p->s_cnt_info,&tmp_buf_head.s_cnt_info,sizeof(struct score_count_s));
+
+ if (this_buf_p->hdr.have_results) {
+ if (this_buf_p->hdr.buf2_type & (BUF2_DOWORK + BUF2_DOSHUF + BUF2_DOOPT)) {
+ MPI_Recv(this_buf_p->buf2_res, sizeof(struct buf2_res_s)*buf2_cnt,
+ MPI_BYTE, this_node, RES_TYPE1, MPI_COMM_WORLD, &mpi_status);
+ /*
+ for (i=0; i < buf2_cnt; i++) {
+ if (this_buf_p->buf2_res[i].rst.score[2] > 200) {
+ fprintf(stderr, "HS[%d:%d,%d]: %d (%d:%d)\n",i,this_node, tmp_buf_head.hdr.worker_idx, this_buf_p->buf2_res[i].rst.score[2],
+ this_buf_p->buf2_data[i].seq->index,this_buf_p->buf2_data[i].seq->n1);
+ }
+ }
+ */
+ }
+
+ if (this_buf_p->hdr.buf2_type & BUF2_DOALIGN) {
+ /* (1) get a message that has "have_ares"
+ (2) allocate space for each a_res and receive it individually
+ (3) reset the ->next pointers for the a_res chain
+ */
+
+ for (i = 0; i < buf2_cnt; i++) {
+ MPI_Recv(int_msg_b, 1, MPI_INT, this_node, ALN_TYPE0, MPI_COMM_WORLD, &mpi_status);
+ this_buf_p->buf2_ares[i].have_ares = int_msg_b[0];
+ this_buf_p->buf2_ares[i].a_res = NULL; /* pre-initialize */
+
+ if (this_buf_p->buf2_ares[i].have_ares) {
+ /* allocate space to receive it */
+ if ((new_ares_p = (struct a_res_str *)calloc(1,sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr, "cannot allocate a_res from %d\n",this_node);
+ exit(1);
+ }
+ /* save the head of the ares_chain */
+ this_buf_p->buf2_ares[i].a_res = new_ares_p;
+
+ /* get the first a_res */
+ MPI_Recv(new_ares_p, sizeof(struct a_res_str), MPI_BYTE, this_node,
+ ALN_TYPE1, MPI_COMM_WORLD, &mpi_status);
+ /* get the associated res[nres] */
+ if ((new_ares_p->res = (int *)calloc(new_ares_p->nres,sizeof(int)))==NULL) {
+ fprintf(stderr, "cannot allocate res for a_res from %d\n",this_node);
+ exit(1);
+ }
+ MPI_Recv(new_ares_p->res, new_ares_p->nres, MPI_INT, this_node,
+ ALN_TYPE2, MPI_COMM_WORLD, &mpi_status);
+
+ /* now get alignment encodings if available */
+ if (new_ares_p->aln_code) {
+ if ((new_ares_p->aln_code = (char *)calloc(new_ares_p->aln_code_n+1,sizeof(char)))==NULL) {
+ fprintf(stderr, "cannot allocate aln_code for a_res from %d\n",this_node);
+ exit(1);
+ }
+ MPI_Recv(new_ares_p->aln_code, new_ares_p->aln_code_n+1, MPI_BYTE, this_node,
+ ALN_TYPE3, MPI_COMM_WORLD, &mpi_status);
+ }
+ if (new_ares_p->ann_code) {
+ if ((new_ares_p->ann_code = (char *)calloc(new_ares_p->ann_code_n+1,sizeof(char)))==NULL) {
+ fprintf(stderr, "cannot allocate ann_code for a_res from %d\n",this_node);
+ exit(1);
+ }
+ MPI_Recv(new_ares_p->ann_code, new_ares_p->ann_code_n+1, MPI_BYTE, this_node,
+ ALN_TYPE3, MPI_COMM_WORLD, &mpi_status);
+
+ }
+
+ while (new_ares_p->next) { /* while the chain continues */
+ prev_ares_p = new_ares_p; /* save pointer to previous a_res to fix prev_ares->next */
+ if ((new_ares_p = (struct a_res_str *)calloc(1,sizeof(struct a_res_str)))==NULL) {
+ fprintf(stderr, "cannot allocate a_res from %d\n",this_node);
+ exit(1);
+ }
+ prev_ares_p->next = new_ares_p;
+ MPI_Recv(new_ares_p, sizeof(struct a_res_str), MPI_BYTE, this_node,
+ ALN_TYPE1, MPI_COMM_WORLD, &mpi_status);
+ if ((new_ares_p->res = (int *)calloc(new_ares_p->nres,sizeof(int)))==NULL) {
+ fprintf(stderr, "cannot allocate res for a_res from %d\n",this_node);
+ exit(1);
+ }
+ MPI_Recv(new_ares_p->res, new_ares_p->nres, MPI_INT, this_node,
+ ALN_TYPE2, MPI_COMM_WORLD, &mpi_status);
+ /* now get alignment encodings if available */
+ if (new_ares_p->aln_code) {
+ if ((new_ares_p->aln_code = (char *)calloc(new_ares_p->aln_code_n+1,sizeof(char)))==NULL) {
+ fprintf(stderr, "cannot allocate aln_code for a_res from %d\n",this_node);
+ exit(1);
+ }
+ MPI_Recv(new_ares_p->aln_code, new_ares_p->aln_code_n+1, MPI_BYTE, this_node,
+ ALN_TYPE3, MPI_COMM_WORLD, &mpi_status);
+ }
+ if (new_ares_p->ann_code) {
+ if ((new_ares_p->ann_code = (char *)calloc(new_ares_p->ann_code_n+1,sizeof(char)))==NULL) {
+ fprintf(stderr, "cannot allocate ann_code for a_res from %d\n",this_node);
+ exit(1);
+ }
+ MPI_Recv(new_ares_p->ann_code, new_ares_p->ann_code_n+1, MPI_BYTE, this_node,
+ ALN_TYPE3, MPI_COMM_WORLD, &mpi_status);
+
+ }
+ } /* finished with the ares_chain */
+ } /* done with have_ares */
+ else {
+#ifdef DEBUG
+ fprintf(stderr, " getting alignment with no have_ares[%d]: %d/%d",
+ this_buf_p->hdr.worker_idx,i,this_buf_p->buf2_ares[i].best_idx);
+#endif
+ }
+ } /* done with buf2_ares[buf2_cnt] */
+ } /* done with BUF_DOALIGN */
+ } /* done with have_results */
+ *snode = this_node;
+ return this_buf_p;
+}
+
+/* wait until a worker/buffer is available */
+void get_rbuf(struct buf_head **cur_buf, int max_work_buf)
+{
+ int node, snode;
+ int i_msg_b[2], nresults;
+ struct buf_head *this_buf_p;
+
+#ifdef MPI_SRC
+ MPI_Status mpi_status;
+#endif
+
+ if (num_workers_idle == 0) {
+ this_buf_p = next_work_result(&snode);
+
+ work_q[next_worker_idx] = snode;
+ num_workers_idle++;
+ }
+ else {
+ this_buf_p = worker_buf[work_q[next_worker_idx]-FIRSTNODE];
+ }
+
+ *cur_buf = this_buf_p;
+
+ /* update worker queue */
+ next_worker_idx = (next_worker_idx+1)%(max_work_buf);
+}
+
+/* put_rbuf() takes a buffer filled with sequences to be compared
+ sends it to a worker */
+
+void put_rbuf(struct buf_head *cur_buf, int max_work_buf)
+{
+#ifdef MPI_SRC
+ MPI_Status mpi_status;
+#endif
+ struct seq_record *cur_seq_p, *tmp_seq_p;
+ int i, j, snode, buf2_cnt, seqr_cnt;
+ int cur_aa1b_size, max_aa1b_size;
+
+ /* do not send msg if no data */
+ if (!cur_buf->hdr.have_data || !(cur_buf->hdr.buf2_cnt > 0)) {return;}
+
+ /* here, since we have a buffer, we have a worker, just send the info */
+ snode = cur_buf->hdr.worker_idx;
+ buf2_cnt = cur_buf->hdr.buf2_cnt;
+ seqr_cnt = cur_buf->hdr.seqr_cnt;
+ max_aa1b_size = cur_buf->hdr.aa1b_size;
+
+#ifdef DEBUG
+ /* fprintf(stderr," sending %d/%d seqs to %d\n", buf2_cnt, seqr_cnt, snode); */
+#endif
+ /* send header */
+ MPI_Send(&cur_buf->hdr, sizeof(struct buf2_hdr_s), MPI_BYTE, snode,
+ MSEQTYPE0, MPI_COMM_WORLD);
+
+ /* send data */
+ MPI_Send(cur_buf->buf2_data, sizeof(struct buf2_data_s)*buf2_cnt,
+ MPI_BYTE, snode, MSEQTYPE1, MPI_COMM_WORLD);
+
+ /* before sending sequence records, we need to check to see if we
+ need to transfer to a continuous location (or send lots of short
+ records) */
+
+#ifdef DEBUG
+ cur_aa1b_size = 0;
+ for (i=0; i < buf2_cnt; i++) {
+ cur_seq_p = cur_buf->buf2_data[i].seq;
+ if (!cur_buf->buf2_data[i].seq_dup) {
+ cur_aa1b_size += cur_seq_p->n1+1;
+ }
+ if (check_seq_range(cur_seq_p->aa1b, cur_seq_p->n1, 50, "put_rbuf()")) {
+ fprintf(stderr, "[put_rbuf] range error at: %d\n", i);
+ }
+ }
+
+ if (cur_aa1b_size != cur_buf->hdr.aa1b_used) {
+ fprintf(stderr,"[put_rbuf:%d] aa1b_used size mismatch: %d != %d\n",
+ snode, cur_aa1b_size, cur_buf->hdr.aa1b_used);
+ }
+#endif
+
+ if (cur_buf->hdr.seq_record_continuous) {
+ /* send sequence records associated with data in one message */
+ MPI_Send(cur_buf->hdr.seq_b, sizeof(struct seq_record)*seqr_cnt,
+ MPI_BYTE, snode, MSEQTYPE2, MPI_COMM_WORLD);
+ MPI_Send(cur_buf->hdr.aa1b_start, cur_buf->hdr.aa1b_used+1,
+ MPI_BYTE, snode, MSEQTYPE3, MPI_COMM_WORLD);
+ }
+ else {
+ /* send individual sequence records */
+ cur_aa1b_size = 0;
+ for (i=0; i < buf2_cnt; i++) {
+ cur_seq_p = cur_buf->buf2_data[i].seq;
+ if (!cur_buf->buf2_data[i].seq_dup) { /* don't send sequence if its a duplicate */
+ MPI_Send(cur_seq_p, sizeof(struct seq_record),
+ MPI_BYTE, snode, MSEQTYPE4, MPI_COMM_WORLD);
+ MPI_Send(cur_seq_p->aa1b, cur_seq_p->n1+1,
+ MPI_BYTE, snode, MSEQTYPE5, MPI_COMM_WORLD);
+ }
+ }
+ }
+
+ /* reduce the number of idle workers */
+ num_workers_idle--;
+}
+
+/* wait_rbuf() -- wait for the worker threads to finish with the
+ current sequence buffers.
+*/
+void wait_rbuf(int used_reader_bufs) {
+ int snode;
+
+ while (num_workers_idle < max_worker_q) {
+ next_work_result(&snode);
+ num_workers_idle++;
+ }
+
+ /* all workers are idle, re-initialize work_q */
+ for (snode = 0; snode < max_worker_q; snode++) {
+ work_q[snode] = snode + FIRSTNODE;
+ }
+}
+
+void rbuf_done(int nthreads)
+{
+#ifdef MPI_SRC
+ MPI_Status mpi_status;
+#endif
+ int status, i;
+
+ /* use a dummy buf_head to send buf2_cnt=0, stop_work=1 */
+ struct buf2_hdr_s tmp_buf2_hdr;
+
+ tmp_buf2_hdr.stop_work = 1;
+ tmp_buf2_hdr.buf2_cnt = 0;
+
+ /* send a message to all the workers waiting for get_wbuf()
+ to quit
+ */
+
+ for (i=FIRSTNODE; i < nthreads+FIRSTNODE; i++) {
+ MPI_Send(&tmp_buf2_hdr, sizeof(struct buf2_hdr_s), MPI_BYTE, i,
+ MSEQTYPE0, MPI_COMM_WORLD);
+ }
+}
+
+/* get_wbuf() -- called in workers
+ get a buffer full of sequences to be compared from the main program
+
+ this function should follow put_rbuf() message for message
+
+ In the PCOMPLIB version, there is no queue of buffers to be read,
+ but we must have space to put the messages in as we receive them,
+ and we must fix the pointers in the seq_records
+ */
+
+int get_wbuf(struct buf_head **cur_buf, int max_work_buf)
+{
+#ifdef MPI_SRC
+ MPI_Status mpi_status;
+#endif
+
+ /* we need to preserve some sequence pointer information so it is not
+ over-written by the messages */
+
+ struct seq_record *seq_base, *cur_seq_p, *prev_seq_p, *old_host_seq_p, *host_seq_p;
+ struct buf2_data_s *cur_buf2_dp;
+ unsigned char *aa1b_start_save, *old_aa1b_start, *cur_aa1b;
+ unsigned char *host_aa1b, *old_host_aa1b;
+ int buf2_cnt, i, j, cur_n1, seqr_cnt;
+ int max_aa1b_size, aa1b_size_save;
+ int cur_aa1b_size;
+ int snode;
+
+ snode = (*cur_buf)->hdr.worker_idx;
+ seq_base = (*cur_buf)->hdr.seq_b;
+ aa1b_start_save = (*cur_buf)->hdr.aa1b_start;
+ max_aa1b_size = aa1b_size_save = (*cur_buf)->hdr.aa1b_size;
+
+ /* put invalid bytes in aa1b to check for transmission errors */
+ memset(aa1b_start_save, 127, aa1b_size_save);
+
+ MPI_Recv(&(*cur_buf)->hdr, sizeof(struct buf2_hdr_s), MPI_BYTE, 0,
+ MSEQTYPE0, MPI_COMM_WORLD, &mpi_status);
+
+ buf2_cnt = (*cur_buf)->hdr.buf2_cnt;
+ seqr_cnt = (*cur_buf)->hdr.seqr_cnt;
+
+ if (buf2_cnt <= 0 || (*cur_buf)->hdr.stop_work) { return 0; }
+
+ /* get the buf2_data array, which has seq_dup and ->seq records */
+ MPI_Recv((*cur_buf)->buf2_data, sizeof(struct buf2_data_s)*buf2_cnt,
+ MPI_BYTE, 0, MSEQTYPE1, MPI_COMM_WORLD, &mpi_status);
+
+#ifdef DEBUG
+ /* fprintf(stderr,"[%d/get_wbuf] receiving %d/%d sequences\n",snode, buf2_cnt, seqr_cnt); */
+#endif
+
+ /* get seq_records (but not mseq_records, don't need them) */
+ if ((*cur_buf)->hdr.seq_record_continuous) {
+ MPI_Recv(seq_base, sizeof(struct seq_record)*seqr_cnt,
+ MPI_BYTE, 0, MSEQTYPE2, MPI_COMM_WORLD, &mpi_status);
+
+ /* now get the sequence data */
+ MPI_Recv(aa1b_start_save, (*cur_buf)->hdr.aa1b_used+1,
+ MPI_BYTE, 0, MSEQTYPE3, MPI_COMM_WORLD, &mpi_status);
+
+ /* map the seq records back into buf2_data */
+ /* must check for duplicate sequence records, initialize buf2_data[i]->seq
+ AND seq.aa1b in the same pass */
+
+ cur_buf2_dp = (*cur_buf)->buf2_data;
+ cur_seq_p = prev_seq_p = seq_base;
+
+ cur_aa1b = aa1b_start_save;
+ cur_aa1b_size = 0;
+
+ for (i=0; i < buf2_cnt; i++, cur_buf2_dp++) {
+ if (!cur_buf2_dp->seq_dup) { /* not a duplicate */
+ cur_seq_p->aa1b = cur_aa1b;
+ cur_aa1b += cur_seq_p->n1 + 1;
+ cur_aa1b_size += cur_seq_p->n1 + 1;
+ cur_buf2_dp->seq = cur_seq_p++;
+ }
+ else { /* duplicate */
+ cur_buf2_dp->seq = prev_seq_p; /* point to the previous value */
+ prev_seq_p = cur_seq_p;
+ }
+ }
+
+ if (cur_aa1b_size != (*cur_buf)->hdr.aa1b_used) {
+ fprintf(stderr, "[%d] incorrect cur_aa1b_size: %d != %d [%d]\n",
+ snode, cur_aa1b_size, (*cur_buf)->hdr.aa1b_used);
+ }
+ }
+ else { /* not continuous, get seq_records one at a time */
+ cur_seq_p = seq_base;
+ cur_aa1b = aa1b_start_save;
+ cur_buf2_dp = (*cur_buf)->buf2_data;
+ cur_aa1b_size = 0;
+ for (i=0; i < buf2_cnt; i++) {
+ /* get a seq record */
+ if (!(*cur_buf)->buf2_data[i].seq_dup) { /* not a duplicate, so get it */
+ MPI_Recv(cur_seq_p, sizeof(struct seq_record),
+ MPI_BYTE, 0, MSEQTYPE4, MPI_COMM_WORLD, &mpi_status);
+ /* get the sequence itself */
+ prev_seq_p = cur_seq_p;
+ cur_n1 = cur_seq_p->n1;
+ cur_aa1b_size += cur_n1+1;
+ if (cur_aa1b_size >= max_aa1b_size) {
+ fprintf(stderr,"[get_wbuf:%d] -- receive buffer too small %d > %d\n",
+ (*cur_buf)->hdr.worker_idx, cur_aa1b_size, max_aa1b_size);
+ exit(1);
+ }
+
+ MPI_Recv(cur_aa1b, cur_n1+1, MPI_BYTE, 0, MSEQTYPE5, MPI_COMM_WORLD, &mpi_status);
+ cur_seq_p->aa1b = cur_aa1b;
+#ifdef DEBUG
+ if (cur_seq_p->adler32_crc != adler32(1L,cur_aa1b,cur_n1)) {
+ fprintf(stderr," [get_wbuf:%d] -- adler32 mismatch; n1: %d\n",
+ (*cur_buf)->hdr.worker_idx, cur_n1);
+ }
+#endif
+
+ cur_buf2_dp->seq = cur_seq_p++;
+ cur_aa1b += cur_n1+1;
+ }
+ else { /* its a duplicate, so point to the original version */
+ cur_buf2_dp->seq = prev_seq_p;
+ }
+ cur_buf2_dp++;
+ }
+ }
+
+ /* restore the seq_b, aa1b_start that were over-written */
+ (*cur_buf)->hdr.seq_b = seq_base;
+ (*cur_buf)->hdr.aa1b_start = aa1b_start_save;
+ (*cur_buf)->hdr.aa1b_size = aa1b_size_save;
+
+ /*
+ for (i=0; i < buf2_cnt; i++) {
+ cur_seq_p = (*cur_buf)->buf2_data[i].seq;
+ if (check_seq_range(cur_seq_p->aa1b, cur_seq_p->n1, 50, "get_wbuf()")) {
+ fprintf(stderr, "[%d] (get_wbuf) range error at: %d/%d (seqr_cnt: %d)\n",
+ (*cur_buf)->hdr.worker_idx, i, buf2_cnt, seqr_cnt);
+ }
+ }
+ */
+
+ return 1;
+}
+
+/* put_wbuf() -- called in workers
+
+ In the PCOMPLIB version, there is no queue of buffers to be read,
+ so just send the buffer to the manager
+ */
+void put_wbuf(struct buf_head *cur_buf, int max_work_buf)
+{
+ int int_msg_b[4], i;
+ struct buf2_ares_s *buf2_ares_p;
+ struct a_res_str *cur_ares_p, *next_ares_p;
+#ifdef MPI_SRC
+ MPI_Status mpi_status;
+#endif
+
+ MPI_Send(&cur_buf->hdr, sizeof(struct buf_head), MPI_BYTE, 0,
+ RES_TYPE0, MPI_COMM_WORLD);
+
+ if (!cur_buf->hdr.have_results) { return;}
+
+ /* have buf2_res type results */
+ if (cur_buf->hdr.buf2_type & (BUF2_DOWORK + BUF2_DOSHUF+BUF2_DOOPT)) {
+ MPI_Send(cur_buf->buf2_res, sizeof(struct buf2_res_s)*cur_buf->hdr.buf2_cnt, MPI_BYTE, 0,
+ RES_TYPE1, MPI_COMM_WORLD);
+ }
+
+ /* have buf2_ares type results */
+ if (cur_buf->hdr.buf2_type & BUF2_DOALIGN) {
+ /* buf2_ares does not have much useful information, except have_ares and a chain of *a_res pointers.
+ so we need to:
+ (1) send have_ares
+ (2) send each part of the a_res chain individually
+ */
+
+ buf2_ares_p = cur_buf->buf2_ares;
+ for (i=0; i < cur_buf->hdr.buf2_cnt; i++) {
+ int_msg_b[0] = buf2_ares_p->have_ares;
+ MPI_Send(int_msg_b, 1, MPI_INT, 0, ALN_TYPE0, MPI_COMM_WORLD);
+ if (buf2_ares_p->have_ares) {
+ /* (a) send the first one */
+ for (cur_ares_p = buf2_ares_p->a_res; cur_ares_p; cur_ares_p = cur_ares_p->next) {
+ MPI_Send(cur_ares_p, sizeof(struct a_res_str), MPI_BYTE, 0, ALN_TYPE1, MPI_COMM_WORLD);
+ MPI_Send(cur_ares_p->res, cur_ares_p->nres ,MPI_INT, 0, ALN_TYPE2, MPI_COMM_WORLD);
+ if (cur_ares_p->aln_code) {
+ MPI_Send(cur_ares_p->aln_code, cur_ares_p->aln_code_n+1 ,MPI_BYTE, 0, ALN_TYPE3, MPI_COMM_WORLD);
+ }
+ if (cur_ares_p->ann_code) {
+ MPI_Send(cur_ares_p->ann_code, cur_ares_p->ann_code_n+1 ,MPI_BYTE, 0, ALN_TYPE3, MPI_COMM_WORLD);
+ }
+ } /* done with a_res chain */
+
+ /* free the chain */
+ cur_ares_p = buf2_ares_p->a_res;
+ while (cur_ares_p) {
+ if (cur_ares_p->aln_code) free(cur_ares_p->aln_code);
+ if (cur_ares_p->ann_code) free(cur_ares_p->ann_code);
+ if ((buf2_ares_p->have_ares & 0x1) && cur_ares_p->res) free(cur_ares_p->res);
+ next_ares_p = cur_ares_p->next;
+ free(cur_ares_p);
+ cur_ares_p = next_ares_p;
+ }
+ buf2_ares_p->a_res = NULL;
+ } /* done with have_ares */
+ buf2_ares_p->have_ares = 0; /* must be zero-ed out for later use */
+ buf2_ares_p++;
+ } /* done with buf2_ares[buf2_cnt] */
+ } /* done with BUF2_DOALIGN */
+} /* done with put_wbuf() */
diff --git a/src/pgsql_lib.c b/src/pgsql_lib.c
new file mode 100644
index 0000000..912fa06
--- /dev/null
+++ b/src/pgsql_lib.c
@@ -0,0 +1,631 @@
+/* $Id: pgsql_lib.c 781 2011-06-20 10:31:40Z wrp $ */
+
+/* pgsql_lib.c copyright (c) 2004, 2014 by William R. Pearson and
+ The Rector & Visitors of the University of Virginia */
+/*
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* functions for opening, reading, seeking a pgsql database */
+
+/*
+ For the moment, this interface assumes that the file to be searched will
+ be specified in a single, long, string with 4 parts:
+
+ (1) a database open string. This string has four fields, separated by
+ whitespace (' \t'):
+ hostname:port dbname user password
+
+ '--' dashes at the beginning of lines are ignored -
+ thus the first line could be:
+ -- hostname:port dbname user password
+
+ (2) a database query string that will return an unique ID (not
+ necessarily numberic, but it must be < 12 characters as libstr[12]
+ is used) and a sequence string
+
+ (2a) a series of pgsql commands that do not generate results
+ starting with 'DO', followed by a select() statement.
+
+ (3) a database select string that will return a description
+ given a unique ID
+
+ (4) a database select string that well return a sequence given a
+ unique ID
+
+ Lines (3) and (4) are not required for pv34comp* libraries, but
+ line (2) must generate a complete description as well as a sequence.
+
+
+ 18-July-2001
+ Additional syntax has been added to support multiline SQL queries.
+
+ If the host line begins with '+', then the SQL is openned on the same
+ connection as the previous SQL file.
+
+ If the host line contains '-' just before the terminal ';', then
+ the file will not produce any output.
+
+ This string can contain "\n". ";" are used to separate the four
+ functions, which must be specified in the order shown above.
+ The last (fourth) query must terminate with a ';'
+
+ 19-July-2004
+
+ This file is designed for PostgreSQL, which uses a different syntax
+ for getting rows of data. Specifically, a select statement must be
+ associated with a "cursor", so that one can fetch a single row.
+
+ This can be simply done with the statment:
+
+ DECLARE next_seq CURSOR FOR "select statement ..."
+
+ The need for a CURSOR complicates the getlib()/ranlib() design, which
+ assumes that ranlib() can set something up that getlib() can read.
+ This can be avoided by setting up an otherwise unnecessary cursor for
+ the ranlib statement that gets a sequence.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include <libpq-fe.h>
+#define PGSQL_LIB 17
+
+#include "defs.h"
+#include "mm_file.h"
+
+#define XTERNAL
+#include "uascii.h"
+#define EOSEQ 0
+/* #include "upam.h" */
+
+char *alloc_file_name(char *f_name);
+int pgsql_getlib(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
+void pgsql_ranlib(char *, int, fseek_t, char *, struct lmf_str *m_fd);
+
+#define PGSQL_BUF 4096
+
+struct lmf_str *
+pgsql_openlib(char *sname, int ldnaseq, int *sascii) {
+ FILE *sql_file;
+ PGconn *conn;
+ PGresult *res;
+ char *tmp_str, *ttmp_str;
+ int tmp_str_len;
+ char *bp, *bps, *bdp, *tp, tchar;
+ int i, qs_len, qqs_len;
+ char *sql_db, *sql_host, *sql_dbname, *sql_user, *sql_pass;
+ char *sql_port;
+ char *sql_do;
+ int sql_do_cnt;
+ struct lmf_str *m_fptr;
+
+ /* if (sql_reopen) return NULL; - should not be called for re-open */
+
+ tmp_str_len = PGSQL_BUF;
+ if ((tmp_str=(char *)calloc(tmp_str_len,sizeof(char)))==NULL) {
+ fprintf(stderr,"cannot allocate %d for pgSQL buffer\n",tmp_str_len);
+ return NULL;
+ }
+
+ if (sname[0] == '%') {
+ strncpy(tmp_str,sname+1,tmp_str_len);
+ tmp_str[sizeof(tmp_str)-1]='\0';
+ }
+ else {
+ if ((sql_file=fopen(sname,"r"))==NULL) {
+ fprintf(stderr," cannot open pgSQL file: %s\n",sname);
+ return NULL;
+ }
+
+ if ((qs_len=fread(tmp_str,sizeof(char),tmp_str_len-1,sql_file))<=0) {
+ fprintf(stderr," cannot read pgSQL file: %s\n",sname);
+ return NULL;
+ }
+ else {
+ tmp_str[qs_len]='\0';
+ qqs_len = qs_len;
+ while (qqs_len >= tmp_str_len-1) {
+ tmp_str_len += PGSQL_BUF;
+ if ((tmp_str=(char *)realloc(tmp_str,tmp_str_len))==NULL) {
+ fprintf(stderr,
+ " cannot reallocate %d for pgSQL buffer\n",tmp_str_len);
+ return NULL;
+ }
+ ttmp_str = &tmp_str[qqs_len];
+ if ((qs_len=fread(ttmp_str,sizeof(char),PGSQL_BUF,sql_file))<0) {
+ fprintf(stderr," cannot read pgSQL file: %s\n",sname);
+ return NULL;
+ }
+ ttmp_str[qs_len]='\0';
+ qqs_len += qs_len;
+ }
+ }
+ fclose(sql_file);
+ }
+
+ bps = tmp_str;
+ if ((bp=strchr(bps,';'))!=NULL) {
+ *bp='\0';
+ if ((sql_db=calloc(strlen(bps)+1,sizeof(char)))==NULL) {
+ fprintf(stderr, " cannot allocate space for database name [%d], %s\n",
+ (int)strlen(bps),bps);
+ return NULL;
+ }
+ /* have database name, parse the fields */
+ else {
+ strcpy(sql_db,bps); /* strcpy OK because allocated strlen(bps) */
+ bps = bp+1; /* points to next char after ';' */
+ while (isspace(*bps)) bps++;
+ *bp=';'; /* replace ; */
+ bp = sql_db;
+ while (*bp=='-') {*bp++ = ' ';}
+ sql_host = strtok(bp," \t\n");
+ if (sql_host[0]=='@') sql_host="";
+ sql_dbname = strtok(NULL," \t\n");
+ sql_user = strtok(NULL," \t\n");
+ if (sql_user[0]=='@') sql_user="";
+ sql_pass = strtok(NULL," \t\n");
+ if (sql_pass[0]=='@') sql_pass="";
+ if ((tp=strchr(sql_host,':'))!=NULL) {
+ sql_port = tp+1;
+ *tp='\0';
+ }
+ else sql_port = "";
+ }
+ }
+ else {
+ fprintf(stderr," cannot find database fields:\n%s\n",tmp_str);
+ return NULL;
+ }
+
+ /* we have all the info we need to open a database, allocate lmf_str */
+ if ((m_fptr = (struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {
+ fprintf(stderr," cannot allocate lmf_str (%ld) for %s\n",
+ sizeof(struct lmf_str),sname);
+ return NULL;
+ }
+
+ /* have our struct, initialize it */
+
+ m_fptr->lb_name = alloc_file_name(sname);
+
+ m_fptr->sascii = sascii;
+
+ m_fptr->sql_db = sql_db;
+ m_fptr->getlib = pgsql_getlib;
+ m_fptr->ranlib = pgsql_ranlib;
+ m_fptr->mm_flg = 0;
+ m_fptr->sql_reopen = 0;
+ m_fptr->lb_type = PGSQL_LIB;
+
+ /* now open the database, if necessary */
+ conn = PQsetdbLogin(sql_host,
+ sql_port,
+ NULL,
+ NULL,
+ sql_dbname,
+ sql_user,
+ sql_pass);
+
+ if (PQstatus(conn) != CONNECTION_OK) {
+ fprintf(stderr, "Connection to database '%s' failed.\n", PQdb(conn));
+ fprintf(stderr, "%s", PQerrorMessage(conn));
+ PQfinish(conn);
+ goto error_r;
+ }
+ else {
+ m_fptr->pgsql_conn = conn;
+ fprintf(stderr," Database %s opened on %s\n",sql_dbname,sql_host);
+ }
+
+ /* check for 'DO' command - copy to 'DO' string */
+ while (*bps == '-') { *bps++=' ';}
+ if (isspace(bps[-1]) && toupper(bps[0])=='D' &&
+ toupper(bps[1])=='O' && isspace(bps[2])) {
+ /* have some 'DO' commands */
+ /* check where the end of the last DO statement is */
+
+ sql_do_cnt = 1; /* count up the number of 'DO' statements for later */
+ bdp=bps+3;
+ while ((bp=strchr(bdp,';'))!=NULL) {
+ tp = bp+2; /* skip ;\n */
+ while (isspace(*tp) || *tp == '-') {*tp++ = ' ';}
+ if (toupper(*tp)=='D' && toupper(tp[1])=='O' && isspace(tp[2])) {
+ sql_do_cnt++; /* count the DO statements */
+ bdp = tp+3; /* move to the next DO statement */
+ }
+ else break;
+ }
+ if (bp != NULL) { /* end of the last DO, begin of select */
+ tchar = *(bp+1);
+ *(bp+1)='\0'; /* terminate DO strings */
+ if ((sql_do = calloc(strlen(bps)+1, sizeof(char)))==NULL) {
+ fprintf(stderr," cannot allocate %d for sql_do\n",(int)strlen(bps));
+ goto error_r;
+ }
+ else {
+ strcpy(sql_do,bps);
+ *(bp+1)=tchar; /* replace missing ';' */
+ }
+ bps = bp+1;
+ while (isspace(*bps)) bps++;
+ }
+ else {
+ fprintf(stderr," terminal ';' not found: %s\n",bps);
+ goto error_r;
+ }
+ /* all the DO commands are in m_fptr->sql_do in the form:
+ DO command1; DO command2; DO command3; */
+ bdp = sql_do;
+ while (sql_do_cnt-- && (bp=strchr(bdp,';'))!=NULL) {
+ /* do the pgsql statement on bdp+3 */
+ /* check for error */
+ *bp='\0';
+ res = PQexec(m_fptr->pgsql_conn,bdp+3);
+ if (PQresultStatus(res) != PGRES_COMMAND_OK) {
+ fprintf(stderr,"*** Error %s - query failed:\n%s\n",
+ PQerrorMessage(m_fptr->pgsql_conn), bdp+3);
+ PQclear(res);
+ goto error_r;
+ }
+ PQclear(res);
+
+ *bp=';';
+ bdp = bp+1;
+ while (isspace(*bdp)) bdp++;
+ }
+ }
+
+ /* copy 1st query field */
+ if ((bp=strchr(bps,';'))!=NULL) {
+ *bp='\0';
+ if ((m_fptr->sql_query=calloc(strlen(bps)+41,sizeof(char)))==NULL) {
+ fprintf(stderr, " cannot allocate space for query string [%d], %s\n",
+ (int)strlen(bps),bps);
+ goto error_r;
+ }
+ /* have query, copy it */
+ else {
+ strncpy(m_fptr->sql_query,"DECLARE next_seq CURSOR FOR ",40);
+ strcat(m_fptr->sql_query,bps);
+ *bp=';'; /* replace ; */
+ bps = bp+1;
+ while(isspace(*bps)) bps++;
+ }
+ }
+ else {
+ fprintf(stderr," cannot find database query field:\n%s\n",tmp_str);
+ goto error_r;
+ }
+
+ /* copy get_desc field */
+ if ((bp=strchr(bps,';'))!=NULL) {
+ *bp='\0';
+ if ((m_fptr->sql_getdesc=calloc(strlen(bps)+1,sizeof(char)))==NULL) {
+ fprintf(stderr, " cannot allocate space for database name [%d], %s\n",
+ (int)strlen(bps),bps);
+ goto error_r;
+ }
+ /* have get_desc, copy it */
+ else {
+ strcpy(m_fptr->sql_getdesc,bps);
+ *bp=';'; /* replace ; */
+ bps = bp+1;
+ while(isspace(*bps)) bps++;
+ }
+ }
+ else {
+ fprintf(stderr," cannot find getdesc field:\n%s\n",tmp_str);
+ goto error_r;
+ }
+
+ if ((bp=strchr(bps,';'))!=NULL) { *bp='\0';}
+
+ if ((m_fptr->sql_getseq=calloc(strlen(bps)+1,sizeof(char)))==NULL) {
+ fprintf(stderr, " cannot allocate space for database name [%d], %s\n",
+ (int)strlen(bps),bps);
+ goto error_r;
+ }
+
+ if (strlen(bps) > 0) {
+ strcpy(m_fptr->sql_getseq,bps);
+ }
+ else {
+ fprintf(stderr," cannot find getseq field:\n%s\n",tmp_str);
+ return NULL;
+ }
+ if (bp!=NULL) *bp=';';
+
+ /* now do the fetch */
+
+ res = PQexec(m_fptr->pgsql_conn,"BEGIN;");
+ if (PQresultStatus(res) != PGRES_COMMAND_OK) {
+ fprintf(stderr,"*** Error %s - BEGIN failed:\n",
+ PQerrorMessage(conn));
+ PQclear(res);
+ goto error_r;
+ }
+ PQclear(res);
+
+ res = PQexec(m_fptr->pgsql_conn, m_fptr->sql_query);
+ if (PQresultStatus(res) != PGRES_COMMAND_OK) {
+ fprintf(stderr,"*** Error %d:%s - query failed:\n%s\n",
+ PQresultStatus(res),PQerrorMessage(conn), m_fptr->sql_query);
+ PQclear(res);
+ goto error_r;
+ }
+ PQclear(res);
+ m_fptr->pgsql_res=NULL;
+
+ return m_fptr;
+
+ error_r:
+ free(m_fptr->sql_getseq);
+ free(m_fptr->sql_getdesc);
+ free(m_fptr->sql_query);
+ free(m_fptr);
+ free(sql_db);
+ return NULL;
+}
+
+struct lmf_str *
+pgsql_reopen(struct lmf_str *m_fptr) {
+ m_fptr->sql_reopen = 1;
+ return m_fptr;
+}
+
+void
+pgsql_closelib(struct lmf_str *m_fptr) {
+
+ if (m_fptr == NULL) return;
+ if (m_fptr->pgsql_res != NULL) PQclear(m_fptr->pgsql_res);
+ PQfinish(m_fptr->pgsql_conn);
+ m_fptr->sql_reopen=0;
+}
+
+/*
+static char *sql_seq = NULL, *sql_seqp;
+static int sql_seq_len;
+*/
+
+int
+pgsql_getlib( unsigned char *seq,
+ int maxs,
+ char *libstr,
+ int n_libstr,
+ fseek_t *libpos,
+ int *lcont,
+ struct lmf_str *lm_fd,
+ long *l_off)
+{
+ register unsigned char *cp, *seqp;
+ register int *ap;
+ unsigned char *seqm, *seqm1;
+ PGresult *res;
+
+ char *bp;
+ /* int l_start, l_stop, len; */
+
+ seqp = seq;
+ seqm = &seq[maxs-9];
+ seqm1 = seqm-1;
+
+ ap = lm_fd->sascii;
+
+ if (*lcont==0) {
+ /* get a row, with UID, sequence */
+ *l_off = 1;
+
+ /* check to see if we already have a valid result */
+ if (lm_fd->pgsql_res==NULL) {
+ res = PQexec(lm_fd->pgsql_conn,"FETCH next_seq");
+ if (PQresultStatus(res) != PGRES_TUPLES_OK) {
+ fprintf(stderr,"*** Error %s - getlib FETCH failed:\n%s\n",
+ PQerrorMessage(lm_fd->pgsql_conn), lm_fd->sql_query);
+ PQclear(res);
+ lm_fd->pgsql_res = NULL;
+ *lcont = 0;
+ *seqp = EOSEQ;
+ return -1;
+ }
+ }
+ else {res = lm_fd->pgsql_res;}
+
+ if (PQntuples(res)>0) {
+ lm_fd->pgsql_res = res;
+ *libpos=(fseek_t)atol(PQgetvalue(res,0,0));
+
+ *l_off = 1;
+ if (PQnfields(res) > 2 && (bp=strchr(PQgetvalue(res,0,2),'@'))!=NULL &&
+ !strncmp(bp+1,"C:",2)) sscanf(bp+3,"%ld",l_off);
+
+ lm_fd->sql_seqp = PQgetvalue(res,0,1);
+
+ /* because of changes in pgsql_ranlib(), it is essential that
+ libstr return the unique identifier; thus we must use
+ sql_row[0], not sql_row[2]. Using libstr as the UID allows
+ one to use any UID, not just numeric ones. *libpos is not
+ used for pgsql libraries.
+ */
+
+ if (n_libstr <= MAX_UID) {
+ /* the normal case returns only GID/sequence */
+ strncpy(libstr,PQgetvalue(res,0,0),MAX_UID-1);
+ libstr[MAX_UID-1]='\0';
+ }
+ else {
+ /* here we do not use the UID in libstr, because we are not
+ going back into the db */
+ /* the PVM case also returns a long description */
+ if (PQnfields(res)>2) {
+ strncpy(libstr,PQgetvalue(res,0,2),n_libstr-1);
+ }
+ else {
+ strncpy(libstr,PQgetvalue(res,0,0),n_libstr-1);
+ }
+ libstr[n_libstr-1]='\0';
+ }
+ }
+ else {
+ PQclear(lm_fd->pgsql_res);
+ lm_fd->pgsql_res=NULL;
+ *lcont = 0;
+ *seqp = EOSEQ;
+ return -1;
+ }
+ }
+
+ for (cp=(unsigned char *)lm_fd->sql_seqp; seqp<seqm1 && *cp; ) {
+ if ((*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA &&
+ (*seqp++=ap[*cp++])<NA) continue;
+ --seqp;
+ if (*(cp-1)==0) break;
+ }
+ lm_fd->sql_seqp = (char *)cp;
+
+ if (seqp>=seqm1) (*lcont)++;
+ else {
+ *lcont=0;
+ PQclear(lm_fd->pgsql_res);
+ lm_fd->pgsql_res = NULL;
+ }
+
+ *seqp = EOSEQ;
+ /* if ((int)(seqp-seq)==0) return 1; */
+ return (int)(seqp-seq);
+}
+
+void
+pgsql_ranlib(char *str,
+ int cnt,
+ fseek_t libpos,
+ char *libstr,
+ struct lmf_str *lm_fd
+ )
+{
+ char tmp_query[1024], tmp_val[20];
+ PGresult *res;
+ char *bp;
+
+ str[0]='\0';
+
+ /* put the UID into the query string - cannot use sprintf because of
+ "%' etc */
+
+ /* sprintf(tmp_query,lm_fd->sql_getdesc,libpos); */
+
+ if ((bp=strchr(lm_fd->sql_getdesc,'#'))==NULL) {
+ fprintf(stderr, "no KEY position in %s\n",lm_fd->sql_getdesc);
+ goto next1;
+ }
+ else {
+ *bp = '\0';
+ strncpy(tmp_query,lm_fd->sql_getdesc,sizeof(tmp_query));
+ tmp_query[sizeof(tmp_query)-1]='\0';
+ /* sprintf(tmp_val,"%ld",(long)libpos); */
+ strncat(tmp_query,libstr,sizeof(tmp_query)-1);
+ strncat(tmp_query,bp+1,sizeof(tmp_query)-1);
+ *bp='#';
+ lm_fd->lpos = libpos;
+ }
+
+ /* fprintf(stderr," requesting: %s\n",tmp_query); */
+
+ if (lm_fd->pgsql_res !=NULL) {
+ PQclear(lm_fd->pgsql_res);
+ lm_fd->pgsql_res = NULL;
+ }
+
+ res = PQexec(lm_fd->pgsql_conn,tmp_query);
+ if (PQresultStatus(res) != PGRES_TUPLES_OK) {
+ lm_fd->pgsql_res = NULL;
+
+ sprintf(str,"gi|%ld ***Error - query failed***",(long)libpos);
+ fprintf(stderr,"*** Error %s - ranlib DESC failed:\n%s\n",
+ PQerrorMessage(lm_fd->pgsql_conn), tmp_query);
+ PQclear(res);
+ goto next1;
+ }
+
+ if (PQntuples(res)<=0) {
+/* fprintf(stderr,"*** Error = use result failed\n%s\n",
+ pgsql_error(lm_fd->pgsql_conn)); */
+ sprintf(str,"gi|%ld ***use result failed***",(long)libpos);
+ goto next0;
+ }
+
+ if (PQgetvalue(res,0,1)!= NULL) strncpy(str,PQgetvalue(res,0,1),cnt-1);
+ else strncpy(str,PQgetvalue(res,0,0),cnt-1);
+ str[cnt-1]='\0';
+ /* change this later to support multiple row returns */
+ /*
+ while (strlen(str) < cnt-1 &&
+ (lm_fd->sql_row = pgsql_fetch_row(lm_fd->pgsql_res))!=NULL) {
+ strncat(str," ",cnt-2-strlen(str));
+ if (lm_fd->sql_row[1]!=NULL)
+ strncat(str,lm_fd->sql_row[1],cnt-2-strlen(str));
+ else break;
+ }
+ */
+
+ str[cnt-1]='\0';
+ if ((bp = strchr(str,'\r'))!=NULL) *bp='\0';
+ if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
+
+ next0:
+ PQclear(res);
+ next1:
+ lm_fd->pgsql_res = NULL;
+
+ /* get the sequence, set up for pgsql_getseq() */
+ /* put the UID into the query string */
+
+ if ((bp=strchr(lm_fd->sql_getseq,'#'))==NULL) {
+ fprintf(stderr, "no GID position in %s\n",lm_fd->sql_getseq);
+ return;
+ }
+ else {
+ *bp = '\0';
+ strncpy(tmp_query,lm_fd->sql_getseq,sizeof(tmp_query));
+ tmp_query[sizeof(tmp_query)-1]='\0';
+ /* sprintf(tmp_val,"%ld",(long)libpos); */
+ strncat(tmp_query,libstr,sizeof(tmp_query));
+ strncat(tmp_query,bp+1,sizeof(tmp_query));
+ *bp='#';
+ }
+
+ res = PQexec(lm_fd->pgsql_conn,tmp_query);
+ if (PQresultStatus(res) != PGRES_TUPLES_OK) {
+ PQclear(res);
+ lm_fd->pgsql_res = NULL;
+ fprintf(stderr,"*** Error - ranlib SEQ failed:\n%s\n%s\n",tmp_query,
+ PQerrorMessage(lm_fd->pgsql_conn));
+ exit(1);
+ }
+ else {
+ lm_fd->pgsql_res = res;
+ }
+}
diff --git a/src/print_pssm.c b/src/print_pssm.c
new file mode 100644
index 0000000..f2db3d5
--- /dev/null
+++ b/src/print_pssm.c
@@ -0,0 +1,793 @@
+/* print_pssm.c - 21-Jan-2005 */
+
+/* $Id: print_pssm.c 1111 2013-01-09 18:46:57Z wrp $ */
+/* $Revision: 1111 $ */
+
+/* copyright (c) 2005, 2014 - William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/*
+ read a binary PSSM checkpoint file from blastpgp, and produce an ascii
+ formatted file
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <math.h>
+#include <string.h>
+#ifdef UNIX
+#include <getopt.h>
+#else
+extern int optind; /* used by getopt() */
+extern char *optarg;
+#endif
+
+#include "defs.h"
+#include "param.h"
+
+#include "uascii.h"
+#include "upam.h"
+
+void initenv(int, char **, struct pstruct *, char *);
+void read_pssm(unsigned char *aa0, int n0, int nsq, double pamscale,
+ FILE *fp, int pgpf_type, struct pstruct *ppst);
+int
+read_asn_pssm(unsigned char *aa0, int n0, int nsq,
+ double pamscale, FILE *fp, struct pstruct *ppst);
+void alloc_pam();
+int **alloc_pam2p();
+void initpam2();
+void init_ascii0(int *xascii, char *sq_map, int n_sq_map, struct pstruct *ppst);
+void fill_pam();
+double get_lambda(int **pam2p, int n0, int nsq, char *aa0);
+
+int standard_pam(char *smstr, struct pstruct *ppst, int del_set, int gap_set);
+int getseq(char *filen, int *qascii, unsigned char *seq, int maxs, char *libstr,
+ int n_libstr, long *sq0off);
+int initpam (char *mfname, struct pstruct *ppst);
+int karlin(int , int, double *, double *, double *);
+
+int
+main(int argc, char **argv) {
+
+ unsigned char *aa0;
+ char libstr[MAX_FN];
+ char qname[MAX_FN];
+ long sq0off;
+ int i, n0;
+ FILE *fp;
+ struct pstruct pst, *ppst;
+
+ /* stuff from initfa.c/h_init() */
+
+ memcpy(qascii,aascii,sizeof(qascii));
+
+ /* initialize a pam matrix */
+ ppst = &pst;
+ strncpy(ppst->pamfile,"BP62",MAX_FN);
+ standard_pam(ppst->pamfile,ppst,0,0);
+
+ /* this is always protein by default */
+ ppst->nsq = naa;
+ ppst->nsqx = naax;
+ for (i=0; i<=ppst->nsqx; i++) {
+ ppst->sq[i] = NCBIstdaa[i];
+ }
+ ppst->sq[ppst->nsqx+1] = ppst->sqx[ppst->nsqx+1] = '\0';
+
+ if ((aa0 = calloc(MAXTST,sizeof(char)))==NULL) {
+ fprintf(stderr,"Cannot allocate aa0\n");
+ exit(1);
+ }
+
+ initenv(argc, argv, &pst, qname);
+
+ if (argc < 1) {
+ fprintf(stderr,"usage -- print_pssm -P \"pssm.asn 2\" query_file\n");
+ exit(1);
+ }
+
+ alloc_pam(pst.nsq+1,pst.nsq+1, &pst);
+ initpam2(&pst);
+ init_ascii0(qascii, NCBIstdaa_ext, NCBIstdaa_ext_n, &pst);
+
+ n0 = getseq (qname, qascii, aa0, MAXTST, libstr, sizeof(libstr), &sq0off);
+
+ if (!pst.pam_pssm) {
+ fprintf(stderr," ** ERROR ** No -P PSSM provided\n");
+ }
+ else {
+ pst.pam2p[0] = alloc_pam2p(n0,pst.nsq);
+ pst.pam2p[1] = alloc_pam2p(n0,pst.nsq);
+
+ if ((pst.pgpfile_type == 0) && (fp=fopen(pst.pgpfile,"rb"))) {
+ read_pssm(aa0, n0, pst.nsq, pst.pamscale, fp, 0, &pst);
+ }
+ else if ((pst.pgpfile_type == 1) && (fp=fopen(pst.pgpfile,"r"))) {
+ read_pssm(aa0, n0, pst.nsq, pst.pamscale, fp, 1, &pst);
+ }
+ else if ((pst.pgpfile_type == 2) && (fp=fopen(pst.pgpfile,"rb"))) {
+ if (read_asn_pssm(aa0, n0, pst.nsq, pst.pamscale, fp, &pst)<=0)
+ fprintf(stderr," Could not parse PSSM file: %s\n",pst.pgpfile);
+ pst.pam_pssm = 0;
+ exit(1);
+ }
+ }
+}
+
+void
+initenv(int argc, char **argv, struct pstruct *ppst, char *qname) {
+ char copt;
+ char *bp;
+
+ pascii = aascii;
+
+ while ((copt = getopt(argc, argv, "P:s:"))!=EOF) {
+ switch (copt) {
+ case 'P':
+ if ((bp=(strchr(optarg,' ')))!=NULL) {
+ *bp = '\0';
+ ppst->pgpfile_type = atoi(bp+1);
+ }
+ strncpy(ppst->pgpfile,optarg,MAX_FN);
+ ppst->pgpfile[MAX_FN-1]='\0';
+ ppst->pam_pssm = 1;
+ break;
+
+ case 's':
+ strncpy (ppst->pamfile, optarg, 120);
+ ppst->pamfile[120-1]='\0';
+ if (!standard_pam(ppst->pamfile,ppst,0, 0)) {
+ initpam (ppst->pamfile, ppst);
+ }
+ ppst->pam_set=1;
+ break;
+ }
+ }
+ optind--;
+
+ if (argc - optind > 1) strncpy(qname, argv[optind+1], MAX_FN);
+}
+
+
+/*
+ *aa0 - query sequence
+ n0 - length
+ pamscale - scaling for pam matrix - provided by apam.c, either
+ 0.346574 = ln(2)/2 (P120, BL62) or
+ 0.231049 = ln(2)/3 (P250, BL50)
+*/
+
+#define N_EFFECT 20
+
+void
+read_pssm(unsigned char *aa0, int n0, int nsq, double pamscale, FILE *fp, int pgf_type, struct pstruct *ppst) {
+ int i, j, len;
+ int qi, rj;
+ int **pam2p;
+ int first, too_high;
+ char *query;
+ double freq, **freq2d, lambda, new_lambda;
+ double scale, scale_high, scale_low;
+
+ pam2p = ppst->pam2p[0];
+
+ if(1 != fread(&len, sizeof(int), 1, fp)) {
+ fprintf(stderr, "error reading from checkpoint file: %d\n", len);
+ exit(1);
+ }
+
+ if(len != n0) {
+ fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",
+ len,n0);
+ exit(1);
+ }
+
+ /* read over query sequence stored in BLAST profile */
+ if(NULL == (query = (char *) calloc(len, sizeof(char)))) {
+ fprintf(stderr, "Couldn't allocate memory for query!\n");
+ exit(1);
+ }
+
+ if(len != fread(query, sizeof(char), len, fp)) {
+ fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);
+ exit(1);
+ }
+
+ printf("%d\n%s\n",len,query);
+
+ /* currently we don't do anything with query; ideally, we should
+ check to see that it actually matches aa0 ... */
+
+ /* quick 2d array alloc: */
+ if((freq2d = (double **) calloc(n0, sizeof(double *))) == NULL) {
+ fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
+ exit(1);
+ }
+
+ if((freq2d[0] = (double *) calloc(n0 * N_EFFECT, sizeof(double))) == NULL) {
+ fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
+ exit(1);
+ }
+
+ /* a little pointer arithmetic to fill out 2d array: */
+ for (qi = 1 ; qi < n0 ; qi++) {
+ freq2d[qi] = freq2d[0] + (N_EFFECT * qi);
+ }
+
+ printf(" rrtotal: %ld; pamscale=%0.5f\n", rrtotal, pamscale);
+
+ printf(" ");
+ for (rj = 1; rj <= N_EFFECT; rj++ ) {
+ printf(" %c ",NCBIstdaa[rj]);
+ }
+ printf("\n");
+
+ for (qi = 0 ; qi < n0 ; qi++) {
+ printf("%c",query[qi]);
+ for (rj = 0 ; rj < N_EFFECT ; rj++) {
+ if(1 != fread(&freq, sizeof(double), 1, fp)) {
+ fprintf(stderr, "Error while reading frequencies!\n");
+ exit(1);
+ }
+ /* printf(" %8.7g",freq*10.0); */
+
+ if (freq > 1e-12) {
+ freq = log(freq /((double) (rrcounts[rj+1])/(double) rrtotal));
+ freq /= pamscale; /* this gets us close to originial pam scores */
+ freq2d[qi][rj] = freq;
+ }
+ else {freq2d[qi][rj] = freq;}
+ printf(" %5.2f",freq);
+ }
+ printf("\n");
+ }
+
+
+ /* now figure out the right scale */
+ scale = 1.0;
+ lambda = get_lambda(ppst->pam2[0], 20, 20, "\0ARNDCQEGHILKMFPSTWYV");
+
+ /* should be near 1.0 because of our initial scaling by ppst->pamscale */
+ fprintf(stderr, "real_lambda: %g\n", lambda);
+
+ /* get initial high/low scale values: */
+ first = 1;
+ while (1) {
+ fill_pam(pam2p, n0, 20, freq2d, scale);
+ new_lambda = get_lambda(pam2p, n0, 20, query);
+ fprintf(stderr, "new_lambda: %g; scale: %g\n",new_lambda,scale);
+
+ if (new_lambda > lambda) {
+ if (first) {
+ first = 0;
+ scale = scale_high = 1.0 + 0.05;
+ scale_low = 1.0;
+ too_high = 1;
+ } else {
+ if (!too_high) break;
+ scale = (scale_high += scale_high - 1.0);
+ }
+ } else if (new_lambda > 0) {
+ if (first) {
+ first = 0;
+ scale_high = 1.0;
+ scale = scale_low = 1.0 - 0.05;
+ too_high = 0;
+ } else {
+ if (too_high) break;
+ scale = (scale_low += scale_low - 1.0);
+ }
+ } else { /* new_lambda <= 0 */
+ fprintf(stderr, "new_lambda (%g) <= 0; matrix has positive average score", new_lambda);
+ exit(1);
+ }
+ }
+
+ /* now do binary search between low and high */
+ for (i = 0 ; i < 10 ; i++) {
+ scale = 0.5 * (scale_high + scale_low);
+ fill_pam(pam2p, n0, 20, freq2d, scale);
+ new_lambda = get_lambda(pam2p, n0, 20, query);
+ fprintf(stderr, "it: %d - new_lambda: %g; scale: %g\n",i,new_lambda,scale);
+
+ if (new_lambda > lambda) scale_low = scale;
+ else scale_high = scale;
+ }
+
+ scale = 0.5 * (scale_high + scale_low);
+ fill_pam(pam2p, n0, 20, freq2d, scale);
+
+ fprintf(stderr, "final scale: %g\n", scale);
+
+ fprintf(stderr," ");
+ for (rj = 1; rj <= N_EFFECT+3; rj++ ) {
+ fprintf(stderr," %c",NCBIstdaa[rj]);
+ }
+ fprintf(stderr,"\n");
+
+ for (qi = 0 ; qi < n0 ; qi++) {
+ fprintf(stderr, "%4d %c: ", qi+1, query[qi]);
+ for (rj = 1 ; rj <= N_EFFECT+3 ; rj++) {
+ fprintf(stderr, "%4d", pam2p[qi][rj]);
+ }
+ fprintf(stderr, "\n");
+ }
+
+ free(freq2d[0]);
+ free(freq2d);
+
+ free(query);
+}
+
+/*
+ * alloc_pam(): allocates memory for the 2D pam matrix as well
+ * as for the integer array used to transmit the pam matrix
+ */
+void
+alloc_pam (int d1, int d2, struct pstruct *ppst)
+{
+ int i, *d2p;
+
+
+ if ((ppst->pam2[0] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
+ fprintf(stderr,"Cannot allocate 2D pam matrix: %d",d1);
+ exit(1);
+ }
+
+ if ((ppst->pam2[1] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
+ fprintf(stderr,"Cannot allocate 2D pam matrix: %d",d1);
+ exit(1);
+ }
+
+ if ((d2p = (int *) malloc (d1 * d2 * sizeof (int))) == NULL) {
+ fprintf(stderr,"Cannot allocate 2D pam matrix: %d",d1);
+ exit(1);
+ }
+
+ for (i = 0; i < d1; i++, d2p += d2)
+ ppst->pam2[0][i] = d2p;
+
+ if ((d2p= (int *) malloc (d1 * d2 * sizeof (int))) == NULL) {
+ fprintf(stderr,"Cannot allocate 2d pam matrix: %d",d2);
+ exit(1);
+ }
+
+ for (i = 0; i < d1; i++, d2p += d2)
+ ppst->pam2[1][i] = d2p;
+}
+
+void
+fill_pam(int **pam2p, int n0, int nsq, double **freq2d, double scale) {
+ int i, j, n_j;
+ double freq;
+
+ /* fprintf(stderr, "scale: %g\n", scale); */
+
+ /* now fill in the pam matrix: */
+ for (j = 1 ; j <=nsq ; j++) {
+ n_j = qascii[pssm_aa[j]];
+ for (i = 0 ; i < n0 ; i++) {
+ freq = scale * freq2d[i][j-1];
+ if ( freq < 0.0) freq -= 0.5;
+ else freq += 0.5;
+ pam2p[i][n_j] = (int)freq;
+ }
+ }
+}
+
+/*
+ * initpam2(struct pstruct pst): Converts 1-D pam matrix to 2-D
+ */
+void initpam2 (struct pstruct *ppst)
+{
+ int i, j, k, nsq, pam_xx, pam_xm;
+ int sa_x, sa_t, tmp;
+
+ nsq = ppst->nsq;
+ sa_x = pascii['X'];
+ sa_t = pascii['*'];
+
+ ppst->pam2[0][0][0] = -BIGNUM;
+ ppst->pam_h = -1; ppst->pam_l = 1;
+
+ k = 0;
+ for (i = 1; i <= nsq; i++) {
+ ppst->pam2[0][0][i] = ppst->pam2[0][i][0] = -BIGNUM;
+ for (j = 1; j <= i; j++) {
+ ppst->pam2[0][j][i] = ppst->pam2[0][i][j] = pam[k++] - ppst->pamoff;
+ if (ppst->pam_l > ppst->pam2[0][i][j]) ppst->pam_l =ppst->pam2[0][i][j];
+ if (ppst->pam_h < ppst->pam2[0][i][j]) ppst->pam_h =ppst->pam2[0][i][j];
+ }
+ }
+
+ ppst->nt_align = (ppst->dnaseq== SEQT_DNA || ppst->dnaseq == SEQT_RNA);
+
+ if (ppst->dnaseq == SEQT_RNA) {
+ tmp = ppst->pam2[0][nascii['G']][nascii['G']] - 1;
+ ppst->pam2[0][nascii['A']][nascii['G']] =
+ ppst->pam2[0][nascii['C']][nascii['T']] =
+ ppst->pam2[0][nascii['C']][nascii['U']] = tmp;
+ }
+
+ if (ppst->pam_x_set) {
+ for (i=1; i<=nsq; i++) {
+ ppst->pam2[0][sa_x][i] = ppst->pam2[0][i][sa_x]=ppst->pam_xm;
+ ppst->pam2[0][sa_t][i] = ppst->pam2[0][i][sa_t]=ppst->pam_xm;
+ }
+ ppst->pam2[0][sa_x][sa_x]=ppst->pam_xx;
+ ppst->pam2[0][sa_t][sa_t]=ppst->pam_xm;
+ }
+ else {
+ ppst->pam_xx = ppst->pam2[0][sa_x][sa_x];
+ ppst->pam_xm = ppst->pam2[0][1][sa_x];
+ }
+}
+
+double
+get_lambda(int **pam2p, int n0, int nsq, char *aa0) {
+ double lambda, H;
+ double *pr, tot, sum;
+ int aa0i;
+ int i, ioff, j, min, max;
+
+ /* get min and max scores */
+ min = BIGNUM;
+ max = -BIGNUM;
+ if(pam2p[0][1] == -BIGNUM) {
+ ioff = 1;
+ n0++;
+ } else {
+ ioff = 0;
+ }
+
+ for (i = ioff ; i < n0 ; i++) {
+ for (j = 1; j <= nsq ; j++) {
+ if (min > pam2p[i][j])
+ min = pam2p[i][j];
+ if (max < pam2p[i][j])
+ max = pam2p[i][j];
+ }
+ }
+
+ fprintf(stderr, "min: %d\tmax:%d\n", min, max);
+
+ if ((pr = (double *) calloc(max - min + 1, sizeof(double))) == NULL) {
+ fprintf(stderr, "Couldn't allocate memory for score probabilities: %d\n", max - min + 1);
+ exit(1);
+ }
+
+ tot = (double) rrtotal * (double) rrtotal * (double) n0;
+ for (i = ioff ; i < n0 ; i++) {
+ if (aa0[i] < 'A') {aa0i = aa0[i];}
+ else {aa0i = aascii[aa0[i]];}
+ for (j = 1; j <= nsq ; j++) {
+ pr[pam2p[i][j] - min] +=
+ (double) ((double) rrcounts[aa0i] * (double) rrcounts[j]) / tot;
+ }
+ }
+
+ sum = 0.0;
+ for(i = 0 ; i <= max-min ; i++) {
+ sum += pr[i];
+ /* fprintf(stderr, "%3d: %g %g\n", i+min, pr[i], sum); */
+ }
+ /* fprintf(stderr, "pr[] sum: %g\n", sum); */
+
+ for(i = 0 ; i <= max-min ; i++) { pr[i] /= sum; }
+
+ if (!karlin(min, max, pr, &lambda, &H)) {
+ fprintf(stderr, "Karlin lambda estimation failed\n");
+ }
+
+ /* fprintf(stderr, "lambda: %g\n", lambda); */
+ free(pr);
+
+ return lambda;
+}
+
+int **
+alloc_pam2p(int len, int nsq) {
+ int i;
+ int **pam2p;
+
+ if ((pam2p = (int **)calloc(len,sizeof(int *)))==NULL) {
+ fprintf(stderr," Cannot allocate pam2p: %d\n",len);
+ return NULL;
+ }
+
+ if((pam2p[0] = (int *)calloc((nsq+1)*len,sizeof(int)))==NULL) {
+ fprintf(stderr, "Cannot allocate pam2p[0]: %d\n", (nsq+1)*len);
+ free(pam2p);
+ return NULL;
+ }
+
+ for (i=1; i<len; i++) {
+ pam2p[i] = pam2p[0] + (i*(nsq+1));
+ }
+
+ return pam2p;
+}
+
+void free_pam2p(int **pam2p) {
+ if (pam2p) {
+ free(pam2p[0]);
+ free(pam2p);
+ }
+}
+
+int
+parse_pssm_asn_fa(FILE *afd, int *n_rows, int *n_cols,
+ unsigned char **query, double ***wfreqs,double ***freqs, int ***iscores,
+ char *matrix, int *gap_open, int *gap_extend,
+ double *lambda);
+
+/* the ASN.1 pssm includes information about the scoring matrix used
+ (though not the gap penalty in the current version PSSM:2) The PSSM
+ scoring matrix and gap penalties should become the default if they
+ have not been set explicitly.
+*/
+
+/* read the PSSM from an open FILE *fp - but nothing has been read
+ from *fp */
+
+int
+read_asn_pssm(unsigned char *aa0, int n0, int nsq,
+ double pamscale, FILE *fp, struct pstruct *ppst) {
+
+ int i, j, len, k, itmp;
+ int qi, rj; /* qi - index query; rj - index residues (1-20) */
+ int **pam2p;
+ int first, too_high;
+ char *query, ctmp;
+ char dline[512];
+ char matrix[MAX_SSTR];
+ double psi2_lambda;
+ double freq, **wfreq2d, **freq2d, lambda, new_lambda;
+ double scale, scale_high, scale_low;
+ int **iscores2d;
+ int gap_open, gap_extend;
+ int n_rows, n_cols;
+
+ pam2p = ppst->pam2p[0];
+
+ if (parse_pssm_asn_fa(fp, &n_rows, &n_cols, (unsigned char **)&query, &wfreq2d, &freq2d, &iscores2d,
+ matrix, &gap_open, &gap_extend, &psi2_lambda)<=0) {
+ return -1;
+ }
+
+ if (!query) { query = (char *)aa0;}
+
+ if (gap_open) {
+ if (gap_open > 0) {gap_open = -gap_open;}
+ ppst->gdelval = gap_open;
+ }
+ else if (strncmp(matrix,"BLOSUM62",8)==0) {
+ ppst->gdelval = -11;
+ }
+
+ if (gap_extend) {
+ if (gap_extend > 0) {gap_extend = -gap_extend;}
+ ppst->ggapval = gap_extend;
+ }
+ else if (strncmp(matrix,"BLOSUM62",8)==0) {
+ ppst->ggapval = -1;
+ }
+
+ if (strncmp(matrix, "BLOSUM62", 8)== 0 && !ppst->pam_set) {
+ strncpy(ppst->pamfile, "BP62", 120);
+ strncpy(ppst->pamfile_save, ppst->pamfile, 120);
+ standard_pam(ppst->pamfile,ppst,0, 0);
+ if (!ppst->have_pam2) {
+ alloc_pam (MAXSQ, MAXSQ, ppst);
+ }
+ initpam2(ppst);
+ ppst->pam_set = 1;
+ }
+
+ if (n_cols < n0) {
+ fprintf(stderr, " query length: %d != n_cols: %d\n",n0, n_cols);
+ exit(1);
+ }
+
+ printf(" rrtotal: %ld; pamscale=%0.5f\n", rrtotal, pamscale);
+
+ printf(" ");
+ for (rj = 1; rj <= N_EFFECT; rj++ ) {
+ printf(" %c ",NCBIstdaa[rj]);
+ }
+ printf("\n");
+ for (qi = 0 ; qi < n0 ; qi++) {
+ printf("%3d %c",qi+1, NCBIstdaa[aa0[qi]]);
+ for (rj = 0 ; rj < N_EFFECT ; rj++) {
+ freq = freq2d[qi][rj];
+/* printf(" %8.7g",freq*10.0); */
+
+ if (freq > 1e-12) {
+ freq = log(freq /((double) (rrcounts[rj+1])/(double) rrtotal));
+ freq /= pamscale; /* this gets us close to originial pam scores */
+ freq2d[qi][rj] = freq;
+ }
+ printf(" %5.2f",freq);
+ }
+ printf("\n");
+ }
+
+ /* now figure out the right scale */
+ scale = 1.0;
+ lambda = get_lambda(ppst->pam2[0], 20, 20, "\0ARNDCQEGHILKMFPSTWYV");
+
+ /* should be near 1.0 because of our initial scaling by ppst->pamscale */
+ fprintf(stderr, "real_lambda: %g\n", lambda);
+
+ /* get initial high/low scale values: */
+ first = 1;
+ while (1) {
+ fill_pam(pam2p, n0, 20, freq2d, scale);
+ new_lambda = get_lambda(pam2p, n0, 20, query);
+ fprintf(stderr, " new_lambda: %g; scale: %g\n",new_lambda,scale);
+
+ if (new_lambda > lambda) {
+ if (first) {
+ first = 0;
+ scale = scale_high = 1.0 + 0.05;
+ scale_low = 1.0;
+ too_high = 1;
+ } else {
+ if (!too_high) break;
+ scale = (scale_high += scale_high - 1.0);
+ }
+ } else if (new_lambda > 0) {
+ if (first) {
+ first = 0;
+ scale_high = 1.0;
+ scale = scale_low = 1.0 - 0.05;
+ too_high = 0;
+ } else {
+ if (too_high) break;
+ scale = (scale_low += scale_low - 1.0);
+ }
+ } else {
+ fprintf(stderr, "new_lambda (%g) <= 0; matrix has positive average score", new_lambda);
+ exit(1);
+ }
+ }
+
+ /* now do binary search between low and high */
+ for (i = 0 ; i < 10 ; i++) {
+ scale = 0.5 * (scale_high + scale_low);
+ fill_pam(pam2p, n0, 20, freq2d, scale);
+ new_lambda = get_lambda(pam2p, n0, 20, query);
+ fprintf(stderr, "it: %d - new_lambda: %g; scale: %g\n",i,new_lambda,scale);
+
+ if (new_lambda > lambda) scale_low = scale;
+ else scale_high = scale;
+ }
+
+ scale = 0.5 * (scale_high + scale_low);
+ fill_pam(pam2p, n0, 20, freq2d, scale);
+
+ fprintf(stderr, "final scale: %g\n", scale);
+
+ fprintf(stderr," ");
+ for (rj = 1; rj <= N_EFFECT+3; rj++ ) {
+ fprintf(stderr," %c",NCBIstdaa[rj]);
+ }
+ fprintf(stderr,"\n");
+
+ for (qi = 0 ; qi < n0 ; qi++) {
+ fprintf(stderr, "%4d %c: ", qi+1, NCBIstdaa[aa0[qi]]);
+ for (rj = 1 ; rj <= N_EFFECT+3 ; rj++) {
+ fprintf(stderr, "%3d", pam2p[qi][rj]);
+ }
+ fprintf(stderr, "\n");
+ }
+
+ if (iscores2d != NULL) {
+ fprintf(stderr," ");
+ for (rj = 1; rj <= N_EFFECT+3; rj++ ) {
+ fprintf(stderr," %c ",NCBIstdaa[rj]);
+ }
+ fprintf(stderr,"\n");
+ for (qi = 0 ; qi < n0 ; qi++) {
+ fprintf(stderr, "%4d %c: ", qi+1, NCBIstdaa[aa0[qi]]);
+ for (rj = 1 ; rj <= N_EFFECT+3 ; rj++) {
+ itmp = iscores2d[qi][rj];
+ if (itmp < -256) itmp=0;
+ fprintf(stderr, "%3d", itmp );
+ }
+ fprintf(stderr, "\n");
+ }
+ free(iscores2d[0]);
+ free(iscores2d);
+ }
+
+ if (wfreq2d != NULL) {
+ free(wfreq2d[0]);
+ free(wfreq2d);
+ }
+
+ if (freq2d != NULL) {
+ free(freq2d[0]);
+ free(freq2d);
+ }
+
+ free(query);
+ return 1;
+}
+
+void
+init_altpam(struct pstruct *ppst) {
+ int ix_i, ix_l, ix_j, p_i, p_j, i;
+
+ /* add values for 'J' (I/L) value, which are not present in 1-D matrices */
+ ix_i = pascii['I'];
+ ix_l = pascii['L'];
+ ix_j = pascii['J'];
+ if (strchr(pam_sq,'J')==NULL) {
+ ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
+ /* get the identities */
+ ppst->pam2[0][ix_j][ix_j] =
+ max(ppst->pam2[0][ix_i][ix_i],ppst->pam2[0][ix_l][ix_l]);
+ for (i=1; i < pam_sq_n; i++) {
+ p_i = pascii[pam_sq[i]];
+ /* do not assume symmetric matrices */
+ ppst->pam2[0][ix_j][p_i] =
+ max(ppst->pam2[0][ix_i][p_i],ppst->pam2[0][ix_l][p_i]);
+ ppst->pam2[0][p_i][ix_j] =
+ max(ppst->pam2[0][p_i][ix_i],ppst->pam2[0][p_i][ix_l]);
+ }
+ }
+ /* add values for 'O' (K) value, which are not present in 1-D matrices */
+ ix_i = pascii['K'];
+ ix_j = pascii['O'];
+ if (ix_j < ppst->nsq) { /* is it in the NCBIstdaa alphabet ? */
+ ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
+ /* get the identity */
+ ppst->pam2[0][ix_j][ix_j] = ppst->pam2[0][ix_i][ix_i];
+ /* do not assume symmetric matrices */
+ for (i=1; i < pam_sq_n; i++) {
+ p_i = pascii[pam_sq[i]];
+ ppst->pam2[0][ix_j][p_i] = ppst->pam2[0][ix_i][p_i];
+ ppst->pam2[0][p_i][ix_j] = ppst->pam2[0][p_i][ix_i];
+ }
+ }
+ else {
+ pascii['O'] = pascii['K'];
+ pascii['o'] = pascii['k'];
+ }
+
+ /* add values for 'U' (C) value, which are not present in 1-D matrices */
+ ix_i = pascii['C'];
+ ix_j = pascii['U'];
+ if (ix_j < ppst->nsq) { /* is it in the NCBIstdaa alphabet */
+ ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
+ /* get the identity */
+ ppst->pam2[0][ix_j][ix_j] = ppst->pam2[0][ix_i][ix_i];
+ /* do not assume symmetric matrices */
+ for (i=1; i < pam_sq_n; i++) {
+ p_i = pascii[pam_sq[i]];
+ ppst->pam2[0][ix_j][p_i] = ppst->pam2[0][ix_i][p_i];
+ ppst->pam2[0][p_i][ix_j] = ppst->pam2[0][p_i][ix_i];
+ }
+ }
+ else {
+ pascii['U'] = pascii['C'];
+ pascii['u'] = pascii['c'];
+ }
+}
+
diff --git a/src/pssm_asn_subs.c b/src/pssm_asn_subs.c
new file mode 100644
index 0000000..50b7844
--- /dev/null
+++ b/src/pssm_asn_subs.c
@@ -0,0 +1,1756 @@
+/* $Id: pssm_asn_subs.c 1265 2014-06-30 16:13:49Z wrp $ */
+
+/* copyright (C) 2005, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* read_asn_dest modified 26-Jul-2007 to skip over text/bytes if dest is NULL */
+
+/* this code is designed to parse the ASN.1 binary encoded scoremat
+ object produced by blastpgp -C file.ckpt_asn -u 2 */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+
+int parse_pssm_asn();
+int parse_pssm2_asn();
+
+int
+parse_pssm_asn_fa(FILE *afd, int *n_rows, int *n_cols,
+ unsigned char **query, double ***wfreqs, double ***freqs, int ***iscores,
+ char *matrix, int *gap_open, int *gap_extend,
+ double *lambda);
+
+#define COMPO_NUM_TRUE_AA 20
+
+/**positions of true characters in protein alphabet*/
+/*
+static int trueCharPositions[COMPO_NUM_TRUE_AA] = {
+ 1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,22
+};
+*/
+
+#define COMPO_LARGEST_ALPHABET 28
+
+/*
+static char ncbieaatoa[COMPO_LARGEST_ALPHABET] = {"-ABCDEFGHIJKLMNOPQRSTUVWXYZ"};
+
+static int alphaConvert[COMPO_LARGEST_ALPHABET] = {
+ (-1), 0, (-1), 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15,
+ 16, 19, 17, (-1), 18, (-1), (-1), (-1), (-1), (-1)
+};
+*/
+
+int pssm_aa_order[20] = { 1, /*A*/
+ 16, /*R*/
+ 13, /*N*/
+ 4, /*D*/
+ 3, /*C*/
+ 15, /*Q*/
+ 5, /*E*/
+ 7, /*G*/
+ 8, /*H*/
+ 9, /*I*/
+ 11, /*L*/
+ 10, /*K*/
+ 12, /*M*/
+ 6, /*F*/
+ 14, /*P*/
+ 17, /*S*/
+ 18, /*T*/
+ 20, /*W*/
+ 22, /*Y*/
+ 19}; /*V*/
+
+#define ABP *asnp->abp
+#define ABPP asnp->abp
+#define ABP_INC2 asnp->abp += 2
+
+#define ASN_SEQ 48
+#define ASN_SET 48
+#define ASN_SEQOF 49
+#define ASN_SETOF 49
+
+#define ASN_PSSM_QUERY 166
+#define ASN_PSSM2_VERSION 160
+#define ASN_PSSM2_QUERY 161
+#define ASN_PSSM2_MATRIX 162
+
+#define ASN_PSSM_IS_PROT 160
+#define ASN_PSSM_NROWS 162
+#define ASN_PSSM_NCOLS 163
+
+#define ASN_PSSM_BYCOL 165
+#define ASN_PSSM_INTERMED_DATA 167
+#define ASN_PSSM_INTERMED_RES_FREQS 160
+#define ASN_PSSM_INTERMED_WRES_FREQS 161
+#define ASN_PSSM_INTERMED_FREQ_RATIOS 162
+#define ASN_PSSM_INTERMED_INFO_CONTENT 163
+#define ASN_PSSM_INTERMED_GAPL_COLWTS 164
+#define ASN_PSSM_INTERMED_SIGMA 165
+#define ASN_PSSM_INTERMED_INTVAL_SIZE 166
+#define ASN_PSSM_INTERMED_NUM_MATCH_SEQ 167
+
+#define ASN_PSSM_FREQS 162
+
+#define ASN_PSSM_FINAL_DATA 168 /* Sequence */
+#define ASN_PSSM_FINAL_DATA_SCORES 160 /* sequence of integer */
+#define ASN_PSSM_FINAL_DATA_LAMBDA 161 /* real */
+#define ASN_PSSM_FINAL_DATA_KAPPA 162 /* real */
+#define ASN_PSSM_FINAL_DATA_H 163 /* real */
+#define ASN_PSSM_FINAL_DATA_SCALEF 164 /* integer */
+#define ASN_PSSM_FINAL_DATA_ULAMBDA 165 /* real */
+#define ASN_PSSM_FINAL_DATA_UKAPPA 166 /* real */
+#define ASN_PSSM_FINAL_DATA_UH 167 /* real */
+
+#define ASN_PSSM2_IS_PROTEIN 160
+#define ASN_PSSM2_MATRIX_NAME 161
+#define ASN_PSSM2_MATRIX_COMMENT 162 /* not used */
+#define ASN_PSSM2_NCOLS 163
+#define ASN_PSSM2_NROWS 164
+#define ASN_PSSM2_SCORES 165
+#define ASN_PSSM2_KARLIN_K 166
+#define ASN_PSSM2_FREQS 167
+
+#define ASN_IS_STR 26
+#define ASN_IS_SSTR 65
+#define ASN_IS_INT 2
+#define ASN_IS_BOOL 1
+#define ASN_IS_OCTSTR 4
+#define ASN_IS_OCTSSTR 65
+#define ASN_IS_REAL 9
+#define ASN_IS_ENUM 10
+#define ASN_IS_ENUM0 1
+
+#define ASN_OBJ_INT 160
+#define ASN_OBJ_STR 161
+
+struct asn_bstruct {
+ FILE *fd;
+ unsigned char *buf;
+ unsigned char *abp;
+ unsigned char *buf_max;
+ int len;
+};
+
+#define ASN_BUF 4096
+
+void *
+new_asn_bstruct(int buf_siz) {
+
+ struct asn_bstruct *asnp;
+
+ if ((asnp=calloc(1,sizeof(struct asn_bstruct)))==NULL) {
+ fprintf(stderr, "cannot allocate asn_bstruct\n");
+ exit(1);
+ }
+
+ if ((asnp->buf = (unsigned char *)calloc(buf_siz, sizeof(char))) == NULL ) {
+ fprintf(stderr, " cannot allocate asn_buf (%d)\n",buf_siz);
+ exit(1);
+ }
+
+ return asnp;
+}
+
+void
+free_asn_bstruct(struct asn_bstruct *asnp) {
+
+ if (asnp == NULL) return;
+ if (asnp->buf != NULL) free(asnp->buf);
+ free(asnp);
+}
+
+unsigned char *
+chk_asn_buf(struct asn_bstruct *asnp, int v) {
+ int new_buf;
+
+ if (v > ASN_BUF) {
+ fprintf(stderr," attempt to read %d bytes ASN.1 data > buffer size (%d)\n",
+ v, ASN_BUF);
+ exit(1);
+ }
+
+ if (asnp->abp + v > asnp->buf_max) {
+
+ /* move down the left over stuff */
+ asnp->len = asnp->buf_max - asnp->abp;
+
+ memmove(asnp->buf, asnp->abp, asnp->len);
+
+ asnp->abp = asnp->buf;
+ new_buf = ASN_BUF - asnp->len;
+
+ if (asnp->fd && !feof(asnp->fd) &&
+ (new_buf=fread(asnp->buf + asnp->len, sizeof(char), new_buf, asnp->fd)) != 0) {
+ asnp->len += new_buf;
+ }
+
+ asnp->buf_max = asnp->buf + asnp->len;
+
+ if (asnp->len < v) {
+ fprintf(stderr, " Unable to read %d bytes\n",v);
+ exit(1);
+ }
+ }
+ /* otherwise, v bytes are currently in the buffer */
+
+ return asnp->abp;
+}
+
+unsigned char *
+asn_error(char *func, char *token, int tval,
+ struct asn_bstruct *asnp, int len) {
+ int i;
+
+ fprintf(stderr," %s %s [%0x]:",func, token, tval);
+ for (i=0; i<len; i++) {
+ fprintf(stderr," %0x",asnp->abp[i]);
+ }
+ fprintf(stderr,"\n");
+ return asnp->abp;
+}
+
+/*
+ read_asn_dest reads v bytes into oct_str if v <= o_len - otherwise
+ fails - the correct size buffer must be pre-allocated read_asn_dest
+ is required for ASN data entities that are longer than ASN_BUF
+ (1024)
+
+ skip over if oct_str==NULL;
+*/
+unsigned char *
+read_asn_dest(struct asn_bstruct *asnp, int v, unsigned char *oct_str, int o_len) {
+ int new_buf;
+ unsigned char *oct_ptr;
+
+
+ if (oct_str != NULL && v > o_len) {
+ fprintf(stderr, " read_asn_dest - cannot read %d bytes into %d buffer\n",
+ v, o_len);
+ exit(1);
+ }
+
+ if (asnp->abp + v <= asnp->buf_max) {
+ if (oct_str != NULL) memmove(oct_str, asnp->abp, v);
+ return asnp->abp+v;
+ }
+ else {
+ /* move down the left over stuff */
+
+ asnp->len = asnp->buf_max - asnp->abp;
+
+ if (oct_str != NULL) memmove(oct_str, asnp->abp, asnp->len);
+ oct_ptr = oct_str+asnp->len;
+ v -= asnp->len;
+
+ asnp->abp = asnp->buf;
+ new_buf = ASN_BUF;
+
+ while ((new_buf=fread(asnp->buf, sizeof(char), new_buf, asnp->fd)) != 0) {
+ asnp->len = new_buf;
+ asnp->buf_max = asnp->buf + asnp->len;
+ if (v <= new_buf) { /* we have it all this time */
+ if (oct_str != NULL) memmove(oct_ptr, asnp->buf, v);
+ asnp->len -= v;
+ asnp->abp = asnp->buf + v;
+ break;
+ }
+ else { /* we need to read some more */
+ if (oct_str != NULL) memmove(oct_ptr, asnp->buf, new_buf);
+ v -= new_buf;
+ new_buf = ASN_BUF;
+ }
+ }
+ }
+ return asnp->buf + v;
+}
+
+unsigned char *
+get_astr_bool(struct asn_bstruct *asnp, int *val) {
+
+ int v_len, v;
+
+ asnp->abp = chk_asn_buf(asnp,16);
+
+ v = 0;
+ if (*asnp->abp++ != 1) { /* check for int */
+ fprintf(stderr," bool missing\n");
+ }
+ else {
+ v_len = *asnp->abp++;
+ if (v_len != 1) {
+ fprintf(stderr, "boolean length != 1 : %d\n", v_len);
+ v = *asnp->abp++;
+ }
+ else { v = *asnp->abp++;}
+ }
+ *val = v;
+ return asnp->abp;
+}
+
+unsigned char *
+get_astr_int(struct asn_bstruct *asnp, long *val) {
+
+ int i_len, v_len, v;
+
+ v = 0;
+
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ if (*asnp->abp++ != ASN_IS_INT) { /* check for int */
+ return asn_error("get_astr_int", "ASN_IS_INT", ASN_IS_INT, asnp, 4);
+ }
+ else {
+ i_len = v_len = *asnp->abp++;
+ while (i_len-- > 0) {
+ v *= 256;
+ v += *asnp->abp++;
+ }
+ }
+
+ if (v_len == 1 && v > 127) { v = v - 256; }
+ else if (v_len == 2 && v > 32767) {v = v - 65536;}
+
+ *val = v;
+ return asnp->abp;
+}
+
+unsigned char *
+get_astr_real(struct asn_bstruct *asnp,
+ double *val) {
+
+ int v_len, v;
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ if (ABP != ASN_IS_REAL) {
+ fprintf(stderr," real missing\n");
+ return asnp->abp;
+ }
+ else {
+ v_len = asnp->abp[1];
+ ABP_INC2;
+ }
+
+ *val = 0.0;
+ if (ABP != '\0') {
+ fprintf(stderr," float missing\n");
+ return asnp->abp;
+ }
+ else {
+ sscanf((char *)asnp->abp+1,"%lg",val);
+ asnp->abp += v_len;
+ }
+ return asnp->abp;
+}
+
+unsigned char *
+get_astr_enum(struct asn_bstruct *asnp, int *val) {
+
+ int v_len, v;
+
+ asnp->abp = chk_asn_buf(asnp,16);
+
+ v = 0;
+ if (*asnp->abp++ != ASN_IS_ENUM) { /* check for int */
+ fprintf(stderr," enum missing\n");
+ }
+ else {
+ v_len = *asnp->abp++;
+ while (v_len-- > 0) { v *= 256; v += *asnp->abp++; }
+ }
+ *val = v;
+
+ return asnp->abp;
+}
+
+unsigned char *
+get_astr_packedreal(struct asn_bstruct *asnp, long *l_val_p, double *d_val_p) {
+
+ int v_len;
+ char tmp_str[64];
+
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ if (*asnp->abp++ != ASN_IS_REAL) { /* check for packed float */
+ fprintf(stderr,"*** error [%s:%d] - float missing\n",__FILE__,__LINE__);
+ *d_val_p = 0;
+ return asnp->abp;
+ }
+ else {
+ v_len = *asnp->abp++;
+
+ if (v_len > 63) {
+ fprintf(stderr,"*** error [%s:%d] - real string too long: %d\n",__FILE__,__LINE__,v_len);
+ }
+
+ asnp->abp = chk_asn_buf(asnp,v_len+16);
+
+ if (v_len == 2 && *asnp->abp == '\0' && *(asnp->abp+1)=='0') {
+ ABP_INC2;
+ *d_val_p = 0.0;
+ }
+ else { /* copy and scan it */
+ if (*asnp->abp != '\0') {
+ fprintf(stderr, "*** error [%s:%d] - packedreal - expected 0, got %d\n", __FILE__,__LINE__,*asnp->abp);
+ *d_val_p = -1.0;
+ return asnp->abp;
+ }
+ asnp->abp++;
+ strncpy(tmp_str, (char *)asnp->abp, v_len);
+ tmp_str[v_len-1] = '\0';
+ tmp_str[63] = '\0';
+ sscanf(tmp_str,"%lg", d_val_p);
+ asnp->abp += v_len-1;
+ }
+ }
+ return asnp->abp;
+}
+
+unsigned char *
+get_astr_packedint(struct asn_bstruct *asnp, long *l_val_p, double *d_val_p) {
+
+ asnp->abp = chk_asn_buf(asnp,32);
+ ABPP = get_astr_int(asnp, l_val_p);
+ return asnp->abp;
+}
+
+unsigned char *
+get_astr_str(struct asn_bstruct *asnp, char *text, int t_len) {
+
+ int v_len, tv_len;
+
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ if (text != NULL) text[0] = '\0';
+
+ if (ABP != ASN_IS_STR && ABP != ASN_IS_SSTR) { /* check for str */
+ return asn_error("get_astr_str", "ASN_IS_STR", ASN_IS_STR, asnp, 4);
+ }
+ asnp->abp++;
+
+ v_len = *asnp->abp++;
+ if (v_len > 128) { /* need to read the length from the next bytes */
+ tv_len = v_len &0x7f;
+
+ asnp->abp = chk_asn_buf(asnp,tv_len+32);
+
+ for (v_len =0; tv_len; tv_len--) { v_len = (v_len << 8) + *asnp->abp++; }
+ }
+
+ /* read v_len bytes */
+
+ if (v_len < t_len) { /* the string fits in the buffer */
+ asnp->abp = read_asn_dest(asnp,v_len, (unsigned char *)text, t_len);
+ }
+ else { /* it does not fit, fill the buffer and skip */
+ if (t_len > 0)
+ asnp->abp = read_asn_dest(asnp,t_len, (unsigned char *)text, t_len);
+ asnp->abp = read_asn_dest(asnp,v_len - t_len, NULL, 0);
+ }
+ if (text != NULL && t_len > 0) {text[min(v_len,t_len)]='\0';}
+ return asnp->abp;
+}
+
+unsigned char *
+get_astr_octstr(struct asn_bstruct *asnp,
+ unsigned char *oct_str,
+ int o_len) {
+
+ int q_len, v_len;
+
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ if (ABP == ASN_IS_OCTSTR || ABP == ASN_IS_OCTSSTR) {
+ ABPP++;
+ /* get length of length */
+ if (ABP > 128) {
+ v_len = *asnp->abp++ & 0x7f;
+
+ asnp->abp = chk_asn_buf(asnp,v_len+32);
+
+ q_len = 0;
+ while (v_len-- > 0) {
+ q_len *= 256;
+ q_len += *asnp->abp++;
+ }
+ }
+ else {
+ q_len = *asnp->abp++ & 0x7f;
+ }
+
+ if (q_len < o_len) { /* the string fits in the buffer */
+ asnp->abp = read_asn_dest(asnp,q_len, oct_str, o_len);
+ }
+ else { /* it does not fit, fill the buffer and skip */
+ asnp->abp = read_asn_dest(asnp,o_len, oct_str, o_len);
+ asnp->abp = read_asn_dest(asnp,q_len - o_len, NULL, 0);
+ }
+ if (oct_str != NULL && o_len > 0) oct_str[min(q_len,o_len)]='\0';
+
+ /* asnp->abp += 2; */ /* skip characters and NULL's */
+ }
+ return asnp->abp;
+}
+
+/* something to try to skip over stuff we don't want */
+unsigned char *
+get_astr_junk(struct asn_bstruct *asnp) {
+
+ int seq_cnt = 0;
+ long tmp;
+ char string[256];
+
+ while (ABP) {
+ if ( ABP == ASN_SEQ) { ABP_INC2; seq_cnt++;}
+ else if ( ABP == ASN_IS_BOOL ) {
+ ABP_INC2;
+ ABPP = get_astr_int(asnp, &tmp) + 2;
+ }
+ else if ( ABP == ASN_IS_INT ) {
+ ABP_INC2;
+ ABPP = get_astr_int(asnp, &tmp) + 2;
+ }
+ else if ( ABP == ASN_IS_STR ) {
+ ABP_INC2;
+ ABPP = get_astr_str(asnp, string, sizeof(string)-1) + 2;
+ }
+ }
+
+ while (seq_cnt-- > 0) ABP_INC2;
+ return asnp->abp;
+}
+
+#define ASN_SEQINST_NCBIEAA 167
+#define ASN_SEQINST_NCBISTDAA 169
+#define ASN_SEQINST_IUPACAA 161
+
+unsigned char *
+get_astr_iseqd(struct asn_bstruct *asnp,
+ unsigned char *query,
+ int nq) {
+
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ /* check for the sequence type - NCBIstdaa or NCBIstdeaa */
+
+ if (ABP == ASN_SEQINST_NCBIEAA) {
+ ABP_INC2;
+ return get_astr_str(asnp, (char *)query, nq) + 2;
+ }
+ else if (ABP == ASN_SEQINST_NCBISTDAA) {
+ ABP_INC2;
+ return get_astr_octstr(asnp, query, nq) + 2;
+ }
+ else if (ABP == ASN_SEQINST_IUPACAA) {
+ ABP_INC2;
+ return get_astr_str(asnp, (char *)query, nq) + 2;
+ }
+ else {
+ return asn_error("get_astr_iseqd","",-1,asnp,4);
+ }
+}
+
+unsigned char *
+get_astr_objid(struct asn_bstruct *asnp, int *type, int *val, char *text, int t_len) {
+
+ long local_ival;
+
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ if (text != NULL) text[0] = '\0';
+ if (val != NULL) *val = 0;
+ *type = 0;
+
+ /* object could be text, or could be int */
+
+ if (ABP == ASN_OBJ_INT) {
+ ABP_INC2;
+ asnp->abp = get_astr_int(asnp, &local_ival)+2;
+ if (val != NULL) *val = local_ival;
+ *type = 1;
+ }
+ else if (ABP == ASN_OBJ_STR) {
+ ABP_INC2;
+ asnp->abp = get_astr_str(asnp, text, t_len)+2;
+ *type = 2;
+ }
+ else {
+ return asn_error("get_astr_objid","ASN_OBJ_STR",ASN_OBJ_STR,asnp,4);
+ }
+ return asnp->abp;
+}
+
+#define ASN_BIOSEQ_SEQ 160
+#define ASN_BIOSEQ_ID_VAL 160
+#define ASN_BIOSEQ_ID_OBJ 160
+#define ASN_BIOSEQ_ID_LOCAL 161
+#define ASN_BIOSEQ_ID_GIBBSQ 162
+#define ASN_BIOSEQ_ID_GIBBMT 163
+#define ASN_BIOSEQ_ID_GB 164
+#define ASN_BIOSEQ_ID_EMBL 165
+#define ASN_BIOSEQ_ID_PIR 166
+#define ASN_BIOSEQ_ID_SP 167
+#define ASN_BIOSEQ_ID_PATENT 168
+#define ASN_BIOSEQ_ID_OTHER 169
+#define ASN_BIOSEQ_ID_GEN 170
+#define ASN_BIOSEQ_ID_GI 171
+#define ASN_BIOSEQ_ID_DDBJ 172
+#define ASN_BIOSEQ_ID_PDB 173
+#define ASN_BIOSEQ_ID_TPG 174
+#define ASN_BIOSEQ_ID_TPE 175
+#define ASN_BIOSEQ_ID_TPD 176
+
+#define ASN_BIOSEQ_TEXTID_NAME 160
+#define ASN_BIOSEQ_TEXTID_ACC 161
+#define ASN_BIOSEQ_TEXTID_REL 162
+#define ASN_BIOSEQ_TEXTID_VER 163
+
+#define ASN_BIOSEQ_ID 160
+#define ASN_BIOSEQ_DESCR 161
+#define ASN_BIOSEQ_INST 162
+#define ASN_BIOSEQ_ANNOT 163
+
+#define ASN_BIOSEQ_D_NAME 163
+#define ASN_BIOSEQ_D_TITLE 164
+#define ASN_BIOSEQ_D_PIR 169
+#define ASN_BIOSEQ_D_GB 170
+#define ASN_BIOSEQ_D_USER 173
+#define ASN_BIOSEQ_D_SP 174
+
+#define ASN_BIOSEQ_INST_REPR 160
+#define ASN_BIOSEQ_INST_MOL 161
+#define ASN_BIOSEQ_INST_LEN 162
+#define ASN_BIOSEQ_INST_SEQD 166
+#define ASN_BIOSEQ_INST_HIST 168
+
+#define ASN_USERFLD_D_STR 160
+#define ASN_USERFLD_D_INT 161
+#define ASN_USERFLD_D_REAL 162
+#define ASN_USERFLD_D_BOOL 163
+#define ASN_USERFLD_D_OS 164
+#define ASN_USERFLD_D_USER 165
+#define ASN_USERFLD_D_STRS 166
+#define ASN_USERFLD_D_INTS 167
+#define ASN_USERFLD_D_REALS 168
+#define ASN_USERFLD_D_OSS 169
+#define ASN_USERFLD_D_FIELDS 170
+#define ASN_USERFLD_D_OBJS 171
+
+unsigned char *
+get_astr_userfld_data(struct asn_bstruct *asnp) {
+ double real;
+ long ival;
+ int bool;
+
+ ABPP = chk_asn_buf(asnp, 32);
+
+ switch (ABP) {
+ case ASN_USERFLD_D_STR :
+ ABP_INC2;
+ ABPP = get_astr_str(asnp, NULL, 0) + 2;
+ break;
+ case ASN_USERFLD_D_INT :
+ ABP_INC2;
+ ABPP = get_astr_int(asnp, &ival) + 2;
+ break;
+ case ASN_USERFLD_D_REAL :
+ ABP_INC2;
+ ABPP = get_astr_real(asnp, &real) + 2;
+ break;
+ case ASN_USERFLD_D_BOOL :
+ ABP_INC2;
+ ABPP = get_astr_bool(asnp, &bool) + 2;
+ break;
+ case ASN_USERFLD_D_OS :
+ ABP_INC2;
+ ABPP = get_astr_octstr(asnp, NULL, 0)+2;
+ break;
+ case ASN_USERFLD_D_OSS :
+ asnp->abp += 4;
+ ABPP = get_astr_octstr(asnp, NULL, 0)+4;
+ break;
+ default:
+ return asn_error("get_astr_userfld_data","",0,asnp,4);
+ }
+ return asnp->abp;
+}
+
+#define ASN_USERFLD_LABEL 160
+#define ASN_USERFLD_NUM 161
+#define ASN_USERFLD_DATA 162
+
+unsigned char *
+get_astr_userfld(struct asn_bstruct *asnp) {
+
+ char *func = "get_astr_userfld";
+ long num;
+ int type, in_seq=0;
+
+ asnp->abp = chk_asn_buf(asnp, 32);
+
+ if (ABP == ASN_SEQ) { in_seq = 1; ABP_INC2;}
+
+ if (ABP != ASN_USERFLD_LABEL) {
+ return asn_error(func, "ASN_USERFLD_LABEL", ASN_USERFLD_LABEL, asnp, 4);
+ }
+ else {
+ ABP_INC2;
+ asnp->abp = get_astr_objid(asnp, &type, NULL, NULL, 0)+2;
+ }
+
+ if (ABP == ASN_USERFLD_NUM) {
+ asnp->abp +=2;
+ asnp->abp = get_astr_int(asnp, &num)+2;
+ }
+
+ if (ABP != ASN_USERFLD_DATA) {
+ return asn_error(func, "ASN_USERFLD_DATA", ASN_USERFLD_DATA, asnp, 4);
+ }
+ else {
+ ABP_INC2;
+ asnp->abp = get_astr_userfld_data(asnp)+2;
+ }
+
+ asnp->abp = chk_asn_buf(asnp,8);
+ if (in_seq) ABP_INC2;
+ return asnp->abp;
+}
+
+#define ASN_USER_CLASS 160
+#define ASN_USER_TYPE 161
+#define ASN_USER_DATA 162
+
+unsigned char *
+get_astr_user(struct asn_bstruct *asnp) {
+ int type;
+
+ char *func = "get_astr_user";
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ ABP_INC2; /* skip SEQ */
+ if (ABP == ASN_USER_CLASS) {
+ ABP_INC2;
+ asnp->abp = get_astr_str(asnp, NULL, 0) + 2;
+ }
+ if (ABP != ASN_USER_TYPE) {
+ return asn_error(func, "ASN_USER_TYPE", ASN_USER_TYPE, asnp, 4);
+ }
+ else {
+ ABP_INC2;
+ asnp->abp = get_astr_objid(asnp, &type, NULL, NULL, 0) + 2;
+ }
+
+ if (ABP != ASN_USER_DATA) {
+ return asn_error(func,"ASN_USER_DATA", ASN_USER_DATA, asnp, 4);
+ }
+ else {
+ asnp->abp += 4; /* skip over, data, SEQ */
+ asnp->abp = chk_asn_buf(asnp,32);
+ asnp->abp = get_astr_userfld(asnp);
+ asnp->abp += 4;
+ }
+ return asnp->abp;
+}
+
+unsigned char *
+get_astr_seqdescr(struct asn_bstruct *asnp,
+ char *descr) {
+
+ int end_seq=0;
+
+ /* get seqof '1' */
+ /* get 164/128 - title */
+ /* get string */
+ /* pop nulls */
+
+ asnp->abp = chk_asn_buf(asnp,16);
+
+ if (ABP == ASN_SEQOF) {
+ end_seq++;
+ ABP_INC2;
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] - missing ASN_SEQOF '1': %0x %0x\n",__FILE__, __LINE__,ABP, asnp->abp[1]);
+ }
+
+ while (ABP != '\0') {
+
+ if (ABP == ASN_BIOSEQ_D_TITLE) {
+ ABP_INC2; /* skip token */
+ asnp->abp = get_astr_str(asnp, descr, MAX_STR) + 2;
+ }
+ else if (ABP == ASN_BIOSEQ_D_USER) {
+ ABP_INC2;
+ asnp->abp = get_astr_user(asnp);
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] - Un-parsed Seq-descr: %x %x\n",__FILE__,__LINE__,asnp->abp[0],asnp->abp[1]);
+ return asnp->abp;
+ }
+ }
+
+ asnp->abp = chk_asn_buf(asnp,8);
+
+ if (end_seq) ABP_INC2;
+
+ return asnp->abp;
+}
+
+unsigned char *
+get_astr_seqinst(struct asn_bstruct *asnp,
+ unsigned char **query,
+ int *nq) {
+
+ int end_seq=0, tmp;
+ long l_val;
+
+ /* get sequence '0' */
+ /* get 160/128/10/len/val - repr enum raw val */
+ /* get 161/128/10/len/val - mol enum aa val */
+ /* get 162/128/02/len/val - length int val */
+ /* get 166/128 - topology (empty) */
+ /* get 167/128 - seq-data */
+ /* get 65/len+128/len/octet_string */
+ /* pop nulls */
+
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ if (ABP == ASN_SEQ) {
+ end_seq++;
+ ABP_INC2;
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] - missing ASN_SEQ '0': %0x %0x\n",__FILE__, __LINE__, ABP, asnp->abp[1]);
+ }
+
+ if (ABP == ASN_BIOSEQ_INST_REPR && *(asnp->abp+1) == 128) {
+ ABP_INC2;
+ asnp->abp = get_astr_enum(asnp, &tmp)+2;
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] - missing ASN_BIOSEQ_INST_REPR 160: %0x %0x\n",__FILE__,__LINE__,ABP, asnp->abp[1]);
+ }
+
+ if (ABP == ASN_BIOSEQ_INST_MOL && *(asnp->abp+1) == 128) {
+ ABP_INC2;
+ asnp->abp = get_astr_enum(asnp, &tmp)+2;
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] - missing ASN_BIOSEQ_INST_MOL 161: %0x %0x\n",__FILE__,__LINE__,ABP, asnp->abp[1]);
+ }
+
+ if (ABP == ASN_BIOSEQ_INST_LEN) {
+ ABP_INC2;
+ asnp->abp = get_astr_int(asnp, &l_val)+2;
+ *nq = l_val;
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] - missing ASN_BIOSEQ_INST_LEN 161: %0x %0x\n",__FILE__, __LINE__, ABP, asnp->abp[1]);
+ return asnp->abp;
+ }
+
+ if ((*query = (unsigned char *)calloc(*nq + 1, sizeof(char)))==NULL) {
+ fprintf(stderr, " cannot allocate %d char query\n", *nq+1);
+ }
+
+ if (ABP == ASN_BIOSEQ_INST_SEQD) {
+ ABP_INC2;
+ asnp->abp = get_astr_iseqd(asnp, *query, *nq+1 ) + 2;
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] - missing ASN_BIOSEQ_INST_SEQD 166: %0x %0x\n",__FILE__, __LINE__, ABP, asnp->abp[1]);
+ free(*query);
+ *query = NULL;
+ return asnp->abp;
+ }
+
+ if (ABP == ASN_BIOSEQ_INST_HIST ) {
+ fprintf(stderr, "*** error [%s:%d] - Cannot parse bioseq inst history\n",__FILE__,__LINE__);
+ exit(1);
+ }
+
+ if (end_seq) ABP_INC2;
+
+ return asnp->abp;
+}
+
+
+unsigned char *
+get_astr_textid( struct asn_bstruct *asnp,
+ char *name,
+ char *acc) {
+ int end_seq = 0;
+ long ver;
+ char this_func[]="get_astr_textid";
+
+ chk_asn_buf(asnp,32);
+
+ if (ABP != ASN_SEQ) {
+ fprintf(stderr, "*** error [%s:%d] - %s - Expected ASN_SEQ: %0x %0x\n",__FILE__,__LINE__,this_func,ABP, asnp->abp[1]);
+ }
+ else {ABP_INC2; end_seq++;}
+
+ name[0] = acc[0] = '\0';
+
+ if (ABP == ASN_BIOSEQ_TEXTID_NAME) {
+ ABP_INC2;
+ asnp->abp = get_astr_str(asnp, name, MAX_SSTR) + 2;
+ }
+
+ if (ABP == ASN_BIOSEQ_TEXTID_ACC) {
+ ABP_INC2;
+ asnp->abp = get_astr_str(asnp, acc, MAX_SSTR) + 2;
+ }
+
+ if (ABP == ASN_BIOSEQ_TEXTID_REL) {
+ ABP_INC2;
+ asnp->abp = get_astr_str(asnp, NULL, 0) + 2;
+ }
+
+ if (ABP == ASN_BIOSEQ_TEXTID_VER) {
+ ABP_INC2;
+ asnp->abp = get_astr_int(asnp, &ver)+2;
+ }
+
+ if (end_seq) ABP_INC2;
+ return asnp->abp;
+}
+
+unsigned char *
+get_astr_seqid (struct asn_bstruct *asnp,
+ long *gi,
+ char *name,
+ char *acc) {
+ int type;
+ int val;
+
+ *gi = 0;
+ acc[0] = '\0';
+ while (ABP != '\0') {
+
+ switch (ABP) {
+ case ASN_BIOSEQ_ID_OBJ:
+ ABP_INC2;
+ asnp->abp = get_astr_objid(asnp, &type, &val, name, MAX_SSTR) + 2;
+ break;
+ case ASN_BIOSEQ_ID_LOCAL:
+ ABP_INC2;
+ asnp->abp = get_astr_str(asnp, name, MAX_SSTR) + 2;
+ break;
+ case ASN_BIOSEQ_ID_GI:
+ ABP_INC2;
+ asnp->abp = get_astr_int(asnp, gi) + 2;
+ break;
+
+ case ASN_BIOSEQ_ID_GB:
+ case ASN_BIOSEQ_ID_EMBL:
+ case ASN_BIOSEQ_ID_PIR:
+ case ASN_BIOSEQ_ID_SP:
+ case ASN_BIOSEQ_ID_OTHER:
+ ABP_INC2;
+ asnp->abp = get_astr_textid(asnp, name, acc) + 2;
+ break;
+ default:
+ return asn_error("get_atr_seqid", "", -1, asnp,4);
+ }
+ }
+ return asnp->abp;
+}
+
+/*
+Bioseq ::= SEQUENCE {
+ id SET OF Seq-id , -- equivalent identifiers
+ descr Seq-descr OPTIONAL , -- descriptors
+ inst Seq-inst, -- the sequence data
+ annot SET OF Seq-annot OPTIONAL }
+*/
+
+/* modified 8-Nov-2009 to allow additional information after the inst */
+
+unsigned char *
+get_astr_bioseq(struct asn_bstruct *asnp,
+ long *gi,
+ char *name,
+ char *acc,
+ char *descr,
+ unsigned char **query,
+ int *nq
+ ) {
+
+ int end_seq = 0;
+
+ asnp->abp = chk_asn_buf(asnp,64);
+
+ if (ABP == ASN_SEQ) {
+ end_seq++;
+ ABP_INC2;
+ }
+
+ if (ABP != ASN_BIOSEQ_ID) {
+ fprintf(stderr, "*** error [%s:%d] - Bioseq - missing ID tag: %2x %2x\n",__FILE__,__LINE__,ABP, asnp->abp[1]);
+ return asnp->abp;
+ }
+ else {
+ /* skip over bioseq-id tag */
+ ABP_INC2;
+ if (ABP == ASN_SETOF) { /* jump over ASN_SETOF */
+ ABP_INC2;
+ asnp->abp = get_astr_seqid(asnp, gi, name, acc);
+ ABP_INC2; /* close ASN_SETOF */
+ }
+ else {
+ return asn_error("get_astr_bioseq","ASN_SEQOF", ASN_SEQOF, asnp, 4);
+ }
+ ABP_INC2; /* jump over seq-id tag end */
+ }
+
+ if (ABP == ASN_BIOSEQ_DESCR) {
+ ABP_INC2;
+ asnp->abp = get_astr_seqdescr(asnp, descr);
+ ABP_INC2; /* skip nulls */
+ }
+ else { descr[0] = '\0';}
+
+ while (ABP == '\0') { ABP_INC2;}
+
+ if (ABP != ASN_BIOSEQ_INST) {
+ fprintf(stderr, "*** error [%s:%d] - Bioseq - missing ID tag: %2x %2x\n",__FILE__,__LINE__,ABP, asnp->abp[1]);
+ return asnp->abp;
+ }
+ else {
+ ABP_INC2;
+ asnp->abp = get_astr_seqinst(asnp, query, nq);
+ ABP_INC2; /* skip nulls */
+ }
+
+ if (end_seq--) {
+ ABP_INC2;
+ }
+
+ return asnp->abp;
+}
+
+/*
+ get_pssm_intermed_null() captures and throws away an array of data
+ rather than have different functions for different datatypes, get_data_func()
+ reads the data, saving it to *d_val, where it will be discarded
+*/
+unsigned char *
+get_pssm_intermed_null(struct asn_bstruct *asnp,
+ int n_rows,
+ int n_cols,
+ int by_row,
+ unsigned char *(*get_data_func)(struct asn_bstruct *, long *, double *),
+ long *l_val_p,
+ double *d_val_p
+ ) {
+
+ int i_rows, i_cols;
+ int in_seq = 0;
+
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ if (ABP == ASN_SEQ) {
+ ABP_INC2;
+ in_seq = 1;
+ }
+
+ if (!by_row) {
+ for (i_cols = 0; i_cols < n_cols; i_cols++) {
+ for (i_rows = 0; i_rows < n_rows; i_rows++) {
+ asnp->abp = (*get_data_func)(asnp, l_val_p, d_val_p);
+ }
+ }
+ }
+ else {
+ for (i_rows = 0; i_rows < n_rows; i_rows++) {
+ for (i_cols = 0; i_cols < n_cols; i_cols++) {
+ asnp->abp = (*get_data_func)(asnp, l_val_p, d_val_p);
+ }
+ }
+ }
+
+ asnp->abp = chk_asn_buf(asnp,32);
+ if (in_seq) {asnp->abp +=2;} /* skip nulls */
+ ABP_INC2;
+ return asnp->abp;
+}
+
+unsigned char *
+get_pssm_freqs(struct asn_bstruct *asnp,
+ double **freqs,
+ int n_rows,
+ int n_cols,
+ int by_row) {
+
+ int i_rows, i_cols;
+ int in_seq = 0;
+ long l_val;
+ double f_val;
+
+ asnp->abp = chk_asn_buf(asnp,64);
+
+ if (ABP == ASN_SEQ) {
+ ABP_INC2;
+ in_seq = 1;
+ }
+
+ if (!by_row) {
+ for (i_cols = 0; i_cols < n_cols; i_cols++) {
+ for (i_rows = 0; i_rows < n_rows; i_rows++) {
+ asnp->abp = get_astr_packedreal(asnp, &l_val, &f_val);
+ freqs[i_cols][i_rows] = f_val;
+ }
+ }
+ }
+ else {
+ for (i_rows = 0; i_rows < n_rows; i_rows++) {
+ for (i_cols = 0; i_cols < n_cols; i_cols++) {
+ asnp->abp = get_astr_packedreal(asnp, &l_val, &f_val);
+ freqs[i_rows][i_cols] = f_val;
+ }
+ }
+ }
+
+ asnp->abp = chk_asn_buf(asnp,32);
+ if (in_seq) {asnp->abp +=2;} /* skip nulls */
+ ABP_INC2;
+ return asnp->abp;
+}
+
+unsigned char *
+get_pssm_intermed(struct asn_bstruct *asnp,
+ double ***wfreqs,
+ double ***freqs,
+ int n_rows,
+ int n_cols,
+ int by_row) {
+
+ long long_data;
+ double real_data;
+ int i;
+
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ if (ABP == ASN_SEQ) {
+ ABP_INC2;
+ if (ABP == ASN_PSSM_INTERMED_RES_FREQS) {
+ ABP_INC2;
+ asnp->abp = get_pssm_intermed_null(asnp, n_rows, n_cols, by_row,
+ &get_astr_packedint, &long_data, &real_data);
+ }
+
+ if (ABP == ASN_PSSM_INTERMED_WRES_FREQS) {
+ if (((*wfreqs) = (double **)calloc(n_cols, sizeof(double *)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate wfreq cols - %d\n", __FILE__, __LINE__, n_cols);
+ exit(1);
+ }
+
+ if (((*wfreqs)[0] = (double *) calloc(n_cols * n_rows, sizeof(double)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate freq rows * cols - %d * %d\n", __FILE__, __LINE__, n_rows, n_cols);
+ exit(1);
+ }
+
+ for (i=1; i < n_cols; i++) {
+ (*wfreqs)[i] = (*wfreqs)[i-1] + n_rows;
+ }
+
+ ABP_INC2;
+ asnp->abp = get_pssm_freqs(asnp, *wfreqs, n_rows, n_cols, by_row);
+ }
+
+ if (ABP == ASN_PSSM_INTERMED_FREQ_RATIOS) {
+ if ((*freqs = (double **) calloc(n_cols, sizeof(double *)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate wfreq cols - %d\n", __FILE__, __LINE__, n_cols);
+ exit(1);
+ }
+
+ if (((*freqs)[0] = (double *) calloc(n_cols * n_rows, sizeof(double)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate freq rows * cols - %d * %d\n", __FILE__, __LINE__, n_rows, n_cols);
+ exit(1);
+ }
+
+ for (i=1; i < n_cols; i++) {
+ (*freqs)[i] = (*freqs)[i-1] + n_rows;
+ }
+
+ ABP_INC2;
+ asnp->abp = get_pssm_freqs(asnp, *freqs, n_rows, n_cols, by_row);
+ }
+
+ if (ABP == ASN_PSSM_INTERMED_INFO_CONTENT) {
+ ABP_INC2;
+ asnp->abp = get_pssm_intermed_null(asnp, 1, n_cols, by_row,
+ &get_astr_packedreal, &long_data, &real_data);
+ }
+
+ if (ABP == ASN_PSSM_INTERMED_GAPL_COLWTS) {
+ ABP_INC2;
+ asnp->abp = get_pssm_intermed_null(asnp, 1, n_cols, by_row,
+ &get_astr_packedreal, &long_data, &real_data);
+ }
+
+ if (ABP == ASN_PSSM_INTERMED_SIGMA) {
+ ABP_INC2;
+ asnp->abp = get_pssm_intermed_null(asnp, 1, n_cols, by_row,
+ &get_astr_packedreal, &long_data, &real_data);
+ }
+
+ if (ABP == ASN_PSSM_INTERMED_INTVAL_SIZE) {
+ ABP_INC2;
+ asnp->abp = get_pssm_intermed_null(asnp, 1, n_cols, by_row,
+ &get_astr_packedint, &long_data, &real_data);
+ }
+
+ if (ABP == ASN_PSSM_INTERMED_NUM_MATCH_SEQ) {
+ ABP_INC2;
+ asnp->abp = get_pssm_intermed_null(asnp, 1, n_cols, by_row,
+ &get_astr_packedint, &long_data, &real_data);
+ }
+
+ asnp->abp +=2; /* skip nulls */
+ }
+ ABP_INC2;
+ return asnp->abp;
+}
+
+
+#define ASN_PSSM_PARAMS 161
+#define ASN_PSSM_PARAMS_PSEUDOCNT 160
+#define ASN_PSSM_PARAMS_RPSPARAMS 161
+#define ASN_PSSM_RPSPARAMS_MATRIX 160
+#define ASN_PSSM_RPSPARAMS_GAPOPEN 161
+#define ASN_PSSM_RPSPARAMS_GAPEXT 162
+
+unsigned char *
+get_pssm_rpsparams(struct asn_bstruct *asnp,
+ char *matrix,
+ int *gap_open_p,
+ int *gap_ext_p) {
+
+ int end_seq=0;
+ long l_val;
+
+ asnp->abp = chk_asn_buf(asnp,32);
+
+ if (ABP == ASN_SEQ) {
+ ABP_INC2;
+ end_seq++;
+ }
+
+ asnp->abp = chk_asn_buf(asnp,32);
+ if (ABP == ASN_PSSM_RPSPARAMS_MATRIX) {
+ ABP_INC2;
+ asnp->abp = get_astr_str(asnp, matrix, MAX_SSTR) + 2;
+ }
+ else {
+ strncpy(matrix,"BLOSUM62", MAX_SSTR);
+ }
+
+ asnp->abp = chk_asn_buf(asnp,16);
+ if (ABP == ASN_PSSM_RPSPARAMS_GAPOPEN) {
+ ABP_INC2;
+ asnp->abp = get_astr_int(asnp, &l_val)+2;
+ *gap_open_p = l_val;
+ }
+ else {*gap_open_p = -11;}
+
+ asnp->abp = chk_asn_buf(asnp,16);
+ if (ABP == ASN_PSSM_RPSPARAMS_GAPEXT) {
+ ABP_INC2;
+ asnp->abp = get_astr_int(asnp, &l_val)+2;
+ *gap_ext_p = l_val;
+ }
+ else {*gap_ext_p = -1;}
+
+ if (end_seq) { chk_asn_buf(asnp,(end_seq * 2)+16); }
+ while (end_seq-- > 0) { ABP_INC2; }
+ return asnp->abp;
+}
+
+/* this routine skips over the final scores */
+unsigned char *
+get_pssm_final_scores(struct asn_bstruct *asnp, int ***iscores, int n_rows, int n_cols, int by_row) {
+
+ int i_rows, i_cols, i;
+ int in_seq = 0;
+ long l_val;
+
+ if (ABP == ASN_SEQ) { ABP_INC2; in_seq=1;}
+
+ if (((*iscores) = (int **) calloc(n_cols, sizeof(int *)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate wfreq cols - %d\n", __FILE__, __LINE__, n_cols);
+ exit(1);
+ }
+
+ if (((*iscores)[0] = (int *) calloc(n_cols * n_rows, sizeof(int)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate freq rows * cols - %d * %d\n", __FILE__, __LINE__, n_rows, n_cols);
+ exit(1);
+ }
+
+ for (i=1; i < n_cols; i++) {
+ (*iscores)[i] = (*iscores)[i-1] + n_rows;
+ }
+
+ if (!by_row) {
+ for (i_cols = 0; i_cols < n_cols; i_cols++) {
+ for (i_rows = 0; i_rows < n_rows; i_rows++) {
+ asnp->abp = get_astr_int(asnp, &l_val);
+ (*iscores)[i_cols][i_rows] = l_val;
+ }
+ }
+ }
+ else {
+ for (i_rows = 0; i_rows < n_rows; i_rows++) {
+ for (i_cols = 0; i_cols < n_cols; i_cols++) {
+ asnp->abp = get_astr_int(asnp, &l_val);
+ (*iscores)[i_cols][i_rows] = l_val;
+ }
+ }
+ }
+
+ asnp->abp = chk_asn_buf(asnp,16);
+ if (in_seq) {asnp->abp +=2;} /* skip nulls */
+ ABP_INC2;
+ return asnp->abp;
+}
+
+unsigned char *
+get_pssm_params(struct asn_bstruct *asnp,
+ int *pseudo_cnts,
+ char *matrix,
+ int *gap_open_p,
+ int *gap_ext_p) {
+
+ int end_seq=0;
+ long l_val;
+
+ asnp->abp = chk_asn_buf(asnp,16);
+
+ if (ABP == ASN_SEQ) {
+ ABP_INC2;
+ end_seq++;
+ }
+
+ if (ABP == ASN_PSSM_PARAMS_PSEUDOCNT) {
+ ABP_INC2;
+ asnp->abp = get_astr_int(asnp, &l_val)+2;
+ *pseudo_cnts = l_val;
+ }
+
+ if (ABP == ASN_PSSM_PARAMS_RPSPARAMS) {
+ ABP_INC2;
+ asnp->abp = get_pssm_rpsparams(asnp, matrix, gap_open_p, gap_ext_p);
+ ABP_INC2;
+ }
+ else {
+ *gap_open_p = -11;
+ *gap_ext_p = -1;
+ strncpy(matrix,"BLOSUM62",MAX_SSTR);
+ }
+
+ while (end_seq-- > 0) { ABP_INC2; }
+ return asnp->abp;
+}
+
+unsigned char *
+get_pssm2_scores(struct asn_bstruct *asnp,
+ int *have_scores
+ ) {
+
+ int end_seq=0;
+
+ if (have_scores != NULL) *have_scores = 0;
+
+ if (ABP == ASN_SEQ) {
+ end_seq++;
+ ABP_INC2;
+ }
+
+ if (ABP == '\0') { /* no scores */
+ if (end_seq) ABP_INC2;
+ }
+ else {
+ if (have_scores != NULL) *have_scores = 1;
+ }
+ return asnp->abp;
+}
+
+unsigned char *
+get_pssm2_intermed(struct asn_bstruct *asnp,
+ double ***wfreqs,
+ double ***freqs,
+ int n_rows,
+ int n_cols) {
+
+ int i;
+ double **my_freqs, **my_wfreqs;
+ int **my_iscores;
+
+ if ((my_freqs = (double **) calloc(n_cols, sizeof(double *)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate freq cols - %d\n", __FILE__, __LINE__, n_cols);
+ exit(1);
+ }
+
+ if ((my_wfreqs = (double **) calloc(n_cols, sizeof(double *)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate wfreq cols - %d\n", __FILE__, __LINE__, n_cols);
+ exit(1);
+ }
+
+ if ((my_freqs[0] = (double *) calloc(n_cols * n_rows, sizeof(double)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate freq rows * cols - %d * %d\n", __FILE__, __LINE__, n_rows, n_cols);
+ exit(1);
+ }
+
+ if ((my_wfreqs[0] = (double *) calloc(n_cols * n_rows, sizeof(double)))==NULL) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate freq rows * cols - %d * %d\n", __FILE__, __LINE__, n_rows, n_cols);
+ exit(1);
+ }
+
+ for (i=1; i < n_cols; i++) {
+ my_freqs[i] = my_freqs[i-1] + n_rows;
+ my_wfreqs[i] = my_wfreqs[i-1] + n_rows;
+ }
+
+ *wfreqs = my_wfreqs;
+ *freqs = my_freqs;
+
+ chk_asn_buf(asnp, 16);
+
+ return get_pssm_freqs(asnp, my_freqs, n_rows, n_cols, 0);
+}
+
+int
+parse_pssm2_asn(struct asn_bstruct *asnp,
+ long *gi,
+ char *name,
+ char *acc,
+ char *descr,
+ unsigned char **query,
+ int *nq,
+ int *n_rows,
+ int *n_cols,
+ double ***wfreqs,
+ double ***freqs,
+ int ***iscores,
+ int *pseudo_cnts,
+ char *matrix,
+ double *lambda_p) {
+
+ int is_protein;
+ int have_rows=0, have_cols=0;
+ long l_val;
+ int have_scores=0;
+
+ chk_asn_buf(asnp, 32);
+
+ /* first get the query */
+
+ if (memcmp(asnp->abp, "\241\2000\200",4) != 0) {
+ asn_error("parse_pssm2_asn","ASN_PSSM2_QUERY",ASN_PSSM2_QUERY,asnp,4);
+ return -1;
+ }
+ else {
+ asnp->abp+=4;
+ asnp->abp = get_astr_bioseq(asnp, gi, name, acc, descr, query, nq) + 4;
+ }
+
+ /* finish up the nulls */
+ /* perhaps we have parsed correctly and do not need this */
+ /* while (ABP == '\0') { ABP_INC2;} */
+
+ if (memcmp(asnp->abp, "\242\2000\200",4) != 0) {
+ asn_error("parse_pssm2_asn","ASN_PSSM2_MATRIX",ASN_PSSM2_MATRIX,asnp,4);
+ return -1;
+ }
+ else {
+ asnp->abp+=4;
+
+ if (ABP == ASN_PSSM_IS_PROT) {
+ ABP_INC2;
+ asnp->abp = get_astr_bool(asnp, &is_protein)+2;
+ }
+
+ if (ABP == ASN_PSSM2_MATRIX_NAME) {
+ ABP_INC2;
+ asnp->abp = get_astr_str(asnp, matrix, MAX_SSTR) + 2;
+ }
+
+ if (ABP == ASN_PSSM2_NCOLS) {
+ ABP_INC2;
+ asnp->abp = get_astr_int(asnp, &l_val)+2;
+ *n_cols = l_val;
+ have_cols = 1;
+ }
+
+ if (ABP == ASN_PSSM2_NROWS) {
+ ABP_INC2;
+ asnp->abp = get_astr_int(asnp, &l_val)+2;
+ *n_rows = l_val;
+ have_rows = 1;
+ }
+
+ if (ABP == ASN_PSSM2_SCORES) {
+ /* right now, this is always empty */
+ ABP_INC2;
+ asnp->abp = get_pssm2_scores(asnp, &have_scores) + 2;
+ if (have_scores) return 0;
+ }
+
+ if (ABP == ASN_PSSM2_KARLIN_K) {
+ ABP_INC2;
+ asnp->abp = get_astr_packedreal(asnp, &l_val, lambda_p) + 2;
+ }
+
+ if (ABP == ASN_PSSM2_FREQS) {
+ asnp->abp += 4;
+ asnp->abp = get_pssm2_intermed(asnp, wfreqs, freqs, *n_rows, *n_cols) + 4;
+ }
+ }
+
+ return 1;
+}
+
+int
+parse_pssm_asn(FILE *afd,
+ long *gi,
+ char *name,
+ char *acc,
+ char *descr,
+ unsigned char **query,
+ int *nq,
+ int *n_rows,
+ int *n_cols,
+ double ***wfreqs,
+ double ***freqs,
+ int ***iscores,
+ int *pseudo_cnts,
+ char *matrix,
+ int *gap_open_p,
+ int *gap_ext_p,
+ double *lambda_p) {
+
+ int is_protein;
+ int pssm_version;
+ long l_val;
+ int i;
+ long itmp;
+ int have_rows=0, have_cols=0, by_col=0;
+ double **my_freqs=NULL, **my_wfreqs=NULL, dtmp;
+ int **my_iscores=NULL;
+ struct asn_bstruct *asnp;
+
+ *wfreqs = NULL;
+ *freqs = NULL;
+ *iscores = NULL;
+
+ asnp = new_asn_bstruct(ASN_BUF);
+
+ asnp->fd = afd;
+ asnp->len = ASN_BUF;
+ asnp->abp = asnp->buf_max = asnp->buf + ASN_BUF;
+
+ chk_asn_buf(asnp, 32);
+
+ if (memcmp(asnp->abp, "0\200\240\200",4) != 0) {
+ fprintf(stderr, "*** error [%s:%d] - improper PSSM header\n",__FILE__,__LINE__);
+ return -1;
+ }
+ else {asnp->abp+=4;}
+
+ if (ABP == ASN_IS_INT) {
+ asnp->abp = get_astr_int(asnp, &l_val)+2;
+ pssm_version = l_val;
+ if (pssm_version != 2) {
+ fprintf(stderr, "*** error [%s:%d] - PSSM2 version mismatch: %d\n",__FILE__,__LINE__,pssm_version);
+ return -1;
+ }
+ *gap_open_p = *gap_ext_p = 0;
+ return parse_pssm2_asn(asnp, gi, name, acc, descr,
+ query, nq,
+ n_rows, n_cols,
+ wfreqs, freqs, iscores,
+ pseudo_cnts, matrix,
+ lambda_p);
+ }
+
+ if (ABP == ASN_SEQ) { asnp->abp += 2; }
+
+ if (ABP == ASN_PSSM_IS_PROT ) {
+ ABP_INC2;
+ asnp->abp = get_astr_bool(asnp, &is_protein)+2;
+ }
+
+ if (ABP == ASN_PSSM_NROWS ) {
+ ABP_INC2;
+ asnp->abp = get_astr_int(asnp, &l_val)+2;
+ *n_rows = l_val;
+
+ if (*n_rows > 0) { have_rows = 1; }
+ else {
+ fprintf(stderr, "*** error [%s:%d] - bad n_row count\n",__FILE__,__LINE__);
+ exit(1);
+ }
+ }
+
+ if (ABP == ASN_PSSM_NCOLS ) {
+ ABP_INC2;
+ asnp->abp = get_astr_int(asnp, &l_val)+2;
+ *n_cols = l_val;
+ if (*n_cols > 0) {
+ have_cols = 1;
+ }
+ else {
+ fprintf(stderr, "*** error [%s:%d] - bad n_row count\n",__FILE__,__LINE__);
+ exit(1);
+ }
+ }
+
+ if (ABP == ASN_PSSM_BYCOL ) {
+ ABP_INC2;
+ asnp->abp = get_astr_bool(asnp, &by_col)+2;
+ }
+
+ /* we have read everything up to the query
+
+ n_cols gives us the query length, which we can allocate;
+ */
+
+ if (ABP == ASN_PSSM_QUERY ) {
+ asnp->abp+=4; /* skip token and CHOICE */
+ asnp->abp = get_astr_bioseq(asnp, gi, name, acc, descr, query, nq) + 4;
+ *nq = *n_cols;
+ }
+
+ /* finish up the nulls */
+
+
+ while (ABP == '\0') { asnp->abp += 2;}
+
+ if (ABP == ASN_PSSM_INTERMED_DATA) {
+
+ if (!have_rows || !have_cols) {
+ fprintf(stderr, "*** error [%s:%d] - cannot allocate freq - missing rows/cols - %d/%d\n",
+ __FILE__,__LINE__, have_rows, have_cols);
+ return -1;
+ }
+
+ ABP_INC2;
+ asnp->abp = get_pssm_intermed(asnp, &my_wfreqs, &my_freqs, *n_rows, *n_cols, by_col);
+ *wfreqs = my_wfreqs;
+ *freqs = my_freqs;
+ }
+
+ if (ABP == ASN_PSSM_FINAL_DATA) {
+ ABP_INC2;
+ if (ABP == ASN_SEQ) { asnp->abp += 2; }
+ if (ABP == ASN_PSSM_FINAL_DATA_SCORES) {
+ ABP_INC2;
+
+ asnp->abp = get_pssm_final_scores(asnp, iscores, *n_rows, *n_cols, by_col) + 2;
+
+ ABP_INC2;
+ }
+ if (ABP == ASN_PSSM_FINAL_DATA_LAMBDA) {
+ ABP_INC2;
+ ABPP = get_astr_real(asnp, &dtmp) + 2;
+ }
+ if (ABP == ASN_PSSM_FINAL_DATA_KAPPA) {
+ ABP_INC2;
+ ABPP = get_astr_real(asnp, &dtmp) + 2;
+ }
+ if (ABP == ASN_PSSM_FINAL_DATA_H) {
+ ABP_INC2;
+ ABPP = get_astr_real(asnp, &dtmp) + 2;
+ }
+ if (ABP == ASN_PSSM_FINAL_DATA_SCALEF) {
+ ABP_INC2;
+ ABPP = get_astr_int(asnp, &itmp) + 2;
+ }
+ if (ABP == ASN_PSSM_FINAL_DATA_ULAMBDA) {
+ ABP_INC2;
+ ABPP = get_astr_real(asnp, &dtmp) + 2;
+ }
+ if (ABP == ASN_PSSM_FINAL_DATA_UKAPPA) {
+ ABP_INC2;
+ ABPP = get_astr_real(asnp, &dtmp) + 2;
+ }
+ if (ABP == ASN_PSSM_FINAL_DATA_UH) {
+ ABP_INC2;
+ ABPP = get_astr_real(asnp, &dtmp) + 2;
+ }
+ asnp->abp += 8;
+ }
+
+ if (ABP == ASN_PSSM_PARAMS ) {
+ ABP_INC2;
+ asnp->abp = get_pssm_params(asnp, pseudo_cnts, matrix, gap_open_p, gap_ext_p) + 2;
+ }
+ else {
+ *gap_open_p = -11;
+ *gap_ext_p = -1;
+ strncpy(matrix,"BLOSUM62",MAX_SSTR);
+ if (ABP == 0) {ABP_INC2;}
+ }
+
+ free_asn_bstruct(asnp);
+
+ return 1;
+}
+
+int
+parse_pssm_asn_fa( FILE *fd,
+ int *n_rows_p, int *n_cols_p,
+ unsigned char **query,
+ double ***wfreq2d,
+ double ***freq2d,
+ int ***iscores2d,
+ char *matrix,
+ int *gap_open_p,
+ int *gap_extend_p,
+ double *lambda_p
+ ) {
+
+ int qi, rj;
+ long gi;
+ double tmp_freqs[COMPO_LARGEST_ALPHABET];
+ char name[MAX_SSTR], acc[MAX_SSTR], descr[MAX_STR];
+ int nq;
+ int pseudo_cnts;
+ int ret_val;
+
+ /* parse the file */
+
+ ret_val = parse_pssm_asn(fd, &gi, name, acc, descr, query, &nq,
+ n_rows_p, n_cols_p, wfreq2d, freq2d, iscores2d,
+ &pseudo_cnts, matrix, gap_open_p, gap_extend_p,
+ lambda_p);
+
+ if (ret_val <=0) return ret_val;
+
+ for (qi = 0; qi < *n_cols_p; qi++) {
+ for (rj = 0; rj < *n_rows_p; rj++) { tmp_freqs[rj] = (*freq2d)[qi][rj];}
+
+ for (rj = 0; rj < COMPO_NUM_TRUE_AA; rj++) {
+ (*freq2d)[qi][rj] = tmp_freqs[pssm_aa_order[rj]];
+ }
+ }
+
+ return 1;
+}
diff --git a/src/pthr_subs.h b/src/pthr_subs.h
new file mode 100644
index 0000000..60469c0
--- /dev/null
+++ b/src/pthr_subs.h
@@ -0,0 +1,49 @@
+/* $Id: pthr_subs.h 625 2011-03-23 17:21:38Z wrp $ */
+
+#include <pthread.h>
+
+/* error macro for thread calls */
+
+#define check(status,string) \
+ if (status != 0) {fprintf(stderr,string); \
+ fprintf(stderr,"%s\n",strerror(status)); } /* error macro */
+
+/*
+#define check(status,string) \
+ if (status == -1) perror(string) */ /* error macro for thread calls */
+
+
+#ifndef XTERNAL
+pthread_t *fa_threads=NULL;
+
+/* reader stuff */
+
+pthread_mutex_t reader_mutex; /* empty buffer pointer structure lock */
+pthread_cond_t reader_cond_var; /* condition variable for reader */
+
+pthread_mutex_t worker_mutex; /* full buffer pointer structure lock */
+pthread_cond_t worker_cond_var; /* condition variable for workers */
+
+/* condition variable stuff */
+
+pthread_mutex_t start_mutex; /* start-up synchronisation lock */
+pthread_cond_t start_cond_var; /* start-up synchronisation condition variable */
+
+#else
+extern pthread_t *fa_threads;
+
+/* mutex stuff */
+
+extern pthread_mutex_t reader_mutex;
+extern pthread_mutex_t worker_mutex;
+
+/* condition variable stuff */
+
+extern pthread_cond_t reader_cond_var;
+extern pthread_cond_t worker_cond_var;
+
+extern pthread_mutex_t start_mutex;
+extern pthread_cond_t start_cond_var;
+extern int start_thread;
+
+#endif
diff --git a/src/pthr_subs2.c b/src/pthr_subs2.c
new file mode 100644
index 0000000..e64d81c
--- /dev/null
+++ b/src/pthr_subs2.c
@@ -0,0 +1,377 @@
+/* $Id: pthr_subs2.c 625 2011-03-23 17:21:38Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* modified to do more initialization of work_info here, rather than in main() */
+
+/* this file isolates the pthreads calls from the main program */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/types.h>
+#ifdef UNIX
+#include <unistd.h>
+#endif
+#include <signal.h>
+
+#include "defs.h"
+#include "structs.h" /* mngmsg, libstruct */
+#include "param.h" /* pstruct, thr_str, buf_head, rstruct */
+#include "thr_buf_structs.h"
+
+#include <pthread.h>
+#define XTERNAL
+#include "thr_bufs2.h"
+#undef XTERNAL
+#include "pthr_subs.h"
+
+extern void work_thread (struct thr_str *);
+
+/* start the threads working */
+
+void init_thr(int nthreads, struct thr_str *work_info,
+ const struct mngmsg *m_msp, struct pstruct *ppst,
+ unsigned char *aa0, struct mng_thr *m_bufi_p)
+{
+ int status, i;
+ pthread_attr_t thread_attr;
+
+ if (fa_threads == NULL && (fa_threads=(pthread_t *)calloc(nthreads, sizeof(pthread_t)))==NULL) {
+ fprintf(stderr, "Cannot allocate %d pthread_t\n",nthreads);
+ exit(1);
+ }
+
+ /* set up work_info[] structure, set parameters */
+
+ for (i=0; i<nthreads; i++) {
+ work_info[i].m_msp = m_msp;
+ work_info[i].n0 = m_msp->n0;
+ work_info[i].nm0 = m_msp->nm0;
+ work_info[i].qframe = m_msp->qframe;
+ work_info[i].qshuffle = m_msp->qshuffle;
+ work_info[i].ppst = ppst;
+ work_info[i].aa0 = aa0;
+ work_info[i].max_work_buf=m_bufi_p->max_work_buf;
+ work_info[i].worker=i;
+ work_info[i].max_tot=m_msp->max_tot;
+ }
+
+ /* mutex and condition variable initialisation */
+
+ status = pthread_mutex_init(&reader_mutex, NULL);
+ check(status,"Reader_mutex init bad status\n");
+
+ status = pthread_mutex_init(&worker_mutex, NULL);
+ check(status,"Worker_mutex init bad status\n");
+
+ status = pthread_cond_init(&reader_cond_var, NULL);
+ check(status,"Reader_cond_var init bad status\n");
+
+ status = pthread_cond_init(&worker_cond_var, NULL);
+ check(status,"Worker_cond_var init bad status\n");
+
+ status = pthread_mutex_init(&start_mutex, NULL);
+ check(status,"Start_mutex init bad status\n");
+
+ status = pthread_cond_init(&start_cond_var, NULL);
+ check(status,"Start_cond_var init bad status\n");
+
+ /* change stacksize on threads */ /***************************/
+
+ status = pthread_attr_init( &thread_attr );
+ check(status,"attribute create bad status\n");
+
+#ifdef IRIX
+ if (pthread_attr_setscope( &thread_attr, 2) != NULL)
+ status = pthread_attr_setscope( &thread_attr,PTHREAD_SCOPE_PROCESS);
+ check(status,"set scope on IRIX bad status\n");
+#endif
+
+#ifdef FASTA_setscope
+ status = pthread_attr_setscope( &thread_attr, PTHREAD_SCOPE_SYSTEM);
+ check(status,"set scope bad status\n");
+#endif
+
+ /* start the worker threads */
+
+ for (i=0; i < nthreads; i++) {
+ /**********************/
+ status=pthread_create(&fa_threads[i],&thread_attr,
+ (void *(*)(void *))&work_thread,&work_info[i]);
+ check(status,"Pthread_create failed\n");
+ }
+}
+
+/* start_mutex/start_cont_var provides exclusive access to
+ extern int start_thread */
+
+void start_thr()
+{
+ int status;
+
+ /* tell threads to proceed */
+
+ status = pthread_mutex_lock(&start_mutex);
+ check(status,"Start_mutex lock bad status in main\n");
+
+ start_thread = 0; /* lower predicate */
+
+ status = pthread_cond_broadcast(&start_cond_var);
+ status = pthread_mutex_unlock(&start_mutex);
+ check(status,"Start_mutex unlock bad status in main\n");
+}
+
+/* get_rbuf() provides buffers containing sequences to the main program
+ initially, max_work_buf buffers are allocated and are available.
+ As the main program runs, it calls get_rbuf() to get a reader
+ buffer, fills it with sequences, and puts it on the queue with
+ put_rbuf().
+
+ At the same time, the worker programs call get_wbuf(), which gets a
+ filled buffer put on the queue by put_rbuf(), takes the sequences
+ from the buffer and does the comparisons, and puts the results back
+ in that buffer, finally calling put_wbuf().
+
+ locks on reader_mutex
+ increments reader_buf_readp
+*/
+
+/* wait until all reader bufs are available */
+void get_rbuf(struct buf_head **cur_buf, int max_work_buf)
+{
+ int status;
+
+ status = pthread_mutex_lock(&reader_mutex); /* lock reader_buf structure */
+
+ reader_wait = 0;
+
+ check(status,"Reader_mutex lock in master bad status\n");
+
+ /* no reader bufs: wait for signal to proceed */
+ while (num_reader_bufs == 0) {
+ pthread_cond_wait(&reader_cond_var,&reader_mutex);
+ }
+
+ *cur_buf = reader_buf[reader_buf_readp]; /* get the buffer address */
+ reader_buf_readp = (reader_buf_readp+1)%(max_work_buf); /* increment index */
+ num_reader_bufs--;
+
+ /* fprintf(stderr, " rb: %3d consumed by %lld\n",num_reader_bufs,pthread_self()); */
+
+ status = pthread_mutex_unlock(&reader_mutex); /* unlock structure */
+ check(status,"Reader_mutex unlock in master bad status\n");
+}
+
+/* put_rbuf() takes a buffer filled with sequences to be compared and
+ puts it in the queue.
+
+ locks on worker_mutex
+ increments worker_buf_readp;
+ */
+
+void put_rbuf(struct buf_head *cur_buf, int max_work_buf)
+{
+ int status;
+
+ /* give the buffer to a thread, and wait for more */
+ status = pthread_mutex_lock(&worker_mutex); /* lock worker_buf_structure */
+ check(status,"Worker_mutex lock in master bad status\n");
+
+ /* Put buffer onto available for workers list */
+ worker_buf[worker_buf_readp] = cur_buf;
+ worker_buf_readp = (worker_buf_readp+1)%(max_work_buf);
+ num_worker_bufs++; /* increment number of buffers available to workers */
+
+ /* Signal one worker to wake and start work */
+ status = pthread_cond_signal(&worker_cond_var);
+
+ status = pthread_mutex_unlock(&worker_mutex);
+ check(status,"Worker_mutex unlock in master bad status\n");
+}
+
+/* this function is not currently used */
+void put_rbuf_done(int nthreads, struct buf_head *cur_buf, int max_work_buf)
+{
+ int status, i;
+ void *exit_value;
+
+ /* give the buffer to a thread, and wait for more */
+ status = pthread_mutex_lock(&worker_mutex); /* lock worker_buf_structure */
+ check(status,"Worker_mutex lock in master bad status\n");
+
+ /* Put buffer onto available for workers list */
+ worker_buf[worker_buf_readp] = cur_buf;
+ worker_buf_readp = (worker_buf_readp+1)%(max_work_buf);
+ num_worker_bufs++; /* increment number of buffers available to workers */
+
+ /* Signal one worker to wake and start work */
+
+ reader_done = 1; /* this causes the next get_wbuf() in the
+ thread to return 0 which causes the thread
+ to exit the main while() loop and quit */
+
+ status = pthread_cond_broadcast(&worker_cond_var);
+
+ status = pthread_mutex_unlock(&worker_mutex);
+ check(status,"Worker_mutex unlock in master bad status\n");
+
+ /* wait for all buffers available (means all do_workers are done) */
+
+ for (i=0; i < nthreads; i++) {
+ status = pthread_join( fa_threads[i], &exit_value);
+ check(status,"Pthread_join bad status\n");
+ }
+}
+
+/* wait_rbuf() -- wait for the worker threads to finish with the
+ current sequence buffers.
+*/
+void wait_rbuf(int used_reader_bufs) {
+ int status;
+
+ /* get a buffer to work on */
+ status = pthread_mutex_lock(&reader_mutex);
+ check(status,"wait_rbuf reader_mutex lock in worker bad status\n");
+
+ reader_wait = 1;
+
+ /* worker_bufs still available: wait for worker to consume them */
+ while (num_reader_bufs < used_reader_bufs) {
+ /* wait for a signal that no more worker_bufs are available */
+ pthread_cond_wait(&reader_cond_var,&reader_mutex);
+ }
+
+ status = pthread_mutex_unlock(&reader_mutex);
+ check(status,"wait_rbuf reader_mutex unlock in worker bad status\n");
+}
+
+/*
+ Called once at the end of each search.
+ */
+
+void rbuf_done(int nthreads)
+{
+ int status, i;
+ void *exit_value;
+
+ /* give the buffer to a thread, and wait for more */
+ status = pthread_mutex_lock(&worker_mutex); /* lock worker_buf_structure */
+ check(status,"Worker_mutex lock in master bad status\n");
+
+ /* Signal one worker to wake and start work */
+
+ reader_done = 1; /* this causes the next get_wbuf() in the
+ thread to return 0 which causes the thread
+ to exit the main while() loop and quit */
+
+ status = pthread_cond_broadcast(&worker_cond_var);
+
+ status = pthread_mutex_unlock(&worker_mutex);
+ check(status,"Worker_mutex unlock in master bad status\n");
+
+ /* wait for all buffers available (means all do_workers are done) */
+
+ for (i=0; i < nthreads; i++) {
+ status = pthread_join( fa_threads[i], &exit_value);
+ check(status,"Pthread_join bad status\n");
+ }
+}
+
+/* wait for extern int start_thread == 0 */
+
+void wait_thr()
+{
+ int status;
+
+ /* Wait on master to give start signal */
+ status = pthread_mutex_lock(&start_mutex);
+ check(status,"Start_mutex lock bad status in worker\n");
+
+ while (start_thread) {
+ status = pthread_cond_wait(&start_cond_var, &start_mutex);
+ check(status,"Start_cond_wait bad status in worker\n");
+ }
+
+ status = pthread_mutex_unlock(&start_mutex);
+ check(status,"Start_mutex unlock bad status in worker\n");
+}
+
+/* get_wbuf() -- used in worker threads
+ get a buffer full of sequences to be compared from the main program
+
+ locks on worker_mutex
+ increments worker_buf_workp
+ */
+
+int get_wbuf(struct buf_head **cur_buf, int max_work_buf)
+{
+ int status;
+
+ /* get a buffer to work on */
+ status = pthread_mutex_lock(&worker_mutex);
+ check(status,"First worker_mutex lock in worker bad status\n");
+
+ /* No worker_bufs available: wait for reader to produce some */
+ while (num_worker_bufs == 0) {
+ /* Exit if reader has finished */
+ if (reader_done) {
+ pthread_mutex_unlock(&worker_mutex);
+ return 0;
+ }
+ pthread_cond_wait(&worker_cond_var,&worker_mutex);
+ } /* end while */
+
+ /* Get the buffer from list */
+ *cur_buf = worker_buf[worker_buf_workp];
+ worker_buf_workp = (worker_buf_workp+1)%(max_work_buf);
+ num_worker_bufs--;
+
+ status = pthread_mutex_unlock(&worker_mutex);
+ check(status,"First worker_mutex unlock in worker bad status\n");
+ return 1;
+}
+
+/* put_wbuf() -- called in worker threads
+ return a filled buffer of scores to the main program
+
+ locks on reader_mutex
+ increments reader_buf_workp
+ */
+void put_wbuf(struct buf_head *cur_buf, int max_work_buf)
+{
+ int status;
+
+ /* put buffer back on list for reader */
+ status = pthread_mutex_lock(&reader_mutex);
+ check(status,"Reader_mutex lock in worker bad status\n");
+
+ reader_buf[reader_buf_workp] = cur_buf;
+ reader_buf_workp = (reader_buf_workp+1)%(max_work_buf);
+ num_reader_bufs++;
+
+ /* fprintf(stderr, " rb: %3d produced\n",num_reader_bufs); */
+
+ /* we used to do this only when num_reader_bufs==1 */
+ if (num_reader_bufs == 1 || reader_wait == 1) {
+ pthread_cond_signal(&reader_cond_var);
+ }
+
+ status = pthread_mutex_unlock(&reader_mutex);
+ check(status,"Reader_mutex unlock in worker bad status\n");
+}
diff --git a/src/randtest.c b/src/randtest.c
new file mode 100644
index 0000000..2c7cf7e
--- /dev/null
+++ b/src/randtest.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+void *my_srand(int);
+
+main(argc, argv)
+ int argc; char **argv;
+{
+ int i, n, s;
+ struct timeval t;
+ void *my_rand_state;
+
+ if (argc < 2) n = 10;
+ else n = atoi(argv[1]);
+
+ /*
+ gettimeofday(&t,NULL);
+ printf(" seed: %d\n",t.tv_usec);
+ */
+
+ my_rand_state = my_srand(0);
+
+ for (i=0; i<n; i++) {
+ s = my_nrand(n,my_rand_state);
+ printf("%d\n",s);
+ }
+
+ /*
+ for (i=0; i< 9999; i++) {
+
+ }
+ n = my_nrand(2147483648,my_rand_state);
+ printf("number 10000: %d\n",n);
+ n = my_nrand(2147483648,my_rand_state);
+ printf("number 10001: %d\n",n);
+ */
+}
diff --git a/src/re_getlib.c b/src/re_getlib.c
new file mode 100644
index 0000000..a25e350
--- /dev/null
+++ b/src/re_getlib.c
@@ -0,0 +1,146 @@
+/* re_getlib.c - re-acquire a sequence given lseek, lcont */
+
+/* $Id: re_getlib.c 1227 2013-09-26 19:19:28Z wrp $ */
+
+/* copyright (C) 2005, 2008, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+#include "structs.h"
+#include "mm_file.h"
+#define XTERNAL
+#include "uascii.h"
+
+#define GETLIB (m_fptr->getlib)
+
+/* modified Feb, 2008 to provide aa1a - annotation string */
+extern int ann_scan(unsigned char *aa0, int n0, unsigned char **aa0a_p, int seqtype);
+
+int
+re_getlib(unsigned char *aa1,
+ struct annot_str **annot_p,
+ int maxn, /* longest aa1 */
+ int maxt3, /* alternate maxn */
+ int loff, /* overlap */
+ int lcont,
+ int term_code,
+ long *loffset, /* offset from real start of sequence */
+ long *l_off_p, /* coordinate of sequence start */
+ struct lmf_str *m_fptr) {
+
+ unsigned char *aa1ptr;
+ int *sascii_save;
+ int icont, maxt, ccont, n1;
+ char libstr[MAX_UID];
+ fseek_t lmark;
+ int sstart, sstop, is, id;
+
+ aa1ptr = aa1;
+ icont=0;
+
+ /* no longer do selection */
+ m_fptr->sel_acc_p = NULL;
+
+ *loffset = 0l;
+ maxt = maxn;
+ n1 = -1;
+
+ /* to process sequences in pieces properly, if lcont > 0, then we
+ must read all but the last sequence using the scanning sascii,
+ and then read the last piece using the ann_ascii */
+
+ if (lcont > 1) {
+ for (ccont=0; ccont<lcont-1; ccont++) {
+
+ n1= GETLIB(aa1ptr,maxt,libstr,sizeof(libstr),&lmark,&icont,m_fptr,l_off_p);
+
+ if (term_code && m_fptr->lib_aa && aa1ptr[n1-1]!=term_code) {
+ aa1ptr[n1++]=term_code;
+ aa1ptr[n1]=0;
+ }
+
+ if (aa1ptr!=aa1) n1 += loff;
+
+ if (icont) {
+ maxt = maxt3;
+ memcpy(aa1,&aa1[n1-loff],loff);
+ aa1ptr= &aa1[loff];
+ *loffset += n1 - loff;
+ }
+ else {
+ maxt = maxn;
+ aa1ptr=aa1;
+ }
+ }
+ }
+
+ /* for the last one, replace m_fptr->sascii with ann_ascii[], and
+ read the sequence */
+
+ /* change sascii matrix only if there are annotations - otherwise
+ l_ann_ascii is not initialized */
+ /* cannot scan for annotations in memory mapped files */
+ if (annot_p != NULL && !m_fptr->mm_flg) {
+ if (*annot_p || (*annot_p = (struct annot_str *)calloc(1,sizeof(struct annot_str)))!=NULL) {
+ sascii_save = m_fptr->sascii;
+ m_fptr->sascii = l_ann_ascii;
+ n1= GETLIB(aa1ptr,maxt,libstr,sizeof(libstr),&lmark,&icont,m_fptr,l_off_p);
+ m_fptr->sascii = sascii_save;
+ n1 = ann_scan(aa1ptr,n1,&((*annot_p)->aa1_ann),0);
+ }
+ else {
+ fprintf(stderr,"re_getlib.c: cannot allocate annot_p\n");
+ n1= GETLIB(aa1ptr,maxt,libstr,sizeof(libstr),&lmark,&icont,m_fptr,l_off_p);
+ }
+ }
+ else {
+ n1= GETLIB(aa1ptr,maxt,libstr,sizeof(libstr),&lmark,&icont,m_fptr,l_off_p);
+ }
+
+ if (term_code && m_fptr->lib_aa && aa1ptr[n1-1]!=term_code) {
+ aa1ptr[n1++]=term_code;
+ aa1ptr[n1]=0;
+ }
+
+ /* check for subset */
+ if (m_fptr->opt_text[0]!='\0') {
+ if (m_fptr->opt_text[0]=='-') {
+ sstart=0; sscanf(&m_fptr->opt_text[1],"%d",&sstop);
+ }
+ else {
+ sstart = 0; sstop = -1;
+ sscanf(&m_fptr->opt_text[0],"%d-%d",&sstart,&sstop);
+ sstart--;
+ if (sstop <= 0 ) sstop = BIGNUM;
+ }
+
+ n1 = min(n1, sstop);
+ for (id=0,is=sstart; is<n1; ) {
+ aa1ptr[id++]=aa1ptr[is++];
+ }
+ aa1ptr[id]='\0';
+ n1 -= sstart;
+ *l_off_p += sstart;
+ }
+
+ if (aa1ptr!=aa1) n1 += loff;
+
+ return n1;
+}
diff --git a/src/res_stats.c b/src/res_stats.c
new file mode 100644
index 0000000..95271e5
--- /dev/null
+++ b/src/res_stats.c
@@ -0,0 +1,703 @@
+/* $Id: res_stats.c 1227 2013-09-26 19:19:28Z wrp $ */
+
+/* copyright (C) 2005, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* calculate stats from results file using scalesws.c */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <limits.h>
+#include <math.h>
+
+#define MAX_LLEN 200
+
+#define LN_FACT 10.0
+
+#include "defs.h"
+#include "structs.h"
+#include "param.h"
+
+struct beststr {
+ int score; /* smith-waterman score */
+ int sscore; /* duplicate for compatibility with fasta */
+ double comp;
+ double H;
+ double zscore;
+ double escore;
+ int n1;
+#ifndef USE_FTELLO
+ long lseek; /* position in library file */
+#else
+ off_t lseek;
+#endif
+ int cont; /* offset into sequence */
+ int frame;
+ int lib;
+ char libstr[13];
+} *bbp, *bestptr, **bptr, *best;
+
+struct stat_str {
+ int score;
+ int n1;
+ double comp;
+ double H;
+};
+
+static struct db_str qtt = {0l, 0l, 0};
+
+char info_gstring2[MAX_STR]; /* string for label */
+char info_gstring3[MAX_STR];
+char info_hstring1[MAX_STR];
+
+FILE *outfd;
+
+int nbest; /* number of sequences better than bestcut in best */
+int bestcut=1; /* cut off for getting into MAX_BEST */
+int bestfull;
+
+int dohist = 0;
+int zsflag = 1;
+int outtty=1;
+int llen=40;
+
+/* statistics functions */
+extern void
+process_hist(struct stat_str *sptr, int nstat, struct pstruct pst,
+ struct hist_str *hist, void **);
+extern void addhistz(double, struct hist_str *); /* scaleswn.c */
+void selectbestz(struct beststr **, int, int );
+
+extern double zs_to_E(double, int, int, long, struct db_str);
+extern double zs_to_Ec(double zs, long entries);
+
+extern double find_z(int score, int length, double comp, void *);
+
+void prhist(FILE *, struct mngmsg, struct pstruct, struct hist_str,
+ int, struct db_str, char *);
+
+int nshow=20, mshow=50, ashow= -1;
+double e_cut=10.0;
+
+main(argc, argv)
+ int argc; char **argv;
+{
+ FILE *fin;
+ char line[512];
+ int max, icol, iarg, i, qsfnum, lsfnum, n0, n1, s[3], frame;
+ double comp, H;
+ int idup, ndup, max_s;
+ char libstr[MAX_UID], *bp;
+ char bin_file[80];
+ FILE *bout=NULL;
+ struct mngmsg m_msg; /* Message from host to manager */
+ struct pstruct pst;
+ struct stat_str *stats;
+ int nstats;
+ double zscor, mu, var;
+
+#if defined(UNIX)
+ outtty = isatty(1);
+#else
+ outtty = 1;
+#endif
+
+ if (argc < 2 ) {
+ fprintf(stderr," useage - res_stats -c col -r bin_file file\n");
+ exit(1);
+ }
+
+ m_msg.db.length = qtt.length = 0l;
+ m_msg.db.entries = m_msg.db.carry = qtt.entries = qtt.carry = 0;
+ m_msg.pstat_void = NULL;
+ m_msg.hist.hist_a = NULL;
+ m_msg.nohist = 0;
+ m_msg.markx = 0;
+
+ pst.n0 = 200; /* sensible dummy value */
+ pst.zsflag = 1;
+ pst.dnaseq = 0;
+ pst.histint = 2;
+
+ bin_file[0]='\0';
+ icol = 1;
+ iarg = 1;
+ ndup = 1;
+ while (1) {
+ if (argv[iarg][0]=='-' && argv[iarg][1]=='c') {
+ sscanf(argv[iarg+1],"%d",&icol);
+ iarg += 2;
+ }
+ else if (argv[iarg][0]=='-' && argv[iarg][1]=='r') {
+ strncpy(bin_file,argv[iarg+1],sizeof(bin_file));
+ iarg += 2;
+ }
+ else if (argv[iarg][0]=='-' && argv[iarg][1]=='z') {
+ sscanf(argv[iarg+1],"%d",&pst.zsflag);
+ iarg += 2;
+ }
+ else if (argv[iarg][0]=='-' && argv[iarg][1]=='n') {
+ pst.dnaseq = 1;
+ iarg += 1;
+ }
+ else if (argv[iarg][0]=='-' && argv[iarg][1]=='s') {
+ sscanf(argv[iarg+1],"%d",&ndup);
+ iarg += 2;
+ }
+ else if (argv[iarg][0]=='-' && argv[iarg][1]=='q') {
+ outtty = 0;
+ iarg += 1;
+ }
+ else break;
+ }
+
+ icol--;
+
+ if ((fin=fopen(argv[iarg],"r"))==NULL) {
+ fprintf(stderr," cannot open %s\n",argv[1]);
+ exit(1);
+ }
+
+ if (bin_file[0]!='\0' && ((bout=fopen(bin_file,"w"))==NULL)) {
+ fprintf(stderr,"cannot open %s for output\n",bin_file);
+ }
+
+ if ((stats =
+ (struct stat_str *)malloc((MAX_STATS)*sizeof(struct stat_str)))==NULL)
+ s_abort ("Cannot allocate stats struct","");
+ nstats = 0;
+
+ initbest(MAX_BEST+1); /* +1 required for select() */
+
+ for (nbest=0; nbest<MAX_BEST+1; nbest++)
+ bptr[nbest] = &best[nbest];
+ bptr++; best++;
+ best[-1].score= BIGNUM;
+
+ nbest = 0;
+
+ pst.Lambda=0.232;
+ pst.K = 0.11;
+ pst.H = 0.34;
+
+ /* read the best scores from the results file */
+
+ max_s = -1;
+ idup = 0;
+
+ /* get first line with sequence length */
+ fgets(line,sizeof(line),fin);
+ sscanf(line,"%d",&n0);
+ if (n0 > 0) pst.n0 = n0;
+
+ while (fgets(line,sizeof(line),fin)!=NULL) {
+ if (line[0]=='/' && line[1]=='*') {
+ fputs(line,stdout);
+ strncpy(info_gstring2,line,sizeof(info_gstring2));
+ if ((bp=strchr(info_gstring2,'\n'))!=NULL) *bp = '\0';
+ break;
+ }
+ if (line[0]==';') {
+ if ((bp=strchr(line,'|'))!=NULL) qsfnum = atoi(bp+1);
+ else continue;
+ if ((bp=strchr(line,'('))!=NULL) {
+ n0 = atoi(bp+1);
+ pst.n0 = n0;
+ }
+ else {
+ fprintf(stderr, "cannot find n0:\n %s\n",line);
+ continue;
+ }
+ }
+ else {
+ sscanf(line,"%s %d %d %d %lf %lf %d %d %d",
+ libstr,&lsfnum,&n1,&frame,&comp, &H, &s[0],&s[1],&s[2]);
+ if (lsfnum==0 && n1==0) {
+ fputs(line,stderr);
+ continue;
+ }
+ if (n1 < 10 || s[icol]<=0) fputs(line,stderr);
+ idup++;
+
+ if (s[icol] > max_s) max_s = s[icol];
+ if (idup < ndup) continue;
+
+ m_msg.db.entries++;
+ m_msg.db.length += n1;
+
+ if (dohist) addhistz(zscor=find_z(max_s,n1,comp,m_msg.pstat_void),
+ &m_msg.hist);
+ else zscor = (double)max_s;
+
+ if (nstats < MAX_STATS) {
+ stats[nstats].n1 = n1;
+ stats[nstats].comp = comp;
+ stats[nstats].H = H;
+ stats[nstats++].score = max_s;
+ }
+
+ else if (!dohist) {
+ /* do_bout(bout,stats,nstats); */
+ process_hist(stats,nstats,pst,&m_msg.hist, &m_msg.pstat_void);
+ for (i=0; i<nbest; i++)
+ bptr[i]->zscore =
+ find_z(bptr[i]->score,bptr[i]->n1,bptr[i]->comp,
+ m_msg.pstat_void);
+ dohist = 1;
+ }
+
+ if (dohist) {
+ zscor =find_z(max_s,n1,comp,m_msg.pstat_void);
+ addhistz(zscor,&m_msg.hist);
+ }
+ else zscor = (double)max_s;
+
+ if (nbest >= MAX_BEST) {
+ bestfull = nbest-MAX_BEST/4;
+ selectz(bestfull-1,nbest);
+ bestcut = (int)(bptr[bestfull-1]->zscore+0.5);
+ nbest = bestfull;
+ }
+ bestptr = bptr[nbest];
+ bestptr->score = max_s;
+ bestptr->sscore = max_s;
+ bestptr->n1 = n1;
+ bestptr->comp = comp;
+ bestptr->H = H;
+ bestptr->lib = lsfnum;
+ bestptr->zscore = zscor;
+ strncpy(bestptr->libstr,libstr,12);
+ bestptr->libstr[12]='\0';
+ nbest++;
+
+ max_s = -1;
+ idup = 0;
+ }
+ } /* done with reading results */
+
+ if (!dohist) {
+ if (nbest < 20) {
+ zsflag = 0;
+ }
+ else {
+ /* do_bout(bout,stats,nstats); */
+ process_hist(stats,nstats,pst,&m_msg.hist,&m_msg.pstat_void);
+ for (i=0; i<nbest; i++)
+ bptr[i]->zscore =
+ find_z(bptr[i]->score,bptr[i]->n1,bptr[i]->comp,m_msg.pstat_void);
+ dohist = 1;
+ }
+ }
+
+ printf(" using n0: %d\n",pst.n0);
+
+ /* print histogram, statistics */
+
+ m_msg.nbr_seq = m_msg.db.entries;
+ pst.zdb_size = m_msg.db.entries;
+ /* get_param(&pst, info_gstring2,info_gstring3); */
+
+ prhist(stdout,m_msg,pst,m_msg.hist,nstats,m_msg.db,info_gstring2);
+
+ if (!zsflag) sortbest();
+ else {
+ sortbestz(bptr,nbest);
+ for (i=0; i<nbest; i++)
+ bptr[i]->escore = zs_to_E(bptr[i]->zscore,bptr[i]->n1,pst.dnaseq,
+ pst.zdb_size, m_msg.db);
+ }
+
+ outfd = stdout;
+ showbest(m_msg.db); /* display best matches */
+}
+
+initbest(nbest) /* allocate arrays for best sort */
+ int nbest;
+{
+
+ if ((best=(struct beststr *)calloc((size_t)nbest,sizeof(struct beststr)))
+ == NULL) {fprintf(stderr,"cannot allocate best struct\n"); exit(1);}
+ if ((bptr=(struct beststr **)calloc((size_t)nbest,sizeof(struct beststr *)))
+ == NULL) {fprintf(stderr,"cannot allocate bptr\n"); exit(1);}
+}
+
+void
+prhist(FILE *fd, struct mngmsg m_msg,
+ struct pstruct pst,
+ struct hist_str hist,
+ int nstats,
+ struct db_str ntt,
+ char *info_gstring2)
+{
+ int i,j,hl,hll, el, ell, ev;
+ char hline[80], pch, *bp;
+ int mh1, mht;
+ int maxval, maxvalt, dotsiz, ddotsiz,doinset;
+ double cur_e, prev_e, f_int;
+ double max_dev, x_tmp;
+ double db_tt;
+ int n_chi_sq, cum_hl, max_i;
+
+
+ fprintf(fd,"\n");
+
+ if (pst.zsflag < 0 || nstats <= 10) {
+ fprintf(fd, "%7ld residues in %5ld sequences\n", ntt.length,ntt.entries);
+ fprintf(fd,"\n%s\n",info_gstring2);
+ return;
+ }
+
+ max_dev = 0.0;
+ mh1 = hist.maxh-1;
+ mht = (3*hist.maxh-3)/4 - 1;
+
+ if (!m_msg.nohist && mh1 > 0) {
+ for (i=0,maxval=0,maxvalt=0; i<hist.maxh; i++) {
+ if (hist.hist_a[i] > maxval) maxval = hist.hist_a[i];
+ if (i >= mht && hist.hist_a[i]>maxvalt) maxvalt = hist.hist_a[i];
+ }
+ n_chi_sq = 0;
+ cum_hl = -hist.hist_a[0];
+ dotsiz = (maxval-1)/60+1;
+ ddotsiz = (maxvalt-1)/50+1;
+ doinset = (ddotsiz < dotsiz && dotsiz > 2);
+
+ if (pst.zsflag>=0)
+ fprintf(fd," opt E()\n");
+ else
+ fprintf(fd," opt\n");
+
+ prev_e = zs_to_Ec((double)(hist.min_hist-hist.histint/2),hist.entries);
+ for (i=0; i<=mh1; i++) {
+ pch = (i==mh1) ? '>' : ' ';
+ pch = (i==0) ? '<' : pch;
+ hll = hl = hist.hist_a[i];
+ if (pst.zsflag>=0) {
+ cum_hl += hl;
+ f_int = (double)(i*hist.histint+hist.min_hist)+(double)hist.histint/2.0;
+ cur_e = (double)zs_to_Ec(f_int,hist.entries);
+ ev = el = ell = (int)(cur_e - prev_e + 0.5);
+ if (hl > 0 && i > 5 && i < (90-hist.min_hist)/hist.histint) {
+ x_tmp = fabs(cum_hl - cur_e);
+ if ( x_tmp > max_dev) {
+ max_dev = x_tmp;
+ max_i = i;
+ }
+ n_chi_sq++;
+ }
+ if ((el=(el+dotsiz-1)/dotsiz) > 60) el = 60;
+ if ((ell=(ell+ddotsiz-1)/ddotsiz) > 40) ell = 40;
+ fprintf(fd,"%c%3d %5d %5d:",
+ pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
+ mh1*hist.histint+hist.min_hist,hl,ev);
+ }
+ else fprintf(fd,"%c%3d %5d :",
+ pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
+ mh1*hist.histint+hist.min_hist,hl);
+
+ if ((hl=(hl+dotsiz-1)/dotsiz) > 60) hl = 60;
+ if ((hll=(hll+ddotsiz-1)/ddotsiz) > 40) hll = 40;
+ for (j=0; j<hl; j++) hline[j]='=';
+ if (pst.zsflag>=0) {
+ if (el <= hl ) {
+ if (el > 0) hline[el-1]='*';
+ hline[hl]='\0';
+ }
+ else {
+ for (j = hl; j < el; j++) hline[j]=' ';
+ hline[el-1]='*';
+ hline[hl=el]='\0';
+ }
+ }
+ else hline[hl] = 0;
+ if (i==1) {
+ for (j=hl; j<10; j++) hline[j]=' ';
+ sprintf(&hline[10]," one = represents %d library sequences",dotsiz);
+ }
+ if (doinset && i == mht-2) {
+ for (j = hl; j < 10; j++) hline[j]=' ';
+ sprintf(&hline[10]," inset = represents %d library sequences",ddotsiz);
+ }
+ if (i >= mht&& doinset ) {
+ for (j = hl; j < 10; j++) hline[j]=' ';
+ hline[10]=':';
+ for (j = 11; j<11+hll; j++) hline[j]='=';
+ hline[11+hll]='\0';
+ if (pst.zsflag>=0) {
+ if (ell <= hll) hline[10+ell]='*';
+ else {
+ for (j = 11+hll; j < 10+ell; j++) hline[j]=' ';
+ hline[10+ell] = '*';
+ hline[11+ell] = '\0';
+ }
+ }
+ }
+
+ fprintf(fd,"%s\n",hline);
+ prev_e = cur_e;
+ }
+ }
+
+ if (ntt.carry==0) {
+ fprintf(fd, "%7ld residues in %5ld sequences\n", ntt.length, ntt.entries);
+ }
+ else {
+ db_tt = (double)ntt.carry*(double)LONG_MAX + (double)ntt.length;
+ fprintf(fd, "%.0f residues in %5ld library sequences\n", db_tt, ntt.entries);
+ }
+
+ if (pst.zsflag>=0) {
+ if (MAX_STATS < hist.entries)
+ fprintf(fd," statistics extrapolated from %d to %ld sequences\n",
+ MAX_STATS,hist.entries);
+ /* summ_stats(stat_info); */
+ fprintf(fd," %s\n",hist.stat_info);
+ if (!m_msg.nohist && cum_hl > 0)
+ fprintf(fd," Kolmogorov-Smirnov statistic: %6.4f (N=%d) at %3d\n",
+ max_dev/(double)cum_hl, n_chi_sq,max_i*hist.histint+hist.min_hist);
+ if (m_msg.markx & MX_M10FORM) {
+ while ((bp=strchr(hist.stat_info,'\n'))!=NULL) *bp=' ';
+ if (cum_hl <= 0) cum_hl = -1;
+ sprintf(info_hstring1,"; mp_extrap: %d %ld\n; mp_stats: %s\n; mp_KS: %6.4f (N=%d) at %3d\n",
+ MAX_STATS,hist.entries,hist.stat_info,max_dev/(double)cum_hl, n_chi_sq,max_i*hist.histint+hist.min_hist);
+ }
+ }
+ fprintf(fd,"\n%s\n",info_gstring2);
+ fflush(fd);
+}
+
+showbest(struct db_str ntt)
+ {
+ int ib, istart, istop;
+ char bline[200], fmt[40], pad[200];
+ char rline[20];
+ int ntmp;
+ int lcont, ccont, loff;
+ int hcutoff;
+
+ sprintf(fmt,"%%-%ds (%%3d)",llen-10);
+
+ nshow = min(20,nbest);
+ mshow = min(20,nbest);
+
+ if (outtty) {
+ printf(" How many scores would you like to see? [%d] ",nshow);
+ fflush(stdout);
+ if (fgets(rline,sizeof(rline),stdin)==NULL) exit(0);
+ if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&nshow);
+ if (nshow<=0) nshow = min(20,nbest);
+ }
+ else nshow=mshow;
+
+ memset(pad,' ',llen-10);
+ pad[llen-31]='\0';
+ if (zsflag)
+ fprintf(outfd,"The best scores are:%s s-w Z-score E(%ld)\n",pad,ntt.entries);
+ else
+ fprintf(outfd,"The best scores are:%s s-w\n",pad);
+
+ if (outfd != stdout)
+ if (zsflag)
+ fprintf(stdout,"The best scores are:%s s-w Z-score E(%ld)\n",pad,ntt.entries);
+ else
+ fprintf(stdout,"The best scores are:%s s-w\n",pad);
+
+ istart = 0;
+ l1: istop = min(nbest,nshow);
+ for (ib=istart; ib<istop; ib++) {
+ bbp = bptr[ib];
+
+ if (!outtty && zsflag && bbp->escore > e_cut) {
+ nshow = ib;
+ goto done;
+ }
+
+ sprintf(bline,"%-12s %d",bbp->libstr,bbp->lib);
+ bline[13]='\0';
+
+ fprintf(outfd,fmt,bline,bbp->n1);
+
+ if (zsflag)
+ fprintf(outfd,"%4d %4.1f %6.2g\n",
+ bbp->score,bbp->zscore,
+ bbp->escore);
+ else
+ fprintf(outfd,"%4d\n",bbp->score);
+
+ if (outfd!=stdout) {
+ fprintf(stdout,fmt,bline,bbp->n1);
+ if (zsflag)
+ printf("%4d %4.1f %6.2g\n",
+ bbp->score,bbp->zscore,
+ bbp->escore);
+ else
+ printf("%4d\n",bbp->score);
+ }
+ }
+
+ fflush(outfd); if (outfd!=stdout) fflush(stdout);
+
+ if (outtty) {
+ printf(" More scores? [0] ");
+ fflush(stdout);
+ if (fgets(rline,sizeof(rline),stdin)==NULL) exit(0);
+ ntmp = 0;
+ if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&ntmp);
+ if (ntmp<=0) ntmp = 0;
+ if (ntmp>0) {
+ istart = istop;
+ nshow += ntmp;
+ mshow += ntmp;
+ goto l1;
+ }
+ }
+ else if (zsflag && bbp->escore < e_cut) {
+ istart=istop;
+ nshow += 10;
+ goto l1;
+ }
+
+ done:
+ if (outfd!=stdout) fprintf(outfd,"\n");
+}
+
+selectz(k,n) /* k is rank in array */
+ int k,n;
+{
+ int t, i, j, l, r;
+ double v;
+ struct beststr *tmptr;
+
+ l=0; r=n-1;
+
+ while ( r > l ) {
+ i = l-1;
+ j = r;
+ v = bptr[r]->zscore;
+ do {
+ while (bptr[++i]->zscore > v ) ;
+ while (bptr[--j]->zscore < v ) ;
+ tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
+ } while (j > i);
+ bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
+ if (i>=k) r = i-1;
+ if (i<=k) l = i+1;
+ }
+}
+
+sortbest()
+{
+ int cmps(), cmp1(), cmpa(), cmpz();
+ ksort(bptr,nbest,cmps);
+}
+
+sortbeste()
+{
+ int cmpe();
+ ksort(bptr,nbest,cmpe);
+}
+
+sortbestz()
+{
+ int cmpz();
+ ksort(bptr,nbest,cmpz);
+}
+
+cmps(ptr1,ptr2)
+ struct beststr *ptr1, *ptr2;
+{
+ if (ptr1->score < ptr2->score) return (1);
+ else if (ptr1->score > ptr2->score) return (-1);
+ else return (0);
+}
+
+cmpe(ptr1,ptr2)
+ struct beststr *ptr1, *ptr2;
+{
+ if (ptr1->escore < ptr2->escore) return (-1);
+ else if (ptr1->escore > ptr2->escore) return (1);
+ else return (0);
+}
+
+cmpz(ptr1,ptr2)
+ struct beststr *ptr1, *ptr2;
+{
+ if (ptr1->zscore < ptr2->zscore) return (1);
+ else if (ptr1->zscore > ptr2->zscore) return (-1);
+ else return (0);
+}
+
+ksort(v,n,comp)
+ char *v[]; int n, (*comp)();
+{
+ int gap, i, j;
+ char *tmp;
+
+ for (gap=n/2; gap>0; gap/=2)
+ for (i=gap; i<n; i++)
+ for (j=i-gap; j>=0; j -= gap) {
+ if ((*comp)(v[j],v[j+gap]) <=0)
+ break;
+ tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
+ }
+}
+
+/*
+do_bout(FILE *bout,struct stat_str **bptr, int nbest)
+{
+ int i, min_hist, max_hist;
+ double mu, var;
+
+ if (bout==NULL) return;
+
+ inithist();
+ for (i = 0; i<nbest; i++)
+ addhist(bptr[i]->score,bptr[i]->n1);
+
+ for (i=0; i<MAX_LLEN; i++)
+ if (llen_hist[i]>0) {
+ min_hist=i;
+ break;
+ }
+
+ for (i=MAX_LLEN-1; i>=0; i--)
+ if (llen_hist[i]>0) {
+ max_hist=i;
+ break;
+ }
+
+ for (i=min_hist; i<=max_hist; i++) {
+ mu=(double)score_sums[i]/(double)llen_hist[i];
+ if (llen_hist[i]>1) {
+ var = ((double)score2_sums[i]-(double)llen_hist[i]*mu*mu)/
+ (double)(llen_hist[i]-1);
+
+ fprintf(bout,"%d\t%d\t%.1f\t%.1f\t%.1f\t%.4f\t%.4f\n",
+ i,llen_hist[i],exp(((double)(i))/LN_FACT),
+ score_sums[i],score2_sums[i],mu,var);
+ }
+ }
+ free_hist();
+ fclose(bout);
+}
+*/
+
+s_abort()
+{
+ exit(1);
+}
diff --git a/src/rstruct.h b/src/rstruct.h
new file mode 100644
index 0000000..94a9814
--- /dev/null
+++ b/src/rstruct.h
@@ -0,0 +1,16 @@
+/* $Id: rstruct.h 625 2011-03-23 17:21:38Z wrp $ */
+
+#ifndef RSTRUCT
+#define RSTRUCT
+struct rstruct
+{
+ int score[3];
+ int valid_stat;
+ int alg_info;
+ double comp;
+ double H;
+ double escore;
+ int segnum;
+ int seglen;
+};
+#endif
diff --git a/src/sc_to_e.c b/src/sc_to_e.c
new file mode 100644
index 0000000..5df14e8
--- /dev/null
+++ b/src/sc_to_e.c
@@ -0,0 +1,71 @@
+/* $Id: sc_to_e.c 625 2011-03-23 17:21:38Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and the
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* sc_to_e uses statistical parameters from search and
+ score, length, and database size to calculate E()
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+double mean_var, mu, rho;
+
+main(argc, argv)
+ int argc; char **argv;
+{
+ char line[128];
+ int score, length, db_size;
+ double z_val, s_to_zv(), zv_to_E();
+
+ if (argc == 4) {
+ sscanf(argv[1],"%lf",&rho);
+ sscanf(argv[2],"%lf",&mu);
+ sscanf(argv[3],"%lf",&mean_var);
+ }
+ else {
+ fprintf(stderr," enter rho mu mean_var: ");
+ fgets(line,sizeof(line),stdin);
+ sscanf(line,"%lf %lf %lf",&rho, &mu, &mean_var);
+ }
+
+ while (1) {
+ fprintf(stderr," enter score length db_size: ");
+ if (fgets(line,sizeof(line),stdin)==NULL) exit(0);
+ if (line[0]=='\n') exit(0);
+ sscanf(line,"%d %d %d",&score, &length, &db_size);
+ if (db_size < 1) db_size = 50000;
+
+ z_val = s_to_zv(score, length);
+
+ printf(" s: %d (%d) E(%d): %4.2g\n",score,length,db_size,zv_to_E(z_val,db_size));
+ }
+}
+
+double s_to_zv(int score, int length)
+{
+ return ((double)score - rho * log((double)length) - mu)/sqrt(mean_var);
+}
+
+double zv_to_E(double zv, int db_size)
+{
+ double e;
+
+ e = exp(-1.282554983 * zv - .577216);
+ return (double)db_size * (e > .01 ? 1.0 - exp(-e) : e);
+}
diff --git a/src/scaleswn.c b/src/scaleswn.c
new file mode 100644
index 0000000..cae7962
--- /dev/null
+++ b/src/scaleswn.c
@@ -0,0 +1,3136 @@
+/* scaleswn.c */
+
+/* $Id: scaleswn.c 1245 2013-12-18 18:19:38Z wrp $ */
+
+/* copyright (c) 1995, 1996, 2000, 2014 by William R. Pearson and The
+ Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* as of 24 Sept, 2000 - scaleswn uses no global variables */
+
+/* Provide statistical estimates using an extreme value distribution
+
+ This code provides multiple methods for scaling sequence
+ similarity scores to correct for length effects.
+
+ Currently, six methods are available:
+
+ ppst->zsflag = 0 - no scaling (AVE_STATS)
+ ppst->zsflag = 1 - regression-scaled scores (REG_STATS)
+ ppst->zsflag = 2 - (revised) MLE Lambda/K scaled scores (MLE_STATS)
+ ppst->zsflag = 3 - scaling using Altschul's parameters (AG_STATS)
+ ppst->zsflag = 4 - regression-scaled with iterative outlier removal (REGI_STATS)
+ ppst->zsflag = 5 = like 1, but length scaled variance (REG2_STATS)
+ ppst->zsflag = 6 = like 2, but uses lambda composition/scale (MLE2_STATS)
+ ppst->zsflag = 11 = 10 + 1 - use random shuffles, method 1
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#include <limits.h>
+
+#include "defs.h"
+#include "param.h"
+#include "structs.h"
+#include "best_stats.h"
+
+#define MAXHIST 50
+#define MAX_LLEN 200
+#define LHISTC 5
+#define VHISTC 5
+#define MAX_SSCORE 300
+
+#define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */
+
+#define LN_FACT 10.0
+#ifndef M_LN2
+#define M_LN2 0.69314718055994530942
+#endif
+#define EULER_G 0.57721566490153286060
+#define PI_SQRT6 1.28254983016186409554
+
+#ifndef M_SQRT2
+#define M_SQRT2 1.41421356237
+#endif
+#define LN200 5.2983173666
+#define ZS_MAX 400.0 /* used to prevent underflow on some machines */
+#define TOLERANCE 1.0e-12
+#define TINY 1.0e-6
+
+/* used by AVE_STATS, REG_STATS, REGI_STATS, REG2_STATS*/
+struct rstat_str {
+ double rho, rho_e, mu, mu_e, mean_var, var_e; /* var_e:std. error of var */
+ double mean_var_sqrt; /* sqrt(mean_var) - used frequently */
+/* used by REG2_STATS */
+ double rho2, mu2, var_cutoff;
+ int n_trimmed; /* excluded because of high z-score */
+ int n1_trimmed, nb_trimmed, nb_tot; /* excluded because of bin */
+};
+
+/* used by AG_STATS, MLE_STATS */
+struct ag_stat_str {
+ double K, Lambda, H, a_n0f, a_n0;
+};
+
+/* used by MLE2_STATS */
+struct mle2_stat_str {
+ double a_n0;
+ double mle2_a0, mle2_a1, mle2_a2, mle2_b1;
+ double ave_comp, max_comp, ave_H;
+};
+
+struct pstat_str {
+ int zsflag;
+ double ngLambda, ngK, ngH;
+ double ave_n1;
+ double sample_fract;
+ double zs_off; /* z-score offset from sampling */
+ union {
+ struct rstat_str rg;
+ struct ag_stat_str ag;
+ struct mle2_stat_str m2;
+ } r_u;
+};
+
+double find_z(int score, double escore, int len, double comp, struct pstat_str *);
+
+#define AVE_STATS 0 /* no length effect, only mean/variance */
+double find_zn(int score, double escore, int len, double comp, struct pstat_str *);
+
+static void st_sort (struct stat_str *v, int n);
+
+static int
+calc_thresh(struct pstruct *ppst, struct stat_str *sptr, int nstats,
+ struct score_count_s s_info,
+ double Lambda, double K, double H, double *zstrim);
+
+int proc_hist_n(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp, int do_trim,
+ struct pstat_str *);
+
+#define REG_STATS 1 /* length-regression scaled */
+#define REGI_STATS 4 /* length regression, iterative */
+double find_zr(int score, double escore, int len, double comp, struct pstat_str *);
+int proc_hist_r(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp,
+ int do_trim, struct pstat_str *pu);
+
+#define MLE_STATS 2 /* MLE for lambda, K */
+double find_ze(int score, double escore, int len, double comp, struct pstat_str *);
+int proc_hist_ml(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp, int do_trim,
+ struct pstat_str *);
+
+#define AG_STATS 3 /* Altschul-Gish parameters */
+double find_za(int score, double escore, int len, double comp, struct pstat_str *);
+int proc_hist_a(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp, int do_trim,
+ struct pstat_str *);
+
+#ifdef NORMAL_DIST
+int proc_hist_anorm(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp, int do_trim,
+ struct pstat_str *);
+#endif
+
+#define REG2_STATS 5 /* length regression on mean + variance */
+double find_zr2(int score, double escore, int len, double comp, struct pstat_str *);
+int proc_hist_ri(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp, int do_trim,
+ struct pstat_str *);
+
+#define MLE2_STATS 6 /* MLE stats using comp(lambda) */
+double find_ze2(int score, double escore, int length, double comp, struct pstat_str *);
+int proc_hist_ml2(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp, int do_trim,
+ struct pstat_str *);
+
+#ifdef USE_LNSTATS
+#define LN_STATS 2
+double find_zl(int score, double escore, int len, double comp, struct pstat_str *);
+int proc_hist_ln(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp, int do_trim,
+ struct pstat_str *);
+#endif
+
+double (*find_z_arr[7])(int score, double escore, int len, double comp, struct pstat_str *) = {
+ find_zn, /* AVE_STATS zsflag ==0 */
+ find_zr, /* REG_STATS zsflag ==1 */
+ find_ze, /* MLE_STATS, zsflag==2 */
+#ifndef NORMAL_DIST
+ find_za, /* AG_STATS, zsflag==3 */
+#else
+ find_zn, /* AVE_STATS zsflag ==0 */
+#endif
+ find_zr, /* REGI_STATS, zsflag==4 */
+ find_zr2, /* REG2_STATS, zsflag==5 */
+ find_ze2, /* MLE2_STATS, zsflag==6 */
+};
+
+/* print out all pstat_str info for independent calculation */
+void
+pstat_info(char *info_str, int info_str_n, char *comment, struct pstat_str *pu);
+
+/* scaleswn.c local variables that belong in their own structure */
+
+struct llen_str {
+ int min, max;
+ int max_score, min_score;
+ int *hist;
+ double *score_sums, *score2_sums;
+ double *score_var;
+ int max_length, min_length, zero_s;
+ int fit_flag;
+};
+
+/* llen bins */
+static void inithist(struct llen_str *, struct pstruct *, int);
+static void free_hist( struct llen_str *);
+static void addhist(struct llen_str *, int, int, int);
+static void prune_hist(struct llen_str *, int, int, int, long *);
+/* final display histogram */
+void inithistz(int, struct hist_str *histp, double zs_off);
+void addhistz(double zs, struct hist_str *histp);
+void addhistzp(double zs, struct hist_str *histp);
+
+/* calculate rho (slope), mu (intercept) REG_STATS,
+ mean_var, rho2, mu2 for variance (REG2_STATS),
+ minimal iterative exclusion
+*/
+static void fit_llen(struct llen_str *, struct rstat_str *);
+/* calculate rho (slope), mu (intercept) REG_STATS,
+ mean_var, rho2, mu2 for variance (REG2_STATS),
+
+*/
+static void fit_llen2(struct llen_str *, struct rstat_str *);
+
+/* calculate rho (slope), mu (intercept) REG_STATS,
+ mean_var, rho2, mu2 for variance (REG2_STATS) */
+static void fit_llens(struct llen_str *, struct rstat_str *);
+
+extern void sortbeste(struct beststr **bptr, int nbest);
+extern void sortbestz(struct beststr **bptr, int nbest);
+
+/* void set_db_size(int, struct db_str *, struct hist_str *); */
+
+#ifdef DEBUG
+FILE *tmpf;
+#endif
+
+int
+process_hist(struct stat_str *sptr, int nstats,
+ const struct mngmsg *m_msp,
+ struct pstruct *ppst,
+ struct hist_str *histp,
+ struct pstat_str **ps_sp,
+ struct score_count_s *s_info,
+ int do_hist)
+{
+ int zsflag, r_zsflag, do_trim, i;
+ struct pstat_str *ps_s;
+
+ if (ppst->zsflag < 0) { /* no statistics */
+ *ps_sp = NULL;
+ return ppst->zsflag;
+ }
+
+ /* need a pstat_str if its NULL */
+ if (*ps_sp == NULL) {
+ if ((ps_s=(struct pstat_str *)calloc(1,sizeof(struct pstat_str)))==NULL) {
+ fprintf(stderr," cannot allocate pstat_union: %ld\n",sizeof(struct pstat_str));
+ exit(1);
+ }
+ else {
+ *ps_sp = ps_s;
+ }
+ }
+ else {
+ ps_s = *ps_sp;
+ memset(ps_s,0,sizeof(struct pstat_str));
+ }
+
+ if (s_info->tot_scores > 10) {
+ ps_s->sample_fract = min(1.0, (double)s_info->s_cnt[ppst->score_ix]/(double)s_info->tot_scores);
+ if (ps_s->sample_fract > 0.0 && ps_s -> sample_fract < 1.0) {
+ ps_s->zs_off = -log(ps_s->sample_fract)/PI_SQRT6;
+ }
+ else {
+ ps_s->zs_off = 0.0;
+ }
+ }
+ else {
+ ps_s->sample_fract = 1.0;
+ ps_s->zs_off = 0.0;
+ }
+ ppst->zs_off = ps_s->zs_off;
+
+ ps_s->ngLambda = m_msp->Lambda;
+ ps_s->ngK = m_msp->K;
+ ps_s->ngH = m_msp->H;
+
+ zsflag = ppst->zsflag;
+ if (nstats < 10) zsflag = AG_STATS;
+
+ if (m_msp->n0 <= LENGTH_CUTOFF && (zsflag == REG_STATS || zsflag == REGI_STATS || zsflag == REG2_STATS)) {
+ zsflag = MLE_STATS;
+ }
+
+/*
+#ifdef DEBUG
+ if (ppst->debug_lib) {
+ tmpf=fopen("tmp_stats.res","w+");
+ for (i=0; i<nstats; i++) fprintf(tmpf,"%d\t%d\n",sptr[i].score,sptr[i].n1);
+ fclose(tmpf);
+ }
+#endif
+*/
+
+#ifdef DEBUG
+ st_sort(sptr, nstats);
+#endif
+
+ if (zsflag >= 20) { /* working with shuffled top sequences */
+ if (ppst->zsflag2 == AG_STATS) ppst->zsflag2 = MLE_STATS;
+ zsflag = ppst->zsflag2;
+ do_trim = 0;
+ }
+ else if (zsflag >= 10) {
+ zsflag -= 10;
+ do_trim = 0;
+ }
+ else do_trim = 1;
+
+ if (!ppst->zdb_size_set) ppst->zdb_size = m_msp->db.entries;
+
+#ifdef USE_LNSTATS
+ if (zsflag==LN_STATS) { /* -z 2 */
+ find_z_arr[LN_STATS] = &find_zl;
+ ps_s->zsflag=r_zsflag = proc_hist_ln(sptr, nstats, m_msp->s_info, histp, do_trim, ps_s);
+ }
+#else /* -z 2 */
+ if (zsflag==MLE_STATS) {
+ find_z_arr[MLE_STATS] = &find_ze;
+ ps_s->zsflag=r_zsflag = proc_hist_ml(sptr, nstats, m_msp->s_info, ppst, histp, do_trim, ps_s);
+ }
+#endif
+ else if (zsflag==REG_STATS) { /* -z 1 */
+ ps_s->zsflag=r_zsflag = proc_hist_r(sptr, nstats, m_msp->s_info, ppst, histp, do_trim, ps_s);
+ }
+ else if (zsflag==AG_STATS) { /* -z 3 */
+#ifndef NORMAL_DIST
+ ps_s->zsflag=r_zsflag = proc_hist_a(sptr, nstats, m_msp->s_info, ppst, histp, do_trim, ps_s);
+#else
+ ps_s->zsflag=r_zsflag = proc_hist_anorm(sptr, nstats, m_msp->s_info, ppst, histp, do_trim, ps_s);
+#endif
+
+ }
+ else if (zsflag==REGI_STATS) { /* -z 4, iterated outlier removal */
+ ps_s->zsflag=r_zsflag = proc_hist_ri(sptr,nstats, m_msp->s_info, ppst, histp, do_trim, ps_s);
+ }
+ else if (zsflag==REG2_STATS) { /* -z 5, length-scaled variance */
+ ps_s->zsflag=r_zsflag = proc_hist_r(sptr,nstats, m_msp->s_info, ppst, histp, do_trim, ps_s);
+ }
+#if !defined(TFAST) && !defined(FASTX)
+ else if (zsflag == MLE2_STATS) { /* -z 6, MLE + composition */
+ ps_s->zsflag=r_zsflag = proc_hist_ml2(sptr, nstats, m_msp->s_info, ppst, histp, do_trim, ps_s);
+ }
+#endif
+ else { /* AVE_STATS */
+ ps_s->zsflag=r_zsflag = proc_hist_n(sptr,nstats, m_msp->s_info, ppst, histp, do_trim, ps_s);
+ }
+
+ if (!do_hist && histp != NULL) {
+ histp->entries = nstats; /* db->entries = 0; */
+ inithistz(MAXHIST, histp, ps_s->zs_off);
+ for (i = 0; i < nstats; i++) {
+ if (sptr[i].n1 < 0) sptr[i].n1 = -sptr[i].n1;
+ addhistz(find_z(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp,ps_s),
+ histp);
+ }
+ }
+ return r_zsflag;
+}
+
+/* calc_thresh() sets the threshold used to exclude high scores from
+ regression calculations
+*/
+static int
+calc_thresh(struct pstruct *ppst, struct stat_str *sptr, int nstats,
+ struct score_count_s s_info,
+ double Lambda, double K, double H, double *zstrim)
+{
+ int max_hscore;
+ int i;
+ double ave_n1, tmp_score, z, l_fact;
+
+ ave_n1 = 0.0;
+ for (i=0; i<nstats; i++) {
+ ave_n1 += (double)sptr[i].n1;
+ }
+ if (nstats >= 1) ave_n1 /= nstats;
+
+ if (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA) {
+ l_fact = 1.0;
+ }
+ else {
+ l_fact = 0.7;
+ }
+
+/* max_hscore = MAX_SSCORE; */
+/* mean expected for ppst->n0 * 400 for protein, 5000 for DNA */
+/* we want a number of offsets that is appropriate for the database size so
+ far (nstats)
+*/
+
+/*
+ the calculation below sets a high-score threshold using an
+ ungapped lambda, but errs towards the high-score side by using
+ E()=0.01 and calculating with 0.70*lambda, which is the correct for
+ going from ungapped to -12/-2 gapped lambda with BLOSUM50
+*/
+
+#ifndef NORMAL_DIST
+ if (K < 1.0e-50) K = 0.02; /* prevent floating pt underflow with K=0 */
+ tmp_score = 0.01/((double)nstats*K*(double)ppst->n0*ave_n1);
+ tmp_score = -log(tmp_score)/(Lambda*l_fact);
+ max_hscore = (int)(tmp_score+0.5);
+
+ /* z = 1.0/(double)nstats; */
+ z = (double)s_info.tot_scores/((double)nstats*(double)s_info.s_cnt[ppst->score_ix]);
+ z = (log(z)+EULER_G)/(- PI_SQRT6);
+ if (z < 3.0) { *zstrim = -1.0; }
+ else { *zstrim = 10.0*z+50.0;}
+#else
+ max_hscore = 100;
+ z = 5.0;
+ *zstrim = 10.0*z+50.0;
+#endif
+ return max_hscore;
+}
+
+int
+proc_hist_r(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp,
+ int do_trim, struct pstat_str *pu)
+{
+ int i, max_hscore;
+ double zs, zstrim;
+ char s_string[128];
+ char rho_str[32]; /* used for simplifying output label */
+ double rho_val;
+ struct llen_str llen;
+ char *f_string;
+ llen.fit_flag=1;
+ llen.hist=NULL;
+
+ max_hscore = calc_thresh(ppst, sptr, nstats, s_info, pu->ngLambda,
+ pu->ngK, pu->ngH, &zstrim);
+
+
+ if (do_trim && zstrim < 0.0) {
+ /* too few scores to do a z-trim, shift to Karlin-Altscul */
+ return proc_hist_a(sptr, nstats, s_info, ppst, histp, do_trim, pu);
+ }
+
+ inithist(&llen, ppst,max_hscore);
+ f_string = histp->stat_info;
+
+ for (i = 0; i<nstats; i++)
+ addhist(&llen,sptr[i].score,sptr[i].n1, max_hscore);
+
+ if ((llen.max_score - llen.min_score) < 10) {
+ free_hist(&llen);
+ llen.fit_flag = 0;
+ return proc_hist_n(sptr, nstats, s_info, ppst, histp, do_trim, pu);
+ }
+
+ fit_llen(&llen, &(pu->r_u.rg)); /* get parameters for REG_STATS */
+
+ if (!llen.fit_flag) { /* the fit failed, fall back to proc_hist_ml */
+ free_hist(&llen);
+ return proc_hist_ml(sptr,nstats, s_info, ppst, histp, do_trim, pu);
+ }
+
+ pu->r_u.rg.n_trimmed= pu->r_u.rg.n1_trimmed = pu->r_u.rg.nb_trimmed = 0;
+
+ if (do_trim) {
+ for (i = 0; i < nstats; i++) {
+ zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, pu);
+ if (zs < 20.0 || zs > zstrim) {
+ pu->r_u.rg.n_trimmed++;
+ prune_hist(&llen,sptr[i].score,sptr[i].n1, max_hscore,
+ &(histp->entries));
+ }
+ }
+
+ if ((nstats - pu->r_u.rg.n_trimmed) < 50) {
+ llen.fit_flag = 0;
+ free_hist(&llen);
+ return proc_hist_ml(sptr,nstats, s_info, ppst, histp, do_trim, pu);
+ }
+
+ /* fprintf(stderr,"Z-trimmed %d entries with z > 5.0\n",
+ pu->r_u.rg.n_trimmed); */
+
+ /* calculate parameters for REG2_STATS; fit_llens() is always
+ called, but the parameters are only used for REG2_STATS
+ (find_zr2) */
+
+ fit_llens(&llen, &(pu->r_u.rg));
+
+ /* fprintf(stderr,"Bin-trimmed %d entries in %d bins\n",
+ pu->r_u.rg.n1_trimmed,pu->r_u.rg.nb_trimmed); */
+ }
+
+ free_hist(&llen);
+
+ /* put all the scores in the histogram */
+
+ if (ppst->zsflag < 10) s_string[0]='\0';
+ else if (ppst->zs_win > 0) {sprintf(s_string,"(shuffled [%d], win: %d)",nstats,ppst->zs_win);}
+ else if (ppst->shuffle_dna3) { sprintf(s_string,"(shuffled3 [%d])",nstats);}
+ else { sprintf(s_string,"(shuffled [%d])",nstats);}
+
+ /* #ifdef LOCAL_SCORE */
+ strncpy(rho_str,"ln(x)",sizeof(rho_str));
+ rho_val = pu->r_u.rg.rho*LN_FACT;
+ /*
+#else
+ strncpy(rho_str,"x",sizeof(rho_str));
+ rho_val = pu->r_u.rg.rho;
+#endif
+ */
+
+ if (ppst->zsflag == REG2_STATS || ppst->zsflag == 10+REG2_STATS ||
+ ppst->zsflag == 20+REG2_STATS) {
+ sprintf(f_string,"%s Expectation_v fit: rho(%s)= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f;\n rho2=%6.2f; mu2= %6.2f, 0's: %d Z-trim: %d B-trim: %d in %d/%d",
+ s_string, rho_str, rho_val ,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e),
+ pu->r_u.rg.rho2,pu->r_u.rg.mu2,llen.zero_s,
+ pu->r_u.rg.n_trimmed, pu->r_u.rg.n1_trimmed, pu->r_u.rg.nb_trimmed, pu->r_u.rg.nb_tot);
+ }
+ else {
+ sprintf(f_string,"%s Expectation_n fit: rho(%s)= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f\n mean_var=%6.4f+/-%6.3f, 0's: %d Z-trim(%.1f): %d B-trim: %d in %d/%d\n Lambda= %8.6f",
+ s_string, rho_str, rho_val,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e), pu->r_u.rg.mean_var,sqrt(pu->r_u.rg.var_e),
+ llen.zero_s, zstrim, pu->r_u.rg.n_trimmed, pu->r_u.rg.n1_trimmed, pu->r_u.rg.nb_trimmed, pu->r_u.rg.nb_tot,
+ PI_SQRT6/sqrt(pu->r_u.rg.mean_var));
+ }
+ return REG_STATS;
+}
+
+/* proc_hist_ri() -- iterative removal of outliers before regression of mean vs log(len) */
+int
+proc_hist_ri(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp,
+ int do_trim, struct pstat_str *pu)
+{
+ int i, nit, nprune, max_hscore;
+ double zs, zstrim;
+ char s_string[128];
+ char *f_string;
+ struct llen_str llen;
+
+ llen.fit_flag=1;
+ llen.hist=NULL;
+
+ max_hscore = calc_thresh(ppst, sptr, nstats, s_info, pu->ngLambda,
+ pu->ngK, pu->ngH, &zstrim);
+
+ if (do_trim && zstrim < 0.0) {
+ /* too few scores to do a z-trim, shift to Karlin-Altscul */
+ return proc_hist_a(sptr, nstats, s_info, ppst, histp, do_trim, pu);
+ }
+
+ inithist(&llen, ppst,max_hscore);
+ f_string = histp->stat_info;
+
+ for (i = 0; i<nstats; i++)
+ addhist(&llen,sptr[i].score,sptr[i].n1,max_hscore);
+
+ pu->r_u.rg.n_trimmed= pu->r_u.rg.n1_trimmed = pu->r_u.rg.nb_trimmed = 0;
+ if (do_trim) nit = 5;
+ else nit = 0;
+
+ while (nit-- > 0) {
+ nprune = 0;
+ fit_llen2(&llen, &(pu->r_u.rg));
+ if (llen.fit_flag == 0) break;
+
+ for (i = 0; i < nstats; i++) {
+ if (sptr[i].n1 < 0) continue;
+ zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp,pu);
+ if (zs < 20.0 || zs > zstrim ) {
+ nprune++;
+ pu->r_u.rg.n_trimmed++;
+ prune_hist(&llen,sptr[i].score,sptr[i].n1,max_hscore,
+ &(histp->entries));
+ sptr[i].n1 = -sptr[i].n1;
+ }
+ }
+ /* fprintf(stderr," %d Z-trimmed at %d\n",nprune,nit); */
+ if (nprune < LHISTC) { break; }
+ }
+
+ fit_llen(&llen, &(pu->r_u.rg));
+ free_hist(&llen);
+
+ if (!llen.fit_flag) { /* the fit failed, fall back to proc_hist_ml */
+ return proc_hist_ml(sptr,nstats, s_info, ppst, histp, do_trim, pu);
+ }
+
+ if (ppst->zsflag < 10) s_string[0]='\0';
+ else if (ppst->zs_win > 0)
+ sprintf(s_string,"(shuffled [%d], win: %d)",nstats,ppst->zs_win);
+ else
+ sprintf(s_string,"(shuffled [%d])",nstats);
+
+ sprintf(f_string,"%s Expectation_i fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f;\n mean_var=%6.4f+/-%6.3f 0's: %d Z-trim: %d N-it: %d\n Lambda= %8.6f",
+ s_string,
+ pu->r_u.rg.rho*LN_FACT,sqrt(pu->r_u.rg.rho_e),pu->r_u.rg.mu,sqrt(pu->r_u.rg.mu_e),
+ pu->r_u.rg.mean_var,sqrt(pu->r_u.rg.var_e),llen.zero_s,pu->r_u.rg.n_trimmed, nit,
+ PI_SQRT6/sqrt(pu->r_u.rg.mean_var));
+ return REGI_STATS;
+}
+
+/* this procedure implements Altschul's pre-calculated values for lambda, K */
+
+#include "alt_parms.h"
+
+int
+look_p(struct alt_p parm[], int gap, int ext,
+ double *K, double *Lambda, double *H);
+
+int
+proc_hist_a(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp,
+ int do_trim, struct pstat_str *pu)
+{
+ double Lambda, K, H;
+ char *f_string;
+ int r_v;
+ int t_gdelval, t_ggapval;
+
+#ifdef OLD_FASTA_GAP
+ t_gdelval = ppst->gdelval;
+ t_ggapval = ppst->ggapval;
+#else
+ t_gdelval = ppst->gdelval+ppst->ggapval;
+ t_ggapval = ppst->ggapval;
+#endif
+
+ f_string = histp->stat_info;
+
+ if (ppst->dnaseq==0) {
+ if (strcmp(ppst->pam_name,"BL50")==0 || strcmp(ppst->pam_name,"BLOSUM50")==0) {
+ r_v = look_p(bl50_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ else if (strcmp(ppst->pam_name,"BL62")==0 || strcmp(ppst->pam_name,"BLOSUM62")==0) {
+ r_v = look_p(bl62_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ else if (strcmp(ppst->pam_name,"BL80")==0 || strcmp(ppst->pam_name,"BLOSUM80")==0) {
+ r_v = look_p(bl80_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ else if (strcmp(ppst->pam_name,"PAM250")==0) {
+ r_v = look_p(p250_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ else if ((strcmp(ppst->pam_name,"PAM120")==0) || (strcmp(ppst->pam_name,"VT120")==0)) {
+ r_v = look_p(p120_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ else if ((strcmp(ppst->pam_name,"MD10")==0) || (strcmp(ppst->pam_name,"VT10")==0)) {
+ r_v = look_p(md10_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ else if ((strcmp(ppst->pam_name,"MD20")==0) || (strcmp(ppst->pam_name,"VT20")==0)) {
+ r_v = look_p(md20_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ else if ((strcmp(ppst->pam_name,"MD40")==0) || (strcmp(ppst->pam_name,"VT40")==0)) {
+ r_v = look_p(md40_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ else if (strcmp(ppst->pam_name,"OPTIMA5")==0) {
+ r_v = look_p(opt5_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ else r_v = 0;
+ }
+ else {
+ r_v = look_p(nt32_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ if (strcmp(ppst->pam_name,"DNA")==0 || (ppst->pam_h==5 && ppst->pam_l == -4)) {
+ r_v = look_p(nt54_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ else if (strcmp(ppst->pam_name,"+3/-2")==0 || (ppst->pam_h==3 && ppst->pam_l == -2)) {
+ r_v = look_p(nt32_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ else if (strcmp(ppst->pam_name,"+1/-3")==0 || (ppst->pam_h==1 && ppst->pam_l == -3)) {
+ r_v = look_p(nt13_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+ }
+ }
+
+ if (r_v == 1) {
+ pu->r_u.ag.Lambda = Lambda;
+ pu->r_u.ag.K = K;
+ pu->r_u.ag.H = H;
+ }
+ else {
+ r_v = look_p(bl62_p,t_gdelval,t_ggapval,&K,&Lambda,&H);
+#ifdef DEBUG
+ fprintf(stderr,"+++ Warning : [%s:%d] Parameters not available for: %s: %d/%d -- using BL62\n",
+ __FILE__, __LINE__, ppst->pam_name,t_gdelval-t_ggapval,t_ggapval);
+#endif
+ }
+
+ /*
+ fprintf(stderr," the parameters are: Lambda: %5.3f K: %5.3f H: %5.3f\n",
+ Lambda, K, H);
+ */
+
+ pu->r_u.ag.a_n0 = (double)ppst->n0;
+ pu->r_u.ag.a_n0f = log (K * pu->r_u.ag.a_n0)/H;
+
+ sprintf(f_string,"Altschul/Gish params: n0: %d Lambda: %5.3f K: %5.3f H: %5.3f",
+ ppst->n0,Lambda, K, H);
+ return AG_STATS;
+}
+
+int
+ag_parm(char *pam_name, int gdelval, int ggapval, struct pstat_str *pu)
+{
+ double Lambda, K, H;
+ int r_v;
+
+ if (strcmp(pam_name,"BL50")==0)
+ r_v = look_p(bl50_p,gdelval,ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_name,"BL62")==0)
+ r_v = look_p(bl62_p,gdelval,ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_name,"P250")==0)
+ r_v = look_p(p250_p,gdelval,ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_name,"P120")==0)
+ r_v = look_p(p120_p,gdelval,ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_name,"MD10")==0)
+ r_v = look_p(md10_p,gdelval,ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_name,"MD20")==0)
+ r_v = look_p(md20_p,gdelval,ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_name,"MD40")==0)
+ r_v = look_p(md40_p,gdelval,ggapval,&K,&Lambda,&H);
+ else if (strcmp(pam_name,"DNA")==0 || strcmp(pam_name,"+5/-4")==0)
+ r_v = look_p(nt54_p,gdelval,ggapval, &K,&Lambda,&H);
+ else if (strcmp(pam_name,"+3/-2")==0)
+ r_v = look_p(nt32_p,gdelval,ggapval, &K,&Lambda,&H);
+ else if (strcmp(pam_name,"+1/-3")==0)
+ r_v = look_p(nt13_p,gdelval,ggapval, &K,&Lambda,&H);
+ else r_v = 0;
+
+ pu->r_u.ag.K = K;
+ pu->r_u.ag.Lambda = Lambda;
+ pu->r_u.ag.H = H;
+
+ /*
+ if (r_v == 0) {
+ fprintf(stderr,"Parameters not available for: %s: %d/%d\n",
+ pam_name,gdelval,ggapval);
+ }
+ */
+ return r_v;
+}
+
+int
+look_p(struct alt_p parm[], int gap, int ext,
+ double *K, double *Lambda, double *H)
+{
+ int i;
+
+ gap = -gap;
+ ext = -ext;
+
+ if (gap > parm[1].gap) {
+ *K = parm[0].K;
+ *Lambda = parm[0].Lambda;
+ *H = parm[0].H;
+ return 1;
+ }
+
+ for (i=1; parm[i].gap > 0; i++) {
+ if (parm[i].gap > gap) continue;
+ else if (parm[i].gap == gap && parm[i].ext > ext ) continue;
+ else if (parm[i].gap == gap && parm[i].ext == ext) {
+ *K = parm[i].K;
+ *Lambda = parm[i].Lambda;
+ *H = parm[i].H;
+ return 1;
+ }
+ else break;
+ }
+ return 0;
+}
+
+#ifdef NORMAL_DIST
+int
+proc_hist_anorm(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp,
+ int do_trim, struct pstat_str *pu)
+{
+ char s_string[128];
+ char *f_string;
+ int med_ix, q1_ix, q3_ix;
+ double iqr_sd;
+
+ f_string = histp->stat_info;
+
+ if (ppst->zsflag < 10) s_string[0]='\0';
+ else if (ppst->zs_win > 0) {
+ sprintf(s_string,"(shuffled [%d], win: %d)",nstats,ppst->zs_win);
+ }
+ else {
+ sprintf(s_string,"(shuffled [%d])",nstats);
+ }
+
+ /* find median, 1st/3rd quartile */
+
+ med_ix = nstats/2;
+ q3_ix = med_ix - med_ix/2;
+ q1_ix = med_ix + med_ix/2;
+
+ st_sort(sptr,nstats);
+
+ pu->r_u.rg.mu = sptr[med_ix].score;
+
+ iqr_sd = (sptr[q1_ix].score - sptr[q3_ix].score)/1.35;
+
+ pu->r_u.rg.mean_var_sqrt = iqr_sd;
+ pu->r_u.rg.mean_var = iqr_sd * iqr_sd;
+ pu->r_u.rg.n_trimmed = 0;
+
+ sprintf(f_string,
+ "%s Unscaled normal (median, IQR) statistics: mu= %6.4f var=%6.4f Ztrim: %d",
+ s_string, pu->r_u.rg.mu,pu->r_u.rg.mean_var, pu->r_u.rg.n_trimmed
+ );
+ return AG_STATS;
+}
+#endif
+
+/* uncensored and censored maximum likelihood estimates developed
+ by Aaron Mackey based on a preprint from Sean Eddy */
+
+int mle_cen (struct stat_str *, int, int, double, double *, double *);
+
+int
+proc_hist_ml(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp,
+ int do_trim, struct pstat_str *pu)
+{
+ double f_cen;
+ char s_string[128];
+ char *f_string;
+ double opt_correct;
+
+ f_string = histp->stat_info;
+ pu->r_u.ag.a_n0 = (double)ppst->n0;
+
+ if (ppst->zsflag < 10) s_string[0]='\0';
+ else { /* shuffled, do_trim = 0 */
+ if (ppst->zs_win > 0) {
+ sprintf(s_string,"(shuffled [%d], win: %d)",nstats,ppst->zs_win);
+ }
+ else {
+ sprintf(s_string,"(shuffled [%d])",nstats);
+ }
+ }
+
+ if (!do_trim) {
+ if (mle_cen(sptr, nstats, ppst->n0, 0.0, &pu->r_u.ag.Lambda, &pu->r_u.ag.K) == -1)
+ goto bad_mle;
+ sprintf(f_string,"%s MLE statistics: Lambda= %6.4f; K=%6.4g",
+ s_string,pu->r_u.ag.Lambda,pu->r_u.ag.K);
+ }
+ else {
+ /* this section attempts to censor the appropriate fraction of
+ high-scores. In general, we would like 5% censored.
+ Unfortunately, for fasta36 with statistical thresholds for
+ optimization, the thresholds mean that only a small fraction of
+ the highest scoring library sequences are optimized (and used
+ for scores). As a result, there may be no low scores, and a
+ large excess of high-scores. Thus, when only the best scores
+ are calculated initially, more censoring must be done.
+
+ The pre-selection information is available in struct
+ score_count_s m_msp->s_info, by comparing
+ s_info.s_cnt[ppst->score_ix] to sinfo.tot_scores.
+ */
+
+ opt_correct = (double)s_info.s_cnt[ppst->score_ix]/(double)s_info.tot_scores;
+
+ if ((double)nstats/20.0 > 1000.0) f_cen = 1000.0/((double)nstats*opt_correct);
+ else f_cen = 0.05/opt_correct;
+
+ if (f_cen > 0.4) f_cen = 0.2;
+
+ if (mle_cen(sptr, nstats, ppst->n0, f_cen, &pu->r_u.ag.Lambda, &pu->r_u.ag.K) == -1)
+ goto bad_mle;
+ sprintf(f_string, "MLE_cen statistics: Lambda= %6.4f; K=%6.4g (cen=%d)",
+ pu->r_u.ag.Lambda,pu->r_u.ag.K,(int)((double)nstats*f_cen));
+ }
+
+ return MLE_STATS;
+ bad_mle:
+ return proc_hist_n(sptr, nstats, s_info, ppst, histp, do_trim, pu);
+}
+
+int
+mle_cen2 (struct stat_str *, int, int, double, double *, double *, double *, double *);
+
+
+int
+proc_hist_ml2(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp,
+ int do_trim, struct pstat_str *pu)
+{
+ int i, ns=0, nneg=0;
+ double f_cen, ave_lambda;
+ char s_string[128], ex_string[64];
+ char *f_string;
+
+ f_string = histp->stat_info;
+ pu->r_u.m2.a_n0 = (double)ppst->n0;
+
+ if (ppst->zsflag < 10) s_string[0]='\0';
+ else if (ppst->zs_win > 0)
+ sprintf(s_string,"(shuffled [%d], win: %d)",nstats,ppst->zs_win);
+ else
+ sprintf(s_string,"(shuffled [%d])",nstats);
+
+
+ pu->r_u.m2.ave_comp = 0.0;
+ pu->r_u.m2.max_comp = -1.0;
+
+ ns = nneg = 0;
+ for (i=0; i<nstats; i++) {
+ if (sptr[i].comp > pu->r_u.m2.max_comp) pu->r_u.m2.max_comp = sptr[i].comp;
+ if (sptr[i].comp > 0.0) {
+ pu->r_u.m2.ave_comp += log(sptr[i].comp);
+ ns++;
+ }
+ else nneg++;
+ }
+ pu->r_u.m2.ave_comp /= (double)ns;
+ pu->r_u.m2.ave_comp = exp(pu->r_u.m2.ave_comp);
+ for (i=0; i<nstats; i++) if (sptr[i].comp < 0.0) {
+ sptr[i].comp = pu->r_u.m2.ave_comp;
+ }
+
+ if (nneg > 0)
+ sprintf(ex_string,"composition = -1 for %d sequences",nneg);
+ else ex_string[0]='\0';
+
+ if (!do_trim) {
+ if (mle_cen2(sptr, nstats, ppst->n0, 0.0,
+ &pu->r_u.m2.mle2_a0, &pu->r_u.m2.mle2_a1,
+ &pu->r_u.m2.mle2_a2, &pu->r_u.m2.mle2_b1) == -1) goto bad_mle2;
+ ave_lambda = 1.0/(pu->r_u.m2.ave_comp*pu->r_u.m2.mle2_b1);
+
+ sprintf(f_string,"%s MLE-2 statistics: a0= %6.4f; a1=%6.4f; a2=%6.4f; b1=%6.4f\n ave Lamdba: %6.4f",
+ s_string, pu->r_u.m2.mle2_a0, pu->r_u.m2.mle2_a1, pu->r_u.m2.mle2_a2, pu->r_u.m2.mle2_b1,ave_lambda);
+ }
+ else {
+ if (nstats/20 > 500) f_cen = 500.0/(double)nstats;
+ else f_cen = 0.05;
+ if (mle_cen2(sptr, nstats, ppst->n0, f_cen, &pu->r_u.m2.mle2_a0, &pu->r_u.m2.mle2_a1, &pu->r_u.m2.mle2_a2, &pu->r_u.m2.mle2_b1)== -1) goto bad_mle2;
+
+ ave_lambda = 1.0/(pu->r_u.m2.ave_comp*pu->r_u.m2.mle2_b1);
+
+ sprintf(f_string,"%s MLE-2-cen statistics: a0= %6.4f; a1=%6.4f; a2=%6.4f; b1=%6.4f (cen=%d)\n ave Lambda:%6.4f",
+ s_string, pu->r_u.m2.mle2_a0, pu->r_u.m2.mle2_a1, pu->r_u.m2.mle2_a2, pu->r_u.m2.mle2_b1, (int)((double)nstats*f_cen),ave_lambda);
+ }
+
+ return MLE2_STATS;
+ bad_mle2:
+ return proc_hist_n(sptr, nstats, s_info, ppst, histp, do_trim, pu);
+}
+
+double first_deriv_cen(double lambda, struct stat_str *sptr,
+ int start, int stop,
+ double sumlenL, double cenL,
+ double sumlenH, double cenH);
+
+double second_deriv_cen(double lambda, struct stat_str *sptr,
+ int start, int stop,
+ double sumlenL, double cenL,
+ double sumlenH, double cenH);
+
+static void
+st_sort (struct stat_str *v, int n) {
+ int gap, i, j;
+ int tmp;
+ double dtmp;
+
+ for (gap = 1; gap < n/3; gap = 3*gap +1) ;
+
+ for (; gap > 0; gap = (gap-1)/3) {
+ for (i = gap; i < n; i++) {
+ for (j = i - gap; j >= 0; j -= gap) {
+ if (v[j].score <= v[j + gap].score) break;
+
+ tmp = v[j].score;
+ v[j].score = v[j + gap].score;
+ v[j + gap].score = tmp;
+
+ tmp = v[j].n1;
+ v[j].n1 = v[j + gap].n1;
+ v[j + gap].n1 = tmp;
+
+ dtmp = v[j].comp;
+ v[j].comp = v[j + gap].comp;
+ v[j + gap].comp = dtmp;
+
+ dtmp = v[j].H;
+ v[j].H = v[j + gap].H;
+ v[j + gap].H = dtmp;
+ }
+ }
+ }
+}
+
+
+/* sptr[].score, sptr[].n1; sptr[] must be sorted
+ int n = total number of samples
+ int M = length of query
+ double fn = fraction of scores to be censored fn/2.0 from top, bottom
+ double *Lambda = Lambda estimate
+ double *K = K estimate
+*/
+
+#define MAX_NIT 100
+
+int
+mle_cen(struct stat_str *sptr, int n, int M, double fc,
+ double *Lambda, double *K) {
+
+ double sumlenL, sumlenH, cenL, cenH;
+ double sum_s, sum2_s, mean_s, var_s, dtmp;
+ int start, stop;
+ int i, nf;
+ int nit = 0;
+ double deriv, deriv2, lambda, old_lambda, sum = 0.0;
+
+ /*
+ int sumlenL, int sumlenghtsR = sum of low (Left), right (High) seqs.
+ int cenL, cenH = censoring score low, high
+ */
+
+ nf = (fc/2.0) * n;
+ start = nf;
+ stop = n - nf;
+
+ st_sort(sptr,n);
+
+ sum_s = sum2_s = 0.0;
+ for (i=start; i<stop; i++) {
+ sum_s += sptr[i].score;
+ }
+ dtmp = (double)(stop-start);
+ mean_s = sum_s/dtmp;
+
+ for (i=start; i<stop; i++) {
+ sum2_s += sptr[i].score * sptr[i].score;
+ }
+ var_s = sum2_s/(dtmp-1.0);
+
+ sumlenL = sumlenH = 0.0;
+ for (i=0; i<start; i++) sumlenL += (double)sptr[i].n1;
+ for (i=stop; i<n; i++) sumlenH += (double)sptr[i].n1;
+
+ if (nf > 0) {
+ cenL = (double)sptr[start].score;
+ cenH = (double)sptr[stop].score;
+ }
+ else {
+ cenL = (double)sptr[start].score/2.0;
+ cenH = (double)sptr[start].score*2.0;
+ }
+
+ if (cenL >= cenH) return -1;
+
+ /* initial guess for lambda is 0.2 - this does not work for matrices
+ with very different scales */
+ /* lambda = 0.2; */
+ lambda = PI_SQRT6/sqrt(var_s);
+ if (lambda > 1.0) {
+ fprintf(stderr," Lambda initial estimate error: lambda: %6.4g; var_s: %6.4g\n",lambda,var_s);
+ lambda = 0.2;
+ }
+
+ do {
+ deriv = first_deriv_cen(lambda, sptr, start, stop,
+ sumlenL, cenL, sumlenH, cenH);
+ /* (uncensored version)
+ first_deriv(lambda, &sptr[start], stop - start))
+ */
+
+ /* (uncensored version)
+ deriv2 = second_deriv(lambda, &sptr[start], stop-start);
+ */
+ deriv2 = second_deriv_cen(lambda, sptr, start, stop,
+ sumlenL, cenL, sumlenH, cenH);
+
+ old_lambda = lambda;
+ if (lambda - deriv/deriv2 > 0.0) lambda = lambda - deriv/deriv2;
+ else lambda = lambda/2.0;
+ nit++;
+ } while (fabs((lambda - old_lambda)/lambda) > TINY && nit < MAX_NIT);
+
+ /* fprintf(stderr," mle_cen nit: %d\n",nit); */
+
+ if (nit >= MAX_NIT) return -1;
+
+ for(i = start; i < stop ; i++) {
+ sum += (double) sptr[i].n1 * exp(- lambda * (double)sptr[i].score);
+ }
+
+ *Lambda = lambda;
+ /*
+ *K = (double)(stop-start)/((double)M*sum);
+ */
+ *K = (double)n/((double)M*
+ (sum+sumlenL*exp(-lambda*cenL)-sumlenH*exp(-lambda*cenH)));
+ return 0;
+}
+
+/*
+double
+first_deriv(double lambda, struct stat_str *sptr, int n) {
+
+ int i;
+ double sum = 0.0, sum1 = 0.0, sum2 = 0.0;
+ double s, l, es;
+
+ for(i = 0 ; i < n ; i++) {
+ s = (double)sptr[i].score;
+ l = (double)sptr[i].n1;
+ es = exp(-lambda * s );
+ sum += s;
+ sum2 += l * es;
+ sum1 += s * l * es;
+ }
+
+ return (1.0/lambda) - (sum/(double)n) + (sum1/sum2);
+}
+*/
+
+/*
+double
+second_deriv(double lambda, struct stat_str *sptr, int n) {
+ double sum1 = 0.0, sum2 = 0.0, sum3 = 0.0;
+ double s, l, es;
+ int i;
+
+ for(i = 0 ; i < n ; i++) {
+ l = (double)sptr[i].n1;
+ s = (double)sptr[i].score;
+ es = exp(-lambda * s);
+ sum2 += l * es;
+ sum1 += l * s * es;
+ sum3 += l * s * s * es;
+ }
+
+ return ((sum1*sum1)/(sum2*sum2)) - (sum3/sum2) - (1.0/(lambda*lambda));
+}
+*/
+
+double
+first_deriv_cen(double lambda, struct stat_str *sptr, int start, int stop,
+ double sumlenL, double cenL, double sumlenH, double cenH) {
+ int i;
+ double sum = 0.0, sum1 = 0.0, sum2 = 0.0;
+ double s, l, es;
+
+ for(i = start ; i < stop ; i++) {
+ s = (double)sptr[i].score;
+ l = (double)sptr[i].n1;
+ es = exp(-lambda * s );
+ sum += s;
+ sum2 += l * es;
+ sum1 += s * l * es;
+ }
+
+ sum1 += sumlenL*cenL*exp(-lambda*cenL) - sumlenH*cenH*exp(-lambda*cenH);
+ sum2 += sumlenL*exp(-lambda*cenL) - sumlenH*exp(-lambda*cenH);
+
+ return (1.0 / lambda) - (sum /(double)(stop-start)) + (sum1 / sum2);
+}
+
+double
+second_deriv_cen(double lambda, struct stat_str *sptr, int start, int stop,
+ double sumlenL, double cenL, double sumlenH, double cenH) {
+
+ double sum1 = 0.0, sum2 = 0.0, sum3 = 0.0;
+ double s, l, es;
+ int i;
+
+ for(i = start ; i < stop ; i++) {
+ s = (double)sptr[i].score;
+ l = (double)sptr[i].n1;
+ es = exp(-lambda * s);
+ sum2 += l * es;
+ sum1 += l * s * es;
+ sum3 += l * s * s * es;
+ }
+
+ sum1 += sumlenL*cenL*exp(-lambda*cenL) - sumlenH*cenH*exp(-lambda*cenH);
+ sum2 += sumlenL*exp(-lambda * cenL) - sumlenH*exp(-lambda * cenH);
+ sum3 += sumlenL*cenL*cenL * exp(-lambda * cenL) -
+ sumlenH*cenH*cenH * exp(-lambda * cenH);
+ return ((sum1 * sum1) / (sum2 * sum2)) - (sum3 / sum2)
+ - (1.0 / (lambda * lambda));
+}
+
+double mle2_func(double *params,
+ double *consts,
+ struct stat_str *values,
+ int n, int start, int stop);
+
+void simplex(double *fitparams,
+ double *lambda,
+ int nparam,
+ double (*minfunc) (double *tryparams, double *consts,
+ struct stat_str *data, int ndata,
+ int start, int stop),
+ double *consts,
+ void *data,
+ int ndata, int start, int stop
+ );
+
+int
+mle_cen2(struct stat_str *sptr, int n, int M, double fc,
+ double *a0, double *a1, double *a2, double *b1) {
+
+ double params[4], lambdas[4], consts[9];
+ double avglenL, avglenH, avgcompL, avgcompH, cenL, cenH;
+ int start, stop;
+ int i, nf;
+
+ nf = (fc/2.0) * n;
+ start = nf;
+ stop = n - nf;
+
+ st_sort(sptr,n);
+
+ /* choose arithmetic or geometic mean for compositions by appropriate commenting */
+
+ if (nf > 0) {
+ avglenL = avglenH = 0.0;
+ avgcompL = avgcompH = 0.0;
+ /* avgcompL = avgcompH = 1.0 */
+ for (i=0; i<start; i++) {
+ avglenL += (double)sptr[i].n1;
+ avgcompL += (double)sptr[i].comp;
+ /* avgcompL *= (double) sptr[i].comp; */
+ }
+ avglenL /= (double) start;
+ avgcompL /= (double) start;
+ /* avgcompL = pow(avgcompL, 1.0/(double) start); */
+
+ for (i=stop; i<n; i++) {
+ avglenH += (double)sptr[i].n1;
+ avgcompH += (double)sptr[i].comp;
+ /* avgcompH *= (double) sptr[i].comp; */
+ }
+ avglenH /= (double) (n - stop);
+ avgcompH /= (double) (n - stop);
+ /* avgcompL = pow(avgcompL, 1.0/(double) (n - stop)); */
+
+ cenL = (double)sptr[start].score;
+ cenH = (double)sptr[stop].score;
+ if (cenL >= cenH) return -1;
+ }
+ else {
+ avglenL = avglenH = cenL = cenH = 0.0;
+ avgcompL = avgcompH = 1.0;
+ }
+
+ params[0] = 10.0;
+ params[1] = -10.0;
+ params[2] = 1.0;
+ params[3] = 1.0;
+
+ lambdas[0] = 1.0;
+ lambdas[1] = 0.5;
+ lambdas[2] = 0.1;
+ lambdas[3] = 0.01;
+
+ consts[0] = M;
+ consts[1] = (double) start;
+ consts[2] = (double) stop;
+ consts[3] = cenL;
+ consts[4] = cenH;
+ consts[5] = avglenL;
+ consts[6] = avglenH;
+ consts[7] = avgcompL;
+ consts[8] = avgcompH;
+
+ simplex(params, lambdas, 4,
+ (double (*) (double *, double *, struct stat_str *, int, int, int) )mle2_func,
+ consts, sptr, n, start, stop);
+
+ *a0 = params[0];
+ *a1 = params[1];
+ *a2 = params[2];
+ *b1 = params[3];
+
+ return 0;
+}
+
+double mle2_func(double *params,
+ double *consts,
+ struct stat_str *values,
+ int n, int start, int stop
+ ) {
+
+ double a0, a1, a2, b1, M;
+ double score, length, comp;
+ double cenL, cenH, avglenL, avglenH, avgcompL, avgcompH;
+ double L, y;
+
+ int i;
+
+ a0 = params[0];
+ a1 = params[1];
+ a2 = params[2];
+ b1 = params[3];
+
+ M = consts[0];
+ /*
+ start = (int) consts[1];
+ stop = (int) consts[2];
+ */
+ cenL = consts[3];
+ cenH = consts[4];
+ avglenL = consts[5];
+ avglenH = consts[6];
+ avgcompL = consts[7];
+ avgcompH = consts[8];
+
+ L = 0;
+ y = 0;
+
+ if (start > 0) {
+ y = -(cenL - (a0 + a1*avgcompL +a2*avgcompL*log(M*avglenL)))/(b1*avgcompL);
+ L += (double) start * exp(y);
+ }
+
+ for(i = start ; i < stop ; i++) {
+ score = (double) values[i].score;
+ length = (double) values[i].n1;
+ comp = (double) values[i].comp;
+
+ y = - (score - (a0 + a1*comp + a2 * comp * log(M*length))) / (b1*comp);
+
+ L += -y + exp(y) + log(b1 * comp);
+ }
+
+ if (stop < n) {
+ y = -(cenH -(a0 + a1*avgcompH + a2*avgcompH*log(M*avglenH)))/(b1*avgcompH);
+ L -= (double) (n - stop) * exp(y);
+ }
+ return L;
+}
+
+/* Begin Nelder-Mead simplex code: */
+
+double evalfunc(double **param,
+ double *vals,
+ double *psums,
+ double *ptry,
+ int nparam,
+ double (*minfunc) (double *params, double *consts,
+ struct stat_str *data, int ndata,
+ int start, int stop),
+ double *consts,
+ void *data,
+ int ndata, int start, int stop,
+ int ihi,
+ double factor);
+
+void simplex(double *fitparams,
+ double *lambda,
+ int nparam,
+ double (*minfunc) (double *tryparams, double *consts,
+ struct stat_str *data, int ndata,
+ int start, int stop),
+ double *consts,
+ void *data,
+ int ndata,
+ int start,
+ int stop
+ )
+{
+
+ int i, j, ilo, ihi, inhi;
+ double rtol, sum, tmp, ysave, ytry;
+ double *psum, *vals, *ptry, **param;
+
+
+ psum = (double *) calloc(nparam, sizeof(double));
+ ptry = (double *) calloc(nparam, sizeof(double));
+
+ vals = (double *) calloc(nparam + 1, sizeof(double));
+
+ param = (double **) calloc(nparam + 1, sizeof(double *));
+ param[0] = (double *) calloc((nparam + 1) * nparam, sizeof(double));
+ for( i = 1 ; i < (nparam + 1) ; i++ ) {
+ param[i] = param[0] + i * nparam;
+ }
+
+ /* Get our N+1 initial parameter values for the simplex */
+
+ for( i = 0 ; i < nparam ; i++ ) {
+ param[0][i] = fitparams[i];
+ }
+
+ for( i = 1 ; i < (nparam + 1) ; i++ ) {
+ for( j = 0 ; j < nparam ; j++ ) {
+ param[i][j] = fitparams[j] + lambda[j] * ( (i - 1) == j ? 1 : 0 );
+ }
+ }
+
+ /* calculate initial values at the simplex nodes */
+
+ for( i = 0 ; i < (nparam + 1) ; i++ ) {
+ vals[i] = minfunc(param[i], consts, data, ndata, start, stop);
+ }
+
+ /* Begin Nelder-Mead simplex algorithm from Numerical Recipes in C */
+
+ for( j = 0 ; j < nparam ; j++ ) {
+ for( sum = 0.0, i = 0 ; i < nparam + 1 ; i++ ) {
+ sum += param[i][j];
+ }
+ psum[j] = sum;
+ }
+
+
+ while( 1 ) {
+/*
+ determine which point is highest (ihi), next highest (inhi) and
+ lowest (ilo) by looping over the points in the simplex
+*/
+ ilo = 0;
+
+/* ihi = vals[0] > vals[1] ? (inhi = 1, 0) : (inhi = 0, 1); */
+ if(vals[0] > vals[1]) { ihi = 0; inhi = 1; }
+ else { ihi = 1; inhi = 0; }
+
+ for( i = 0 ; i < nparam + 1 ; i++) {
+ if( vals[i] <= vals[ilo] ) ilo = i;
+ if( vals[i] > vals[ihi] ) {
+ inhi = ihi;
+ ihi = i;
+ } else if ( vals[i] > vals[inhi] && i != ihi ) inhi = i;
+ }
+
+ /* Are we finished? */
+
+ rtol = 2.0 * fabs(vals[ihi] - vals[ilo]) /
+ (fabs(vals[ihi]) + fabs(vals[ilo]) + TINY);
+
+ if( rtol < TOLERANCE ) {
+
+/* put the best value and best parameters into the first index */
+
+ tmp = vals[0];
+ vals[0] = vals[ilo];
+ vals[ilo] = tmp;
+
+ for( i = 0 ; i < nparam ; i++ ) {
+ tmp = param[0][i];
+ param[0][i] = param[ilo][i];
+ param[ilo][i] = tmp;
+ }
+
+ /* et voila, c'est finis */
+ break;
+ }
+
+ /* Begin a new iteration */
+
+ /* first, extrapolate by -1 through the face of the simplex across from ihi */
+
+ ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
+ data, ndata, start, stop, ihi, -1.0);
+
+ if( ytry <= vals[ilo] ) {
+
+ /* Good result, try additional extrapolation by 2 */
+
+ ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
+ data, ndata, start, stop, ihi, 2.0);
+
+ } else if ( ytry >= vals[inhi] ) {
+
+ /* no good, look for an intermediate lower point by contracting */
+
+ ysave = vals[ihi];
+ ytry = evalfunc(param, vals, psum, ptry, nparam, minfunc, consts,
+ data, ndata, start, stop, ihi, 0.5);
+
+ if( ytry >= ysave ) {
+
+ /* Still no good. Contract around lowest (best) point. */
+
+ for( i = 0 ; i < nparam + 1 ; i++ ) {
+ if( i != ilo ) {
+ for ( j = 0 ; j < nparam ; j++ ) {
+ param[i][j] = psum[j] = 0.5 * (param[i][j] + param[ilo][j]);
+ }
+ vals[i] = minfunc(psum, consts, data, ndata, start, stop);
+ }
+ }
+
+
+ for( j = 0 ; j < nparam ; j++ ) {
+ for( sum = 0.0, i = 0 ; i < nparam + 1 ; i++ ) {
+ sum += param[i][j];
+ }
+ psum[j] = sum;
+ }
+
+ }
+ }
+ }
+
+ for( i = 0 ; i < nparam ; i++ ) {
+ fitparams[i] = param[0][i];
+ }
+
+ if (ptry!=NULL) {
+ free(ptry);
+ ptry=NULL;
+ }
+ free(param[0]);
+ free(param);
+ free(vals);
+ free(psum);
+}
+
+
+double evalfunc(double **param,
+ double *vals,
+ double *psum,
+ double *ptry,
+ int nparam,
+ double (*minfunc)(double *tryparam, double *consts,
+ struct stat_str *data, int ndata,
+ int start, int stop),
+ double *consts,
+ void *data,
+ int ndata, int start, int stop,
+ int ihi,
+ double factor) {
+
+ int j;
+ double fac1, fac2, ytry;
+
+
+ fac1 = (1.0 - factor) / nparam;
+ fac2 = fac1 - factor;
+
+ for( j = 0 ; j < nparam ; j++ ) {
+ ptry[j] = psum[j] * fac1 - param[ihi][j] * fac2;
+ }
+
+ ytry = minfunc(ptry, consts, data, ndata, start, stop);
+
+ if( ytry < vals[ihi] ) {
+ vals[ihi] = ytry;
+ for( j = 0 ; j < nparam ; j++ ) {
+ psum[j] += ptry[j] - param[ihi][j];
+ param[ihi][j] = ptry[j];
+ }
+ }
+
+ return ytry;
+}
+
+/* end of Nelder-Mead simplex code */
+
+int
+proc_hist_n(struct stat_str *sptr, int nstats, struct score_count_s s_info,
+ struct pstruct *ppst, struct hist_str *histp,
+ int do_trim, struct pstat_str *pu)
+{
+ int i, j, t_j;
+ double s_score, s2_score, ssd, zstrim, mean, var;
+ double t_score;
+ int max_trim, min_trim;
+ int nit, max_hscore;
+ char s_string[128];
+ char *f_string;
+ int no_trim = 0;
+
+ f_string = histp->stat_info;
+
+ max_hscore = calc_thresh(ppst, sptr, nstats, s_info, pu->ngLambda,
+ pu->ngK, pu->ngH, &zstrim);
+
+ if (do_trim && zstrim < 0.0) {
+ /* too few scores to do a z-trim, shift to Karlin-Altscul */
+ /* cannot use proc_hist_a(), because proc_hist_a() calls proc_hist_n() */
+ /* return proc_hist_a(sptr, nstats, s_info, ppst, histp, do_trim, pu); */
+ no_trim = 1;
+ }
+
+ s_score = s2_score = 0.0;
+
+ /* calculate mean */
+ for (j=0, i=0; i < nstats; i++) {
+ if ( sptr[i].score <= max_hscore) {
+ s_score += (double)sptr[i].score;
+ j++;
+ }
+ else { sptr[i].n1 = -sptr[i].n1;}
+ }
+
+ if (j > 1) {
+ pu->r_u.rg.mu = mean = s_score/(double)j;
+
+ /* calculate variance */
+ for (i=0; i < nstats; i++) {
+ if (sptr[i].n1 < 0) continue;
+ ssd = (double)sptr[i].score - mean;
+ s2_score += ssd * ssd;
+ }
+
+ var = pu->r_u.rg.mean_var = s2_score/(double)(j-1);
+ pu->r_u.rg.mean_var_sqrt = sqrt(var);
+ }
+ else {
+ pu->r_u.rg.mu = 50.0;
+ pu->r_u.rg.mean_var = 10.0;
+ pu->r_u.rg.mean_var_sqrt = sqrt(10.0);
+ }
+
+ if (pu->r_u.rg.mean_var < 0.01) {
+ /* pu->r_u.rg.mean_var = (pu->r_u.rg.mu > 1.0) ? pu->r_u.rg.mu: 1.0; */
+ return proc_hist_a(sptr, nstats, s_info, ppst, histp, do_trim, pu);
+ }
+
+ /* now remove some scores */
+ if (no_trim==0) {
+ nit = 5;
+ pu->r_u.rg.n_trimmed = 0;
+ max_trim = -BIGNUM;
+ min_trim = BIGNUM;
+ while (nit-- > 0) {
+ t_score = 0.0;
+ t_j = 0;
+ for (i=0; i< nstats; i++) {
+ if (sptr[i].n1 < 0) continue;
+ ssd = find_zn(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, pu);
+ if ((ssd > zstrim)
+#ifndef NORMAL_DIST
+ || ssd < 20.0
+#else
+ || ssd < 0.0
+#endif
+ )
+ {
+ /* fprintf(stderr,"removing %3d %3d %4.1f\n",
+ sptr[i].score, sptr[i].n1,ssd); */
+
+ ssd = sptr[i].score;
+ if (ssd > max_trim) max_trim = ssd;
+ if (ssd < min_trim) min_trim = ssd;
+
+ t_score += (double)ssd;
+ t_j++;
+ pu->r_u.rg.n_trimmed++;
+ histp->entries--;
+ sptr[i].n1 = -sptr[i].n1;
+ }
+ }
+
+ if (j - t_j > 1 ) {
+ mean = pu->r_u.rg.mu = (s_score - t_score)/(double)(j - t_j);
+
+ /* calculate variance */
+ s2_score = 0.0;
+ for (i=0; i < nstats; i++) {
+ if ( sptr[i].n1 > 0 && sptr[i].score <= max_hscore) {
+ ssd = (double)sptr[i].score - mean;
+ s2_score += ssd * ssd;
+ }
+ }
+ var = pu->r_u.rg.mean_var = s2_score/(double)(j-1);
+ pu->r_u.rg.mean_var_sqrt = sqrt(var);
+ }
+ else {
+ pu->r_u.rg.mu = 50.0;
+ pu->r_u.rg.mean_var = 10.0;
+ pu->r_u.rg.mean_var_sqrt = sqrt(10.0);
+ }
+
+ if (pu->r_u.rg.mean_var < 0.01) {
+ pu->r_u.rg.mean_var = (pu->r_u.rg.mu > 1.0) ? pu->r_u.rg.mu: 1.0;
+ }
+
+ if (pu->r_u.rg.n_trimmed < LHISTC) {
+ /*
+ fprintf(stderr,"nprune %d at %d\n",nprune,nit);
+ */
+ break;
+ }
+ }
+ }
+
+ if (ppst->zsflag < 10) s_string[0]='\0';
+ else if (ppst->zs_win > 0) {
+ sprintf(s_string,"(shuffled [%d], win: %d)",nstats,ppst->zs_win);
+ }
+ else {
+ sprintf(s_string,"(shuffled [%d])",nstats);
+ }
+
+ sprintf(f_string,
+#ifndef NORMAL_DIST
+ "%s Unscaled statistics: mu= %6.4f var=%6.4f; Lambda= %6.4f",
+ s_string, pu->r_u.rg.mu,pu->r_u.rg.mean_var,PI_SQRT6/sqrt(pu->r_u.rg.mean_var_sqrt)
+#else
+ "%s Unscaled normal statistics: mu= %6.4f var=%6.4f Ztrim: %d",
+ s_string, pu->r_u.rg.mu,pu->r_u.rg.mean_var, pu->r_u.rg.n_trimmed
+#endif
+ );
+ return AVE_STATS;
+}
+
+/*
+This routine calculates the maximum likelihood estimates for the
+extreme value distribution exp(-exp(-(-x-a)/b)) using the formula
+
+ <lambda> = x_m - sum{ x[i] * exp (-x[i]<lambda>)}/sum{exp (-x[i]<lambda>)}
+ <a> = -<1/lambda> log ( (1/nlib) sum { exp(-x[i]/<lambda> } )
+
+ The <a> parameter can be transformed into and K
+ of the formula: 1 - exp ( - K m n exp ( - lambda S ))
+ using the transformation: 1 - exp ( -exp -(lambda S + log(K m n) ))
+ 1 - exp ( -exp( - lambda ( S + log(K m n) / lambda))
+
+ a = log(K m n) / lambda
+ a lambda = log (K m n)
+ exp(a lambda) = K m n
+ but from above: a lambda = log (1/nlib sum{exp( -x[i]*lambda)})
+ so: K m n = (1/n sum{ exp( -x[i] *lambda)})
+ K = sum{}/(nlib m n )
+
+*/
+
+void
+alloc_hist(struct llen_str *llen)
+{
+ int max_llen, i;
+ max_llen = llen->max;
+
+ if (llen->hist == NULL) {
+ llen->hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
+ llen->score_sums = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
+ llen->score2_sums =(double *)calloc((size_t)(max_llen + 1),sizeof(double));
+ llen->score_var = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
+ }
+
+ for (i=0; i< max_llen+1; i++) {
+ llen->hist[i] = 0;
+ llen->score_var[i] = llen->score_sums[i] = llen->score2_sums[i] = 0.0;
+ }
+}
+
+void
+free_hist(struct llen_str *llen)
+{
+ if (llen->hist!=NULL) {
+ free(llen->score_var);
+ free(llen->score2_sums);
+ free(llen->score_sums);
+ free(llen->hist);
+ llen->hist=NULL;
+ }
+}
+
+void
+inithist(struct llen_str *llen, struct pstruct *ppst, int max_hscore)
+{
+ llen->max = MAX_LLEN;
+
+ llen->max_score = -1;
+ llen->min_score=10000;
+
+ alloc_hist(llen);
+
+ llen->zero_s = 0;
+ llen->min_length = 10000;
+ llen->max_length = 0;
+}
+
+void
+addhist(struct llen_str *llen, int score, int length, int max_hscore)
+{
+ int llength;
+ double dscore;
+
+ if ( length < LENGTH_CUTOFF) {
+ llen->min_score = 0;
+ llen->zero_s++;
+ return;
+ }
+
+ if (score < llen->min_score) llen->min_score = score;
+ if (score > llen->max_score) llen->max_score = score;
+
+ if (length > llen->max_length) llen->max_length = length;
+ if (length < llen->min_length) llen->min_length = length;
+ if (score > max_hscore) score = max_hscore;
+
+#ifdef LOCAL_SCORE
+ llength = (int)(LN_FACT*log((double)length)+0.5);
+#else
+ llength = (int)(LN_FACT*log((double)length)+0.5);
+ /* llength = length; */
+#endif
+
+ if (llength < 0 ) llength = 0;
+ if (llength > llen->max) llength = llen->max;
+ llen->hist[llength]++;
+ dscore = (double)score;
+ llen->score_sums[llength] += dscore;
+ llen->score2_sums[llength] += dscore * dscore;
+}
+
+/* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */
+
+void
+inithistz(int mh, struct hist_str *histp, double zs_off)
+{
+ int i, izs_off;
+
+ izs_off = (int)(zs_off + 0.5);
+
+ histp->z_calls = 0;
+
+ histp->min_hist = 20 + izs_off*10;
+ histp->max_hist = 120 + izs_off * 10;
+
+ histp->histint = (int)
+ ((double)(histp->max_hist - histp->min_hist + 2)/(double)mh+0.5);
+ histp->maxh = (int)
+ ((double)(histp->max_hist - histp->min_hist + 2)/(double)histp->histint+0.5);
+
+ if (histp->hist_a==NULL) {
+ if ((histp->hist_a=(int *)calloc((size_t)histp->maxh,sizeof(int)))==
+ NULL) {
+ fprintf(stderr," cannot allocate %d for histogram\n",histp->maxh);
+ histp->histflg = 0;
+ }
+ else histp->histflg = 1;
+ }
+ else {
+ for (i=0; i<histp->maxh; i++) histp->hist_a[i]=0;
+ }
+ histp->entries = 0;
+}
+
+static double nrv[100]={
+ 0.3098900570,-0.0313400923, 0.1131975903,-0.2832547606, 0.0073672659,
+ 0.2914489107, 0.4209306311,-0.4630181404, 0.3326537896, 0.0050140359,
+ -0.1117435426,-0.2835630301, 0.2302997065,-0.3102716394, 0.0819894916,
+ -0.1676455701,-0.3782225018,-0.3204509938,-0.3594969187,-0.0308950398,
+ 0.2922813812, 0.1337170751, 0.4666577031,-0.2917784349,-0.2438179916,
+ 0.3002301394, 0.0231147123, 0.5687927366,-0.2318208709,-0.1476839273,
+ -0.0385043851,-0.1213476523, 0.1486341995, 0.1027917167, 0.1409192644,
+ -0.3280652579, 0.4232041455, 0.0775993309, 0.1159071787, 0.2769424442,
+ 0.3197284751, 0.1507346903, 0.0028580909, 0.4825103412,-0.0496843610,
+ -0.2754357656, 0.6021881753,-0.0816123956,-0.0899148991, 0.4847183201,
+ 0.2151621865,-0.4542246220, 0.0690709102, 0.2461894193, 0.2126042295,
+ -0.0767060668, 0.4819746149, 0.3323031326, 0.0177600676, 0.1143185210,
+ 0.2653977455, 0.0921872958,-0.1330986718, 0.0412287716,-0.1691604748,
+ -0.0529679078,-0.0194157955,-0.6117493924, 0.1199067932, 0.0210243193,
+ -0.5832259838,-0.1685528664, 0.0008591271,-0.1120347822, 0.0839125069,
+ -0.2787486831,-0.1937017962,-0.1915733940,-0.7888453635,-0.3316745163,
+ 0.1180885226,-0.3347001067,-0.2477492636,-0.2445697600, 0.0001342482,
+ -0.0015759812,-0.1516473992,-0.5202267615, 0.2136975210, 0.2500423188,
+ -0.2402926401,-0.1094186280,-0.0618869933,-0.0815221188, 0.2623337275,
+ 0.0219427302 -0.1774469919, 0.0828245026,-0.3271952808,-0.0632898028};
+
+void
+addhistz(double zs, struct hist_str *histp)
+{
+ int ih, zi;
+ double rv;
+
+ if (histp == NULL) return;
+
+ rv = nrv[histp->z_calls++ % 100];
+ zi = (int)(zs + 0.5+rv );
+
+ if ((zi >= 0) && (zi <= 120)) histp->entries++;
+
+ if (zi < histp->min_hist) zi = histp->min_hist;
+ if (zi > histp->max_hist) zi = histp->max_hist;
+
+ ih = (zi - histp->min_hist)/histp->histint;
+
+ histp->hist_a[ih]++;
+}
+
+/* addhistzp() does not increase histp->entries since addhist did it already */
+/*
+void
+addhistzp(double zs, struct hist_str *histp)
+{
+ int ih, zi;
+ double rv;
+
+ rv = nrv[histp->z_calls++ %100];
+ zi = (int)(zs + 0.5 + rv);
+
+ if (zi < histp->min_hist) zi = histp->min_hist;
+ if (zi > histp->max_hist) zi = histp->max_hist;
+
+ ih = (zi - histp->min_hist)/histp->histint;
+
+ histp->hist_a[ih]++;
+}
+*/
+
+void
+prune_hist(struct llen_str *llen, int score, int length, int max_hscore,
+ long *entries)
+{
+ int llength;
+ double dscore;
+
+#ifdef LOCAL_SCORE
+ if (score <= 0 || length < LENGTH_CUTOFF) return;
+#endif
+
+ if (score > max_hscore) score = max_hscore;
+
+#ifdef LOCAL_SCORE
+ llength = (int)(LN_FACT*log((double)length)+0.5);
+#else
+ llength = (int)(LN_FACT*log((double)length)+0.5);
+ /* llength = length; */
+#endif
+
+
+ if (llength < 0 ) llength = 0;
+ if (llength > llen->max) llength = llen->max;
+ llen->hist[llength]--;
+ dscore = (double)score;
+ llen->score_sums[llength] -= dscore;
+ llen->score2_sums[llength] -= dscore * dscore;
+
+/* (*entries)--; histp->entries is not yet initialized */
+}
+
+/* find_zr uses rg.rho, rg.mu, and rg.mean_var_sqrt */
+/* find_zr2 uses rho, mu, but calculates var from regression using rho2, mu2 */
+
+/* fit_llen: no trimming
+ (1) regress scores vs log(n) using weighted variance
+ (2) calculate mean variance after length regression
+ (3) regress residual variance vs log(n), calculate REG2_STATS parameters
+ (4) set variance cutoff for bin exclusion
+*/
+
+void
+fit_llen(struct llen_str *llen, struct rstat_str *pr)
+{
+ int j;
+ int n;
+ int n_size;
+ double x, y2, u, z;
+ double mean_x, mean_y, var_x, var_y, covar_xy;
+ double mean_y2, covar_xy2, var_y2, dllj;
+
+ double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
+
+ /* now fit scores to best linear function of log(n), using
+ simple linear regression */
+
+ /* find the shortest bin with data */
+ for (llen->min=0; llen->min < llen->max; llen->min++)
+ if (llen->hist[llen->min]) break;
+
+ /* calculate the mean/variance for each length interval */
+ for (n_size=0,j = llen->min; j < llen->max; j++) {
+ if (llen->hist[j] > 1) {
+ dllj = (double)llen->hist[j];
+ llen->score_var[j] = llen->score2_sums[j]/dllj
+ - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
+ llen->score_var[j] /= (double)(llen->hist[j]-1);
+ if (llen->score_var[j] <= 0.1 ) llen->score_var[j] = 0.1;
+ n_size++;
+ }
+ }
+
+ pr->nb_tot = n_size;
+
+ n_w = 0.0;
+ sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
+ for (j = llen->min; j < llen->max; j++) {
+ if (llen->hist[j] > 1) {
+ x = j + 0.5;
+ dllj = (double)llen->hist[j];
+ n_w += dllj/llen->score_var[j];
+ sum_x += dllj * x / llen->score_var[j] ;
+ sum_y += llen->score_sums[j] / llen->score_var[j];
+ sum_x2 += dllj * x * x /llen->score_var[j];
+ sum_xy += x * llen->score_sums[j]/llen->score_var[j];
+ }
+ }
+
+ if (n_size < 5 ) {
+ llen->fit_flag=0;
+ pr->rho = 0;
+ pr->mu = sum_y/n_w;
+ return;
+ }
+ else {
+ det = n_w * sum_x2 - sum_x * sum_x;
+ if (det > 0.001) {
+ llen->fit_flag = 1;
+ pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
+ pr->rho_e = n_w/det;
+ pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
+ pr->mu_e = sum_x2/det;
+ }
+ else {
+ llen->fit_flag = 0;
+ pr->rho = 0;
+ pr->mu = sum_y/n_w;
+ return;
+ }
+ }
+
+ /* this code duplicates the calculation above
+ det = n_w * sum_x2 - sum_x * sum_x;
+ pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
+ pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
+ */
+
+ n = 0;
+ mean_x = mean_y = mean_y2 = 0.0;
+ var_x = var_y = 0.0;
+ covar_xy = covar_xy2 = 0.0;
+
+ for (j = llen->min; j <= llen->max; j++) {
+ if (llen->hist[j] > 1 ) {
+ n += llen->hist[j];
+ x = (double)j + 0.5;
+ mean_x += (double)llen->hist[j] * x;
+ mean_y += llen->score_sums[j];
+ var_x += (double)llen->hist[j] * x * x;
+ var_y += llen->score2_sums[j];
+ covar_xy += x * llen->score_sums[j];
+ }
+ }
+
+ mean_x /= n; mean_y /= n;
+ var_x = var_x / n - mean_x * mean_x;
+ var_y = var_y / n - mean_y * mean_y;
+
+ covar_xy = covar_xy / n - mean_x * mean_y;
+/*
+ pr->rho = covar_xy / var_x;
+ pr->mu = mean_y - pr->rho * mean_x;
+*/
+
+ mean_y2 = covar_xy2 = var_y2 = 0.0;
+ for (j = llen->min; j <= llen->max; j++) {
+ if (llen->hist[j] > 1) {
+ x = (double)j + 0.5;
+ u = pr->rho * x + pr->mu;
+ y2 = llen->score2_sums[j] - 2.0 * llen->score_sums[j] * u + llen->hist[j] * u * u;
+ mean_y2 += y2;
+ var_y2 += y2 * y2;
+ covar_xy2 += x * y2;
+ }
+ }
+
+ pr->mean_var = mean_y2 /= (double)n;
+ pr->mean_var_sqrt = sqrt(pr->mean_var);
+ covar_xy2 = covar_xy2 / (double)n - mean_x * mean_y2;
+
+ if (pr->mean_var <= 0.01) {
+ llen->fit_flag = 0;
+ pr->mean_var = (pr->mu > 1.0) ? pr->mu: 1.0;
+ }
+
+ /*
+ fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var %.4f\n",
+ pr->rho*LN_FACT,pr->mu,pr->mean_var);
+ */
+ if (n > 1) pr->var_e = (var_y2/n - mean_y2 * mean_y2)/(n-1);
+ else pr->var_e = 0.0;
+
+ if (llen->fit_flag) {
+ pr->rho2 = covar_xy2 / var_x;
+ pr->mu2 = pr->mean_var - pr->rho2 * mean_x;
+ }
+ else {
+ pr->rho2 = 0;
+ pr->mu2 = pr->mean_var;
+ }
+
+ if (pr->rho2 < 0.0 )
+ z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
+ else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
+ if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
+
+ pr->var_cutoff = pr->rho2 * LN_FACT*log(z) + pr->mu2;
+}
+
+/* used to get parameters for REG2_STATS
+
+ fit_llens: trim high variance bins
+ (1) regress scores vs log(n) using weighted variance
+ (2) regress residuals vs log(n)
+ (3) remove high variance bins
+ (4) calculate mean variance after length regression
+*/
+
+void
+fit_llens(struct llen_str *llen, struct rstat_str *pr)
+{
+ int j;
+ int n, n_u2, n_size;
+ double x, y, y2, u, u2, v, z;
+ double mean_x, mean_y, var_x, var_y, covar_xy;
+ double mean_y2, covar_xy2;
+ double mean_u2, mean_3u2, dllj;
+ double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
+
+ /* now fit scores to best linear function of log(n), using
+ simple linear regression */
+
+ /* find the shortest bin with data */
+ for (llen->min=0; llen->min < llen->max; llen->min++)
+ if (llen->hist[llen->min] > 1) break;
+
+ /* calculate the mean/variance for each length interval */
+ for (j = llen->min; j < llen->max; j++) {
+ if (llen->hist[j] > 1) {
+ dllj = (double)llen->hist[j];
+ llen->score_var[j] = (double)llen->score2_sums[j]/dllj
+ - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
+ llen->score_var[j] /= (double)(llen->hist[j]-1);
+ if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
+ }
+ }
+
+ n_w = 0.0;
+ sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
+ for (n_size = 0, j = llen->min; j < llen->max; j++) {
+ if (llen->hist[j] > 1) {
+ x = j + 0.5;
+ dllj = (double)llen->hist[j];
+ n_w += dllj/llen->score_var[j];
+ sum_x += dllj * x / llen->score_var[j] ;
+ sum_y += llen->score_sums[j] / llen->score_var[j];
+ sum_x2 += dllj * x * x /llen->score_var[j];
+ sum_xy += x * llen->score_sums[j]/llen->score_var[j];
+ n_size++;
+ }
+ }
+
+ pr->nb_tot = n_size;
+
+ if (n_size < 5 ) {
+ llen->fit_flag=0;
+ pr->rho = 0;
+ pr->mu = sum_y/n_w;
+ return;
+ }
+ else {
+ det = n_w * sum_x2 - sum_x * sum_x;
+ if (det > 0.001) {
+ llen->fit_flag = 1;
+ pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
+ pr->rho_e = n_w/det;
+ pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
+ pr->mu_e = sum_x2/det;
+ }
+ else {
+ llen->fit_flag = 0;
+ pr->rho = 0;
+ pr->mu = sum_y/n_w;
+ return;
+ }
+ }
+
+ n = 0;
+ mean_x = mean_y = mean_y2 = 0.0;
+ var_x = var_y = 0.0;
+ covar_xy = covar_xy2 = 0.0;
+
+ for (j = llen->min; j <= llen->max; j++)
+ if (llen->hist[j] > 1 ) {
+ n += llen->hist[j];
+ x = (double)j + 0.5;
+ dllj = (double)llen->hist[j];
+ mean_x += dllj * x;
+ mean_y += llen->score_sums[j];
+ var_x += dllj * x * x;
+ var_y += llen->score2_sums[j];
+ covar_xy += x * llen->score_sums[j];
+ }
+ mean_x /= n; mean_y /= n;
+ var_x = var_x / n - mean_x * mean_x;
+ var_y = var_y / n - mean_y * mean_y;
+
+ covar_xy = covar_xy / n - mean_x * mean_y;
+
+ /* to this point, fit_llen(), fit_llens(), and fit_llen2() -- this
+ function -- are the same */
+
+ /* now regress on the residual variance */
+ mean_y2 = covar_xy2 = 0.0;
+ for (j = llen->min; j <= llen->max; j++)
+ if (llen->hist[j] > 1) {
+ x = (double)j + 0.5;
+ u = pr->rho * x + pr->mu;
+ y2 = llen->score2_sums[j] - 2 * llen->score_sums[j] * u + llen->hist[j] * u * u;
+ mean_y2 += y2;
+ covar_xy2 += x * y2;
+ }
+
+ mean_y2 /= n;
+ covar_xy2 = covar_xy2 / n - mean_x * mean_y2;
+ /* calculate parameters for variance regression */
+ pr->rho2 = covar_xy2 / var_x;
+ pr->mu2 = mean_y2 - pr->rho2 * mean_x;
+
+ if (pr->rho2 < 0.0 )
+ z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
+ else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
+ if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
+
+ pr->var_cutoff = pr->rho2*LN_FACT*log(z) + pr->mu2;
+
+/* fprintf(stderr,"\nminimum allowed predicted variance (%0.2f) at n = %.0f\n",
+ pr->var_cutoff,z);
+*/
+ mean_u2 = 0.0;
+ n_u2 = 0;
+ for ( j = llen->min; j < llen->max; j++) {
+ y = j+0.5;
+ dllj = (double)llen->hist[j];
+ x = pr->rho * y + pr->mu;
+ v = pr->rho2 * y + pr->mu2;
+ if (v < pr->var_cutoff) v = pr->var_cutoff;
+ if (llen->hist[j]> 1) {
+ u2 = (llen->score2_sums[j] - 2 * x * llen->score_sums[j] + dllj * x * x) - v*dllj;
+ mean_u2 += llen->score_var[j] = u2*u2/(llen->hist[j]-1);
+ n_u2++;
+ }
+ else llen->score_var[j] = -1.0;
+ }
+
+ /* mean residual variance after both length and variance with length
+ considered */
+ mean_u2 = sqrt(mean_u2/(double)n_u2);
+ mean_3u2 = mean_u2*3.0;
+
+ /* trim all bins with variance > mean_3u2 */
+ for (j = llen->min; j < llen->max; j++) {
+ if (llen->hist[j] <= 1) continue;
+ if (sqrt(llen->score_var[j]) > mean_3u2) {
+ /* fprintf(stderr," removing %d %d %.2f\n",
+ j, (int)(exp((double)j/LN_FACT)-0.5),
+ sqrt(llen->score_var[j]));
+ */
+ pr->nb_trimmed++;
+ pr->n1_trimmed += llen->hist[j];
+ llen->hist[j] = 0;
+ }
+ }
+ /* fit what is left */
+ fit_llen(llen, pr);
+}
+
+/* s2str is used to regress against residual variance */
+struct s2str {double s; int n;};
+void s2_sort ( struct s2str *sptr, int n);
+
+/* fit_llen2 - used by REGI_STATS
+
+ (1) does the normal fit_llen() regression fitting
+ (2) sorts the residual variance
+ (3) excludes 5% lowest, highest bins by residual variance
+ (4) recalculates mean_var without excluded bins
+*/
+void
+fit_llen2(struct llen_str *llen, struct rstat_str *pr)
+{
+ int j;
+ int n, n_y2, llen_delta, llen_del05;
+ int n_size;
+ double x, y2, u;
+ double mean_x, mean_y, var_x, var_y, covar_xy;
+ double mean_y2, covar_xy2, dllj;
+ struct s2str *ss2;
+
+ double sum_x, sum_y, sum_x2, sum_xy, sum_v, det, n_w;
+
+ /* now fit scores to best linear function of log(n), using
+ simple linear regression */
+
+ /* find the shortest bin with data */
+ for (llen->min=0; llen->min < llen->max; llen->min++)
+ if (llen->hist[llen->min]) break;
+
+ /* find the longest bin with data */
+ for ( ; llen->max > llen->min; llen->max--)
+ if (llen->hist[llen->max]) break;
+
+ /* calculate the mean/variance for each length interval */
+ for (n_size=0,j = llen->min; j < llen->max; j++) {
+ if (llen->hist[j] > 1) {
+ dllj = (double)llen->hist[j];
+ llen->score_var[j] = llen->score2_sums[j]/dllj
+ - (llen->score_sums[j]/dllj) * (llen->score_sums[j]/dllj);
+ llen->score_var[j] /= (double)(llen->hist[j]-1);
+ if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
+ n_size++;
+ }
+ }
+
+ pr->nb_tot = n_size;
+
+ n_w = 0.0;
+ sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
+ for (j = llen->min; j < llen->max; j++) {
+ if (llen->hist[j] > 1) {
+ x = j + 0.5;
+ dllj = (double)llen->hist[j];
+ n_w += (double)llen->hist[j]/dllj;
+ sum_x += (double)llen->hist[j] * x / llen->score_var[j] ;
+ sum_y += llen->score_sums[j] / llen->score_var[j];
+ sum_x2 += dllj * x * x /llen->score_var[j];
+ sum_xy += x * llen->score_sums[j]/llen->score_var[j];
+ }
+ }
+
+ if (n_size < 5 ) {
+ llen->fit_flag=0;
+ pr->rho = 0;
+ pr->mu = sum_y/n_w;
+ return;
+ }
+ else {
+ det = n_w * sum_x2 - sum_x * sum_x;
+ if (det > 0.001) {
+ llen->fit_flag = 1;
+ pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
+ pr->rho_e = n_w/det;
+ pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
+ pr->mu_e = sum_x2/det;
+ }
+ else {
+ llen->fit_flag = 0;
+ pr->rho = 0;
+ pr->mu = sum_y/n_w;
+ return;
+ }
+ }
+
+ /*
+ det = n_w * sum_x2 - sum_x * sum_x;
+ pr->rho = (n_w * sum_xy - sum_x * sum_y)/det;
+ pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/det;
+ */
+/* fprintf(stderr," rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */
+
+ n = 0;
+ mean_x = mean_y = mean_y2 = 0.0;
+ var_x = var_y = 0.0;
+ covar_xy = covar_xy2 = 0.0;
+
+ for (j = llen->min; j <= llen->max; j++)
+ if (llen->hist[j] > 1 ) {
+ n += llen->hist[j];
+ x = (double)j + 0.5;
+ mean_x += (double)llen->hist[j] * x;
+ mean_y += llen->score_sums[j];
+ var_x += (double)llen->hist[j] * x * x;
+ var_y += llen->score2_sums[j];
+ covar_xy += x * llen->score_sums[j];
+ }
+ mean_x /= n; mean_y /= n;
+ var_x = var_x / n - mean_x * mean_x;
+ var_y = var_y / n - mean_y * mean_y;
+
+ covar_xy = covar_xy / n - mean_x * mean_y;
+
+ /* to this point, fit_llen(), fit_llens(), and fit_llen2() -- this
+ function -- are the same */
+
+ /* now prepare to exclude some bins because of excess variance */
+
+ /* allocate space for s2str */
+ if ((ss2=(struct s2str *)calloc(llen->max+1,sizeof(struct s2str)))==NULL) {
+ llen->fit_flag = 0;
+ fprintf(stderr," cannot allocate ss2\n");
+ return;
+ }
+
+ /* calculate residual variance after fit, bin into ss2[] */
+ mean_y2 = 0.0;
+ n_y2 = n = 0;
+ for (j = llen->min; j <= llen->max; j++) {
+ if (llen->hist[j] > VHISTC) {
+ n++;
+ n_y2 += ss2[j].n = llen->hist[j];
+ x = (double)j + 0.5;
+ u = pr->rho * x + pr->mu;
+ ss2[j].s = y2 = llen->score2_sums[j] - 2*llen->score_sums[j]*u + llen->hist[j]*u*u;
+ mean_y2 += y2;
+ }
+ }
+
+ pr->mean_var = mean_y2/(double)n_y2;
+ pr->mean_var_sqrt = sqrt(pr->mean_var);
+
+ /* sort the ss2[.squared residuals, n] by decreasing squared residuals */
+ s2_sort(ss2+llen->min,llen->max-llen->min+1);
+
+ /* fprintf(stderr,"llen->min: %d, max: %d\n",llen->min,llen->max); */
+ /* llen_delta has the number of bins with data */
+ llen_delta = 0;
+ for (j=llen->min; j<=llen->max; j++) {
+ if (ss2[j].n > 1) { llen_delta++;}
+ }
+
+ llen_del05 = llen_delta/20; /* top 5% of bins */
+
+ /* exclude bottom 5% */
+ for (j = llen->min; j<llen->min+llen_del05; j++) {
+ pr->n1_trimmed += ss2[j].n;
+ pr->nb_trimmed++;
+ }
+
+ /* calculate mean_y2 using middle 90% */
+ mean_y2 = 0.0;
+ n_y2 = 0;
+ for (j = llen->min+llen_del05; j <= llen->min+llen_delta-llen_del05; j++)
+ if (ss2[j].n > 1) {
+ mean_y2 += ss2[j].s;
+ n_y2 += ss2[j].n;
+ }
+
+ /* exclude top 5% */
+ for (j = llen->min+llen_delta-llen_del05+1; j< llen->max; j++) {
+ pr->n1_trimmed += ss2[j].n;
+ pr->nb_trimmed++;
+ }
+
+ free(ss2);
+
+ /* return mean_var from middle 90% */
+ if (n_y2 > 1) {
+ pr->mean_var = mean_y2/(double)n_y2;
+ pr->mean_var_sqrt = sqrt(pr->mean_var);
+ }
+ else {
+ llen->fit_flag = 0;
+ }
+
+ /* fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var: %.4f/%d\n",
+ pr->rho*LN_FACT,pr->mu,pr->mean_var,n); */
+ pr->var_e = 0.0;
+}
+
+double find_z(int score, double escore, int length, double comp, struct pstat_str *pu) {
+ if (pu == NULL) {return 0.0;}
+ return find_z_arr[pu->zsflag](score, escore, length, comp, pu);
+}
+
+
+/* REG_STATS - Z() from rho/mu/mean_var */
+double find_zr(int score, double escore, int length, double comp, struct pstat_str *pu)
+{
+ double log_len, z;
+
+ if (pu == NULL) return score;
+
+#ifdef LOCAL_SCORE
+ if (score <= 0) return 0;
+#endif
+ if ( length < LENGTH_CUTOFF) return 0;
+
+#ifdef LOCAL_SCORE
+ log_len = LN_FACT*log((double)(length));
+#else
+ /* log_len = length; */
+ log_len = LN_FACT*log((double)(length));
+#endif
+
+/* var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2;
+ if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.var_cutoff;
+*/
+
+ z = pu->zs_off + ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / pu->r_u.rg.mean_var_sqrt;
+
+ return (50.0 + z*10.0);
+}
+
+/* REG2_STATS Z() from rho/mu, rho2/mu2 */
+double find_zr2(int score, double escore, int length, double comp, struct pstat_str *pu)
+{
+ double log_len, var;
+ double z;
+
+ if ( length < LENGTH_CUTOFF) return 0;
+
+#ifdef LOCAL_SCORE
+ log_len = LN_FACT*log((double)(length));
+#else
+ /* log_len = length; */
+ log_len = LN_FACT*log((double)(length));
+#endif
+
+ var = pu->r_u.rg.rho2 * log_len + pu->r_u.rg.mu2;
+ if (var < pu->r_u.rg.var_cutoff) var = pu->r_u.rg.mean_var;
+
+ z = pu->zs_off + ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu) / sqrt(var);
+
+ return (50.0 + z*10.0);
+}
+
+#ifdef USE_LNSTATS
+/* LN_STATS - ln()-scaled mu, mean_var */
+double find_zl(int score, int length, double comp, struct pstat_str *pu)
+{
+ double ls, z;
+
+ ls = (double)score*LN200/log((double)length);
+
+ z = (ls - pu->r_u.rg.mu) / pu->r_u.rg.mean_var_sqrt;
+
+ return (50.0 + z*10.0);
+}
+#endif
+
+/* MLE_STATS - Z() from MLE for lambda, K */
+double
+find_ze(int score, double escore, int length, double comp, struct pstat_str *pu)
+{
+ double z, mp, np, a_n1;
+
+ a_n1 = (double)length;
+
+ mp = pu->r_u.ag.a_n0;
+ np = a_n1;
+
+ if (np < 1.0) np = 1.0;
+ if (mp < 1.0) mp = 1.0;
+
+ z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp);
+
+ z = -z + EULER_G;
+ z /= - PI_SQRT6;
+ z += pu->zs_off;
+
+ return (50.0 + z*10.0);
+}
+
+/* MLE2_STATS - Z() from MLE for mle_a0..2, mle_b1, length, comp */
+double
+find_ze2(int score, double escore, int length, double comp, struct pstat_str *pu)
+{
+ double z, mp, np, a_n1;
+
+ a_n1 = (double)length;
+
+ if (comp <= 0.0) comp = pu->r_u.m2.ave_comp;
+
+ /* avoid very biased comp estimates */
+ /* comp = exp((4.0*log(comp)+log(pu->r_u.m2.ave_comp))/5.0); */
+
+ mp = pu->r_u.m2.a_n0;
+ np = a_n1;
+
+ if (np < 1.0) np = 1.0;
+ if (mp < 1.0) mp = 1.0;
+
+ z = (-(pu->r_u.m2.mle2_a0 + pu->r_u.m2.mle2_a1 * comp + pu->r_u.m2.mle2_a2 * comp * log(np * mp)) + score) / (pu->r_u.m2.mle2_b1 * comp);
+
+
+ z = -z + EULER_G;
+ z /= - PI_SQRT6;
+ z += pu->zs_off;
+
+ return (50.0 + z*10.0);
+}
+
+/* AG_STATS - Altschul-Gish Lamdba, K */
+double
+find_za(int score, double escore, int length, double comp, struct pstat_str *pu)
+{
+ double z, mp, np, a_n1, a_n1f;
+
+ a_n1 = (double)length;
+ a_n1f = log(a_n1)/pu->r_u.ag.H;
+
+ mp = pu->r_u.ag.a_n0 - pu->r_u.ag.a_n0f - a_n1f;
+ np = a_n1 - pu->r_u.ag.a_n0f - a_n1f;
+
+ if (np < 1.0) np = 1.0;
+ if (mp < 1.0) mp = 1.0;
+
+ z = pu->r_u.ag.Lambda * score - log(pu->r_u.ag.K * np * mp);
+
+ z = -z + EULER_G;
+ z /= - PI_SQRT6;
+
+ return (50.0 + z*10.0);
+}
+
+double find_zn(int score, double escore, int length, double comp, struct pstat_str *pu)
+{
+ double z;
+
+ z = pu->zs_off + ((double)score - pu->r_u.rg.mu) / pu->r_u.rg.mean_var_sqrt;
+
+ return (50.0 + z*10.0);
+}
+
+/* computes E value for a given z value, assuming extreme value distribution */
+double
+z_to_E(double zs, long entries, struct db_str db)
+{
+ double e, n;
+
+ /* if (db->entries < 5) return (double)db.entries; */
+ if (entries < 1) { n = db.entries;}
+ else {n = entries;}
+
+ if (zs > ZS_MAX) return 0.0;
+
+#ifndef NORMAL_DIST
+ e = exp(- PI_SQRT6 * zs - EULER_G);
+ return n * (e > .01 ? 1.0 - exp(-e) : e);
+#else
+ return n * erfc(zs/M_SQRT2)/2.0;
+#endif
+}
+
+double
+zs_to_p(double zs)
+{
+ double e, z;
+
+ /* if (db.entries < 5) return 0.0; */
+
+ z = (zs - 50.0)/10.0;
+
+ if (z > ZS_MAX) return 0.0;
+
+#ifndef NORMAL_DIST
+ e = exp(- PI_SQRT6 * z - EULER_G);
+ return (e > .01 ? 1.0 - exp(-e) : e);
+#else
+ return erfc(z/M_SQRT2)/2.0;
+#endif
+}
+
+double
+zs_to_bit(double zs, int n0, int n1)
+{
+ double z, a_n0, a_n1;
+
+ z = (zs - 50.0)/10.0;
+ a_n0 = (double)n0;
+ a_n1 = (double)n1;
+
+ return (PI_SQRT6 * z + EULER_G + log(a_n0*a_n1))/M_LN2 ;
+}
+
+/* computes E-value for a given z value, assuming extreme value distribution */
+double
+zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db)
+{
+ double e, z, k;
+
+ /* if (db->entries < 5) return 0.0; */
+
+ z = (zs - 50.0)/10.0;
+
+ if (z > ZS_MAX ) return 0.0;
+
+ if (entries < 1) entries = db.entries;
+
+ if (dnaseq == SEQT_DNA || dnaseq == SEQT_RNA) {
+ k = (double)db.length /(double)n1;
+ if (db.carry > 0) {
+ k += ((double)db.carry * (double)LONG_MAX)/(double)n1;
+ }
+ }
+ else k = (double)entries;
+
+ if (k < 1.0) k = 1.0;
+
+#ifndef NORMAL_DIST
+ z *= PI_SQRT6;
+ z += EULER_G;
+ e = exp(-z);
+ return k * (e > .01 ? 1.0 - exp(-e) : e);
+#else
+ return k * erfc(z/M_SQRT2)/2.0;
+#endif
+}
+
+#ifdef NORMAL_DIST
+double np_to_z(double, int *);
+#endif
+
+/* compute z-score for given E()-value, assuming normal or
+ extreme-value dist */
+double
+E_to_zs(double E, long entries)
+{
+ double e, z;
+ int error;
+
+ e = E/(double)entries;
+
+#ifndef NORMAL_DIST
+ z = (log(e)+EULER_G)/(- PI_SQRT6);
+ return z*10.0 + 50.0;
+#else
+ /* this formula does not work for E() >= 1 */
+ if (e >= 1.0) e = 0.99;
+ z = np_to_z(1.0-e,&error);
+ if (!error) return z*10.0 + 50.0;
+ else return 0.0;
+#endif
+}
+
+/* computes 1.0 - E value for a given z value, assuming extreme value
+ distribution */
+double
+zs_to_Ec(double zs, long entries)
+{
+ double e, z;
+
+ if (entries < 5) return 0.0;
+
+ z = (zs - 50.0)/10.0;
+
+ if (z > ZS_MAX) return 1.0;
+
+#ifndef NORMAL_DIST
+ e = exp(- PI_SQRT6 * z - EULER_G);
+ return (double)entries * (e > .01 ? exp(-e) : 1.0 - e);
+#else
+ return (double)entries*erf(z/M_SQRT2)/2.0;
+#endif
+}
+
+int
+ELK_to_s(double e_val, int n0, int n1,
+ double Lambda, double K, double H) {
+
+ double a_n0, a_n1;
+ double a_n0f, mp, np;
+
+ a_n0 = (double)n0;
+ a_n1 = (double)n1;
+
+ a_n0f = log(K * a_n0 * a_n1)/H;
+ mp = a_n0 - a_n0f;
+ np = a_n1 - a_n0f;
+
+ if (np < 1.0) np = 1.0;
+ if (mp < 1.0) mp = 1.0;
+
+ return (int)((log(K * mp * np/e_val))/Lambda);
+}
+
+/* calculate a threshold score, given an E() value and Lambda,K,H */
+
+int
+E1_to_s(double e_val, int n0, int n1, int db_size,
+ struct pstat_str *pu) {
+ double mp, np, a_n0, a_n0f, a_n1;
+ double zs, log_len, p_val;
+ int score;
+
+ if (pu->zsflag < 0 || n0 < LENGTH_CUTOFF || n1 < LENGTH_CUTOFF) return BIGNUM;
+
+ a_n0 = (double)n0;
+ a_n1 = (double)n1;
+ zs = E_to_zs(e_val, db_size);
+ /* convert from "zscore" to "zvalue" */
+ zs = (zs - 50.0)/10.0;
+ p_val = e_val / db_size;
+
+ switch (pu->zsflag) {
+
+ case AVE_STATS:
+ score = zs * pu->r_u.rg.mean_var_sqrt + pu->r_u.rg.mu;
+ break;
+
+ case REG_STATS:
+ case REGI_STATS:
+#ifdef LOCAL_SCORE
+ log_len = LN_FACT * log(a_n1);
+#else
+ /* log_len = a_n1; */
+ log_len = LN_FACT * log(a_n1);
+#endif
+ score = zs * pu->r_u.rg.mean_var_sqrt + pu->r_u.rg.rho * log_len + pu->r_u.rg.mu;
+ break;
+
+ case MLE_STATS:
+ score = (int)((log( pu->r_u.ag.K * a_n0 * a_n1) - log(p_val))/pu->r_u.ag.Lambda +0.5);
+ break;
+
+ case AG_STATS:
+ a_n0f = log(pu->r_u.ag.K * a_n0 * a_n1)/pu->r_u.ag.H;
+ mp = a_n0 - a_n0f;
+ np = a_n1 - a_n0f;
+
+ if (np < 1.0) np = 1.0;
+ if (mp < 1.0) mp = 1.0;
+
+ score = (int)((log( pu->r_u.ag.K * mp * np) - log(p_val))/pu->r_u.ag.Lambda +0.5);
+ break;
+
+ default:
+ fprintf(stderr,"\n*** statistics method: %d not yet supported ***\n", pu->zsflag);
+ score = 999;
+ }
+
+#ifndef NORMAL_DIST
+ if (score < 0) score = 0;
+#endif
+ return score;
+}
+
+/* calculate E()-value from score, lengths, database size */
+double
+s_to_bit(int score, int n0, int n1, struct pstat_str *pu) {
+
+ double bit;
+ double a_n0, a_n1;
+ double z, log_len;
+
+ if (n0 < LENGTH_CUTOFF || n1 < LENGTH_CUTOFF) return -1.0;
+
+ a_n0 = (double)n0;
+ a_n1 = (double)n1;
+
+ switch (pu->zsflag) {
+
+ case AVE_STATS:
+ z = ((double)score - pu->r_u.rg.mu)/pu->r_u.rg.mean_var_sqrt;
+ bit = (PI_SQRT6 * z + EULER_G + log(a_n0*a_n1))/M_LN2 ;
+ break;
+
+ case REG_STATS:
+ case REGI_STATS:
+ log_len = LN_FACT * log(a_n1);
+ z = ((double)score - pu->r_u.rg.rho * log_len - pu->r_u.rg.mu);
+ z /= pu->r_u.rg.mean_var_sqrt ;
+ bit = (PI_SQRT6 * z + EULER_G + log(a_n0*a_n1))/M_LN2 ;
+ break;
+
+ case MLE_STATS:
+ case AG_STATS:
+ bit = ((double)score * pu->r_u.ag.Lambda - log(pu->r_u.ag.K))/M_LN2;
+ break;
+
+ default:
+ fprintf(stderr,"\n*** s_to_bit -- statistics method: %d not yet supported ***\n", pu->zsflag);
+ bit = -1.0;
+ }
+
+ return bit;
+}
+
+double
+bit_to_E (double bit, int n0, int n1, long db_size,
+ struct pstat_str *pu)
+{
+ double a_n0, a_n1, a_n0f, p_val;
+
+ a_n0 = (double)n0;
+ a_n1 = (double)n1;
+
+ if (pu->zsflag == AG_STATS) {
+ a_n0f = log(pu->r_u.ag.K * a_n0 * a_n1)/pu->r_u.ag.H;
+
+ a_n0 -= a_n0f;
+ a_n1 -= a_n0f;
+
+ if (a_n0 < 1.0) a_n0 = 1.0;
+ if (a_n1 < 1.0) a_n1 = 1.0;
+ }
+
+ p_val = a_n0 * a_n1 / pow(2.0, bit);
+ if (p_val > 0.01) p_val = 1.0 - exp(-p_val);
+
+ return (double)db_size * p_val;
+}
+
+
+void s2_sort (struct s2str *ptr, int n)
+{
+ int gap, i, j;
+ struct s2str tmp;
+
+ /* shell sort using sequence from Knuth */
+ /* find a gap larger than 1/3 n */
+ for (gap = 1; gap < n/3; gap = 3*gap +1) ;
+
+ /* do a shell() sort, shrinking the gap */
+ for ( ; gap > 0; gap = (gap-1)/3) {
+ for (i = gap; i < n; i++) {
+ for (j = i - gap; j >= 0; j-= gap) {
+ if (ptr[j].s >= ptr[j + gap].s) break;
+ tmp.s = ptr[j].s;
+ tmp.n = ptr[j].n;
+ ptr[j].s = ptr[j + gap].s;
+ ptr[j].n = ptr[j + gap].n;
+ ptr[j + gap].s = tmp.s;
+ ptr[j + gap].n = tmp.n;
+ }
+ }
+ }
+}
+
+void last_stats() {}
+
+void
+scale_scores(struct beststr **bptr, int nbest, struct db_str db,
+ struct pstruct *ppst, struct pstat_str *rs)
+{
+ int i, ix;
+ double zscore;
+
+ ix = ppst->score_ix;
+
+ if (ppst->zsflag < 0 || ppst->zsflag_f < 0) {
+ /* even though no statistics, we need to sort the scores */
+ for (i=0; i<nbest; i++) {
+ bptr[i]->zscore = bptr[i]->rst.score[ix];
+ }
+ sortbestz(bptr,nbest);
+ }
+ else {
+ for (i=0; i<nbest; i++) {
+ zscore = find_z(bptr[i]->rst.score[ppst->score_ix], bptr[i]->rst.escore,
+ bptr[i]->seq->n1,bptr[i]->rst.comp,rs);
+ bptr[i]->zscore = zscore;
+ bptr[i]->rst.escore
+ =zs_to_E(zscore,bptr[i]->seq->n1,ppst->dnaseq, ppst->zdb_size,db);
+ }
+ sortbeste(bptr,nbest);
+ }
+}
+
+#ifdef NORMAL_DIST
+/* ALGORITHM AS241 APPL. STATIST. (1988) VOL. 37, NO. 3
+
+ Produces the normal deviate Z corresponding to a given lower
+ tail area of P; Z is accurate to about 1 part in 10**16.
+
+ The hash sums below are the sums of the mantissas of the
+ coefficients. They are included for use in checking
+ transcription.
+*/
+
+double np_to_z(double p, int *fault) {
+
+ double q, r, ppnd16;
+
+ double zero = 0.0, one = 1.0, half = 0.5;
+ double split1 = 0.425, split2 = 5.0;
+ double const1 = 0.180625, const2 = 1.6;
+
+/* Coefficients for P close to 0.5 */
+
+ double a0 = 3.3871328727963666080e0;
+ double a1 = 1.3314166789178437745e+2;
+ double a2 = 1.9715909503065514427e+3;
+ double a3 = 1.3731693765509461125e+4;
+ double a4 = 4.5921953931549871457e+4;
+ double a5 = 6.7265770927008700853e+4;
+ double a6 = 3.3430575583588128105e+4;
+ double a7 = 2.5090809287301226727e+3;
+ double b1 = 4.2313330701600911252e+1;
+ double b2 = 6.8718700749205790830e+2;
+ double b3 = 5.3941960214247511077e+3;
+ double b4 = 2.1213794301586595867e+4;
+ double b5 = 3.9307895800092710610e+4;
+ double b6 = 2.8729085735721942674e+4;
+ double b7 = 5.2264952788528545610e+3;
+
+ double sum_ab= 55.8831928806149014439;
+/*
+ Coefficients for P not close to 0, 0.5 or 1.
+*/
+
+ double c0 = 1.42343711074968357734;
+ double c1 = 4.63033784615654529590;
+ double c2 = 5.76949722146069140550;
+ double c3 = 3.64784832476320460504;
+ double c4 = 1.27045825245236838258;
+ double c5 = 2.41780725177450611770e-1;
+ double c6 = 2.27238449892691845833e-2;
+ double c7 = 7.74545014278341407640e-4;
+ double d1 = 2.05319162663775882187;
+ double d2 = 1.67638483018380384940;
+ double d3 = 6.89767334985100004550e-1;
+ double d4 = 1.48103976427480074590e-1;
+ double d5 = 1.51986665636164571966e-2;
+ double d6 = 5.47593808499534494600e-4;
+ double d7 = 1.05075007164441684324e-9;
+
+ double sum_cd=49.33206503301610289036;
+/*
+ Coefficients for P near 0 or 1.
+*/
+ double e0 = 6.65790464350110377720e0;
+ double e1 = 5.46378491116411436990e0;
+ double e2 = 1.78482653991729133580e0;
+ double e3 = 2.96560571828504891230e-1;
+ double e4 = 2.65321895265761230930e-2;
+ double e5 = 1.24266094738807843860e-3;
+ double e6 = 2.71155556874348757815e-5;
+ double e7 = 2.01033439929228813265e-7;
+ double f1 = 5.99832206555887937690e-1;
+ double f2 = 1.36929880922735805310e-1;
+ double f3 = 1.48753612908506148525e-2;
+ double f4 = 7.86869131145613259100e-4;
+ double f5 = 1.84631831751005468180e-5;
+ double f6 = 1.42151175831644588870e-7;
+ double f7 = 2.04426310338993978564e-15;
+
+ double sum_ef=47.52583317549289671629;
+
+ double sum_tmp = 0.0;
+
+ /*
+ sum_tmp = a0+a1+a2+a3+a4+a5+a6+a7+b1+b2+b3+b4+b5+b6+b7;
+ if (fabs(sum_tmp - sum_ab) > 1e-12) {
+ fprintf (stderr," sum_ab error: %lg %lg\n",sum_tmp,sum_ab);
+ *fault = 1;
+ return zero;
+ }
+
+ sum_tmp = c0+c1+c2+c3+c4+c5+c6+c7+d1+d2+d3+d4+d5+d6+d7;
+ if (fabs(sum_tmp - sum_cd) > 1e-12) {
+ fprintf (stderr," sum_cd error: %lg %lg\n",sum_tmp,sum_cd);
+ *fault = 1;
+ return zero;
+ }
+ sum_tmp = e0+e1+e2+e3+e4+e5+e6+e7+f1+f2+f3+f4+f5+f6+f7;
+ if (fabs(sum_tmp - sum_ef) > 1e-12) {
+ fprintf (stderr," sum_ef error: %lg %lg\n",sum_tmp,sum_ef);
+ *fault = 1;
+ return zero;
+ }
+ */
+
+ *fault = 0;
+ q = p - half;
+ if (fabs(q) <= split1) {
+ r = const1 - q * q;
+ return q * (((((((a7 * r + a6) * r + a5) * r + a4) * r + a3)
+ * r + a2) * r + a1) * r + a0) /
+ (((((((b7 * r + b6) * r + b5) * r + b4) * r + b3)
+ * r + b2) * r + b1) * r + one);
+ }
+ else {
+ r = (q < zero) ? p : one - p;
+ if (r <= zero) {
+ *fault = 1;
+ return zero;
+ }
+ r = sqrt(-log(r));
+ if (r <= split2) {
+ r -= const2;
+ ppnd16 = (((((((c7 * r + c6) * r + c5) * r + c4) * r + c3)
+ * r + c2) * r + c1) * r + c0) /
+ (((((((d7 * r + d6) * r + d5) * r + d4) * r + d3)
+ * r + d2) * r + d1) * r + one);
+ }
+ else {
+ r -= split2;
+ ppnd16 = (((((((e7 * r + e6) * r + e5) * r + e4) * r + e3)
+ * r + e2) * r + e1) * r + e0) /
+ (((((((f7 * r + f6) * r + f5) * r + f4) * r + f3)
+ * r + f2) * r + f1) * r + one);
+ }
+ if (q < zero) return -ppnd16;
+ else return ppnd16;
+ }
+}
+#endif
+
+/* print out all pstat_str info for independent calculation */
+void
+pstat_info(char *info_str, int info_str_n, char *comment, struct pstat_str *pu) {
+ char pstat_buf[MAX_STR];
+
+ sprintf(pstat_buf,"%s zsflag: %d\n",comment,pu->zsflag);
+ SAFE_STRNCPY(info_str,pstat_buf,info_str_n);
+ sprintf(pstat_buf,"%s ngLambda: %g; ngK: %g; ngH: %g\n",comment,pu->ngLambda,pu->ngK,pu->ngH);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ sprintf(pstat_buf,"%s ave_n1: %g; sample_fract: %g; zs_off: %g\n",comment,
+ pu->ave_n1,pu->sample_fract,pu->zs_off);
+ if (pu->zsflag == MLE2_STATS) { /* print r_u.m2 */
+ sprintf(pstat_buf,"%s mle2_stat_str: {\n",comment);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ sprintf(pstat_buf,"%s a_n0 %g;\n",comment,pu->r_u.m2.a_n0);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ sprintf(pstat_buf,"%s mle2_a0: %g; mle2_a1: %g; mle2_a2: %g; mle2_b1: %g\n",comment,
+ pu->r_u.m2.mle2_a0,pu->r_u.m2.mle2_a1,pu->r_u.m2.mle2_a2,pu->r_u.m2.mle2_b1);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ sprintf(pstat_buf,"%s ave_comp: %g; max_comp: %g; ave_H: %g }\n",comment,
+ pu->r_u.m2.ave_comp,pu->r_u.m2.max_comp,pu->r_u.m2.ave_H);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ }
+ else if (pu->zsflag == AG_STATS || pu->zsflag == MLE_STATS) {
+ sprintf(pstat_buf,"%s ag_stat_str: {\n",comment);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ sprintf(pstat_buf,"%s K: %g; Lambda: %g; a_n0f: %g; a_n0: %g }\n",comment,
+ pu->r_u.ag.K,pu->r_u.ag.Lambda,pu->r_u.ag.a_n0f,pu->r_u.ag.a_n0);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ }
+ else {
+ sprintf(pstat_buf,"%s rstat_str (LN_FACT: %.1f): {\n",comment,LN_FACT);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ sprintf(pstat_buf,"%s rho: %g; rho_e: %g; mu: %g; mu_e: %g;\n",comment,
+ pu->r_u.rg.rho,pu->r_u.rg.rho_e,pu->r_u.rg.mu,pu->r_u.rg.mu_e);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ sprintf(pstat_buf,"%s mean_var: %g; var_e: %g; mean_var_sqrt: %g\n",comment,
+ pu->r_u.rg.mean_var, pu->r_u.rg.var_e,pu->r_u.rg.mean_var_sqrt);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ sprintf(pstat_buf,"%s rho2: %g; mu2: %g; var_cutoff: %g\n",comment,
+ pu->r_u.rg.rho2, pu->r_u.rg.mu2,pu->r_u.rg.var_cutoff);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ sprintf(pstat_buf,"%s n_trimmed: %d; n1_trimmed: %d; nb_trimmed: %d; nb_tot: %d }\n",comment,
+ pu->r_u.rg.n_trimmed, pu->r_u.rg.n1_trimmed,pu->r_u.rg.nb_trimmed, pu->r_u.rg.nb_tot);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+ }
+}
diff --git a/src/scaleswt.c b/src/scaleswt.c
new file mode 100644
index 0000000..2a59bd8
--- /dev/null
+++ b/src/scaleswt.c
@@ -0,0 +1,1566 @@
+/* $Id: scaleswt.c 714 2011-05-05 00:33:40Z wrp $ */
+
+/* copyright (c) 1995, 1996, 2000, 2002, 2014 by William R. Pearson and The
+ Rectors & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* as of 24 Sept, 2000 - scaleswn uses no global variables */
+
+/*
+ This version is designed for fasts/f, which used Tatusov
+ probabilities for statistical estimates, but still needs a
+ quick-and-dirty linear regression fit to rank things
+
+ For comparisons that obey tatusov statistics, we try whenever
+ possible to provide accurate e_scores, rather than raw scores. As a
+ result, no lambda/K fitting is required; and process_hist() can be
+ called at the very beginning of the search to initialize some of the
+ statistics structures and find_zp().
+
+ find_z() must still return a valid z_score surrogate, as
+ comp_lib.c/p2_complib.c continue to use z_score's to rank hits, save
+ the best, etc.
+
+ If e_score's cannot be calculated, the process_hist() provides
+ linear regression fitting for conventional z_score estimates.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+
+#include <limits.h>
+
+#include "defs.h"
+#include "param.h"
+#include "structs.h"
+#include "best_stats.h"
+
+#define MAXHIST 50
+#define MAX_LLEN 200
+#define LHISTC 5
+#define VHISTC 5
+#define MAX_SSCORE 300
+
+#define LENGTH_CUTOFF 10 /* minimum database sequence length allowed, for fitting */
+
+#define LN_FACT 10.0
+#ifndef M_LN2
+#define M_LN2 0.69314718055994530942
+#endif
+
+#define EULER_G 0.57721566490153286060
+#define PI_SQRT6 1.28254983016186409554
+
+#ifndef M_SQRT2
+#define M_SQRT2 1.41421356237
+#endif
+#define LN200 5.2983173666
+#define ZS_MAX 400.0 /* used to prevent underflow on some machines */
+#define TOLERANCE 1.0e-12
+#define TINY 1.0e-6
+
+/* used by AVE_STATS, REG_STATS, REGI_STATS, REG2_STATS*/
+struct pstat_str {
+ int zsflag;
+ double ngLambda, ngK, ngH;
+ double rho, rho_e, mu, mu_e, mean_var, var_e; /* ?_e:std. error of ? */
+/* used by REG2_STATS */
+ double rho2, mu2, var_cutoff;
+ int n_trimmed; /* excluded because of high z-score */
+ int n1_trimmed, nb_trimmed, nb_tot; /* excluded because of bin */
+ double tat_a, tat_b, tat_c, spacefactor;
+ int have_tat;
+ int tie_j;
+ int eval_is_pval;
+ long zdb_size;
+};
+
+#define AVE_STATS 0 /* no length effect, only mean/variance */
+double find_zt(int score, double escore, int len, double comp, struct pstat_str *);
+
+double find_zn(int score, double escore, int len, double comp, struct pstat_str *);
+
+double power(double, int);
+
+void sortbesto(double *, int );
+extern void sortbeste(struct beststr **bptr, int nbest);
+
+int proc_hist_n(struct stat_str *sptr, int n,
+ struct pstruct *ppst, struct hist_str *histp, int do_trim,
+ struct pstat_str *);
+
+#define REG_STATS 1 /* length-regression scaled */
+double find_zr(int score, double escore, int len, double comp, struct pstat_str *);
+
+int proc_hist_r(struct stat_str *sptr, int n,
+ struct pstruct *ppst, struct hist_str *histp,
+ int do_trim, struct pstat_str *rs);
+
+static double (*find_zp)(int score, double escore, int len, double comp,
+ struct pstat_str *) = &find_zr;
+
+double find_z(int score, double escore, int len, double comp, struct pstat_str *);
+
+/* print out all pstat_str info for independent calculation */
+void
+pstat_info(char *info_str, int info_str_n, char *comment, struct pstat_str *pu);
+
+struct llen_str {
+ int min, max;
+ int max_score, min_score;
+ int *hist;
+ double *score_sums, *score2_sums;
+ double *score_var;
+ int max_length, min_length, zero_s;
+ int fit_flag;
+};
+
+static void inithist(struct llen_str *, struct pstruct *, int);
+static void free_hist( struct llen_str *);
+static void addhist(struct llen_str *, int, int, int);
+static void prune_hist(struct llen_str *, int, int, int, long *);
+void inithistz(int, struct hist_str *histp);
+void addhistz(double zs, struct hist_str *histp);
+
+static void fit_llen(struct llen_str *, struct pstat_str *);
+static void fit_llens(struct llen_str *, struct pstat_str *);
+
+void linreg(double *lny, double *x, double *lnx, int n,
+ double *a, double *b, double *c, int start);
+
+double calc_spacefactor(const unsigned char *, int, int, int);
+
+double det(double a11, double a12, double a13,
+ double a21, double a22, double a23,
+ double a31, double a32, double a33);
+
+double factorial (int a, int b);
+
+/* void set_db_size(int, struct db_str *, struct hist_str *); */
+
+#ifdef DEBUG
+FILE *tmpf;
+#endif
+
+int
+process_hist(struct stat_str *sptr, int nstats,
+ const struct mngmsg *m_msg,
+ struct pstruct *ppst,
+ struct hist_str *histp,
+ struct pstat_str **rs_sp,
+ int do_hist
+ )
+{
+ int zsflag, do_trim;
+ struct pstat_str *rs_s;
+
+ if (ppst->zsflag < 0) {
+ *rs_sp = NULL;
+ return ppst->zsflag;
+ }
+
+ ppst->zs_off = 0.0;
+
+ if (*rs_sp == NULL) {
+ if ((rs_s=(struct pstat_str *)calloc(1,sizeof(struct pstat_str)))==NULL) {
+ fprintf(stderr," cannot allocate rs_snion: %ld\n",sizeof(struct pstat_str));
+ exit(1);
+ }
+ else *rs_sp = rs_s;
+ }
+ else {
+ rs_s = *rs_sp;
+ memset(rs_s,0,sizeof(struct pstat_str));
+ }
+
+ rs_s->zsflag = zsflag = ppst->zsflag;
+ rs_s->zdb_size = ppst->zdb_size;
+
+ if (m_msg->escore_flg) {
+ find_zp = &find_zt;
+ inithistz(MAXHIST,histp);
+ rs_s->eval_is_pval = 1;
+ return 1;
+ }
+
+ if (nstats < 20) {
+ fprintf(stderr," too few sequences for sampling: %d\n",nstats);
+ free(rs_s);
+ *rs_sp = NULL;
+ return -1;
+ }
+
+ rs_s->ngLambda = m_msg->Lambda;
+ rs_s->ngK = m_msg->K;
+ rs_s->ngH = m_msg->H;
+
+ if (zsflag >= 20) {
+ zsflag = ppst->zsflag2;
+ do_trim = 0;
+ }
+ else if (zsflag >= 10) {
+ zsflag -= 10;
+ do_trim = 0;
+ }
+ else do_trim = 1;
+
+ rs_s->eval_is_pval = 0;
+ find_zp = &find_zr;
+
+ return rs_s->zsflag = proc_hist_r(sptr, nstats, ppst, histp, do_trim, rs_s);
+}
+
+int
+calc_thresh(struct pstruct *ppst, int nstats,
+ double Lambda, double K, double H, double *zstrim)
+{
+ int max_hscore;
+ double ave_n1, tmp_score, z, l_fact;
+
+ if (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA) {
+ ave_n1 = 5000.0;
+ l_fact = 1.0;
+ }
+ else {
+ ave_n1 = 400.0;
+ l_fact = 0.7;
+ }
+
+/* max_hscore = MAX_SSCORE; */
+/* mean expected for ppst->n0 * 400 for protein, 5000 for DNA */
+/* we want a number of offsets that is appropriate for the database size so
+ far (nstats)
+*/
+
+/*
+ the calculation below sets a high-score threshold using an
+ ungapped lambda, but errs towards the high-score side by using
+ E()=0.001 and calculating with 0.70*lambda, which is the correct for
+ going from ungapped to -12/-2 gapped lambda with BLOSUM50
+*/
+
+#ifndef NORMAL_DIST
+ tmp_score = 0.01/((double)nstats*K*(double)ppst->n0*ave_n1);
+ tmp_score = -log(tmp_score)/(Lambda*l_fact);
+ max_hscore = (int)(tmp_score+0.5);
+
+ z = 1.0/(double)nstats;
+ z = (log(z)+EULER_G)/(-PI_SQRT6);
+#else
+ max_hscore = 100;
+ z = 5.0;
+#endif
+ *zstrim = 10.0*z+50.0;
+ return max_hscore;
+}
+
+int
+proc_hist_r(struct stat_str *sptr, int nstats,
+ struct pstruct *ppst, struct hist_str *histp,
+ int do_trim, struct pstat_str *rs)
+{
+ int i, max_hscore;
+ double zs, ztrim;
+ char s_string[128];
+ struct llen_str llen;
+ char *f_string;
+ llen.fit_flag=1;
+ llen.hist=NULL;
+
+ max_hscore = calc_thresh(ppst, nstats, rs->ngLambda,
+ rs->ngK, rs->ngH, &ztrim);
+
+ inithist(&llen, ppst,max_hscore);
+ f_string = &(histp->stat_info[0]);
+
+ for (i = 0; i<nstats; i++)
+ addhist(&llen,sptr[i].score,sptr[i].n1, max_hscore);
+ histp->entries = nstats - llen.zero_s;
+
+ if ((llen.max_score - llen.min_score) < 10) {
+ free_hist(&llen);
+ llen.fit_flag = 0;
+ find_zp = &find_zn;
+ return proc_hist_n(sptr, nstats, ppst, histp, do_trim, rs);
+ }
+
+ fit_llen(&llen, rs); /* now we have rho, mu, rho2, mu2, mean_var
+ to set the parameters for the histogram */
+
+ if (!llen.fit_flag) { /* the fit failed, fall back to proc_hist_n */
+ free_hist(&llen);
+ find_zp = &find_zn;
+ return proc_hist_n(sptr,nstats, ppst, histp, do_trim, rs);
+ }
+
+ rs->n_trimmed= rs->n1_trimmed = rs->nb_trimmed = 0;
+
+ if (do_trim) {
+ if (llen.fit_flag) {
+ for (i = 0; i < nstats; i++) {
+ zs = find_zr(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs);
+ if (zs < 20.0 || zs > ztrim) {
+ rs->n_trimmed++;
+ prune_hist(&llen,sptr[i].score,sptr[i].n1, max_hscore,
+ &(histp->entries));
+ }
+ }
+ }
+
+ /* fprintf(stderr,"Z-trimmed %d entries with z > 5.0\n", rs->n_trimmed); */
+
+ if (llen.fit_flag) fit_llens(&llen, rs);
+
+ /* fprintf(stderr,"Bin-trimmed %d entries in %d bins\n", rs->n1_trimmed,rs->nb_trimmed); */
+ }
+
+
+ free_hist(&llen);
+
+ /* rst all the scores in the histogram */
+
+ if (ppst->zsflag < 10) s_string[0]='\0';
+ else if (ppst->zs_win > 0)
+ sprintf(s_string,"(shuffled, win: %d)",ppst->zs_win);
+ else strncpy(s_string,"(shuffled)",sizeof(s_string));
+
+ inithistz(MAXHIST, histp);
+
+ sprintf(f_string,"%s Expectation_n fit: rho(ln(x))= %6.4f+/-%6.3g; mu= %6.4f+/-%6.3f\n mean_var=%6.4f+/-%6.3f, 0's: %d Z-trim: %d B-trim: %d in %d/%d\n Lambda= %6.4f",
+ s_string,
+ rs->rho*LN_FACT,sqrt(rs->rho_e),rs->mu,sqrt(rs->mu_e), rs->mean_var,sqrt(rs->var_e),
+ llen.zero_s, rs->n_trimmed, rs->n1_trimmed, rs->nb_trimmed, rs->nb_tot,
+ PI_SQRT6/sqrt(rs->mean_var));
+ return REG_STATS;
+}
+
+
+int
+proc_hist_n(struct stat_str *sptr, int nstats,
+ struct pstruct *ppst, struct hist_str *histp,
+ int do_trim, struct pstat_str *rs)
+{
+ int i, j;
+ double s_score, s2_score, ssd;
+ double ztrim;
+ int nit, max_hscore;
+ char s_string[128];
+ char *f_string;
+
+ f_string = &(histp->stat_info[0]);
+ /* db->entries = db->length = db->carry = 0; */
+
+ max_hscore = calc_thresh(ppst, nstats, rs->ngLambda,
+ rs->ngK, rs->ngH, &ztrim);
+
+ s_score = s2_score = 0.0;
+
+ histp->entries = 0;
+
+ for ( j = 0, i = 0; i < nstats; i++) {
+ if (sptr[i].score > 0 && sptr[i].score <= max_hscore) {
+ s_score += (ssd=(double)sptr[i].score);
+ s2_score += ssd * ssd;
+ histp->entries++;
+ /*
+ db->length += sptr[i].n1;
+ if (db->length > LONG_MAX) {
+ db->carry++;
+ db->length -= LONG_MAX;
+ }
+ */
+ j++;
+ }
+ }
+
+ if (j > 1 ) {
+ rs->mu = s_score/(double)j;
+ rs->mean_var = s2_score - (double)j * rs->mu * rs->mu;
+ rs->mean_var /= (double)(j-1);
+ }
+ else {
+ rs->mu = 50.0;
+ rs->mean_var = 10.0;
+ }
+
+ if (rs->mean_var < 0.01) {
+ rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0;
+ }
+
+ /* now remove some scores */
+
+ nit = 5;
+ while (nit-- > 0) {
+ rs->n_trimmed = 0;
+
+ for (i=0; i< nstats; i++) {
+ if (sptr[i].n1 < 0) continue;
+ ssd = find_zn(sptr[i].score,sptr[i].escore,sptr[i].n1,sptr[i].comp, rs);
+ if (ssd > ztrim || ssd < 20.0) {
+ /* fprintf(stderr,"removing %3d %3d %4.1f\n",
+ sptr[i].score, sptr[i].n1,ssd); */
+ ssd = sptr[i].score;
+ s_score -= ssd;
+ s2_score -= ssd*ssd;
+ j--;
+ rs->n_trimmed++;
+ histp->entries--;
+ sptr[i].n1 = -sptr[i].n1;
+ }
+ }
+
+ if (j > 1 ) {
+ rs->mu = s_score/(double)j;
+ rs->mean_var = s2_score - (double)j * rs->mu * rs->mu;
+ rs->mean_var /= (double)(j-1);
+ }
+ else {
+ rs->mu = 50.0;
+ rs->mean_var = 10.0;
+ }
+
+ if (rs->mean_var < 0.01) {
+ rs->mean_var = (rs->mu > 1.0) ? rs->mu: 1.0;
+ }
+
+ if (rs->n_trimmed < LHISTC) {
+ /*
+ fprintf(stderr,"nprune %d at %d\n",nprune,nit);
+ */
+ break;
+ }
+ }
+
+ if (ppst->zsflag < 10) s_string[0]='\0';
+ else if (ppst->zs_win > 0)
+ sprintf(s_string,"(shuffled, win: %d)",ppst->zs_win);
+ else strncpy(s_string,"(shuffled)",sizeof(s_string));
+
+ sprintf(f_string,"%s unscaled statistics: mu= %6.4f var=%6.4f; Lambda= %6.4f",
+ s_string, rs->mu,rs->mean_var,PI_SQRT6/sqrt(rs->mean_var));
+ return AVE_STATS;
+}
+
+
+/*
+This routine calculates the maximum likelihood estimates for the
+extreme value distribution exp(-exp(-(-x-a)/b)) using the formula
+
+ <lambda> = x_m - sum{ x[i] * exp (-x[i]<lambda>)}/sum{exp (-x[i]<lambda>)}
+ <a> = -<1/lambda> log ( (1/nlib) sum { exp(-x[i]/<lambda> } )
+
+ The <a> parameter can be transformed into and K
+ of the formula: 1 - exp ( - K m n exp ( - lambda S ))
+ using the transformation: 1 - exp ( -exp -(lambda S + log(K m n) ))
+ 1 - exp ( -exp( - lambda ( S + log(K m n) / lambda))
+
+ a = log(K m n) / lambda
+ a lambda = log (K m n)
+ exp(a lambda) = K m n
+ but from above: a lambda = log (1/nlib sum{exp( -x[i]*lambda)})
+ so: K m n = (1/n sum{ exp( -x[i] *lambda)})
+ K = sum{}/(nlib m n )
+
+*/
+
+void
+alloc_hist(struct llen_str *llen)
+{
+ int max_llen, i;
+ max_llen = llen->max;
+
+ if (llen->hist == NULL) {
+ llen->hist = (int *)calloc((size_t)(max_llen+1),sizeof(int));
+ llen->score_sums = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
+ llen->score2_sums =(double *)calloc((size_t)(max_llen + 1),sizeof(double));
+ llen->score_var = (double *)calloc((size_t)(max_llen + 1),sizeof(double));
+ }
+
+ for (i=0; i< max_llen+1; i++) {
+ llen->hist[i] = 0;
+ llen->score_var[i] = llen->score_sums[i] = llen->score2_sums[i] = 0.0;
+ }
+}
+
+void
+free_hist(struct llen_str *llen)
+{
+ if (llen->hist!=NULL) {
+ free(llen->score_var);
+ free(llen->score2_sums);
+ free(llen->score_sums);
+ free(llen->hist);
+ llen->hist=NULL;
+ }
+}
+
+void
+inithist(struct llen_str *llen, struct pstruct *ppst, int max_hscore)
+{
+ llen->max = MAX_LLEN;
+
+ llen->max_score = -1;
+ llen->min_score=10000;
+
+ alloc_hist(llen);
+
+ llen->zero_s = 0;
+ llen->min_length = 10000;
+ llen->max_length = 0;
+}
+
+void
+addhist(struct llen_str *llen, int score, int length, int max_hscore)
+{
+ int llength;
+ double dscore;
+
+ if ( score<=0 || length < LENGTH_CUTOFF) {
+ llen->min_score = 0;
+ llen->zero_s++;
+ return ;
+ }
+
+ if (score < llen->min_score) llen->min_score = score;
+ if (score > llen->max_score) llen->max_score = score;
+
+ if (length > llen->max_length) llen->max_length = length;
+ if (length < llen->min_length) llen->min_length = length;
+ if (score > max_hscore) score = max_hscore;
+
+ llength = (int)(LN_FACT*log((double)length)+0.5);
+
+ if (llength < 0 ) llength = 0;
+ if (llength > llen->max) llength = llen->max;
+ llen->hist[llength]++;
+ dscore = (double)score;
+ llen->score_sums[llength] += dscore;
+ llen->score2_sums[llength] += dscore * dscore;
+
+ /*
+ db->entries++;
+ db->length += length;
+ if (db->length > LONG_MAX) {db->carry++;db->length -= LONG_MAX;}
+ */
+}
+
+/* histogram will go from z-scores of 20 .. 100 with mean 50 and z=10 */
+
+
+void
+inithistz(int mh, struct hist_str *histp )
+{
+ int i;
+
+ histp->min_hist = 20;
+ histp->max_hist = 120;
+
+ histp->histint = (int)
+ ((double)(histp->max_hist - histp->min_hist + 2)/(double)mh+0.5);
+ histp->maxh = (int)
+ ((double)(histp->max_hist - histp->min_hist + 2)/(double)histp->histint+0.5);
+
+ if (histp->hist_a==NULL) {
+ if ((histp->hist_a=(int *)calloc((size_t)histp->maxh,sizeof(int)))==
+ NULL) {
+ fprintf(stderr," cannot allocate %d for histogram\n",histp->maxh);
+ histp->histflg = 0;
+ }
+ else histp->histflg = 1;
+ }
+ else {
+ for (i=0; i<histp->maxh; i++) histp->hist_a[i]=0;
+ }
+}
+
+/* fasts/f will not show any histogram */
+void
+addhistz(double zs, struct hist_str *histp)
+{
+}
+
+void
+prune_hist(struct llen_str *llen, int score, int length, int max_hscore,
+ long *entries)
+{
+ int llength;
+ double dscore;
+
+ if (score <= 0 || length < LENGTH_CUTOFF) return;
+
+ if (score > max_hscore) score = max_hscore;
+
+ llength = (int)(LN_FACT*log((double)length)+0.5);
+
+ if (llength < 0 ) llength = 0;
+ if (llength > llen->max) llength = llen->max;
+ llen->hist[llength]--;
+ dscore = (double)score;
+ llen->score_sums[llength] -= dscore;
+ llen->score2_sums[llength] -= dscore * dscore;
+
+ (*entries)--;
+ /*
+ if (length < db->length) db->length -= length;
+ else {db->carry--; db->length += (LONG_MAX - (unsigned long)length);}
+ */
+}
+
+/* fit_llen: no trimming
+ (1) regress scores vs log(n) using weighted variance
+ (2) calculate mean variance after length regression
+*/
+
+void
+fit_llen(struct llen_str *llen, struct pstat_str *pr)
+{
+ int j;
+ int n;
+ int n_size;
+ double x, y2, u, z;
+ double mean_x, mean_y, var_x, var_y, covar_xy;
+ double mean_y2, covar_xy2, var_y2, dllj;
+
+ double sum_x, sum_y, sum_x2, sum_xy, sum_v, delta, n_w;
+
+/* now fit scores to best linear function of log(n), using
+ simple linear regression */
+
+ for (llen->min=0; llen->min < llen->max; llen->min++)
+ if (llen->hist[llen->min]) break;
+ llen->min--;
+
+ for (n_size=0,j = llen->min; j < llen->max; j++) {
+ if (llen->hist[j] > 1) {
+ dllj = (double)llen->hist[j];
+ llen->score_var[j] = llen->score2_sums[j]/dllj
+ - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
+ llen->score_var[j] /= (double)(llen->hist[j]-1);
+ if (llen->score_var[j] <= 0.1 ) llen->score_var[j] = 0.1;
+ n_size++;
+ }
+ }
+
+ pr->nb_tot = n_size;
+
+ n_w = 0.0;
+ sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
+ for (j = llen->min; j < llen->max; j++)
+ if (llen->hist[j] > 1) {
+ x = j + 0.5;
+ dllj = (double)llen->hist[j];
+ n_w += dllj/llen->score_var[j];
+ sum_x += dllj * x / llen->score_var[j] ;
+ sum_y += llen->score_sums[j] / llen->score_var[j];
+ sum_x2 += dllj * x * x /llen->score_var[j];
+ sum_xy += x * llen->score_sums[j]/llen->score_var[j];
+ }
+
+ if (n_size < 5 ) {
+ llen->fit_flag=0;
+ pr->rho = 0;
+ pr->mu = sum_y/n_w;
+ return;
+ }
+ else {
+ delta = n_w * sum_x2 - sum_x * sum_x;
+ if (delta > 0.001) {
+ pr->rho = (n_w * sum_xy - sum_x * sum_y)/delta;
+ pr->rho_e = n_w/delta;
+ pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;
+ pr->mu_e = sum_x2/delta;
+ }
+ else {
+ llen->fit_flag = 0;
+ pr->rho = 0;
+ pr->mu = sum_y/n_w;
+ return;
+ }
+ }
+
+ delta = n_w * sum_x2 - sum_x * sum_x;
+ pr->rho = (n_w * sum_xy - sum_x * sum_y)/delta;
+ pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;
+
+ n = 0;
+ mean_x = mean_y = mean_y2 = 0.0;
+ var_x = var_y = 0.0;
+ covar_xy = covar_xy2 = 0.0;
+
+ for (j = llen->min; j <= llen->max; j++)
+ if (llen->hist[j] > 1 ) {
+ n += llen->hist[j];
+ x = (double)j + 0.5;
+ mean_x += (double)llen->hist[j] * x;
+ mean_y += llen->score_sums[j];
+ var_x += (double)llen->hist[j] * x * x;
+ var_y += llen->score2_sums[j];
+ covar_xy += x * llen->score_sums[j];
+ }
+ mean_x /= n; mean_y /= n;
+ var_x = var_x / n - mean_x * mean_x;
+ var_y = var_y / n - mean_y * mean_y;
+
+ covar_xy = covar_xy / n - mean_x * mean_y;
+/*
+ pr->rho = covar_xy / var_x;
+ pr->mu = mean_y - pr->rho * mean_x;
+*/
+ mean_y2 = covar_xy2 = var_y2 = 0.0;
+ for (j = llen->min; j <= llen->max; j++)
+ if (llen->hist[j] > 1) {
+ x = (double)j + 0.5;
+ u = pr->rho * x + pr->mu;
+ y2 = llen->score2_sums[j] - 2.0 * llen->score_sums[j] * u + llen->hist[j] * u * u;
+/*
+ dllj = (double)llen->hist[j];
+ fprintf(stderr,"%.2f\t%d\t%g\t%g\n",x/LN_FACT,llen->hist[j],
+ llen->score_sums[j]/dllj,y2/dllj);
+*/
+ mean_y2 += y2;
+ var_y2 += y2 * y2;
+ covar_xy2 += x * y2;
+ /* fprintf(stderr,"%6.1f %4d %8d %8d %7.2f %8.2f\n",
+ x,llen->hist[j],llen->score_sums[j],llen->score2_sums[j],u,y2); */
+ }
+
+ pr->mean_var = mean_y2 /= (double)n;
+ covar_xy2 = covar_xy2 / (double)n - mean_x * mean_y2;
+
+ if (pr->mean_var <= 0.01) {
+ llen->fit_flag = 0;
+ pr->mean_var = (pr->mu > 1.0) ? pr->mu: 1.0;
+ }
+
+ /*
+ fprintf(stderr," rho1/mu1: %.4f/%.4f mean_var %.4f\n",
+ pr->rho*LN_FACT,pr->mu,pr->mean_var);
+ */
+ if (n > 1) pr->var_e = (var_y2/n - mean_y2 * mean_y2)/(n-1);
+ else pr->var_e = 0.0;
+
+ if (llen->fit_flag) {
+ pr->rho2 = covar_xy2 / var_x;
+ pr->mu2 = pr->mean_var - pr->rho2 * mean_x;
+ }
+ else {
+ pr->rho2 = 0;
+ pr->mu2 = pr->mean_var;
+ }
+
+ if (pr->rho2 < 0.0 )
+ z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
+ else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
+ if (z < 2*LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
+
+ pr->var_cutoff = pr->rho2 * LN_FACT*log(z) + pr->mu2;
+}
+
+/* fit_llens: trim high variance bins
+ (1) regress scores vs log(n) using weighted variance
+ (2) regress residuals vs log(n)
+ (3) remove high variance bins
+ (4) calculate mean variance after length regression
+*/
+
+void
+fit_llens(struct llen_str *llen, struct pstat_str *pr)
+{
+ int j;
+ int n, n_u2;
+ double x, y, y2, u, u2, v, z;
+ double mean_x, mean_y, var_x, var_y, covar_xy;
+ double mean_y2, covar_xy2;
+ double mean_u2, mean_3u2, dllj;
+ double sum_x, sum_y, sum_x2, sum_xy, sum_v, delta, n_w;
+
+/* now fit scores to best linear function of log(n), using
+ simple linear regression */
+
+ for (llen->min=0; llen->min < llen->max; llen->min++)
+ if (llen->hist[llen->min]) break;
+ llen->min--;
+
+ for (j = llen->min; j < llen->max; j++) {
+ if (llen->hist[j] > 1) {
+ dllj = (double)llen->hist[j];
+ llen->score_var[j] = (double)llen->score2_sums[j]/dllj
+ - (llen->score_sums[j]/dllj)*(llen->score_sums[j]/dllj);
+ llen->score_var[j] /= (double)(llen->hist[j]-1);
+ if (llen->score_var[j] <= 1.0 ) llen->score_var[j] = 1.0;
+ }
+ }
+
+ n_w = 0.0;
+ sum_x = sum_y = sum_x2 = sum_xy = sum_v = 0;
+ for (j = llen->min; j < llen->max; j++)
+ if (llen->hist[j] > 1) {
+ x = j + 0.5;
+ dllj = (double)llen->hist[j];
+ n_w += dllj/llen->score_var[j];
+ sum_x += dllj * x / llen->score_var[j] ;
+ sum_y += llen->score_sums[j] / llen->score_var[j];
+ sum_x2 += dllj * x * x /llen->score_var[j];
+ sum_xy += x * llen->score_sums[j]/llen->score_var[j];
+ }
+
+ delta = n_w * sum_x2 - sum_x * sum_x;
+ pr->rho = (n_w * sum_xy - sum_x * sum_y)/delta;
+ pr->mu = (sum_x2 * sum_y - sum_x * sum_xy)/delta;
+
+/* printf(" rho1/mu1: %.2f/%.2f\n",pr->rho*LN_FACT,pr->mu); */
+
+ n = 0;
+ mean_x = mean_y = mean_y2 = 0.0;
+ var_x = var_y = 0.0;
+ covar_xy = covar_xy2 = 0.0;
+
+ for (j = llen->min; j <= llen->max; j++)
+ if (llen->hist[j] > 1 ) {
+ n += llen->hist[j];
+ x = (double)j + 0.5;
+ dllj = (double)llen->hist[j];
+ mean_x += dllj * x;
+ mean_y += llen->score_sums[j];
+ var_x += dllj * x * x;
+ var_y += llen->score2_sums[j];
+ covar_xy += x * llen->score_sums[j];
+ }
+ mean_x /= n; mean_y /= n;
+ var_x = var_x / n - mean_x * mean_x;
+ var_y = var_y / n - mean_y * mean_y;
+
+ covar_xy = covar_xy / n - mean_x * mean_y;
+/* pr->rho = covar_xy / var_x;
+ pr->mu = mean_y - pr->rho * mean_x;
+*/
+
+ mean_y2 = covar_xy2 = 0.0;
+ for (j = llen->min; j <= llen->max; j++)
+ if (llen->hist[j] > 1) {
+ x = (double)j + 0.5;
+ u = pr->rho * x + pr->mu;
+ y2 = llen->score2_sums[j] - 2 * llen->score_sums[j] * u + llen->hist[j] * u * u;
+ mean_y2 += y2;
+ covar_xy2 += x * y2;
+ }
+
+ mean_y2 /= n;
+ covar_xy2 = covar_xy2 / n - mean_x * mean_y2;
+ pr->rho2 = covar_xy2 / var_x;
+ pr->mu2 = mean_y2 - pr->rho2 * mean_x;
+
+ if (pr->rho2 < 0.0 )
+ z = (pr->rho2 * LN_FACT*log((double)llen->max_length) + pr->mu2 > 0.0) ? llen->max_length : exp((-1.0 - pr->mu2 / pr->rho2)/LN_FACT);
+ else z = pr->rho2 ? exp((1.0 - pr->mu2 / pr->rho2)/LN_FACT) : LENGTH_CUTOFF;
+ if (z < 2* LENGTH_CUTOFF) z = 2*LENGTH_CUTOFF;
+
+ pr->var_cutoff = pr->rho2*LN_FACT*log(z) + pr->mu2;
+
+/* fprintf(stderr,"\nminimum allowed predicted variance (%0.2f) at n = %.0f\n",
+ pr->var_cutoff,z);
+*/
+ mean_u2 = 0.0;
+ n_u2 = 0;
+ for ( j = llen->min; j < llen->max; j++) {
+ y = j+0.5;
+ dllj = (double)llen->hist[j];
+ x = pr->rho * y + pr->mu;
+ v = pr->rho2 * y + pr->mu2;
+ if (v < pr->var_cutoff) v = pr->var_cutoff;
+ if (llen->hist[j]> 1) {
+ u2 = (llen->score2_sums[j] - 2 * x * llen->score_sums[j] + dllj * x * x) - v*dllj;
+ mean_u2 += llen->score_var[j] = u2*u2/(llen->hist[j]-1);
+ n_u2++;
+ /* fprintf(stderr," %d (%d) u2: %.2f v*ll: %.2f %.2f\n",
+ j,llen->hist[j],u2,v*dllj,sqrt(llen->score_var[j])); */
+ }
+ else llen->score_var[j] = -1.0;
+ }
+
+ mean_u2 = sqrt(mean_u2/(double)n_u2);
+ /* fprintf(stderr," mean s.d.: %.2f\n",mean_u2); */
+
+ mean_3u2 = mean_u2*3.0;
+
+ for (j = llen->min; j < llen->max; j++) {
+ if (llen->hist[j] <= 1) continue;
+ if (sqrt(llen->score_var[j]) > mean_3u2) {
+ /* fprintf(stderr," removing %d %d %.2f\n",
+ j, (int)(exp((double)j/LN_FACT)-0.5),
+ sqrt(llen->score_var[j]));
+ */
+ pr->nb_trimmed++;
+ pr->n1_trimmed += llen->hist[j];
+ llen->hist[j] = 0;
+ }
+ }
+ fit_llen(llen, pr);
+}
+
+
+double find_z(int score, double escore, int length, double comp, struct pstat_str *pu) {
+ return find_zp(score, escore, length, comp, pu);
+}
+
+/* REG_STATS - Z() from rho/mu/mean_var */
+double find_zr(int score, double escore, int length, double comp,
+ struct pstat_str *rs)
+{
+ double log_len, z;
+
+ if (score <= 0) return 0.0;
+ if ( length < LENGTH_CUTOFF) return 0.0;
+
+ log_len = LN_FACT*log((double)(length));
+/* var = rs->rho2 * log_len + rs->mu2;
+ if (var < rs->var_cutoff) var = rs->var_cutoff;
+*/
+
+ z = ((double)score - rs->rho * log_len - rs->mu) / sqrt(rs->mean_var);
+
+ return (50.0 + z*10.0);
+}
+
+double find_zt(int score, double escore, int length, double comp,
+ struct pstat_str *rs)
+{
+ if (!rs->eval_is_pval) {escore /= rs->zdb_size;}
+
+ if (escore > 0.0) return -log(escore)/M_LN2;
+ else return 744.440071/M_LN2;
+}
+
+double find_zn(int score, double escore, int length, double comp,
+ struct pstat_str *rs)
+{
+ double z;
+
+ z = ((double)score - rs->mu) / sqrt(rs->mean_var);
+
+ return (50.0 + z*10.0);
+}
+
+/* computes E value for a given z value, assuming extreme value distribution */
+double
+z_to_E(double zs, long entries, struct db_str db)
+{
+ double e, n;
+
+ /* if (db->entries < 5) return (double)db.entries; */
+ if (entries < 1) { n = db.entries;}
+ else {n = entries;}
+
+ if (zs > ZS_MAX) return 0.0;
+
+ e = exp(-PI_SQRT6 * zs - EULER_G);
+ return n * (e > .01 ? 1.0 - exp(-e) : e);
+}
+
+double
+zs_to_p(double zs)
+{
+ return zs;
+}
+
+/* this version assumes the probability is in the ->zscore variable,
+ which is provided by this file after last_scale()
+*/
+
+double
+zs_to_bit(double zs, int n0, int n1)
+{
+ return zs+log((double)(n0*n1))/M_LN2 ;
+}
+
+/* computes E-value for a given z value, assuming extreme value distribution */
+double
+zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db)
+{
+ double e, z, k;
+
+ /* if (db->entries < 5) return 0.0; */
+
+ if (zs > ZS_MAX ) return 0.0;
+
+ if (entries < 1) entries = db.entries;
+
+ if (dnaseq == SEQT_DNA || dnaseq == SEQT_RNA) {
+ k = (double)db.length /(double)n1;
+ if (db.carry > 0) { k *= (double)db.carry * (double)LONG_MAX;}
+ }
+ else k = (double)entries;
+
+ if (k < 1.0) k = 1.0;
+
+ zs *= M_LN2;
+ if ( zs > 100.0) e = 0.0;
+ else e = exp(-zs);
+ return k * e;
+}
+
+/* computes E-value for a given z value, assuming extreme value distribution */
+double
+E_to_zs(double E, long entries)
+{
+ double e, z;
+ int error;
+
+ e = E/(double)entries;
+
+#ifndef NORMAL_DIST
+ z = (log(e)+EULER_G)/(-PI_SQRT6);
+ return z*10.0+50.0;
+#else
+ z = np_to_z(1.0-e,&error);
+
+ if (!error) return z*10.0+50.0;
+ else return 0.0;
+#endif
+}
+
+/* computes 1.0 - E value for a given z value, assuming extreme value
+ distribution */
+double
+zs_to_Ec(double zs, long entries)
+{
+ double e, z;
+
+ if (entries < 5) return 0.0;
+
+ z = (zs - 50.0)/10.0;
+
+ if (z > ZS_MAX) return 1.0;
+
+ e = exp(-PI_SQRT6 * z - EULER_G);
+ return (double)entries * (e > .01 ? exp(-e) : 1.0 - e);
+}
+
+int
+E1_to_s(double e_val, int n0, int n1, int db_size,
+ void *pu) {
+ double mp, np, a_n0, a_n0f, a_n1;
+ double zs, log_len, p_val;
+ int score;
+
+ if (n1 < LENGTH_CUTOFF) return 0;
+
+ score = -log(e_val)/log(10.0);
+
+#ifndef NORMAL_DIST
+ if (score < 0) score = 0;
+#endif
+ return score;
+}
+
+void
+sort_escore(double *v, int n)
+{
+ int gap, i, j;
+ double dtmp;
+
+ for (gap=n/2; gap>0; gap/=2) {
+ for (i=gap; i<n; i++) {
+ for (j=i-gap; j>=0; j -= gap) {
+ if (v[j] <= v[j+gap]) break;
+ dtmp = v[j];
+ v[j] = v[j+gap];
+ v[j+gap] = dtmp;
+ }
+ }
+ }
+}
+
+/* scale_tat - compute 'a', 'b', 'c' coefficients for scaling fasts/f
+ escores
+ 5-May-2003 - also calculate index for high ties
+*/
+void
+scale_tat(double *escore, int nstats,
+ long db_entries, int do_trim,
+ struct pstat_str *rs)
+{
+ int i, j, k, start;
+ double *x, *lnx, *lny;
+
+ /* sort_escore(escore, nstats); */
+
+ while (*escore<0.0) {escore++; nstats--; }
+
+ x = (double *) calloc(nstats, sizeof(double));
+ if(x == NULL) {
+ fprintf(stderr, "Couldn't calloc tatE/x\n");
+ exit(1);
+ }
+
+ lnx = (double *) calloc(nstats,sizeof(double));
+ if(lnx == NULL) {
+ fprintf(stderr, "Couldn't calloc tatE/lnx\n");
+ exit(1);
+ }
+
+ lny = (double *) calloc(nstats,sizeof(double));
+ if(lny == NULL) {
+ fprintf(stderr, "Couldn't calloc tatE/lny\n");
+ exit(1);
+ }
+
+ for(i = 0 ; i < nstats ; ) {
+
+ lny[i] = log(escore[i]);
+
+ for(j = i+1 ; j < nstats ; j++) {
+ if(escore[j] != escore[i]) break;
+ }
+
+ x[i] = ((((double)i + (double)(j - i - 1)/2.0)*(double)nstats/(double)db_entries)+1.0)/(double)nstats;
+ lnx[i] = log(x[i]);
+
+ for(k = i+1 ; k < j ; k++) {
+ lny[k]=lny[i];
+ x[k] = x[i];
+ lnx[k]=lnx[i];
+ }
+ i = k;
+ }
+
+ if (!do_trim) {
+ start = 0;
+ } else {
+ start = 0.05 * (double) nstats;
+ start = start > 500 ? 500 : start;
+ }
+
+ linreg(lny, x, lnx, nstats, &rs->tat_a, &rs->tat_b, &rs->tat_c, start);
+
+ /* I have the coefficients I need - a, b, c; free arrays */
+
+ free(lny);
+ free(lnx);
+ free(x);
+
+ /* calculate tie_j - the index below which all scores are considered
+ positional ties */
+
+ rs->tie_j = 0.005 * db_entries;
+}
+
+void
+linreg(double *lny, double *x, double *lnx, int n,
+ double *a, double *b, double *c, int start) {
+
+ double yf1, yf2, yf3;
+ double f1f1, f1f2, f1f3;
+ double f2f2, f2f3;
+ double f3f3, delta;
+
+ int i;
+
+ yf1 = yf2 = yf3 = 0.0;
+ f1f1 = f1f2 = f1f3 = f2f2 = f2f3 = f3f3 = 0.0;
+
+ for(i = start; i < n; i++) {
+ yf1 += lny[i] * lnx[i];
+ yf2 += lny[i] * x[i];
+ yf3 += lny[i];
+
+ f1f1 += lnx[i] * lnx[i];
+ f1f2 += lnx[i] * x[i];
+ f1f3 += lnx[i];
+
+ f2f2 += x[i] * x[i];
+ f2f3 += x[i];
+
+ f3f3 += 1.0;
+ }
+
+ delta = det(f1f1, f1f2, f1f3, f1f2, f2f2, f2f3, f1f3, f2f3, f3f3);
+
+ *a = det(yf1, f1f2, f1f3, yf2, f2f2, f2f3, yf3, f2f3, f3f3) / delta;
+ *b = det(f1f1, yf1, f1f3, f1f2, yf2, f2f3, f1f3, yf3, f3f3) / delta;
+ *c = det(f1f1, f1f2, yf1, f1f2, f2f2, yf2, f1f3, f2f3, yf3) / delta;
+
+}
+
+double det(double a11, double a12, double a13,
+ double a21, double a22, double a23,
+ double a31, double a32, double a33)
+{
+ double result;
+
+ result = a11 * (a22 * a33 - a32 * a23);
+ result -= a12 * (a21 * a33 - a31 * a23);
+ result += a13 * (a21 * a32 - a31 * a22);
+
+ return result;
+}
+
+void
+last_stats(const unsigned char *aa0, int n0,
+ struct stat_str *sptr, int nstats,
+ struct beststr **bestp_arr, int nbest,
+ const struct mngmsg *m_msg, struct pstruct *ppst,
+ struct hist_str *histp, struct pstat_str **rs_sp)
+{
+ double *obs_escore;
+ int i, nobs, nobs_t, do_trim;
+ long db_entries;
+ struct pstat_str *rs_s;
+
+ if (*rs_sp == NULL) {
+ if ((rs_s=(struct pstat_str *)calloc(1,sizeof(struct pstat_str)))==NULL) {
+ fprintf(stderr," cannot allocate rs_s: %ld\n",sizeof(struct pstat_str));
+ exit(1);
+ }
+ else *rs_sp = rs_s;
+ }
+ else rs_s = *rs_sp;
+
+ histp->entries = 0;
+
+ sortbeste(bestp_arr,nbest);
+
+ rs_s->spacefactor =
+ calc_spacefactor(aa0, n0, m_msg->nm0,ppst->nsq);
+
+ if (ppst->zsflag >= 1 && ppst->zsflag <= 4) {
+ if (m_msg->escore_flg) {
+ nobs = nbest;
+ do_trim = 1;
+ }
+ else {
+ nobs = nstats;
+ do_trim = 0;
+ }
+
+ if ((obs_escore = (double *)calloc(nobs,sizeof(double)))==NULL) {
+ fprintf(stderr," cannot allocate obs_escore[%d]\n",nbest);
+ exit(1);
+ }
+
+ if (m_msg->escore_flg) {
+ for (i=nobs=0; i<nbest; i++) {
+ if (bestp_arr[i]->rst.escore<= 1.00)
+ obs_escore[nobs++]=bestp_arr[i]->rst.escore;
+ }
+ /*
+ nobs_t = nobs;
+ for (i=0; i<nbest; i++) {
+ if (bestp_arr[i]->rst.escore >= 0.99 &&
+ bestp_arr[i]->rst.escore <= 1.00)
+ obs_escore[nobs++]=bestp_arr[i]->rst.escore;
+ }
+ */
+ db_entries = m_msg->db.entries;
+ }
+ else {
+ for (i=nobs=0; i<nstats; i++) {
+ if (sptr[i].escore <= 1.00 ) obs_escore[nobs++]=sptr[i].escore;
+ }
+ /*
+ nobs_t = nobs;
+ for (i=0; i<nstats; i++) {
+ if (sptr[i].escore >= 0.99 &&
+ sptr[i].escore <= 1.0) obs_escore[nobs++]=sptr[i].escore;
+ }
+ */
+ db_entries = nobs;
+/* db_entries = m_msg->db.entries;*/
+ }
+
+ sortbesto(obs_escore,nobs);
+ if (nobs > 100) {
+ scale_tat(obs_escore,nobs,db_entries,do_trim,rs_s);
+ rs_s->have_tat=1;
+ sprintf(histp->stat_info,"scaled Tatusov statistics (%d): tat_a: %6.4f tat_b: %6.4f tat_c: %6.4f",
+ nobs,rs_s->tat_a, rs_s->tat_b, rs_s->tat_c);
+ }
+ else {
+ rs_s->have_tat=0;
+ sprintf(histp->stat_info,"Space_factor %.4g scaled statistics",
+ rs_s->spacefactor);
+ }
+ free(obs_escore);
+ }
+ else {
+ rs_s->have_tat=0;
+ histp->stat_info[0] = '\0';
+ }
+ if (rs_s->have_tat) {
+ find_zp = &find_zt;
+ }
+
+}
+
+/* scale_scores() takes the best (real) scores and re-scales them;
+ beststr bptr[] must be sorted */
+
+void
+scale_scores(struct beststr **bptr, int nbest, struct db_str db,
+ struct pstruct *ppst, struct pstat_str *rs)
+{
+ int i, j, k;
+ double obs, r_a, r_b, r_c;
+
+ /* this scale function absolutely requires that the results be sorted
+ before it is used */
+
+ sortbeste(bptr,nbest);
+
+ if (!rs->have_tat) {
+ for (i=0; i<nbest; i++) {
+ bptr[i]->rst.escore *= rs->spacefactor;
+ }
+ }
+ else {
+
+ /* here if more than 1000 scores */
+
+ r_a = rs->tat_a; r_b = rs->tat_b; r_c = rs->tat_c;
+
+ /* the problem with scaletat is that the E() value is related to
+ ones position in the list of top scores - thus, knowing the score
+ is not enough - one must know the rank */
+
+ for(i = 0 ; i < nbest ; ) {
+ /* take the bottom 0.5%, and the ties, and treat them all the same */
+ j = i + 1;
+ while (j< nbest &&
+ (j <= (0.005 * db.entries) || bptr[j]->rst.escore == bptr[i]->rst.escore)
+ ) {
+ j++;
+ }
+
+ /* observed frequency */
+ obs = ((double)i + ((double)(j - i - 1)/ 2.0) + 1.0)/(double)db.entries;
+
+ /* make certain ties all have the same correction */
+ for (k = i ; k < j ; k++) {
+ bptr[k]->rst.escore *= obs/exp(r_a*log(obs) + r_b*obs + r_c);
+ }
+ i = k;
+ }
+ }
+
+ for (i=0; i<nbest; i++) {
+ if(bptr[i]->rst.escore > 0.01)
+ bptr[i]->rst.escore = 1.0 - exp(-bptr[i]->rst.escore);
+ if (bptr[i]->rst.escore > 0.0)
+ bptr[i]->zscore = -log(bptr[i]->rst.escore)/M_LN2;
+ else
+ bptr[i]->zscore = 744.440071/M_LN2;
+ bptr[i]->rst.escore *= ppst->zdb_size;
+ }
+
+ rs->zdb_size = ppst->zdb_size;
+ rs->eval_is_pval = 0;
+}
+
+double scale_one_score (int ipos, double escore,
+ struct db_str db,
+ struct pstat_str *rs) {
+ double obs;
+ double a, b, c;
+
+ if (!rs->have_tat)
+ return escore * rs->spacefactor;
+
+ if (ipos < rs->tie_j) ipos = rs->tie_j/2;
+
+ a = rs->tat_a; b = rs->tat_b; c = rs->tat_c;
+
+ obs = ((double)ipos + 1.0)/(double)db.entries;
+
+ escore *= obs/exp(a*log(obs) + b*obs + c);
+
+ return escore;
+}
+
+double calc_spacefactor(const unsigned char *aa0, int n0,
+ int nm0, int nsq) {
+
+#if !defined(FASTF)
+ return pow(2.0, (double) nm0) - 1.0;
+#else
+
+ int i, j, n, l, nr, bin, k;
+ int nmoff;
+ int **counts;
+ int **factors;
+ double tmp, result = 0.0;
+
+ nmoff = (n0 - nm0 + 1)/nm0+1;
+
+ counts = (int **) calloc(nsq, sizeof(int *));
+ if(counts == NULL) {
+ fprintf(stderr, "couldn't calloc counts array!\n");
+ exit(1);
+ }
+
+ counts[0] = (int *) calloc(nsq * (nmoff - 1), sizeof(int));
+ if(counts[0] == NULL) {
+ fprintf(stderr, "couldn't calloc counts array!\n");
+ exit(1);
+ }
+
+ for(i = 0 ; i < nsq ; i++) {
+ counts[i] = counts[0] + (i * (nmoff - 1));
+ }
+
+ for(i = 0 ; i < nm0 ; i++) {
+ for(j = 0 ; j < (nmoff - 1) ; j++) {
+ counts[ aa0[nmoff * i + j] ] [ j ] ++;
+ }
+ }
+
+ factors = (int **) calloc(nm0 + 1, sizeof(int *));
+ if(factors == NULL) {
+ fprintf(stderr, "Couldn't calloc factors array!\n");
+ exit(1);
+ }
+
+ factors[0] = (int *) calloc((nm0 + 1) * (nmoff - 1), sizeof(int));
+ if(factors[0] == NULL) {
+ fprintf(stderr, "Couldn't calloc factors array!\n");
+ exit(1);
+ }
+
+ for(i = 0 ; i <= nm0 ; i++) {
+ factors[i] = factors[0] + (i * (nmoff - 1));
+ }
+
+ /*
+ this algorithm was adapted from the GAP4 library's NrArrangement function:
+ The GAP Group, GAP --- Groups, Algorithms, and Programming,
+ Version 4.1; Aachen, St Andrews, 1999.
+ (http://www-gap.dcs.st-and.ac.uk/ gap)
+ */
+
+ /* calculate K factors for each column in query: */
+ for(j = 0 ; j < (nmoff - 1) ; j++) {
+
+ /* only one way to select 0 elements */
+ factors[0][j] = 1;
+
+ /* for each of the possible elements in this column */
+ for(n = 0 ; n < nsq ; n++) {
+
+ /* if there aren't any of these, skip it */
+ if(counts[n][j] == 0) { continue; }
+
+ /* loop over the possible lengths of the arrangement: K..0 */
+ for(l = nm0 ; l >= 0 ; l--) {
+ nr = 0;
+ bin = 1;
+
+ /*
+ compute the number of arrangements of length <l>
+ using only the first <n> elements of <mset>
+ */
+ for(i = 0, k = min(counts[n][j], l); i <= k ; i++) {
+
+ /*
+ add the number of arrangements of length <l>
+ that consist of <l>-<i> of the first <n>-1 elements
+ and <i> copies of the <n>th element
+ */
+ nr += bin * factors[l-i][j];
+ bin = (int) ((float) bin * (float) (l - i) / (float) (i + 1));
+ }
+
+ factors[l][j] = nr;
+ }
+ }
+ }
+
+ result = 0.0;
+ for(i = 1 ; i <= nm0 ; i++) {
+ tmp = 1.0;
+ for(j = 0 ; j < (nmoff - 1) ; j++) {
+ tmp *= (double) factors[i][j];
+ }
+ tmp /= factorial(i, 1);
+ result += tmp;
+ }
+
+ free(counts[0]);
+ free(counts);
+ free(factors[0]);
+ free(factors);
+
+ return result;
+#endif
+}
+
+void sortbesto (double *obs, int nobs)
+{
+ int gap, i, j, k;
+ double v;
+ int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
+ 13776, 4592, 1968, 861, 336,
+ 112, 48, 21, 7, 3, 1 };
+
+ for ( k = 0; k < 16; k++)
+ for (gap = incs[k], i=gap; i < nobs; i++) {
+ v = obs[i];
+ j = i;
+ while ( j >= gap && obs[j-gap] > v) {
+ obs[j] = obs[j - gap];
+ j -= gap;
+ }
+ obs[j] = v;
+ }
+}
+
+/* print out all pstat_str info for independent calculation */
+void
+pstat_info(char *info_str, int info_str_n, char *comment, struct pstat_str *pu) {
+ char pstat_buf[MAX_STR];
+
+ sprintf(pstat_buf,"%s zsflag: %d\n",comment,pu->zsflag);
+ SAFE_STRNCPY(info_str,pstat_buf,info_str_n);
+ sprintf(pstat_buf,"%s ngLambda: %g; ngK: %g; ngH: %g\n",comment,pu->ngLambda,pu->ngK,pu->ngH);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+
+ sprintf(pstat_buf,"%s rho: %g; rho_e: %g; mu: %g; mu_e: %g;\n",comment,
+ pu->rho,pu->rho_e,pu->mu,pu->mu_e);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+
+ sprintf(pstat_buf,"%s mean_var: %g; var_e: %gg\n",comment,
+ pu->mean_var, pu->var_e);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+
+ sprintf(pstat_buf,"%s rho2: %g; mu2: %g; var_cutoff: %g\n",comment,
+ pu->rho2, pu->mu2,pu->var_cutoff);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+
+ sprintf(pstat_buf,"%s n_trimmed: %d; n1_trimmed: %d; nb_trimmed: %d; nb_tot: %d\n",comment,
+ pu->n_trimmed, pu->n1_trimmed,pu->nb_trimmed, pu->nb_tot);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+
+ sprintf(pstat_buf,"%s tat_a: %g; tat_b: %g; tat_c: %g; spacefactor: %g\n",comment,
+ pu->tat_a, pu->tat_b,pu->tat_c, pu->spacefactor);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+
+ sprintf(pstat_buf,"%s have_tat: %d; tie_j: %d; eval_is_pval: %d; zdb_size: %ld\n",comment,
+ pu->have_tat,pu->tie_j,pu->eval_is_pval,pu->zdb_size);
+ SAFE_STRNCAT(info_str,pstat_buf,info_str_n);
+}
diff --git a/src/showrss.c b/src/showrss.c
new file mode 100644
index 0000000..d737c3b
--- /dev/null
+++ b/src/showrss.c
@@ -0,0 +1,82 @@
+/* $Id: showrss.c 625 2011-03-23 17:21:38Z wrp $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and the
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+#include "structs.h"
+#include "param.h"
+#include "best_stats.h"
+
+extern double
+zs_to_E(double zs, int n1, int isdna, long entries,struct db_str db);
+extern double zs_to_bit(double zs, int n0, int n1);
+extern double zs_to_p(double zs);
+
+extern char *prog_func;
+
+void showbest (FILE *fp, unsigned char **aa0, unsigned char *aa1, int maxn,
+ struct beststr **bptr, int nbest, int qlib, struct mngmsg *m_msg,
+ struct pstruct pst, struct db_str db,
+ char *info_gstring2, void **f_str)
+{
+ double zs;
+ int score;
+ char *rlabel;
+ struct beststr *bbp;
+
+ if ((rlabel=strrchr(m_msg->label,' '))==NULL) rlabel = m_msg->label;
+
+ fprintf(fp,"\n %s - %d shuffles; ",prog_func,m_msg->shuff_max);
+ if (m_msg->shuff_wid > 0)
+ fprintf(fp," window shuffle, window size: %d\n",m_msg->shuff_wid);
+ else
+ fprintf(fp," uniform shuffle\n");
+
+ bbp = bptr[0];
+
+ fprintf(fp," unshuffled %s score: %d; bits(s=%d|n_l=%d): %4.1f p(%d) < %g\n",
+ rlabel,bbp->score[0],bbp->score[0], bbp->n1,
+ zs_to_bit(bbp->zscore,m_msg->n0,bbp->n1),bbp->score[0],zs_to_p(bbp->zscore));
+
+ fprintf(fp,"For %ld sequences, a score >= %d is expected %4.4g times\n\n",
+ pst.zdb_size,bbp->score[0],zs_to_E(bbp->zscore,bbp->n1,0l,pst.zdb_size,db));
+}
+
+void showalign (FILE *fp, unsigned char *aa0, unsigned char *aa1, int maxn,
+ struct beststr **bptr, int nbest,int qlib,
+ const struct mngmsg *m_msg, struct pstruct *ppst,
+ void *f_str, char *info_gstring2)
+{
+}
+
+void
+aancpy(char *to, char *from, int count,
+ struct pstruct pst)
+{
+ char *tp;
+
+ tp=to;
+ while (count-- && *from) {
+ if (*from <= pst.nsq) *tp++ = pst.sq[*(from++)];
+ else *tp++ = *from++;
+ }
+ *tp='\0';
+}
diff --git a/src/smith_waterman_altivec.c b/src/smith_waterman_altivec.c
new file mode 100644
index 0000000..9f9b614
--- /dev/null
+++ b/src/smith_waterman_altivec.c
@@ -0,0 +1,3086 @@
+
+/* Implementation of the Wozniak "anti-diagonal" vectorization
+ strategy for Smith-Waterman comparison, Wozniak (1997) Comp.
+ Appl. Biosci. 13:145-150
+
+ November, 2004
+*/
+
+/*
+ Written by Erik Lindahl, Stockholm Bioinformatics Center, 2004.
+ Please send bug reports and/or suggestions to lindahl at sbc.su.se.
+*/
+
+#include <stdio.h>
+
+#include "defs.h"
+#include "param.h"
+#include "dropgsw2.h"
+
+#ifdef SW_ALTIVEC
+
+int
+smith_waterman_altivec_word(unsigned char * query_sequence,
+ unsigned short * query_profile_word,
+ int query_length,
+ unsigned char * db_sequence,
+ int db_length,
+ unsigned short bias,
+ unsigned short gap_open,
+ unsigned short gap_extend,
+ struct f_struct * f_str)
+{
+ int i,j,k;
+ unsigned short * p;
+ unsigned short score;
+ unsigned char * p_dbseq;
+ int alphabet_size = f_str->alphabet_size;
+ unsigned short * workspace = (unsigned short *)f_str->workspace;
+
+ vector unsigned short Fup,Hup1,Hup2,E,F,H,tmp;
+ vector unsigned char perm;
+ vector unsigned short v_maxscore;
+ vector unsigned short v_bias,v_gapopen,v_gapextend;
+ vector unsigned short v_score;
+ vector unsigned short v_score_q1;
+ vector unsigned short v_score_q2;
+ vector unsigned short v_score_q3;
+ vector unsigned short v_score_load;
+ vector unsigned char queue1_to_score = (vector unsigned char)(16,17,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+ vector unsigned char queue2_to_queue1 = (vector unsigned char)(0,1,18,19,4,5,6,7,8,9,10,11,12,13,14,15);
+ vector unsigned char queue3_to_queue2 = (vector unsigned char)(16,16,16,16,16,21,16,0,16,1,16,2,16,3,16,4);
+ vector unsigned char queue3_with_load = (vector unsigned char)(23,5,6,7,8,25,9,10,11,27,12,13,29,14,31,16);
+
+ /* Load the bias to all elements of a constant */
+ v_bias = vec_lde(0,&bias);
+ perm = vec_lvsl(0,&bias);
+ v_bias = vec_perm(v_bias,v_bias,perm);
+ v_bias = vec_splat(v_bias,0);
+
+ /* Load gap opening penalty to all elements of a constant */
+ v_gapopen = vec_lde(0,&gap_open);
+ perm = vec_lvsl(0,&gap_open);
+ v_gapopen = vec_perm(v_gapopen,v_gapopen,perm);
+ v_gapopen = vec_splat(v_gapopen,0);
+
+ /* Load gap extension penalty to all elements of a constant */
+ v_gapextend = vec_lde(0,&gap_extend);
+ perm = vec_lvsl(0,&gap_extend);
+ v_gapextend = vec_perm(v_gapextend,v_gapextend,perm);
+ v_gapextend = vec_splat(v_gapextend,0);
+
+ v_maxscore = vec_xor(v_maxscore,v_maxscore);
+
+ // Zero out the storage vector
+ k = 2*(db_length+7);
+
+ for(i=0,j=0;i<k;i++,j+=16)
+ {
+ // borrow the zero value in v_maxscore to have something to store
+ vec_st(v_maxscore,j,workspace);
+ }
+
+ for(i=0;i<query_length;i+=8)
+ {
+ // fetch first data asap.
+ p_dbseq = db_sequence;
+ k = *p_dbseq++;
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // zero lots of stuff.
+ // We use both the VPERM and VSIU unit to knock off some cycles.
+
+ E = vec_splat_u16(0);
+ F = vec_xor(F,F);
+ H = vec_splat_u16(0);
+ Hup2 = vec_xor(Hup2,Hup2);
+ v_score_q1 = vec_splat_u16(0);
+ v_score_q2 = vec_xor(v_score_q2,v_score_q2);
+ v_score_q3 = vec_splat_u16(0);
+
+ // reset pointers to the start of the saved data from the last row
+ p = workspace;
+
+ // PROLOGUE 1
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 2
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 3
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 4
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 5
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 6
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 7
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 8
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // reset pointers to the start of the saved data from the last row
+ p = workspace;
+
+ for(j=8;j<db_length;j+=8)
+ {
+ // STEP 1
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup1 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 2
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup2 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup2 = vec_sld(Hup2,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 3
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup1 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 4
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup2 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup2 = vec_sld(Hup2,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 5
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup1 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 6
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup2 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup2 = vec_sld(Hup2,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 7
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup1 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup1 = vec_sld(Hup1,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // STEP 8
+
+ // prefetch next residue
+ k = *p_dbseq++;
+
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // prefetch score for next step
+ v_score_load = vec_ld(16*k,query_profile_word);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(256, p);
+ Hup2 = vec_ld(272, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,14);
+ Hup2 = vec_sld(Hup2,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+ }
+
+ v_score_load = vec_splat_u16(0);
+
+ for(;j<db_length+7;j++)
+ {
+ // Create the actual diagonal score vector
+ // and update the queue of incomplete score vectors
+ //
+ // This could of course be done with only vec_perm or vec_sel,
+ // but since they use different execution units we have found
+ // it to be slightly faster to mix them.
+ v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 16; // move ahead 32 bytes
+
+ // v_score_load contains all zeros
+ Fup = vec_sld(v_score_load,F,14);
+ Hup1 = vec_sld(v_score_load,H,14);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+ }
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+
+ query_profile_word += 8*alphabet_size;
+ }
+
+ // find largest score in the v_maxscore vector
+ tmp = vec_sld(v_maxscore,v_maxscore,8);
+ v_maxscore = vec_max(v_maxscore,tmp);
+ tmp = vec_sld(v_maxscore,v_maxscore,4);
+ v_maxscore = vec_max(v_maxscore,tmp);
+ tmp = vec_sld(v_maxscore,v_maxscore,2);
+ v_maxscore = vec_max(v_maxscore,tmp);
+
+ // store in temporary variable
+ vec_ste(v_maxscore,0,&score);
+
+ // return largest score
+ return score;
+}
+
+int
+smith_waterman_altivec_byte(unsigned char * query_sequence,
+ unsigned char * query_profile_byte,
+ int query_length,
+ unsigned char * db_sequence,
+ int db_length,
+ unsigned char bias,
+ unsigned char gap_open,
+ unsigned char gap_extend,
+ struct f_struct * f_str)
+{
+ int i,j,k,k8;
+ int overflow;
+ unsigned char * p;
+ unsigned char score;
+ int alphabet_size = f_str->alphabet_size;
+ unsigned char * workspace = (unsigned char *)f_str->workspace;
+
+ vector unsigned char Fup,Hup1,Hup2,E,F,H,tmp;
+ vector unsigned char perm;
+ vector unsigned char v_maxscore;
+ vector unsigned char v_bias,v_gapopen,v_gapextend;
+ vector unsigned char v_score;
+ vector unsigned char v_score_q1;
+ vector unsigned char v_score_q2;
+ vector unsigned char v_score_q3;
+ vector unsigned char v_score_q4;
+ vector unsigned char v_score_q5;
+ vector unsigned char v_score_load1;
+ vector unsigned char v_score_load2;
+ vector unsigned char v_zero;
+
+ vector unsigned char queue1_to_score = (vector unsigned char)(16,1,2,3,4,5,6,7,24,9,10,11,12,13,14,15);
+ vector unsigned char queue2_to_queue1 = (vector unsigned char)(16,17,2,3,4,5,6,7,24,25,10,11,12,13,14,15);
+ vector unsigned char queue3_to_queue2 = (vector unsigned char)(16,17,18,3,4,5,6,7,24,25,26,11,12,13,14,15);
+ vector unsigned char queue4_to_queue3 = (vector unsigned char)(16,17,18,19,4,5,6,7,24,25,26,27,12,13,14,15);
+ vector unsigned char queue5_to_queue4 = (vector unsigned char)(16,17,18,19,20,2,3,4,24,25,26,27,28,10,11,12);
+ vector unsigned char queue5_with_load = (vector unsigned char)(19,20,21,5,6,22,7,23,27,28,29,13,14,30,15,31);
+ vector unsigned char merge_score_load = (vector unsigned char)(0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+
+ v_zero = vec_splat_u8(0);
+
+ /* Load the bias to all elements of a constant */
+ v_bias = vec_lde(0,&bias);
+ perm = vec_lvsl(0,&bias);
+ v_bias = vec_perm(v_bias,v_bias,perm);
+ v_bias = vec_splat(v_bias,0);
+
+ /* Load gap opening penalty to all elements of a constant */
+ v_gapopen = vec_lde(0,&gap_open);
+ perm = vec_lvsl(0,&gap_open);
+ v_gapopen = vec_perm(v_gapopen,v_gapopen,perm);
+ v_gapopen = vec_splat(v_gapopen,0);
+
+ /* Load gap extension penalty to all elements of a constant */
+ v_gapextend = vec_lde(0,&gap_extend);
+ perm = vec_lvsl(0,&gap_extend);
+ v_gapextend = vec_perm(v_gapextend,v_gapextend,perm);
+ v_gapextend = vec_splat(v_gapextend,0);
+
+ v_maxscore = vec_xor(v_maxscore,v_maxscore);
+
+ // Zero out the storage vector
+ k = (db_length+15);
+ for(i=0,j=0;i<k;i++,j+=32)
+ {
+ // borrow the zero value in v_maxscore to have something to store
+ vec_st(v_maxscore,j,workspace);
+ vec_st(v_maxscore,j+16,workspace);
+ }
+
+ for(i=0;i<query_length;i+=16)
+ {
+ // zero lots of stuff.
+ // We use both the VPERM and VSIU unit to knock off some cycles.
+
+ E = vec_splat_u8(0);
+ F = vec_xor(F,F);
+ H = vec_splat_u8(0);
+ Hup2 = vec_xor(Hup2,Hup2);
+ v_score_q1 = vec_splat_u8(0);
+ v_score_q2 = vec_xor(v_score_q2,v_score_q2);
+ v_score_q3 = vec_splat_u8(0);
+ v_score_q4 = vec_xor(v_score_q4,v_score_q4);
+ v_score_q5 = vec_splat_u8(0);
+
+ // reset pointers to the start of the saved data from the last row
+ p = workspace;
+
+ // start directly and prefetch score column
+ k = db_sequence[0];
+ k8 = k;
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = v_score_load1;
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // PROLOGUE 1
+ // prefetch next residue
+ k = db_sequence[1];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+ // PROLOGUE 2
+ // prefetch next residue
+ k = db_sequence[2];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 3
+ // prefetch next residue
+ k = db_sequence[3];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 4
+ // prefetch next residue
+ k = db_sequence[4];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 5
+ // prefetch next residue
+ k = db_sequence[5];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+ // PROLOGUE 6
+ // prefetch next residue
+ k = db_sequence[6];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 7
+ // prefetch next residue
+ k = db_sequence[7];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 8
+ // prefetch next residue
+ k = db_sequence[8];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+ // PROLOGUE 9
+ // prefetch next residue
+ k = db_sequence[9];
+ k8 = db_sequence[1];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 10
+ // prefetch next residue
+ k = db_sequence[10];
+ k8 = db_sequence[2];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+ // PROLOGUE 11
+ // prefetch next residue
+ k = db_sequence[11];
+ k8 = db_sequence[3];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 12
+ // prefetch next residue
+ k = db_sequence[12];
+ k8 = db_sequence[4];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+ // PROLOGUE 13
+ // prefetch next residue
+ k = db_sequence[13];
+ k8 = db_sequence[5];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 14
+ // prefetch next residue
+ k = db_sequence[14];
+ k8 = db_sequence[6];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 15
+ // prefetch next residue
+ k = db_sequence[15];
+ k8 = db_sequence[7];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup1 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+ // PROLOGUE 16
+ // prefetch next residue
+ k = db_sequence[16];
+ k8 = db_sequence[8];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch score for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(0, p);
+ Hup2 = vec_ld(16, p);
+ p += 32; // move ahead 32 bytes
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ p = workspace;
+
+ for(j=16;j<db_length;j+=16)
+ {
+ // STEP 1
+
+ // prefetch next residue
+ k = db_sequence[j+1];
+ k8 = db_sequence[j-7];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+ // STEP 2
+
+ // prefetch next residue
+ k = db_sequence[j+2];
+ k8 = db_sequence[j-6];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 3
+
+ // prefetch next residue
+ k = db_sequence[j+3];
+ k8 = db_sequence[j-5];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 4
+
+ // prefetch next residue
+ k = db_sequence[j+4];
+ k8 = db_sequence[j-4];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 5
+
+ // prefetch next residue
+ k = db_sequence[j+5];
+ k8 = db_sequence[j-3];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 6
+
+ // prefetch next residue
+ k = db_sequence[j+6];
+ k8 = db_sequence[j-2];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 7
+
+ // prefetch next residue
+ k = db_sequence[j+7];
+ k8 = db_sequence[j-1];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 8
+
+ // prefetch next residue
+ k = db_sequence[j+8];
+ k8 = db_sequence[j];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+
+
+
+
+
+ // STEP 9
+
+ // prefetch next residue
+ k = db_sequence[j+9];
+ k8 = db_sequence[j+1];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 10
+
+ // prefetch next residue
+ k = db_sequence[j+10];
+ k8 = db_sequence[j+2];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 11
+
+ // prefetch next residue
+ k = db_sequence[j+11];
+ k8 = db_sequence[j+3];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 12
+
+ // prefetch next residue
+ k = db_sequence[j+12];
+ k8 = db_sequence[j+4];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 13
+
+ // prefetch next residue
+ k = db_sequence[j+13];
+ k8 = db_sequence[j+5];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 14
+
+ // prefetch next residue
+ k = db_sequence[j+14];
+ k8 = db_sequence[j+6];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 15
+
+ // prefetch next residue
+ k = db_sequence[j+15];
+ k8 = db_sequence[j+7];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup1 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup1 = vec_sld(Hup1,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ // STEP 16
+
+ // prefetch next residue
+ k = db_sequence[j+16];
+ k8 = db_sequence[j+8];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load1 = vec_ld(16*k,query_profile_byte);
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+
+ // load values of F and H from previous row (one unit up)
+ Fup = vec_ld(512, p);
+ Hup2 = vec_ld(528, p);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32;
+
+ // shift into place so we have complete F and H vectors
+ // that refer to the values one unit up from each cell
+ // that we are currently working on.
+ Fup = vec_sld(Fup,F,15);
+ Hup2 = vec_sld(Hup2,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup2,v_gapopen);
+ F = vec_max(F,tmp);
+
+ v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
+
+ // add score to H
+ H = vec_adds(Hup1,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+
+ }
+
+ for(;j<db_length+15;j++)
+ {
+ k8 = db_sequence[j-7];
+
+ v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
+ v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
+ v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
+ v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
+ v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
+ v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
+
+
+ // prefetch scores for next step
+ v_score_load2 = vec_ld(16*k8,query_profile_byte);
+ v_score_load1 = vec_perm(v_zero,v_score_load2,merge_score_load);
+
+ // save old values of F and H to use on next row
+ vec_st(F, 0, p);
+ vec_st(H, 16, p);
+ p += 32; // move ahead 32 bytes
+
+ Fup = vec_sld(v_zero,F,15);
+ Hup1 = vec_sld(v_zero,H,15);
+
+ // do the dynamic programming
+
+ // update E value
+ E = vec_subs(E,v_gapextend);
+ tmp = vec_subs(H,v_gapopen);
+ E = vec_max(E,tmp);
+
+ // update F value
+ F = vec_subs(Fup,v_gapextend);
+ tmp = vec_subs(Hup1,v_gapopen);
+ F = vec_max(F,tmp);
+
+ // add score to H
+ H = vec_adds(Hup2,v_score);
+ H = vec_subs(H,v_bias);
+
+ // set H to max of H,E,F
+ H = vec_max(H,E);
+ H = vec_max(H,F);
+
+ // Save value to use for next diagonal H
+ Hup2 = Hup1;
+
+ // Update highest score encountered this far
+ v_maxscore = vec_max(v_maxscore,H);
+ }
+ vec_st(F, 512, p);
+ vec_st(H, 528, p);
+
+ query_profile_byte += 16*alphabet_size;
+
+ // End of this row (actually 16 rows due to SIMD).
+ // Before we continue, check for overflow.
+ tmp = vec_subs(vec_splat_u8(-1),v_bias);
+ overflow = vec_any_ge(v_maxscore,tmp);
+
+
+ }
+
+ if(overflow)
+ {
+ return 255;
+ }
+ else
+ {
+ // find largest score in the v_maxscore vector
+ tmp = vec_sld(v_maxscore,v_maxscore,8);
+ v_maxscore = vec_max(v_maxscore,tmp);
+ tmp = vec_sld(v_maxscore,v_maxscore,4);
+ v_maxscore = vec_max(v_maxscore,tmp);
+ tmp = vec_sld(v_maxscore,v_maxscore,2);
+ v_maxscore = vec_max(v_maxscore,tmp);
+ tmp = vec_sld(v_maxscore,v_maxscore,1);
+ v_maxscore = vec_max(v_maxscore,tmp);
+
+ // store in temporary variable
+ vec_ste(v_maxscore,0,&score);
+
+ // return largest score
+ return score;
+ }}
+
+
+#else
+
+/* No Altivec support. Avoid compiler complaints about empty object */
+
+int sw_dummy;
+
+#endif
diff --git a/src/smith_waterman_altivec.h b/src/smith_waterman_altivec.h
new file mode 100644
index 0000000..0437552
--- /dev/null
+++ b/src/smith_waterman_altivec.h
@@ -0,0 +1,26 @@
+/* $Id: smith_waterman_altivec.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+int
+smith_waterman_altivec_word(const unsigned char * query_sequence,
+ unsigned short * query_profile_word,
+ const int query_length,
+ const unsigned char * db_sequence,
+ const int db_length,
+ unsigned short bias,
+ unsigned short gap_open,
+ unsigned short gap_extend,
+ struct f_struct * f_str);
+
+
+int
+smith_waterman_altivec_byte(const unsigned char * query_sequence,
+ unsigned char * query_profile_byte,
+ const int query_length,
+ const unsigned char * db_sequence,
+ const int db_length,
+ unsigned char bias,
+ unsigned char gap_open,
+ unsigned char gap_extend,
+ struct f_struct * f_str);
+
diff --git a/src/smith_waterman_sse2.c b/src/smith_waterman_sse2.c
new file mode 100644
index 0000000..dbc1cfa
--- /dev/null
+++ b/src/smith_waterman_sse2.c
@@ -0,0 +1,432 @@
+/******************************************************************
+ Copyright 2006 by Michael Farrar. All rights reserved.
+ This program may not be sold or incorporated into a commercial product,
+ in whole or in part, without written consent of Michael Farrar. For
+ further information regarding permission for use or reproduction, please
+ contact: Michael Farrar at farrar.michael at gmail.com.
+*******************************************************************/
+
+/*
+ Written by Michael Farrar, 2006.
+ Please send bug reports and/or suggestions to farrar.michael at gmail.com.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "defs.h"
+#include "param.h"
+#include "dropgsw2.h"
+#include "smith_waterman_sse2.h"
+
+#ifdef __SUNPRO_C
+#include <sunmedia_intrin.h>
+#else
+#include <emmintrin.h>
+#endif
+
+#ifdef SW_SSE2
+
+int
+smith_waterman_sse2_word(const unsigned char * query_sequence,
+ unsigned short * query_profile_word,
+ const int query_length,
+ const unsigned char * db_sequence,
+ const int db_length,
+ unsigned short gap_open,
+ unsigned short gap_extend,
+ struct f_struct * f_str)
+{
+ int i, j, k;
+ short score;
+
+ int cmp;
+ int iter = (query_length + 7) / 8;
+
+
+ __m128i *p;
+ __m128i *workspace = (__m128i *) f_str->workspace;
+
+ __m128i E, F, H;
+
+ __m128i v_maxscore;
+ __m128i v_gapopen;
+ __m128i v_gapextend;
+
+ __m128i v_min;
+ __m128i v_minimums;
+ __m128i v_temp;
+
+ __m128i *pHLoad, *pHStore;
+ __m128i *pE;
+
+ __m128i *pScore;
+
+ /* Load gap opening penalty to all elements of a constant */
+ v_gapopen = _mm_setzero_si128(); /* Apple Devel */
+ v_gapopen = _mm_insert_epi16 (v_gapopen, gap_open, 0);
+ v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
+ v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);
+
+ /* Load gap extension penalty to all elements of a constant */
+ v_gapextend = _mm_setzero_si128(); /* Apple Devel */
+ v_gapextend = _mm_insert_epi16 (v_gapextend, gap_extend, 0);
+ v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
+ v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);
+
+ /* load v_maxscore with the zeros. since we are using signed */
+ /* math, we will bias the maxscore to -32768 so we have the */
+ /* full range of the short. */
+ v_maxscore = _mm_setzero_si128(); /* Apple Devel */
+ v_maxscore = _mm_cmpeq_epi16 (v_maxscore, v_maxscore);
+ v_maxscore = _mm_slli_epi16 (v_maxscore, 15);
+
+ v_minimums = _mm_shuffle_epi32 (v_maxscore, 0);
+
+ v_min = _mm_shuffle_epi32 (v_maxscore, 0);
+ v_min = _mm_srli_si128 (v_min, 14);
+
+ /* Zero out the storage vector */
+ k = 2 * iter;
+
+ p = workspace;
+ for (i = 0; i < k; i++)
+ {
+ _mm_store_si128 (p++, v_maxscore);
+ }
+
+ pE = workspace;
+ pHStore = pE + iter;
+ pHLoad = pHStore + iter;
+
+ for (i = 0; i < db_length; ++i)
+ {
+ /* fetch first data asap. */
+ pScore = (__m128i *) query_profile_word + db_sequence[i] * iter;
+
+ /* bias all elements in F to -32768 */
+ F = _mm_setzero_si128(); /* Apple Devel */
+ F = _mm_cmpeq_epi16 (F, F);
+ F = _mm_slli_epi16 (F, 15);
+
+ /* load the next h value */
+ H = _mm_load_si128 (pHStore + iter - 1);
+ H = _mm_slli_si128 (H, 2);
+ H = _mm_or_si128 (H, v_min);
+
+ p = pHLoad;
+ pHLoad = pHStore;
+ pHStore = p;
+
+ for (j = 0; j < iter; j++)
+ {
+ /* load E values */
+ E = _mm_load_si128 (pE + j);
+
+ /* add score to H */
+ H = _mm_adds_epi16 (H, *pScore++);
+
+ /* Update highest score encountered this far */
+ v_maxscore = _mm_max_epi16 (v_maxscore, H);
+
+ /* get max from H, E and F */
+ H = _mm_max_epi16 (H, E);
+ H = _mm_max_epi16 (H, F);
+
+ /* save H values */
+ _mm_store_si128 (pHStore + j, H);
+
+ /* subtract the gap open penalty from H */
+ H = _mm_subs_epi16 (H, v_gapopen);
+
+ /* update E value */
+ E = _mm_subs_epi16 (E, v_gapextend);
+ E = _mm_max_epi16 (E, H);
+
+ /* update F value */
+ F = _mm_subs_epi16 (F, v_gapextend);
+ F = _mm_max_epi16 (F, H);
+
+ /* save E values */
+ _mm_store_si128 (pE + j, E);
+
+ /* load the next h value */
+ H = _mm_load_si128 (pHLoad + j);
+ }
+
+ /* reset pointers to the start of the saved data */
+ j = 0;
+ H = _mm_load_si128 (pHStore + j);
+
+ /* the computed F value is for the given column. since */
+ /* we are at the end, we need to shift the F value over */
+ /* to the next column. */
+ F = _mm_slli_si128 (F, 2);
+ F = _mm_or_si128 (F, v_min);
+ v_temp = _mm_subs_epi16 (H, v_gapopen);
+ v_temp = _mm_cmpgt_epi16 (F, v_temp);
+ cmp = _mm_movemask_epi8 (v_temp);
+
+ while (cmp != 0x0000)
+ {
+ E = _mm_load_si128 (pE + j);
+
+ H = _mm_max_epi16 (H, F);
+
+ /* save H values */
+ _mm_store_si128 (pHStore + j, H);
+
+ /* update E in case the new H value would change it */
+ H = _mm_subs_epi16 (H, v_gapopen);
+ E = _mm_max_epi16 (E, H);
+ _mm_store_si128 (pE + j, E);
+
+ /* update F value */
+ F = _mm_subs_epi16 (F, v_gapextend);
+
+ j++;
+ if (j >= iter)
+ {
+ j = 0;
+ F = _mm_slli_si128 (F, 2);
+ F = _mm_or_si128 (F, v_min);
+ }
+ H = _mm_load_si128 (pHStore + j);
+
+ v_temp = _mm_subs_epi16 (H, v_gapopen);
+ v_temp = _mm_cmpgt_epi16 (F, v_temp);
+ cmp = _mm_movemask_epi8 (v_temp);
+ }
+ }
+
+ /* find largest score in the v_maxscore vector */
+ v_temp = _mm_srli_si128 (v_maxscore, 8);
+ v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
+ v_temp = _mm_srli_si128 (v_maxscore, 4);
+ v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
+ v_temp = _mm_srli_si128 (v_maxscore, 2);
+ v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
+
+ /* extract the largest score */
+ score = _mm_extract_epi16 (v_maxscore, 0);
+
+ /* return largest score biased by 32768 */
+
+ /* fix for Mac OSX clang 4.1 */
+ /*
+#ifdef __clang__
+ if (score < 0) score += 32768;
+ return score;
+#else
+ */
+ return score + 32768;
+ /* #endif */
+}
+
+int
+smith_waterman_sse2_byte(const unsigned char * query_sequence,
+ unsigned char * query_profile_byte,
+ const int query_length,
+ const unsigned char * db_sequence,
+ const int db_length,
+ unsigned char bias,
+ unsigned char gap_open,
+ unsigned char gap_extend,
+ struct f_struct * f_str)
+{
+ int i, j, k;
+ int score;
+
+ int dup;
+ int cmp;
+ int iter = (query_length + 15) / 16;
+
+ __m128i *p;
+ __m128i *workspace = (__m128i *) f_str->workspace;
+
+ __m128i E, F, H;
+
+ __m128i v_maxscore;
+ __m128i v_bias;
+ __m128i v_gapopen;
+ __m128i v_gapextend;
+
+ __m128i v_temp;
+ __m128i v_zero;
+
+ __m128i *pHLoad, *pHStore;
+ __m128i *pE;
+
+ __m128i *pScore;
+
+ /* Load the bias to all elements of a constant */
+ dup = ((short) bias << 8) | bias;
+ v_bias = _mm_setzero_si128();
+ v_bias = _mm_insert_epi16 (v_bias, dup, 0);
+ v_bias = _mm_shufflelo_epi16 (v_bias, 0);
+ v_bias = _mm_shuffle_epi32 (v_bias, 0);
+
+ /* Load gap opening penalty to all elements of a constant */
+ dup = ((short) gap_open << 8) | gap_open;
+ v_gapopen = _mm_setzero_si128();
+ v_gapopen = _mm_insert_epi16 (v_gapopen, dup, 0);
+ v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
+ v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);
+
+ /* Load gap extension penalty to all elements of a constant */
+ dup = ((short) gap_extend << 8) | gap_extend;
+ v_gapextend = _mm_setzero_si128();
+ v_gapextend = _mm_insert_epi16 (v_gapextend, dup, 0);
+ v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
+ v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);
+
+ /* initialize the max score */
+ /* v_maxscore = _mm_xor_si128 (v_maxscore, v_maxscore); - Apple Devel*/
+ v_maxscore = _mm_setzero_si128(); /* Apple Devel */
+
+ /* create a constant of all zeros for comparison */
+ /* v_zero = _mm_xor_si128 (v_zero, v_zero); - Apple Devel */
+ v_zero = _mm_setzero_si128(); /* Apple Devel */
+
+ /* Zero out the storage vector */
+ k = iter * 2;
+
+ p = workspace;
+ for (i = 0; i < k; i++)
+ {
+ _mm_store_si128 (p++, v_maxscore);
+ }
+
+ pE = workspace;
+ pHStore = pE + iter;
+ pHLoad = pHStore + iter;
+
+ for (i = 0; i < db_length; ++i)
+ {
+ /* fetch first data asap. */
+ pScore = (__m128i *) query_profile_byte + db_sequence[i] * iter;
+
+ /* zero out F value. */
+ /* F = _mm_xor_si128 (F, F); -Apple Devel */
+ F = _mm_setzero_si128(); /* Apple Devel */
+
+ /* load the next h value */
+ H = _mm_load_si128 (pHStore + iter - 1);
+ H = _mm_slli_si128 (H, 1);
+
+ p = pHLoad;
+ pHLoad = pHStore;
+ pHStore = p;
+
+ for (j = 0; j < iter; j++)
+ {
+ /* load values E. */
+ E = _mm_load_si128 (pE + j);
+
+ /* add score to H */
+ H = _mm_adds_epu8 (H, *pScore++);
+ H = _mm_subs_epu8 (H, v_bias);
+
+ /* Update highest score encountered this far */
+ v_maxscore = _mm_max_epu8 (v_maxscore, H);
+
+ /* get max from H, E and F */
+ H = _mm_max_epu8 (H, E);
+ H = _mm_max_epu8 (H, F);
+
+ /* save H values */
+ _mm_store_si128 (pHStore + j, H);
+
+ /* subtract the gap open penalty from H */
+ H = _mm_subs_epu8 (H, v_gapopen);
+
+ /* update E value */
+ E = _mm_subs_epu8 (E, v_gapextend);
+ E = _mm_max_epu8 (E, H);
+
+ /* update F value */
+ F = _mm_subs_epu8 (F, v_gapextend);
+ F = _mm_max_epu8 (F, H);
+
+ /* save E values */
+ _mm_store_si128 (pE + j, E);
+
+ /* load the next h value */
+ H = _mm_load_si128 (pHLoad + j);
+ }
+
+ /* reset pointers to the start of the saved data */
+ j = 0;
+ H = _mm_load_si128 (pHStore + j);
+
+ /* the computed F value is for the given column. since */
+ /* we are at the end, we need to shift the F value over */
+ /* to the next column. */
+ F = _mm_slli_si128 (F, 1);
+ v_temp = _mm_subs_epu8 (H, v_gapopen);
+ v_temp = _mm_subs_epu8 (F, v_temp);
+ v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
+ cmp = _mm_movemask_epi8 (v_temp);
+
+ while (cmp != 0xffff)
+ {
+ E = _mm_load_si128 (pE + j);
+
+ H = _mm_max_epu8 (H, F);
+
+ /* save H values */
+ _mm_store_si128 (pHStore + j, H);
+
+ /* update E in case the new H value would change it */
+ H = _mm_subs_epu8 (H, v_gapopen);
+ E = _mm_max_epu8 (E, H);
+ _mm_store_si128 (pE + j, E);
+
+ /* update F value */
+ F = _mm_subs_epu8 (F, v_gapextend);
+
+ j++;
+ if (j >= iter)
+ {
+ j = 0;
+ F = _mm_slli_si128 (F, 1);
+ }
+ H = _mm_load_si128 (pHStore + j);
+
+ v_temp = _mm_subs_epu8 (H, v_gapopen);
+ v_temp = _mm_subs_epu8 (F, v_temp);
+ v_temp = _mm_cmpeq_epi8 (v_temp, v_zero);
+ cmp = _mm_movemask_epi8 (v_temp);
+ }
+ }
+
+ /* find largest score in the v_maxscore vector */
+ v_temp = _mm_srli_si128 (v_maxscore, 8);
+ v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
+ v_temp = _mm_srli_si128 (v_maxscore, 4);
+ v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
+ v_temp = _mm_srli_si128 (v_maxscore, 2);
+ v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
+ v_temp = _mm_srli_si128 (v_maxscore, 1);
+ v_maxscore = _mm_max_epu8 (v_maxscore, v_temp);
+
+ /* store in temporary variable */
+ score = _mm_extract_epi16 (v_maxscore, 0);
+ score = score & 0x00ff;
+
+ /* check if we might have overflowed */
+ if (score + bias >= 255)
+ {
+ score = 255;
+ }
+
+ /* return largest score */
+ return score;
+}
+#else
+
+/* No SSE2 support. Avoid compiler complaints about empty object */
+
+int sw_dummy;
+
+#endif
diff --git a/src/smith_waterman_sse2.h b/src/smith_waterman_sse2.h
new file mode 100755
index 0000000..8ff004d
--- /dev/null
+++ b/src/smith_waterman_sse2.h
@@ -0,0 +1,43 @@
+
+/* $Id: smith_waterman_sse2.h 625 2011-03-23 17:21:38Z wrp $ */
+/* $Revision: 625 $ */
+
+/******************************************************************
+ Copyright 2006 by Michael Farrar. All rights reserved.
+ This program may not be sold or incorporated into a commercial product,
+ in whole or in part, without written consent of Michael Farrar. For
+ further information regarding permission for use or reproduction, please
+ contact: Michael Farrar at farrar.michael at gmail.com.
+*******************************************************************/
+
+/*
+ Written by Michael Farrar, 2006.
+ Please send bug reports and/or suggestions to farrar.michael at gmail.com.
+*/
+
+#ifndef SMITH_WATERMAN_SSE2_H
+#define SMITH_WATERMAN_SSE2_H
+
+int
+smith_waterman_sse2_word(const unsigned char * query_sequence,
+ unsigned short * query_profile_word,
+ const int query_length,
+ const unsigned char * db_sequence,
+ const int db_length,
+ unsigned short gap_open,
+ unsigned short gap_extend,
+ struct f_struct * f_str);
+
+
+int
+smith_waterman_sse2_byte(const unsigned char * query_sequence,
+ unsigned char * query_profile_byte,
+ const int query_length,
+ const unsigned char * db_sequence,
+ const int db_length,
+ unsigned char bias,
+ unsigned char gap_open,
+ unsigned char gap_extend,
+ struct f_struct * f_str);
+
+#endif /* SMITH_WATERMAN_SSE2_H */
diff --git a/src/structs.h b/src/structs.h
new file mode 100644
index 0000000..9065b89
--- /dev/null
+++ b/src/structs.h
@@ -0,0 +1,196 @@
+/* $Id: structs.h 1259 2014-05-28 19:28:06Z wrp $ */
+
+#include "rstruct.h"
+
+#include "aln_structs.h"
+
+#include "param.h"
+
+struct hist_str {
+ int histflg;
+ int *hist_a;
+ int histint, min_hist, max_hist, maxh;
+ long entries;
+ int z_calls;
+ char stat_info[MAX_STR];
+};
+
+struct db_str {
+ long entries;
+ unsigned long length;
+ int carry;
+};
+
+struct mng_thr { /* structure to keep track of thread buffers */
+ int max_work_buf; /* number of threads buffers */
+ int max_buf2_res; /* number of results/thread buffer */
+ int max_chain_seqs; /* number of sequences/seqr chain */
+ int seq_buf_size; /* size of sequence buffer in each max_seq_cnt chain */
+};
+
+/* values required for successive calles to getlib() */
+struct lib_seq_info {
+ int ldnaseq;
+ int term_code;
+ int maxn;
+ int dupn;
+ int l_overlap;
+ int maxt3;
+};
+
+/* used by help system to describe optional arguments */
+struct opt_def_str {
+ char opt_char;
+ int has_arg;
+ char *opt_str;
+ char *opt_descr_s;
+ char *opt_descr_l;
+ int opt_rank;
+ int fmt_type;
+ int i_param1;
+ int i_param2;
+ double d_param1;
+ double d_param2;
+ char *s_param;
+};
+
+struct markx_str {
+ /* values to be copied into m_msg to modify format */
+ int markx;
+ int nohist;
+ int ashow;
+ int show_code;
+ int long_info;
+ int aln_llen;
+ int aln_llcntx;
+ int aln_llcntx_set;
+ int std_output;
+
+ char *out_file;
+ FILE *out_fd;
+ struct markx_str *next;
+};
+
+struct mngmsg /* Message from host to manager */
+{
+ char pgm_name[MAX_FN]; /* program name from argv[0] */
+ long max_memK; /* maximum amount of sequence buffer memory */
+ int cur_seqr_cnt; /* current number of seqr chains */
+ int n0; /* query length ^qm_msg */
+ int nm0; /* number of segments ^qm_msg */
+ int nmoff; /* length of fastf segment */
+ unsigned char *aa0a; /* annotation array */
+ struct annot_str *annot_p; /* annot_str for query */
+ unsigned char ann_arr[MAX_FN]; /* annotation characters */
+ int ann_arr_n; /* number of annotation characters */
+ char *ann_arr_def[MAX_FN]; /* definitions of ann_arr characters */
+ int ann_flg; /* have annotation array, characters */
+ int have_ann; /* have annotation on this query */
+ char tname[MAX_FN]; /* Query sequence name */
+ int tnamesize; /* Query name size */
+ char lname[MAX_LSTR]; /* Library file name */
+ char link_lname[MAX_LSTR]; /* link-library name */
+ char annot0_sname[MAX_LSTR]; /* query annotation script name */
+ char annot1_sname[MAX_LSTR]; /* library annotation script name */
+ int max_tot; /* function defined total sequence area */
+ int qdnaseq; /* query is protein (0)/dna (1) */
+ int q_overlap; /* overlap when segmenting long query sequence */
+ long q_offset; /* q_offset, l_offset are set outside getlib()
+ and report the number of residues that were
+ skipped/read-previously to get to this
+ version of aa0/aa1 (0-based) */
+ long q_off; /* q_off, l_off are set by getlib(); starting
+ at 1 but modified by @C: the position of
+ the first residue of aa0/aa1 in the
+ original sequence */
+ int qframe; /* number of possible query frames */
+ int nframe; /* frame for TFASTA */
+ int nitt1; /* nframe-1 */
+ int thr_fact; /* fudge factor for threads */
+ int s_int; /* sampling interval for statistics */
+ int ql_start; /* starting query sequence */
+ int ql_stop; /* ending query sequence */
+ int pbuf_siz; /* buffer size for sequences send in p2_complib */
+ char qtitle[MAX_STR]; /* query title */
+ char ltitle[MAX_STR]; /* library title */
+ char flstr[MAX_FN]; /* FASTLIBS string */
+ int std_output; /* produce normal output */
+ char outfile[MAX_FN];
+ FILE *outfd; /* indicates outfile is already open */
+ char label [MAX_SSTR]; /* Output label, "opt", "s-w", "initn init1", "initn opt" */
+ char alabel[MAX_SSTR]; /* Output label, "Smith-Waterman", "banded Smith-Waterman", etc */
+ char f_id0[4]; /* function id for markx==10 */
+ char f_id1[4]; /* function id for markx==10 */
+ char sqnam[4]; /* "aa" or "nt" */
+ char sqtype[10]; /* "DNA" or "protein" */
+ int long_info; /* long description flag*/
+ long sq0off, sq1off; /* virtual offset into aa0, aa1 */
+ int markx; /* alignment display type */
+ int tot_markx; /* markx as summ of all alternative markx */
+ struct markx_str *markx_list; /* list of alternate outputs */
+ int nbr_seq; /* number of library sequences */
+ int n1_high; /* upper limit on sequence length */
+ int n1_low; /* lower limit on sequence length */
+ double e_cut; /* e_value for display */
+ double e_low; /* e_value for display */
+ int e_cut_set; /* e_value deliberately set */
+ int zsflag; /* zsflag for all searches */
+ int zsflag2; /* zsflag2 for all searches */
+ int pamd1; /* 1st dimension of pam matrix */
+ int pamd2; /* 2nd dimension of pam matrix */
+ int revcomp; /* flag to do reverse complement */
+ int quiet; /* quiet option */
+ int nrelv; /* number of interesting scores */
+ int srelv; /* number of scores to show in showbest */
+ int arelv; /* number of scores to show at alignment */
+ int z_bits; /* z_bits==1: show bit score, ==0 show z-score */
+ int tot_ident; /* tot_ident=1 -> no mismatches for 100% identity */
+ char alab[3][24]; /* labels for alignment scores */
+ int nohist; /* no histogram option */
+ int do_showbest; /* do not showbest() */
+ int nshow; /* number shown in showbest() */
+ int nskip; /* number skipped by e_low */
+ int mshow; /* number of scores to show */
+ int mshow_set; /* mshow set with -b */
+ int mshow_min; /* at least mshow scores must be shown, but limited by e_cut */
+ int ashow; /* number of alignments to show */
+ int ashow_set; /* ashow set with -d */
+ int nmlen; /* length of name label */
+ int show_code; /* show alignment code in -m 9; ==1 => identity only, ==2 alignment code*/
+ int tot_show_code; /* show alignment for all outputs */
+ int pre_load_done; /* set after pre_load_best() call */
+ int align_done; /* do_walign() called */
+ unsigned char *aa1save_buf_b; /* buffer for re_getlib() sequences for alignments */
+ char *bline_buf_b; /* buffer for re_getlib() sequences for alignments */
+ int thold; /* threshold */
+ int max_repeat; /* max number of non-intersecting local alignments */
+ int last_calc_flg; /* needs a last calculation stage */
+ int qshuffle; /* shuffle the query and do additional comparisons ^qm_msg*/
+ int shuff_max; /* number of shuffles to perform */
+ int shuff_max_save; /* number of shuffles to perform */
+ int shuff_node; /* number of shuffles/worker node */
+ int shuff_wid;
+ int stages; /* number of stages */
+ double Lambda, K, H; /* (ungapped) Karlin-Altschul parameters */
+ int escore_flg; /* use escore calculated by do_work() */
+ struct lib_seq_info ldb_info; /* maxn, maxt, l_overlap, ldnaseq */
+ struct db_str db; /* the database size as read */
+ struct db_str ldb; /* the database size via save_best() */
+ struct score_count_s s_info; /* counts of different score types */
+ struct score_count_s ss_info; /* counts of different score types from shuffles */
+ struct hist_str hist;
+ void *pstat_void; /* pstat structure for standard statistics */
+ void *pstat_void2; /* pstat structure for zsflag > 2 */
+ struct a_struct aln; /* has llen, llnctx, llnctx_flg, showall */
+ struct a_res_str a_res; /* has individual alignment coordinates */
+ char dfile [MAX_FN]; /* file for dumping scores to */
+};
+
+struct lib_struct {
+ char *file_name; /* this library file */
+ int lib_type; /* the library type can be specified here, for files in an indirect list */
+ /* struct lib_mol_info *lib_params; */ /* parameters (ldnaseq, term_code, constant for all libraries */
+ struct lmf_str *acc_file_p;
+ struct lmf_str *m_file_p; /* magic *m_file_p for reading */
+ struct lib_struct *next; /* next in the list */
+};
diff --git a/src/tatstats.c b/src/tatstats.c
new file mode 100644
index 0000000..ed5413e
--- /dev/null
+++ b/src/tatstats.c
@@ -0,0 +1,583 @@
+/* $Id: tatstats.c 1202 2013-07-20 12:55:32Z wrp $ */
+/* $Revision: 1202 $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and the
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+
+#include "defs.h"
+#include "param.h"
+#define XTERNAL
+#include "upam.h"
+#undef XTERNAL
+#include "tatstats.h"
+#include "best_stats.h"
+#define XTERNAL
+#include "uascii.h"
+
+/* calc_priors() - calculate frequencies of amino-acids, possibly with counts */
+/* generate_tatprobs() - build the table of score probabilities if the
+ sequences are not too long */
+
+double
+det(double a11, double a12, double a13,
+ double a21, double a22, double a23,
+ double a31, double a32, double a33);
+
+double power(double r, int p)
+{
+ double tr;
+ int neg;
+
+ if (r==0.0) return((p==0)?1.0:0.0);
+ if ((neg = (p<0))) p = -p;
+ tr = 1.0;
+ while (p>0) {
+ if (p & 1) tr *= r;
+ p >>= 1;
+ if (p) r *= r;
+ }
+ return((neg ? 1.0/tr: tr));
+}
+
+double
+factorial (int a, int b) {
+
+ double res = 1.0;
+
+ if(a == 0) { return 1.0; }
+
+ while(a > b) {
+ res *= (double) a;
+ a--;
+ }
+
+ return res;
+}
+
+void
+calc_priors(double *priors,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ const unsigned char *aa1, int n1,
+ int pseudocts)
+{
+ long *counts, sum;
+ int i, n_counts;
+
+ if (ppst->nsq > MAXUC) {
+ fprintf(stderr,"*** ERROR *** tatstats.c:calc_priors nsq [ %d] out of range [%d]\n",
+ ppst->nsq, MAXUC);
+ }
+ n_counts = ppst->nsq;
+
+ if ((counts = (long *)calloc(n_counts,sizeof(long)))==NULL) {
+ fprintf(stderr,"*** ERROR *** tatstats.c:calc_priors cannot allocate counts[] for priors\n");
+ exit(1);
+ }
+
+ if (n1 == 0 && f_str->priors[1] > 0.0) {
+ for(i = 1 ; i < ppst->nsq ; i++) {
+ priors[i] = f_str->priors[i];
+ }
+ free(counts);
+ return;
+ }
+
+ /* pre-initialize counts/priors for all library sequences */
+ if(n1 == 0) {
+ if (ppst->dnaseq==SEQT_PROT ) {
+
+ /* Robinson & Robinson residue counts from Stephen Altschul */
+ counts[pascii['A']] = 35155; /* A */
+ counts[pascii['R']] = 23105; /* R */
+ counts[pascii['N']] = 20212; /* N */
+ counts[pascii['D']] = 24161; /* D */
+ counts[pascii['C']] = 8669; /* C */
+ counts[pascii['Q']] = 19208; /* Q */
+ counts[pascii['E']] = 28354; /* E */
+ counts[pascii['G']] = 33229; /* G */
+ counts[pascii['H']] = 9906; /* H */
+ counts[pascii['I']] = 23161; /* I */
+ counts[pascii['L']] = 40625; /* L */
+ counts[pascii['K']] = 25872; /* K */
+ counts[pascii['M']] = 10101; /* M */
+ counts[pascii['F']] = 17367; /* F */
+ counts[pascii['P']] = 23435; /* P */
+ counts[pascii['S']] = 32070; /* S */
+ counts[pascii['T']] = 26311; /* T */
+ counts[pascii['W']] = 5990; /* W */
+ counts[pascii['Y']] = 14488; /* Y */
+ counts[pascii['V']] = 29012; /* V */
+ counts[pascii['B']] = 0; /* B */
+ counts[pascii['X']] = 0; /* X */
+ counts[pascii['Z']] = 0; /* Z */
+ counts[pascii['U']] = 0; /* U */
+ counts[pascii['*']] = 0; /* * */
+ counts[pascii['O']] = 0; /* O */
+ counts[pascii['J']] = 0; /* J */
+ }
+ else { /* SEQT_DNA */
+ counts[pascii['A']] = 250;
+ counts[pascii['C']] = 250;
+ counts[pascii['G']] = 250;
+ counts[pascii['T']] = 250;
+ for (i=5; i<n_counts; i++) counts[i]=0;
+ }
+ }
+ else { /* initialize counts for THIS library sequence */
+ for(i = 0 ; i < n1 ; i++) {
+ if(aa1[i] > ppst->nsq || aa1[i] < 1) continue;
+ counts[aa1[i]]++;
+ }
+ }
+
+ sum = 0;
+ for(i = 1 ; i < ppst->nsq ; i++) sum += counts[i];
+
+ for(i = 1 ; i < ppst->nsq ; i++) {
+ if(n1 == 0) { /* pre-initialize */
+ priors[i] = (double) counts[i] / (double) sum;
+ } else { /* THIS library sequence */
+ priors[i] = ( ((double) pseudocts * f_str->priors[i]) + (double) counts[i] ) / ( (double) sum + (double) pseudocts );
+ }
+ }
+ free(counts);
+ return;
+}
+
+int
+max_score(int *scores, int nsq) {
+
+ int max, i;
+
+ max = -BIGNUM;
+ for ( i = 1 ; i < nsq ; i++ ) {
+ if (scores[i] > max) max = scores[i];
+ }
+
+ return max;
+}
+
+int
+min_score(int *scores, int nsq) {
+
+ int min, i;
+
+ min = BIGNUM;
+ for (i = 1 ; i < nsq ; i++ ) {
+ if (scores[i] < min) min = scores[i];
+ }
+ return min;
+}
+
+double
+calc_tatusov ( struct slink *last,
+ struct slink *this,
+ const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int **pam2, int nsq,
+ struct f_struct *f_str,
+ int pseudocts,
+ int do_opt,
+ int zsflag
+ )
+{
+ int i, is, j, k;
+
+ double *priors, my_priors[MAXUC], tatprob, left_tatprob, right_tatprob;
+ unsigned char *query = NULL;
+ int length, maxlength, sumlength, sumscore, tmp, seg;
+ int start, stop;
+ struct slink *sl;
+ int N;
+ double *tatprobsptr;
+
+#if defined(FASTS) || defined(FASTM)
+ int index = 0;
+ int notokay = 0;
+#endif
+
+ struct tat_str *oldtat = NULL, *newtat = NULL;
+
+#if defined(FASTS) || defined(FASTM)
+ start = this->vp->start - this->vp->dp + f_str->noff;
+ stop = this->vp->stop - this->vp->dp + f_str->noff;
+ tmp = stop - start + 1;
+#else
+ /*
+ FASTF alignments can also hang off the end of library sequences,
+ but no query residues are used up in the process, but we have to
+ keep track of which are
+ */
+ tmp = 0;
+ for(i = 0, j = 0 ; i < n0 ; i++) {
+ if (this->vp->used[i] == 1) {tmp++; }
+ }
+#endif
+
+ sumlength = maxlength = length = tmp;
+ seg = 1;
+ sumscore = this->vp->score;
+
+#if defined(FASTS) || defined(FASTM)
+ if(f_str->aa0b[start] == start && f_str->aa0e[stop] == stop) {
+ index |= (1 << f_str->aa0i[start]);
+ } else {
+ notokay |= (1 << f_str->aa0i[start]);
+ }
+#endif
+
+ for(sl = last; sl != NULL ; sl = sl->prev) {
+
+#if defined(FASTS) || defined(FASTM)
+ start = sl->vp->start - sl->vp->dp + f_str->noff;
+ stop = sl->vp->stop - sl->vp->dp + f_str->noff;
+ tmp = stop - start + 1;
+#else
+ tmp = 0;
+ for(i = 0, j = 0 ; i < n0 ; i++) {
+ if(sl->vp->used[i] == 1) {
+ tmp++;
+ }
+ }
+#endif
+ sumlength += tmp;
+ maxlength = tmp > maxlength ? tmp : maxlength;
+ seg++;
+ sumscore += sl->vp->score;
+
+#if defined(FASTS) || defined(FASTM)
+ if(f_str->aa0b[start] == start && f_str->aa0e[stop] == stop) {
+ index |= (1 << f_str->aa0i[start]);
+ } else {
+ notokay |= (1 << f_str->aa0i[start]);
+ }
+#endif
+
+ }
+
+ tatprob = -1.0;
+
+#if defined(FASTS) || defined(FASTM)
+
+ /* for T?FASTS, we try to use what we've precalculated: */
+
+ /* with z = 3, do_opt is true, but we can use precalculated - with
+ all other z's we can use precalculated only if !do_opt */
+ if(!notokay && f_str->tatprobs != NULL) {
+ /* create our own newtat and copy f_str's tat into it */
+ index--;
+
+ newtat = (struct tat_str *) malloc(sizeof(struct tat_str));
+ if(newtat == NULL) {
+ fprintf(stderr, "Couldn't calloc memory for newtat.\n");
+ exit(1);
+ }
+
+ memcpy(newtat, f_str->tatprobs[index], sizeof(struct tat_str));
+
+ newtat->probs = (double *) calloc(f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore + 1, sizeof(double));
+ if(newtat->probs == NULL) {
+ fprintf(stderr, "Couldn't calloc memory for newtat->probs.\n");
+ exit(1);
+ }
+
+ memcpy(newtat->probs, f_str->tatprobs[index]->probs,
+ (f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore + 1) * sizeof(double));
+
+
+ tatprob = f_str->intprobs[index][sumscore - f_str->tatprobs[index]->lowscore];
+ /*
+ if (tatprob > 0.0 && tatprob < 1e-50) {
+ fprintf(stderr," tatprob[%d][%d] near zero: %lf\n",index,sumscore - f_str->tatprobs[index]->lowscore,tatprob);
+ }
+ */
+ } else { /* we need to recalculate from scratch */
+#endif
+
+ /* for T?FASTF, we're always recalculating from scratch: */
+
+ query = (unsigned char *) calloc(length, sizeof(unsigned char));
+ if(query == NULL) {
+ fprintf(stderr, "Couldn't calloc memory for query.\n");
+ exit(1);
+ }
+
+#if defined(FASTS) || defined(FASTM)
+ start = this->vp->start - this->vp->dp + f_str->noff;
+ for(i = 0, j = 0 ; i < length ; i++) {
+ query[j++] = aa0[start + i];
+ }
+#else
+ for(i = 0, j = 0 ; i < n0 ; i++) {
+ if (this->vp->used[i] == 1) {query[j++] = aa0[i];}
+ }
+#endif
+
+ /* calc_priors - not currently implemented for aa1 dependent */
+ /*
+ if( (do_opt && zsflag == 2) || zsflag == 4 ) {
+ priors = &my_priors[0];
+ calc_priors(priors, f_str, aa1, n1, pseudocts);
+ } else {
+ priors = f_str->priors;
+ }
+ */
+
+ priors = f_str->priors;
+ oldtat = (last != NULL ? last->tat : NULL);
+
+ generate_tatprobs(query, 0, length - 1, priors, pam2, nsq, &newtat, oldtat);
+
+ free(query);
+#if defined(FASTS) || defined(FASTM)
+ } /* close the FASTS-specific if-else from above */
+#endif
+
+ this->newtat = newtat;
+
+ if(tatprob < 0.0) { /* hasn't been set by precalculated FASTS intprobs */
+
+ /* integrate probabilities >= sumscore */
+ tatprobsptr = newtat->probs;
+
+ is = i = newtat->highscore - newtat->lowscore;
+ N = sumscore - newtat->lowscore;
+
+ right_tatprob = 0;
+ for ( ; i >= N; i--) {
+ right_tatprob += tatprobsptr[i];
+ }
+
+ left_tatprob = tatprobsptr[0];
+ for (i = 1 ; i < N ; i++ ) {
+ left_tatprob += tatprobsptr[i];
+ }
+
+ if (right_tatprob < left_tatprob) {tatprob = right_tatprob;}
+ else {tatprob = 1.0 - left_tatprob;}
+
+ tatprob /= (right_tatprob+left_tatprob);
+ }
+
+ if (maxlength > 0) {
+ n1 += 2 * (maxlength - 1);
+ }
+
+#ifndef FASTM
+ tatprob *= factorial(n1 - sumlength + seg, n1 - sumlength);
+#else
+ tatprob *= power(n1 - sumlength,seg)/(1<<seg);
+#endif
+
+ if(tatprob > 0.01)
+ tatprob = 1.0 - exp(-tatprob);
+
+ return tatprob;
+}
+
+/* generates a set of probabilities for every score produced by the
+ query */
+void
+generate_tatprobs(const unsigned char *query,
+ int begin,
+ int end,
+ double *priors,
+ int **pam2,
+ int nsq,
+ struct tat_str **tatarg,
+ struct tat_str *oldtat)
+{
+
+ int i, j, k, l, m, n, N, highscore, lowscore;
+ int *lowrange = NULL, *highrange = NULL;
+ int show_probs = 0;
+ double *probs = NULL, *newprobs = NULL, *priorptr, tmp;
+ struct tat_str *tatprobs = NULL;
+ int *pamptr, *pamptrsave;
+ int last_zero;
+
+ if((tatprobs = (struct tat_str *) calloc(1, sizeof(struct tat_str)))==NULL) {
+ fprintf(stderr, "Couldn't allocate individual tatprob struct.\n");
+ exit(1);
+ }
+
+ n = end - begin + 1;
+
+ if ( (lowrange = (int *) calloc(n, sizeof(int))) == NULL ) {
+ fprintf(stderr, "Couldn't allocate memory for lowrange.\n");
+ exit(1);
+ }
+
+ if ( (highrange = (int *) calloc(n, sizeof(int))) == NULL ) {
+ fprintf(stderr, "Couldn't allocate memory for highrange.\n");
+ exit(1);
+ }
+
+ /* calculate the absolute highest and lowest score possible for this */
+ /* segment. Also, set the range we need to iterate over at each position */
+ /* in the query: */
+ if(oldtat == NULL) {
+ highscore = lowscore = 0;
+ } else {
+ highscore = oldtat->highscore;
+ lowscore = oldtat->lowscore;
+ }
+
+ for ( i = 0 ; i < n ; i++ ) {
+
+ if (query[begin+i] == 0) break;
+
+ highscore =
+ (highrange[i] = highscore + max_score(pam2[query[begin + i]], nsq));
+
+ lowscore =
+ (lowrange[i] = lowscore + min_score(pam2[query[begin + i]], nsq));
+
+ /*
+ fprintf(stderr, "i: %d, max: %d, min: %d, high[i]: %d, low[i]: %d, high: %d, low: %d, char: %d\n",
+ i,
+ max_score(pam2[query[begin + i]], nsq),
+ min_score(pam2[query[begin + i]], nsq),
+ highrange[i], lowrange[i],
+ highscore, lowscore, query[begin + i]);
+ */
+ }
+
+ /*
+ if (lowscore == -55 && highscore==34) {
+ show_probs = 1;
+ fprintf(stderr,"Range: low: %d -- high %d\n",lowscore, highscore);
+ }
+ */
+ /* allocate an array of probabilities for all possible scores */
+ /* i.e. if highest score possible is 50 and lowest score possible */
+ /* is -20, then there are 50 - (-20) + 1 = 71 possible different */
+ /* scores (including 0): */
+ N = highscore - lowscore;
+ if ( (probs = (double *) calloc(N + 1, sizeof(double))) == NULL ) {
+ fprintf(stderr, "Couldn't allocate probability matrix : %d.\n", N + 1);
+ exit(1);
+ }
+
+ if(oldtat == NULL) {
+ /* for the first position, iterate over the only possible scores, */
+ /* summing the priors for the amino acids that can yield each score. */
+ pamptr = pam2[query[begin]];
+ for ( i = 1 ; i < nsq ; i++ ) {
+ if(priors[i] > 0.0) {
+ /*
+ fprintf(stderr," updated: %d(%d)[%d]: %0.4g\n",pamptr[i]-lowscore,pamptr[i],i,priors[i]);
+ */
+ probs[(pamptr[i] - lowscore)] += priors[i];
+ }
+ }
+ } else {
+ /* Need to copy the data out of oldtat->probs into probs */
+ memcpy( &probs[oldtat->lowscore - lowscore],
+ oldtat->probs,
+ (oldtat->highscore - oldtat->lowscore + 1) * sizeof(double));
+ }
+
+ if ( (newprobs = (double *) calloc(N + 1, sizeof(double))) == NULL ) {
+ fprintf(stderr, "Couldn't allocate newprobs matrix.\n");
+ exit(1);
+ }
+
+ /* now for each remaining residue in the segment ... */
+ /* i is the position in the query */
+ for ( i = (oldtat == NULL ? 1 : 0) ; i < n ; i++ ) {
+
+ pamptrsave = pam2[query[begin + i]];
+
+ /* ... calculate new probability distribution .... */
+ /* ... for each possible score j (limited to current range) ... */
+ /* j is the possible score */
+ for ( j = lowrange[i] - lowscore,
+ k = highrange[i] - lowscore ;
+ j <= k ;
+ j++ ) {
+
+ tmp = 0.0;
+ pamptr = &pamptrsave[1];
+ priorptr = &priors[1];
+ /* ... for each of the possible alignment scores at this position ... */
+ for ( l = 1 ;
+ l < nsq ;
+ l++) {
+
+ /*
+ if (*priorptr == 0.0) {
+ priorptr++;
+ pamptr++;
+ continue;
+ }
+ */
+ /* make sure we don't go past highest possible score, or past
+ the lowest possible score; not sure why this can happen */
+ m = j - *pamptr++;
+ if ( m <= N && m >= 0 ) {
+ /* update the probability of getting score j: */
+ tmp += probs[m] * *priorptr;
+ /*
+ if (show_probs && j==N)
+ fprintf(stderr,"probs[%d]: %lg i: %d j: %d l: %d(%c) pam2: %d this: %lg prior: %lg tmp: %lg\n",m,probs[m],i,j,l,NCBIstdaa[l],*(pamptr-1),(probs[m]* *priorptr),*priorptr, tmp);
+ */
+ }
+ priorptr++;
+ }
+
+ /* if (tmp >= 0.0 && tmp < 1e-50) {
+ fprintf(stderr," tmp[%d] near zero: %lg\n",j,tmp);
+ } */
+ newprobs[j] += tmp;
+ }
+
+ /* save the new set of probabilities, get rid of old; we don't
+ necessarily have to copy/clear all N+1 slots, we could use
+ high/low score boundaries -- not sure that's worth the
+ effort. */
+ memcpy(probs, newprobs, (N + 1) * sizeof(double));
+ memset(newprobs, 0, (N + 1) * sizeof(double));
+ }
+
+ last_zero = -100;
+ for (i=N; i < N+1; i++) {
+ tmp = probs[i];
+ if (tmp >= 0.0 && tmp < 1e-200) {
+ if (i == 1 || i == N) {
+ fprintf(stderr," *** tatstats.c/generate_tatprobs() probs[%d] near zero: %lg\n",i+lowscore,tmp);
+ }
+ last_zero = i;
+ }
+ }
+
+ free(newprobs);
+ free(highrange);
+ free(lowrange);
+
+ tatprobs->probs = probs;
+ /* tatprobs->intprobs = intprobs; */
+ tatprobs->lowscore = lowscore;
+ tatprobs->highscore = highscore;
+
+ *tatarg = tatprobs;
+}
diff --git a/src/tatstats.h b/src/tatstats.h
new file mode 100644
index 0000000..9c5899a
--- /dev/null
+++ b/src/tatstats.h
@@ -0,0 +1,160 @@
+/* $Id: tatstats.h 1254 2014-01-29 16:03:40Z wrp $ */
+
+#ifndef MAXSQ
+#include "param.h"
+#endif
+
+#ifndef MAXSAV
+#ifdef FASTS
+#define MAXSAV 25
+#else
+#define MAXSAV 10
+#endif
+#endif
+
+#if defined(IBM_AIX) && defined(MAXSEG)
+#undef MAXSEG
+#endif
+#define MAXSEG 30
+
+struct savestr {
+ int score; /* pam score with segment optimization */
+ int score0; /* pam score of best single segment */
+ int start0; /* score from global match */
+ int dp; /* diagonal of match */
+ int start; /* start of match in lib seq */
+ int stop; /* end of match in lib seq */
+ int exact; /* exact match */
+#if defined(FASTF)
+ int *used; /* array of positions in aa0 that were used */
+#endif
+};
+
+struct dstruct { /* diagonal structure for saving current run */
+ int score; /* hash score of current match */
+ int start; /* start of current match */
+ int stop; /* end of current match */
+ struct savestr *dmax; /* location in vmax[] where best score data saved */
+};
+
+struct tat_str {
+ double *probs;
+ int lowscore;
+ int highscore;
+};
+
+struct f_struct {
+ struct dstruct *diag;
+ struct savestr *vmax; /* best matches saved for one sequence */
+ struct savestr **vptr;
+ struct slink *sarr;
+ struct savestr *lowmax;
+ int maxsav; /* max number of peptide alignments saved in search */
+ int maxsav_w; /* max number of peptide alignments saved in alignment */
+ int shuff_cnt;
+ int nsave;
+ int ndo;
+ int noff;
+ int nm0; /* number of fragments */
+#if defined(FASTS) || defined(FASTM)
+ int *nmoff; /* offset number, start */
+ int *nm_u;
+ int *aa0b; /* beginning of each segment */
+ int *aa0e; /* end of each segment */
+ int *aa0i; /* index of each segment */
+ int *aa0s; /* max score of each segment */
+ int *aa0l; /* longest possible peptide match */
+#else
+ int nmoff; /* offset number, start */
+ unsigned char *aa0;
+ int aa0ix;
+#endif
+ unsigned char *aa0t; /* temp location for peptides */
+ int aa0t_off; /* offset between aa0 and aa0t for correct coordinates */
+ int *aa0ti; /* temp index for peptides */
+ int hmask; /* hash constants */
+ int *pamh1; /* pam based array */
+ int *pamh2; /* pam based kfact array */
+#if defined(FASTS) || defined(FASTM)
+ int *link, *harr, *l_end; /* hash arrays */
+#else
+ struct hlstr *link, *harr; /* hash arrays */
+#endif
+ int kshft; /* shift width */
+ int nsav, lowscor; /* number of saved runs, worst saved run */
+ unsigned char *aa1x; /* contains translated codons 111222333 */
+ unsigned char *aa1y; /* contains translated codons 123123123 */
+ int n10;
+ int *waa;
+ int *res;
+ int max_res;
+ double *priors;
+#if defined(FASTS) || defined(FASTM)
+ struct tat_str **tatprobs; /* array of pointers to tat structs */
+ double **intprobs; /* array of integrated tatprobs */
+#endif
+ int dotat;
+ double spacefactor;
+};
+
+struct slink {
+ int score;
+ double tatprob;
+ struct tat_str *tat;
+ struct tat_str *newtat;
+ struct savestr *vp;
+ struct slink *next;
+ struct slink *prev;
+};
+
+struct segstr {
+ double tatprob;
+ int length;
+};
+
+void generate_tatprobs(const unsigned char *query,
+ int begin,
+ int end,
+ double *priors,
+ int **pam2,
+ int nsq,
+ struct tat_str **tatarg, struct tat_str *oldtat);
+
+double
+calc_tatusov ( struct slink *last,
+ struct slink *this,
+ const unsigned char *aa0, int n0,
+ const unsigned char *aa1, int n1,
+ int **pam2, int nsq,
+ struct f_struct *f_str,
+ int pseudocts,
+ int do_opt,
+ int zsflag
+ );
+
+double seg_tatprob(struct slink *start,
+ const unsigned char *aa0,
+ int n0,
+ const unsigned char *aa1,
+ int n1,
+ struct f_struct *f_str,
+ struct pstruct *ppst,
+ int do_opt);
+
+void calc_priors(double *priors,
+ struct pstruct *ppst,
+ struct f_struct *f_str,
+ const unsigned char *aa1,
+ int n1, int pseudocts);
+
+double factorial (int a, int b);
+
+int max_score(int *scores, int nsq);
+
+int min_score(int *scores, int nsq);
+
+double calc_spacefactor(struct f_struct *f_str);
+
+void linreg(double *lnx, double *x, double *lny,
+ int n,
+ double *a, double *b, double *c, int start);
diff --git a/src/thr_buf_structs.h b/src/thr_buf_structs.h
new file mode 100644
index 0000000..ea5324e
--- /dev/null
+++ b/src/thr_buf_structs.h
@@ -0,0 +1,119 @@
+/* thr_bufs.h - structures for passing buffers of sequences to threads */
+
+/* $Id: thr_buf_structs.h 793 2011-07-03 00:03:55Z wrp $ */
+
+/* copyright (c) 2007, 2014 by William R. Pearson and The Rector &
+ Vistors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+#include <sys/types.h>
+
+struct thr_str {
+ int worker;
+ void *status;
+ int max_work_buf;
+ int qframe;
+ struct pstruct *ppst;
+ const struct mngmsg *m_msp;
+ int qshuffle;
+ int nrshuffle;
+ unsigned char *aa0; /* query sequence */
+ int n0; /* query sequence length */
+ int nm0;
+ int max_tot;
+ char info_lib_range[MAX_SSTR];
+ void **f_str_ap;
+};
+
+
+/* this structure passes library sequences to the worker threads
+ and returns scores */
+
+struct buf2_hdr_s {
+ int buf2_cnt; /* number of buf2 records */
+ /* int buf2_stats_cnt; */ /* number of stats scores */
+ int buf2_type; /* search, shuffle, etc */
+ int stop_work;
+ int have_data;
+ int have_results;
+ int have_best_save;
+ int shuff_cnt;
+ int worker_idx;
+ struct seq_record *seq_b; /* pointer to contig. actual sequence data */
+ struct mseq_record *mseq_b; /* pointer to contig. seq. meta-info */
+ struct seqr_chain *my_chain; /* pointer to the seqr_chain providing data for this buffer */
+ int seqr_cnt; /* count of seq_records, which can be less than buf2_cnt */
+ int seq_record_continuous; /* contiguous seq_record/aa1b */
+ unsigned char *aa1b_start; /* pointer to contiguous aa1b buffer */
+ int aa1b_size; /* allocated size of aab buffer -- needed for PCOMPLIB */
+ int aa1b_used; /* used size of aab buffer -- needed for PCOMPLIB */
+ int my_id;
+ int my_worker;
+};
+
+
+/* this structure contains a single sequence record, with all the
+ information necessary to calculate a score */
+
+struct buf2_data_s {
+ struct seq_record *seq; /* pointer to sequence */
+ struct mseq_record *mseq; /* pointer to sequence meta data */
+ int frame; /* query frame used for returning results, indexes into aa0[], f_str[]
+ also used in best_stats.h bbp->frame */
+ int repeat_thresh; /* threshold for sub-alignment */
+ int stats_idx; /* where to save for statistics */
+ int seq_dup; /* duplicate entry for alternate frame */
+ struct beststr *best_save; /* if beststr is pointing to this record, where is it saved? */
+};
+
+struct buf2_res_s {
+ struct rstruct rst;
+ int is_valid_stat;
+ int qr_score;
+ struct rstruct r_rst;
+ double qr_escore;
+};
+
+/*
+struct buf2_stats_s {
+ int stats_idx;
+ int valid_stat;
+ int iscore;
+ int n1;
+ double escore;
+ double comp;
+ double H;
+};
+*/
+
+struct buf2_ares_s {
+ int have_ares;
+ struct a_res_str *a_res;
+ int best_idx;
+};
+
+#define BUF2_DOWORK 0x1
+#define BUF2_DOSHUF 0x2
+#define BUF2_DOOPT 0x4
+#define BUF2_DOALIGN 0x8
+
+struct buf_head {
+ struct buf2_hdr_s hdr; /* meta-information */
+ struct buf2_data_s *buf2_data; /* input (sequence) datat */
+ struct buf2_res_s *buf2_res; /* score type results */
+ /* struct buf2_stats_s *buf2_stats; */ /* statistics values */
+ struct buf2_ares_s *buf2_ares; /* alignment results */
+ struct score_count_s s_cnt_info; /* statitics info */
+};
diff --git a/src/thr_bufs2.h b/src/thr_bufs2.h
new file mode 100644
index 0000000..d881ca2
--- /dev/null
+++ b/src/thr_bufs2.h
@@ -0,0 +1,41 @@
+/***************************************/
+/* thread global variable declarations */
+/***************************************/
+
+/* $Id: thr_bufs2.h 625 2011-03-23 17:21:38Z wrp $ */
+
+#ifndef MAX_WORKERS
+#define MAX_WORKERS 2
+#endif
+
+#ifndef XTERNAL
+struct buf_head **worker_buf; /* pointers to full buffers */
+struct buf_head **reader_buf; /* pointers to empty buffers */
+
+/* protected by worker_mutex/worker_cond_var */
+ /* indices into full-buffers ptrs */
+int worker_buf_workp; /* modified by get_wbuf() */
+int worker_buf_readp; /* modified by put_rbuf() */
+int num_worker_bufs;
+int reader_done;
+
+/* protected by reader_mutex/reader_cond var */
+ /* indices into empty-buffers ptrs */
+int reader_buf_workp; /* modified by put_wbuf() */
+int reader_buf_readp; /* modified by get_rbuf(), main()-- after rbuf_wait */
+int num_reader_bufs;
+int reader_wait;
+
+/* protected by start_mutex/start_cont_var */
+int start_thread=1; /* start-up predicate, 0 starts */
+#else
+extern struct buf_head **worker_buf;
+extern struct buf_head **reader_buf;
+extern int num_worker_bufs, reader_done;
+extern int num_reader_bufs, reader_wait;
+extern int worker_buf_workp, worker_buf_readp;
+extern int reader_buf_workp, reader_buf_readp;
+
+extern int start_thread;
+#endif
+
diff --git a/src/uascii.h b/src/uascii.h
new file mode 100644
index 0000000..983a2dd
--- /dev/null
+++ b/src/uascii.h
@@ -0,0 +1,59 @@
+/* Concurrent read version */
+/* ascii.gbl ascii translation to amino acids */
+/* modified 10-Mar-1987 for B, Z */
+
+/* $Id: uascii.h 989 2012-07-24 19:37:38Z wrp $ */
+
+#define NA 123
+#define NANN 60 /* changed 24-July-2012 because NCBIstdaa_ext_n = 56 */
+#define ESS 59 /* code for ',' in FASTS,FASTF, FASTM */
+#define EL 125
+#define ES 126
+#define AAMASK 127
+
+#ifndef XTERNAL
+/* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
+/* 32 ! " # $ % & ' ( ) * + , - . / */
+/* 48 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
+/* 64 @ A B C D E F G H I J K L M N O */
+/* 80 P Q R S T U V W X Y Z [ \ ] ^ _ */
+/* 96 ` a b c d e f g h i j k l m n o */
+/*112 p q r s t u v w x y z { | } ~ ^? */
+
+int aascii[128]={
+ EL,NA,NA,NA,NA,NA,NA,NA,NA,NA,EL,NA,NA,EL,NA,NA, /* 15 */
+ NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, /* 31 */
+ NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,24,NA,NA,NA,NA,NA, /* 47 */
+ NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, /* 63 */
+ NA, 1,21, 5, 4, 7,14, 8, 9,10,25,12,11,13, 3,26, /* 79 */
+ 15, 6, 2,16,17,27,20,18,23,19,22,NA,NA,NA,NA,NA, /* 95 */
+ NA, 1,21, 5, 4, 7,14, 8, 9,10,25,12,11,13, 3,26, /*111 */
+ 15, 6, 2,16,17,27,20,18,23,19,22,NA,NA,NA,NA,NA}; /*127 */
+
+int nascii[128]={
+/* 0 1 2 3 5 6 7 8 9 10 11 12 13 14 15 15
+ @ A B C D E F G H I J K L M N O
+ P Q R S T U V W X Y Z */
+ EL,NA,NA,NA,NA,NA,NA,NA,NA,NA,EL,NA,NA,EL,NA,NA,
+ NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
+ NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,ES,NA,NA,16,NA,NA,
+ NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,ES,NA,NA,ES,NA,
+ NA, 1,15, 2,12,NA,NA, 3,13,NA,NA,11,NA, 8,16,NA,
+ 6, 7, 6,10, 4, 5,14, 9,17, 7,NA,NA,NA,NA,NA,NA,
+ NA, 1,15, 2,12,NA,NA, 3,13,NA,NA,11,NA, 8,16,NA,
+ 6, 7, 6,10, 4, 5,14, 9,17, 7,NA,NA,NA,NA,NA,NA};
+
+int *pascii;
+int qascii[128];
+int lascii[128];
+int l_ann_ascii[128];
+#else
+#define AAMASK 127
+extern int aascii[128];
+extern int nascii[128];
+
+extern int *pascii;
+extern int qascii[128];
+extern int lascii[128];
+extern int l_ann_ascii[128];
+#endif
diff --git a/src/upam.h b/src/upam.h
new file mode 100644
index 0000000..e36d59b
--- /dev/null
+++ b/src/upam.h
@@ -0,0 +1,872 @@
+/* Concurrent read version */
+/* 20-June-1986 universal pam file */
+
+/* $Id: upam.h 1124 2013-03-13 20:24:57Z wrp $ */
+/* $Revision: 1124 $ */
+
+/* modified to accomodate both lower and upper case amino acid numbers
+ as a result MAXSQ = 50
+*/
+
+
+#ifndef UPAM_GBL_DEF
+#define UPAM_GBL_DEF
+
+#define TERM 25
+#define EOSEQ 0
+#define MAXSQ 60 /* increased to accomodate ESS=57 */
+#define MAXUC 28
+#define MAXLC 56
+
+#define MAXHASH 32
+#define NMAP MAXHASH+1
+
+struct std_pam_str {
+ char abbrev[6]; /* argument name */
+ char name[10]; /* canonical name */
+ int *pam; /* data pointer */
+ float scale; /* pam scale (ln(2)/2, ln(2)/3 */
+ float ulambda; /* lambda (ungapped) */
+ float entropy; /* bits/position */
+ float tfract_id; /* target fract id */
+ int gdel, ggap; /* gdel, ggap */
+};
+
+#ifndef XTERNAL
+
+int pamoff=0;
+
+/*extern int gdelval, ggapval;*/
+
+/* char sqnam[]="aa"; */
+/* char sqtype[]="protein"; */
+
+
+/* this alphabet covers 56 values, so that libraries can include lower-case characters which will be re-mapped unless -S */
+char *NCBIstdaa = "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ";
+char *NCBIstdaa_l = "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ-ABCDEFGHIKLMNPQRSTVWXYZU*OJ";
+char NCBIstdaa_n = 28;
+
+char *NCBIstdaa_ext = "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ-abcdefghiklmnpqrstvwxyzu*oj";
+char NCBIstdaa_ext_n = 56;
+
+/* the residue ordering used for the internal pam matrices */
+char *pam_sq;
+int pam_sq_n;
+char *apam_sq = "\0ARNDCQEGHILKMFPSTWYVBZX*";
+int apam_sq_n = 25;
+char *npam_sq = "\0ACGTURYMWSKDHVBNX";
+int npam_sq_n = 17;
+
+/* these values have been replaced by NCBIstdaa and *pam_sq */
+/*
+char aa[MAXSQ+1] = {"\0ARNDCQEGHILKMFPSTWYVBZX*JARNDCQEGHILKMFPSTWYVBZX*J\0"};
+char aax[MAXSQ+1] = {"\0ARNDCQEGHILKMFPSTWYVBZX*Jarndcqeghilkmfpstwyvbzx*j\0"};
+*/
+char pssm_aa[26] = {"\0ARNDCQEGHILKMFPSTWYVBZX*"};
+
+char othx[MAXSQ+1] = {"OUou\0"};
+int noth = 2;
+int nothx = 4;
+
+int naa = 25; /* this should be calculated from aa[] */
+int naax = 50;
+
+/* haa[] used to map all valid amino acid codes into a hash value;
+ now, there is an additional hash value - not-mapped - NM */
+
+/* this has been expanded to accomodate '*' */
+ /* 0 A R N D C Q E G H I L K M F P S T W Y V B Z X * J */
+/*
+int haa[MAXSQ+1] = {
+ NMAP,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,3,7,NMAP,NMAP,10,
+ 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,3,7,NMAP,NMAP,10};
+*/
+int h_NCBIstdaa[MAXSQ+1] = {
+ /* - A B C D E F G H I K L M N P Q R S T V W X Y Z U * O J */
+ NMAP,1,13,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,NMAP,21, 5, 3,NMAP,10,11,
+ NMAP,1,13,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,NMAP,21, 5, 3,NMAP,10,11
+};
+
+int h_NCBIstdaa_ext[MAXSQ+1] = {
+ /* - A B C D E F G H I K L M N P Q R S T V W X Y Z U * O J */
+ NMAP,1,13,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,NMAP,21, 5, 3,NMAP,10,11,
+ NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,
+ NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,
+};
+
+/*
+int haax[MAXSQ+1] = {
+ NMAP, 1,2,3,4,5,6,7,8,9,
+ 10,11,12,13,14,15,16,17,18,19,
+ 20,3,7,NMAP,NMAP,10,NMAP,NMAP,NMAP,NMAP,
+ NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,
+ NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,
+ NMAP};
+*/
+
+/*
+ PAM 250 substitution matrix, scale = ln(2)/3 = 0.231049
+ Expected score = -0.844, Entropy = 0.354 bits
+ Lowest score = -8, Highest score = 17
+*/
+int apam250[450] = {
+ 2,
+-2, 6,
+ 0, 0, 2,
+ 0,-1, 2, 4,
+-2,-4,-4,-5,12,
+ 0, 1, 1, 2,-5, 4,
+ 0,-1, 1, 3,-5, 2, 4,
+ 1,-3, 0, 1,-3,-1, 0, 5,
+-1, 2, 2, 1,-3, 3, 1,-2, 6,
+-1,-2,-2,-2,-2,-2,-2,-3,-2, 5,
+-2,-3,-3,-4,-6,-2,-3,-4,-2, 2, 6,
+-1, 3, 1, 0,-5, 1, 0,-2, 0,-2,-3, 5,
+-1, 0,-2,-3,-5,-1,-2,-3,-2, 2, 4, 0, 6,
+-4,-4,-4,-6,-4,-5,-5,-5,-2, 1, 2,-5, 0, 9,
+ 1, 0,-1,-1,-3, 0,-1,-1, 0,-2,-3,-1,-2,-5, 6,
+ 1, 0, 1, 0, 0,-1, 0, 1,-1,-1,-3, 0,-2,-3, 1, 2,
+ 1,-1, 0, 0,-2,-1, 0, 0,-1, 0,-2, 0,-1,-3, 0, 1, 3,
+-6, 2,-4,-7,-8,-5,-7,-7,-3,-5,-2,-3,-4, 0,-6,-2,-5,17,
+-3,-4,-2,-4, 0,-4,-4,-5, 0,-1,-1,-4,-2, 7,-5,-3,-3, 0,10,
+ 0,-2,-2,-2,-2,-2,-2,-1,-2, 4, 2,-2, 2,-1,-1,-1, 0,-6,-2, 4,
+ 0,-1, 2, 3,-4, 1, 2, 0, 1,-2,-3, 1,-2,-5,-1, 0, 0,-5,-3,-2, 2,
+ 0, 0, 1, 3,-5, 3, 3,-1, 2,-2,-3, 0,-2,-5, 0, 0,-1,-6,-4,-2, 2, 3,
+ 0,-1, 0,-1,-3,-1,-1,-1,-1,-1,-1,-1,-1,-2,-1, 0, 0,-4,-2,-1,-1,-1,-1,
+ 0,-1, 0,-1,-3,-1,-1,-1,-1,-1,-1,-1,-1,-2,-1, 0, 0,-4,-2,-1,-1,-1,-1, 8};
+
+/*
+ This matrix was produced by "pam" Version 1.0.6 [28-Jul-93]
+ PAM 120 substitution matrix, scale = ln(2)/2 = 0.346574
+ Expected score = -1.64, Entropy = 0.979 bits
+ Lowest score = -8, Highest score = 12
+*/
+int apam120[450] = {
+ 3,
+ -3, 6,
+ 0,-1, 4,
+ 0,-3, 2, 5,
+ -3,-4,-5,-7, 9,
+ -1, 1, 0, 1,-7, 6,
+ 0,-3, 1, 3,-7, 2, 5,
+ 1,-4, 0, 0,-5,-3,-1, 5,
+ -3, 1, 2, 0,-4, 3,-1,-4, 7,
+ -1,-2,-2,-3,-3,-3,-3,-4,-4, 6,
+ -3,-4,-4,-5,-7,-2,-4,-5,-3, 1, 5,
+ -2, 2, 1,-1,-7, 0,-1,-3,-2,-2,-4, 5,
+ -2,-1,-3,-4,-6,-1,-4,-4,-4, 1, 3, 0, 8,
+ -4,-4,-4,-7,-6,-6,-6,-5,-2, 0, 0,-6,-1, 8,
+ 1,-1,-2,-2,-3, 0,-1,-2,-1,-3,-3,-2,-3,-5, 6,
+ 1,-1, 1, 0,-1,-2,-1, 1,-2,-2,-4,-1,-2,-3, 1, 3,
+ 1,-2, 0,-1,-3,-2,-2,-1,-3, 0,-3,-1,-1,-4,-1, 2, 4,
+ -7, 1,-5,-8,-8,-6,-8,-8,-5,-7,-5,-5,-7,-1,-7,-2,-6, 12,
+ -4,-6,-2,-5,-1,-5,-4,-6,-1,-2,-3,-6,-4, 4,-6,-3,-3,-1, 8,
+ 0,-3,-3,-3,-2,-3,-3,-2,-3, 3, 1,-4, 1,-3,-2,-2, 0,-8,-3, 5,
+ 0,-2, 3, 4,-6, 0, 3, 0, 1,-3,-4, 0,-4,-5,-2, 0, 0,-6,-3,-3, 4,
+ -1,-1, 0, 3,-7, 4, 4,-2, 1,-3,-3,-1,-2,-6,-1,-1,-2,-7,-5,-3, 2, 4,
+ -1,-2,-1,-2,-4,-1,-1,-2,-2,-1,-2,-2,-2,-3,-2,-1,-1,-5,-3,-1,-1,-1,-2,
+ -1,-2,-1,-2,-4,-1,-1,-2,-2,-1,-2,-2,-2,-3,-2,-1,-1,-5,-3,-1,-1,-1,-2, 6};
+
+/*
+#
+# VTML_10
+#
+# This matrix was produced from: vtml_10qij.mat using robinson2.back background frequencies
+#
+# VTML_10 substitution matrix, Units = bits/2.0
+# Expected score = -3.859581 bits; Entropy = 3.462930 bits
+# Target fraction identity = 0.9107
+# Lowest Score = -20, Highest Score= 12
+#
+*/
+int a_vt10[450] = {
+ 7,
+ -9, 8,
+ -8, -7, 9,
+ -8, -16, -3, 8,
+ -5, -9, -10, -18, 11,
+ -7, -4, -6, -6, -17, 9,
+ -7, -14, -7, -3, -18, -3, 8,
+ -6, -9, -7, -8, -10, -10, -8, 7,
+ -9, -5, -4, -6, -9, -4, -8, -9, 11,
+ -9, -10, -11, -15, -8, -12, -12, -19, -11, 8,
+ -9, -10, -11, -19, -17, -8, -10, -13, -8, -3, 7,
+ -8, -2, -5, -7, -17, -4, -4, -9, -7, -10, -10, 8,
+ -7, -8, -9, -11, -7, -6, -10, -12, -15, -3, -2, -7, 11,
+ -10, -12, -13, -20, -17, -10, -18, -13, -6, -6, -5, -18, -4, 9,
+ -6, -8, -10, -8, -11, -7, -8, -10, -8, -12, -9, -7, -12, -10, 8,
+ -4, -7, -4, -7, -5, -6, -6, -6, -6, -11, -10, -7, -10, -8, -6, 7,
+ -5, -8, -5, -8, -8, -7, -7, -10, -7, -6, -9, -6, -6, -10, -8, -3, 8,
+ -11, -10, -12, -12, -20, -19, -20, -11, -8, -8, -8, -10, -16, -4, -11, -10, -19, 12,
+ -10, -9, -8, -17, -7, -16, -9, -13, -3, -9, -8, -10, -15, -1, -19, -8, -10, -4, 10,
+ -5, -11, -11, -11, -6, -9, -9, -12, -10, -1, -5, -9, -5, -8, -10, -10, -6, -17, -9, 8,
+ -8, -11, 3, 2, -14, -6, -5, -7, -5, -13, -15, -6, -10, -16, -9, -5, -6, -12, -12, -11, 8,
+ -7, -9, -6, -4, -17, 3, 2, -9, -6, -12, -9, -4, -8, -14, -7, -6, -7, -19, -12, -9, -4, 8,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+
+/*
+#
+# VTML_20
+#
+# This matrix was produced from: vtml_20qij.mat using robinson2.back background frequencies
+#
+# VTML_20 substitution matrix, Units = bits/2.0
+# Expected score = -2.889610 bits; Entropy = 2.921119 bits
+# Target fraction identity = 0.8312
+# Lowest Score = -16, Highest Score= 12
+#
+*/
+int a_vt20[450] = {
+ 7,
+ -7, 8,
+ -6, -5, 8,
+ -6, -12, -1, 8,
+ -3, -7, -8, -14, 11,
+ -5, -2, -4, -4, -13, 8,
+ -5, -10, -5, -1, -14, -1, 7,
+ -4, -7, -5, -6, -8, -8, -6, 7,
+ -7, -3, -3, -5, -7, -2, -6, -7, 10,
+ -7, -8, -9, -13, -6, -10, -10, -15, -9, 8,
+ -7, -8, -9, -15, -13, -7, -8, -11, -7, -1, 6,
+ -6, 0, -3, -5, -14, -2, -2, -7, -5, -8, -8, 8,
+ -5, -6, -7, -9, -5, -5, -8, -10, -12, -1, 0, -5, 10,
+ -8, -10, -10, -16, -13, -8, -14, -11, -5, -4, -3, -14, -2, 9,
+ -4, -6, -8, -6, -9, -5, -6, -8, -6, -10, -7, -6, -10, -8, 8,
+ -2, -6, -2, -5, -4, -4, -5, -4, -4, -9, -8, -5, -8, -6, -4, 7,
+ -3, -6, -4, -6, -6, -5, -6, -8, -5, -4, -7, -4, -4, -8, -6, -1, 8,
+ -9, -8, -10, -11, -16, -15, -16, -9, -6, -6, -6, -8, -12, -3, -9, -8, -15, 12,
+ -8, -7, -6, -14, -5, -12, -7, -10, -1, -7, -6, -8, -11, 1, -15, -6, -8, -2, 9,
+ -3, -9, -9, -9, -4, -7, -7, -10, -8, 1, -3, -8, -3, -6, -8, -8, -4, -13, -7, 7,
+ -6, -8, 3, 3, -11, -4, -3, -5, -4, -11, -12, -4, -8, -13, -7, -3, -5, -10, -10, -9, 8,
+ -5, -6, -4, -3, -13, 3, 3, -7, -4, -10, -7, -2, -6, -11, -5, -4, -5, -15, -9, -7, -2, 7,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+
+/*
+#
+# VTML_40
+#
+# This matrix was produced from: vtml_40qij.mat using robinson2.back background frequencies
+#
+# VTML_40 substitution matrix, Units = bits/2.0
+# Expected score = -1.963330 bits; Entropy = 2.266217 bits
+# Target fraction identity = 0.6968
+# Lowest Score = -13, Highest Score= 12
+#
+*/
+int a_vt40[450] = {
+ 6,
+ -5, 8,
+ -4, -3, 8,
+ -4, -8, 0, 7,
+ -1, -5, -6, -10, 11,
+ -3, 0, -2, -3, -10, 8,
+ -3, -7, -3, 1, -11, 0, 7,
+ -2, -5, -3, -4, -6, -6, -5, 7,
+ -5, -1, -1, -3, -5, 0, -4, -5, 10,
+ -5, -6, -7, -10, -4, -7, -8, -12, -7, 7,
+ -5, -6, -7, -11, -9, -5, -7, -9, -5, 1, 6,
+ -4, 2, -2, -3, -10, -1, -1, -5, -3, -6, -6, 7,
+ -4, -4, -5, -7, -3, -3, -6, -8, -8, 0, 1, -4, 9,
+ -6, -8, -8, -13, -10, -7, -11, -9, -3, -2, -1, -10, -1, 9,
+ -3, -5, -6, -4, -7, -4, -4, -6, -4, -8, -5, -4, -7, -7, 8,
+ 0, -4, -1, -3, -2, -3, -3, -3, -3, -6, -6, -3, -6, -5, -3, 6,
+ -1, -4, -2, -4, -4, -3, -4, -6, -3, -3, -5, -3, -2, -6, -4, 0, 7,
+ -7, -6, -8, -9, -12, -12, -12, -7, -4, -4, -4, -7, -9, -1, -7, -6, -11, 12,
+ -6, -5, -4, -10, -3, -9, -5, -8, 0, -5, -4, -6, -7, 2, -11, -5, -6, 0, 9,
+ -1, -7, -7, -7, -2, -5, -6, -8, -6, 3, -1, -6, -1, -4, -6, -6, -2, -10, -5, 7,
+ -4, -5, 4, 3, -8, -2, -1, -3, -2, -8, -9, -2, -6, -10, -5, -2, -3, -8, -7, -7, 7,
+ -3, -4, -2, -1, -10, 4, 3, -5, -2, -7, -6, -1, -4, -9, -4, -3, -3, -12, -7, -5, 0, 7,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+
+/*
+#
+# VTML_80
+#
+# This matrix was produced from: vtml_80qij.mat using robinson2.back background frequencies
+#
+# VTML_80 substitution matrix, Units = bits/2.0
+# Expected score = -1.137019 bits; Entropy = 1.390771 bits
+# Target fraction identity = 0.5024
+# Lowest Score = -9, Highest Score= 12
+#
+*/
+int a_vt80[450] = {
+ 5,
+ -3, 7,
+ -2, -1, 7,
+ -2, -5, 1, 7,
+ 0, -4, -4, -7, 10,
+ -2, 1, -1, -1, -6, 6,
+ -2, -3, -1, 2, -7, 1, 6,
+ -1, -3, -1, -2, -4, -4, -3, 6,
+ -3, 0, 0, -1, -3, 0, -2, -4, 9,
+ -3, -5, -5, -7, -2, -5, -5, -8, -4, 6,
+ -3, -4, -5, -8, -6, -3, -5, -7, -3, 2, 5,
+ -2, 3, -1, -2, -6, 0, 0, -3, -1, -4, -4, 6,
+ -2, -3, -4, -5, -2, -2, -4, -6, -5, 2, 2, -2, 8,
+ -4, -5, -6, -9, -6, -5, -7, -7, -1, -1, 0, -7, 0, 8,
+ -1, -3, -4, -3, -5, -2, -2, -4, -3, -5, -4, -2, -5, -5, 7,
+ 1, -2, 0, -1, -1, -2, -1, -1, -1, -4, -4, -1, -4, -3, -1, 5,
+ 0, -2, -1, -2, -2, -2, -2, -4, -2, -1, -3, -1, -1, -4, -3, 1, 6,
+ -5, -4, -6, -7, -9, -8, -8, -5, -2, -2, -2, -5, -5, 1, -6, -5, -8, 12,
+ -4, -3, -3, -7, -2, -6, -4, -6, 2, -3, -2, -4, -4, 3, -8, -3, -4, 1, 8,
+ 0, -4, -5, -5, -1, -4, -4, -6, -4, 3, 0, -4, 0, -2, -4, -4, -1, -6, -4, 6,
+ -2, -3, 4, 4, -5, -1, 0, -1, 0, -6, -6, -1, -4, -7, -3, 0, -1, -6, -5, -5, 7,
+ -2, -1, -1, 0, -6, 4, 3, -3, -1, -5, -4, 0, -3, -6, -2, -1, -2, -8, -5, -4, 0, 6,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+
+/*
+#
+# VTML_120
+#
+# This matrix was produced from: vtml_120qij.mat using robinson2.back background frequencies
+#
+# VTML_120 substitution matrix, Units = bits/2.0
+# Expected score = -0.723803 bits; Entropy = 0.938201 bits
+# Target fraction identity = 0.3748
+# Lowest Score = -7, Highest Score= 11
+#
+*/
+int a_vt120[450] = {
+ 4,
+ -2, 6,
+ -1, -1, 6,
+ -1, -3, 2, 6,
+ 0, -3, -3, -5, 9,
+ -1, 1, 0, 0, -5, 5,
+ -1, -2, 0, 2, -5, 1, 5,
+ 0, -2, -1, -1, -3, -3, -2, 6,
+ -2, 0, 1, -1, -3, 1, -1, -3, 8,
+ -2, -3, -4, -5, -1, -4, -4, -6, -3, 5,
+ -2, -3, -4, -6, -4, -3, -4, -5, -2, 2, 5,
+ -1, 3, 0, -1, -5, 1, 1, -2, 0, -3, -3, 5,
+ -1, -2, -3, -4, -1, -2, -3, -5, -3, 2, 2, -2, 7,
+ -3, -4, -4, -7, -5, -4, -5, -5, 0, 0, 1, -5, 1, 7,
+ -1, -2, -2, -2, -4, -1, -2, -3, -2, -4, -3, -1, -4, -4, 7,
+ 1, -1, 1, -1, 0, -1, -1, -1, -1, -3, -3, -1, -2, -2, -1, 4,
+ 0, -2, 0, -1, -1, -1, -1, -2, -1, 0, -2, -1, -1, -3, -2, 1, 5,
+ -4, -3, -5, -6, -7, -6, -6, -4, -1, -2, -2, -4, -4, 2, -5, -4, -6, 11,
+ -3, -2, -2, -5, -1, -4, -3, -5, 2, -2, -1, -3, -3, 4, -6, -2, -3, 2, 8,
+ 0, -3, -4, -4, 0, -3, -3, -4, -3, 3, 1, -3, 1, -1, -3, -3, 0, -4, -2, 5,
+ -1, -2, 4, 4, -4, 0, 1, -1, 0, -4, -5, 0, -3, -5, -2, 0, 0, -5, -3, -4, 6,
+ -1, 0, 0, 0, -5, 3, 3, -2, 0, -4, -3, 1, -2, -4, -1, -1, -1, -6, -3, -3, 0, 5,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+
+/*
+# VTML160
+#
+# This matrix was produced with scripts written by
+# Tobias Mueller and Sven Rahmann [June-2001].
+#
+# VTML160 substitution matrix, Units = Third-Bits
+# Expected Score = -1.297840 Third-Bits
+# Lowest Score = -7, Highest Score = 16
+#
+# Entropy H = 0.562489 Bits
+#
+# 30-Jun-2001
+*/
+int a_vt160[450] = {
+ 5,
+ -2, 7,
+ -1, 0, 7,
+ -1,-3, 3, 7,
+ 1,-3,-3,-5,13,
+ -1, 2, 0, 1,-4, 6,
+ -1,-1, 0, 3,-5, 2, 6,
+ 0,-3, 0,-1,-2,-3,-2, 8,
+ -2, 1, 1, 0,-2, 2,-1,-3, 9,
+ -1,-4,-4,-6,-1,-4,-5,-7,-4, 6,
+ -2,-3,-4,-6,-4,-2,-4,-6,-3, 3, 6,
+ -1, 4, 0, 0,-4, 2, 1,-2, 0,-4,-3, 5,
+ -1,-2,-3,-5,-1,-1,-3,-5,-3, 2, 4,-2, 8,
+ -3,-5,-5,-7,-4,-4,-6,-6, 0, 0, 2,-5, 1, 9,
+ 0,-2,-2,-1,-3,-1,-1,-3,-2,-4,-3,-1,-4,-5, 9,
+ 1,-1, 1, 0, 1, 0, 0, 0,-1,-3,-3,-1,-3,-3, 0, 4,
+ 1,-1, 0,-1, 0,-1,-1,-2,-1,-1,-2,-1,-1,-3,-1, 2, 5,
+ -5,-4,-5,-7,-7,-6,-7,-5,-1,-2,-1,-5,-4, 3,-5,-4,-6,16,
+ -3,-3,-2,-5,-1,-4,-3,-5, 3,-2,-1,-3,-2, 6,-6,-2,-3, 4,10,
+ 0,-4,-4,-4, 1,-3,-3,-5,-3, 4, 2,-3, 1,-1,-3,-2, 0,-5,-3, 5,
+ -1,-2, 5, 6,-4, 0, 2,-1, 0,-5,-5, 0,-4,-6,-2, 1, 0,-6,-3,-4, 5,
+ -1, 0, 0, 3,-5, 4, 5,-2, 0,-4,-3, 2,-3,-5,-1, 0,-1,-7,-4,-3, 2, 5,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7, 6};
+
+/*
+#
+# VTML_200
+#
+# This matrix was produced from: vtml_200qij.mat using vtml_P.mat background frequencies
+#
+# VTML_200 substitution matrix, Units = bits/3.0
+# Expected score = -0.358430 bits; Entropy = 0.412084 bits
+# Target fraction identity = 0.2295
+# Lowest Score = -6, Highest Score= 15
+#
+*/
+int a_vt200[450] = {
+ 4,
+ -2, 7,
+ -1, 0, 6,
+ -1, -2, 3, 6,
+ 1, -3, -2, -4, 12,
+ -1, 2, 1, 1, -3, 5,
+ -1, -1, 1, 3, -4, 2, 5,
+ 0, -2, 0, -1, -2, -2, -1, 8,
+ -2, 1, 1, 0, -2, 2, 0, -2, 8,
+ -1, -3, -4, -5, 0, -3, -4, -6, -3, 5,
+ -2, -3, -4, -5, -3, -2, -4, -5, -2, 3, 5,
+ -1, 4, 1, 0, -4, 2, 1, -2, 0, -3, -3, 5,
+ -1, -2, -3, -4, -1, -1, -3, -4, -3, 2, 3, -2, 6,
+ -3, -4, -4, -6, -3, -3, -5, -5, 0, 0, 2, -5, 1, 8,
+ 0, -1, -2, -1, -3, -1, -1, -2, -2, -4, -3, -1, -3, -4, 9,
+ 1, -1, 1, 0, 1, 0, 0, 0, 0, -3, -3, 0, -2, -3, 0, 4,
+ 1, -1, 0, -1, 0, 0, -1, -2, -1, -1, -2, 0, -1, -3, -1, 2, 4,
+ -4, -3, -5, -6, -6, -6, -6, -5, -1, -2, -1, -4, -3, 3, -4, -4, -5, 15,
+ -3, -2, -2, -4, 0, -3, -3, -5, 3, -2, -1, -3, -2, 5, -5, -2, -3, 4, 9,
+ 0, -3, -3, -4, 1, -2, -3, -4, -3, 4, 2, -3, 2, -1, -3, -2, 0, -4, -2, 4,
+ -1, -1, 4, 4, -3, 1, 2, 0, 0, -4, -4, 0, -3, -5, -1, 0, 0, -5, -3, -3, 6,
+ -1, 0, 1, 2, -3, 3, 3, -1, 1, -3, -3, 1, -2, -4, -1, 0, 0, -6, -3, -2, 2, 5,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+
+/*
+ Matrix made by matblas from blosum50.iij
+ BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+ Blocks Database = /data/blocks_5.0/blocks.dat
+ Cluster Percentage: >= 50
+ Entropy = 0.4808, Expected = -0.3573
+*/
+/*
+ A R N D C Q E G H I L K M F P S T W Y V B Z X * */
+int abl50[450] = {
+ 5,
+ -2, 7,
+ -1,-1, 7,
+ -2,-2, 2, 8,
+ -1,-4,-2,-4,13,
+ -1, 1, 0, 0,-3, 7,
+ -1, 0, 0, 2,-3, 2, 6,
+ 0,-3, 0,-1,-3,-2,-3, 8,
+ -2, 0, 1,-1,-3, 1, 0,-2,10,
+ -1,-4,-3,-4,-2,-3,-4,-4,-4, 5,
+ -2,-3,-4,-4,-2,-2,-3,-4,-3, 2, 5,
+ -1, 3, 0,-1,-3, 2, 1,-2, 0,-3,-3, 6,
+ -1,-2,-2,-4,-2, 0,-2,-3,-1, 2, 3,-2, 7,
+ -3,-3,-4,-5,-2,-4,-3,-4,-1, 0, 1,-4, 0, 8,
+ -1,-3,-2,-1,-4,-1,-1,-2,-2,-3,-4,-1,-3,-4,10,
+ 1,-1, 1, 0,-1, 0,-1, 0,-1,-3,-3, 0,-2,-3,-1, 5,
+ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 2, 5,
+ -3,-3,-4,-5,-5,-1,-3,-3,-3,-3,-2,-3,-1, 1,-4,-4,-3,15,
+ -2,-1,-2,-3,-3,-1,-2,-3, 2,-1,-1,-2, 0, 4,-3,-2,-2, 2, 8,
+ 0,-3,-3,-4,-1,-3,-3,-4,-4, 4, 1,-3, 1,-1,-3,-2, 0,-3,-1, 5,
+ -2,-1, 4, 5,-3, 0, 1,-1, 0,-4,-4, 0,-3,-4,-2, 0, 0,-5,-3,-4, 5,
+ -1, 0, 0, 1,-3, 4, 5,-2, 0,-3,-3, 1,-1,-4,-1, 0,-1,-2,-2,-3, 2, 5,
+ -1,-1,-1,-1,-2,-1,-1,-2,-1,-1,-1,-1,-1,-2,-2,-1, 0,-3,-1,-1,-1,-1,-1,
+ -1,-1,-1,-1,-2,-1,-1,-2,-1,-1,-1,-1,-1,-2,-2,-1, 0,-3,-1,-1,-1,-1,-1, 7};
+
+/*
+ A R N D C Q E G H I L K M F P S T W Y V B Z X * */
+int a_md10[450]= {
+ 11, /* A */
+-12, 12, /* R */
+-12,-13, 13, /* N */
+-11,-18, -3, 12, /* D */
+-13,-10,-14,-20, 17, /* C */
+-13, -5,-11,-13,-19, 13, /* Q */
+-10,-15,-12, -2,-22, -5, 12, /* E */
+ -8, -9,-11, -9,-12,-16, -9, 11, /* G */
+-16, -5, -5,-10,-12, -3,-15,-16, 16, /* H */
+-13,-17,-14,-19,-17,-20,-19,-21,-18, 12, /* I */
+-15,-14,-19,-21,-16,-12,-20,-21,-13, -7, 10, /* L */
+-14, -2, -6,-15,-21, -6, -8,-15,-13,-17,-18, 12, /* K */
+-13,-14,-15,-18,-15,-14,-18,-19,-15, -4, -4,-12, 16, /* M */
+-18,-22,-19,-22,-11,-22,-23,-22,-14,-11, -6,-23,-14, 14, /* F */
+ -7,-12,-17,-18,-18, -8,-17,-16,-10,-19,-10,-16,-17,-17, 13, /* P */
+ -5,-10, -4,-12, -7,-13,-15, -7,-11,-14,-13,-13,-15,-11, -6, 11, /* S */
+ -4,-12, -7,-14,-14,-13,-15,-14,-13, -7,-16,-10, -7,-19, -9, -4, 12, /* T */
+-21, -9,-21,-21,-10,-17,-21,-13,-21,-21,-13,-21,-17,-13,-21,-15,-18, 18, /* W */
+-20,-17,-12,-13, -7,-16,-21,-20, -3,-15,-16,-20,-17, -3,-20,-12,-17,-12, 15, /* Y */
+ -6,-17,-17,-15,-12,-17,-14,-13,-19, -1, -8,-18, -5,-12,-16,-14,-10,-16,-18, 11, /* V */
+-12,-15, 5, 5,-17,-12, -7,-10, -7,-16,-20,-11,-17,-21,-17, -8,-10,-22,-13,-16, 13, /* B */
+-16,-18,-17, -8,-32, 1, 9,-17,-17,-29,-26,-11,-24,-34,-21,-21,-21,-29,-29,-22, -9, 13, /* Z */
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* X */
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 9}; /* * */
+
+int a_md20[450] = {
+ 10,
+-10, 12,
+ -9,-10, 13,
+ -8,-14, -1, 12,
+-10, -7,-11,-16, 17,
+-10, -3, -8, -9,-16, 13,
+ -7,-11, -9, 1,-19, -3, 11,
+ -5, -6, -8, -6, -9,-12, -7, 11,
+-12, -3, -2, -7, -9, 0,-12,-13, 15,
+-10,-14,-11,-16,-14,-16,-16,-17,-14, 12,
+-12,-11,-15,-18,-13, -9,-17,-18,-10, -4, 10,
+-11, 0, -4,-12,-17, -3, -5,-12, -9,-14,-15, 12,
+ -9,-11,-12,-15,-12,-11,-15,-16,-12, -1, -2, -9, 15,
+-15,-19,-16,-19, -8,-18,-20,-19,-11, -8, -4,-19,-10, 13,
+ -5, -9,-13,-15,-14, -5,-14,-12, -7,-15, -7,-13,-14,-14, 12,
+ -2, -8, -1, -9, -4,-10,-12, -5, -8,-11,-10,-10,-12, -8, -3, 10,
+ -1, -9, -4,-11,-10,-10,-12,-11,-10, -4,-12, -7, -4,-15, -7, -1, 11,
+-17, -6,-18,-18, -7,-14,-18,-10,-17,-17,-10,-17,-14,-10,-18,-12,-15, 18,
+-16,-14, -9,-11, -4,-12,-18,-17, 0,-12,-12,-17,-14, 0,-16, -9,-13, -9, 14,
+ -3,-14,-14,-12, -9,-14,-11,-11,-15, 2, -5,-15, -2, -9,-13,-11, -7,-13,-14, 11,
+ -9,-12, 6, 6,-14, -9, -4, -7, -4,-13,-17, -8,-13,-18,-14, -5, -7,-19,-10,-13, 12,
+-12,-13,-13, -4,-27, 4, 10,-13,-12,-24,-21, -6,-20,-29,-17,-17,-17,-24,-24,-18, -6, 12,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 9 };
+
+int a_md40[450] = {
+ 9,
+ -7, 11,
+ -6, -6, 12,
+ -6,-10, 1, 11,
+ -7, -5, -8,-13, 16,
+ -7, 0, -5, -6,-12, 12,
+ -5, -8, -5, 3,-15, 0, 11,
+ -3, -4, -5, -4, -7, -9, -4, 10,
+ -9, 0, 0, -4, -6, 2, -8,-10, 14,
+ -6,-10, -8,-12,-11,-12,-12,-13,-11, 11,
+ -9, -9,-12,-14,-10, -6,-13,-14, -7, -1, 9,
+ -8, 3, -1, -8,-12, -1, -3, -9, -6,-11,-12, 11,
+ -6, -8, -9,-12, -9, -8,-11,-12, -9, 1, 1, -7, 14,
+-11,-15,-12,-15, -5,-14,-16,-15, -7, -5, -1,-16, -7, 13,
+ -2, -6, -9,-11,-11, -3,-11, -9, -4,-11, -5,-10,-10,-11, 12,
+ 0, -5, 1, -6, -2, -7, -8, -2, -6, -8, -7, -7, -8, -6, -1, 9,
+ 1, -6, -2, -8, -7, -7, -8, -7, -7, -2, -9, -5, -2,-11, -4, 1, 10,
+-14, -4,-14,-15, -4,-11,-15, -7,-13,-13, -8,-13,-11, -7,-14, -9,-12, 18,
+-13,-10, -6, -8, -2, -9,-14,-13, 2, -9, -9,-13,-11, 2,-13, -7,-10, -6, 14,
+ -1,-11,-10, -9, -7,-11, -8, -8,-12, 4, -2,-12, 0, -6, -9, -7, -4,-10,-11, 10,
+ -6, -8, 6, 6,-10, -6, -1, -4, -2,-10,-13, -5,-10,-14,-10, -3, -5,-15, -7,-10, 11,
+ -8, -8, -8, 0,-21, 6, 10, -8, -7,-18,-16, -3,-15,-23,-12,-12,-12,-19,-18,-14, -3, 11,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 9};
+
+/*
+ Matrix made by matblas from blosum62.iij
+ * column uses minimum score
+ BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
+ Blocks Database = /data/blocks_5.0/blocks.dat
+ Cluster Percentage: >= 62
+ Entropy = 0.6979, Expected = -0.5209
+*/
+
+int abl62[450] = {
+ 4,
+ -1, 5,
+ -2, 0, 6,
+ -2,-2, 1, 6,
+ 0,-3,-3,-3, 9,
+ -1, 1, 0, 0,-3, 5,
+ -1, 0, 0, 2,-4, 2, 5,
+ 0,-2, 0,-1,-3,-2,-2, 6,
+ -2, 0, 1,-1,-3, 0, 0,-2, 8,
+ -1,-3,-3,-3,-1,-3,-3,-4,-3, 4,
+ -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,
+ -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,
+ -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5,
+ -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,
+ -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,
+ 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4,
+ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,
+ -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11,
+ -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,
+ 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,
+ -2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4,
+ -1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4,
+ 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1,
+ 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1, 6};
+
+/* blosum80 in 1/2 bit units (previous versions had 1/3 bit units) */
+/*
+ Matrix made by matblas from blosum80.iij
+ * column uses minimum score
+ BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
+ Blocks Database = /data/blocks_5.0/blocks.dat
+ Cluster Percentage: >= 80
+ Entropy = 0.9868, Expected = -0.7442
+*/
+
+int abl80[450] = {
+ 5,
+ -2, 6,
+ -2,-1, 6,
+ -2,-2, 1, 6,
+ -1,-4,-3,-4, 9,
+ -1, 1, 0,-1,-4, 6,
+ -1,-1,-1, 1,-5, 2, 6,
+ 0,-3,-1,-2,-4,-2,-3, 6,
+ -2, 0, 0,-2,-4, 1, 0,-3, 8,
+ -2,-3,-4,-4,-2,-3,-4,-5,-4, 5,
+ -2,-3,-4,-5,-2,-3,-4,-4,-3, 1, 4,
+ -1, 2, 0,-1,-4, 1, 1,-2,-1,-3,-3, 5,
+ -1,-2,-3,-4,-2, 0,-2,-4,-2, 1, 2,-2, 6,
+ -3,-4,-4,-4,-3,-4,-4,-4,-2,-1, 0,-4, 0, 6,
+ -1,-2,-3,-2,-4,-2,-2,-3,-3,-4,-3,-1,-3,-4, 8,
+ 1,-1, 0,-1,-2, 0, 0,-1,-1,-3,-3,-1,-2,-3,-1, 5,
+ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-2,-1,-1,-2,-2, 1, 5,
+ -3,-4,-4,-6,-3,-3,-4,-4,-3,-3,-2,-4,-2, 0,-5,-4,-4,11,
+ -2,-3,-3,-4,-3,-2,-3,-4, 2,-2,-2,-3,-2, 3,-4,-2,-2, 2, 7,
+ 0,-3,-4,-4,-1,-3,-3,-4,-4, 3, 1,-3, 1,-1,-3,-2, 0,-3,-2, 4,
+ -2,-2, 4, 4,-4, 0, 1,-1,-1,-4,-4,-1,-3,-4,-2, 0,-1,-5,-3,-4, 4,
+ -1, 0, 0, 1,-4, 3, 4,-3, 0,-4,-3, 1,-2,-4,-2, 0,-1,-4,-3,-3, 0, 4,
+ -1,-1,-1,-2,-3,-1,-1,-2,-2,-2,-2,-1,-1,-2,-2,-1,-1,-3,-2,-1,-2,-1,-1,
+ -1,-1,-1,-2,-3,-1,-1,-2,-2,-2,-2,-1,-1,-2,-2,-1,-1,-3,-2,-1,-2,-1,-1, 6};
+
+/* OPTIMA_5 matrix: Kann et al. (2000) Proteins 41:498-503 */
+
+int aopt5[450] = {
+ 7,
+ -2,11,
+ -4, 1,12,
+ -4,-4, 4,13,
+ 1,-6,-6,-6,20,
+ -1, 2, 0, 0,-6, 9,
+ -2, 0, 1, 4,-8, 4, 8,
+ 1,-4, 1,-2,-6,-4,-5,13,
+ -4, 1, 2,-2,-6, 0, 0,-4,17,
+ -1,-6,-6,-7,-1,-6,-7,-8,-6, 7,
+ -1,-4,-7,-9,-1,-5,-6,-8,-5, 6, 6,
+ -2, 6, 0,-1,-6, 3, 3,-4,-1,-6,-4, 7,
+ -2,-2,-4,-6,-2, 0,-4,-6,-4, 2, 5,-2,10,
+ -4,-6,-6,-7,-4,-6,-6,-6,-2, 1, 3,-6, 0,11,
+ -1,-4,-4,-1,-6,-2,-1,-4,-4,-6,-7,-2,-4,-8,15,
+ 2,-2, 2, 1,-2, 0, 0, 0,-2,-4,-4, 1,-2,-4,-2, 7,
+ 0,-2, 0,-2,-2,-2,-1,-3,-4,-2,-3,-2,-1,-4,-2, 4, 9,
+ -6,-6,-8,-8,-4,-4,-6,-4,-4,-6,-3,-6,-2, 3,-8,-6,-4,22,
+ -4,-3,-4,-4,-4,-2,-4,-6, 4,-2, 0,-4,-2, 7,-6,-4,-4, 4,14,
+ 1,-7,-6,-6,-1,-4,-6,-6,-6, 7, 3,-5, 2, 0,-4,-5, 1,-6,-2, 8,
+ -4,-1, 8,10,-6, 0, 3, 0, 1,-7,-8, 0,-5,-6,-2, 1,-1,-8,-4,-6, 9,
+ -1, 1, 0, 3,-7, 6, 6,-4, 0,-6,-6, 3,-2,-6,-2, 0,-1,-5,-3,-5, 2, 6,
+ -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
+ -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,10};
+
+/* DNA alphabet
+
+ A, C, G, T, U 1-4, 5
+ R, Y 6, 7
+ M (A or C) 8
+ W (A or T) 9
+ S (C or G) 10
+ K (G or T) 11
+ D (not C) 12
+ H (not G) 13
+ V (not T) 14
+ B (not A) 15
+ N 16
+ X 17
+*/
+
+char nt[MAXSQ+1] ={"\0ACGTURYMWSKDHVBNXACGTURYMWSKDHVBNX\0"};
+char ntx[MAXSQ+1]={"\0ACGTURYMWSKDHVBNXacgturymwskdhvbnx\0"};
+char ntc[MAXSQ+1]={"\0TGCAAYRKWSMHDBVNXtgcaayrkwsmhdbvnx\0"};
+
+/* nt complement to encoding */
+ /* A:T C:G G:C T:A U:A */
+int gc_nt[MAXSQ+1]={ 0, 4, 3, 2, 1, 1,
+ /* R:Y Y:R M:K W:W */
+ 7, 6, 11, 9,
+ /* S:S K:M D:H H:D */
+ 10, 8, 13, 12,
+ /* B:V V:B N:N X:X */
+ 15, 14, 16, 16};
+
+int nnt = 17;
+int nntx = 34;
+
+int hnt[MAXSQ+1] = {
+ NMAP,0,1,2,3,3,0,1,0,0,1,2,0,0,0,1,NMAP,
+ NMAP,0,1,2,3,3,0,1,0,0,1,2,0,0,0,1,NMAP,NMAP};
+int hntx[MAXSQ+1] = {
+ NMAP,0,1,2,3,3,0,1,0,0,1,2,0,0,0,1,NMAP,
+ NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,
+ NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP,NMAP};
+
+int npam[450] = {
+/* A C G T U R Y M W S K D H V B N X */
+ 5, /* A */
+ -4, 5, /* C */
+ -4,-4, 5, /* G */
+ -4,-4,-4, 5, /* T */
+ -4,-4,-4, 5, 5, /* U */
+ 2,-1, 2,-1,-1, 2, /* R (A G)*/
+ -1, 2,-1, 2, 2,-2, 2, /* Y (C T)*/
+ 2, 2,-1,-1,-1,-1,-1, 2, /* M (A C)*/
+ 2,-1,-1, 2, 2, 1, 1, 1, 2, /* W (A T)*/
+ -1, 2, 2,-1,-1, 1, 1, 1,-1, 2, /* S (C G)*/
+ -1,-1, 2, 2, 2, 1, 1,-1, 1, 1, 2, /* K (G T)*/
+ 1,-2, 1, 1, 1, 1,-1,-1, 1,-1, 1, 1, /* D (!C) */
+ 1, 1,-2, 1, 1,-1, 1, 1, 1,-1,-1,-1, 1, /* H (!G) */
+ 1, 1, 1,-2,-2, 1,-1, 1,-1, 1,-1,-1,-1, 1, /* V (!T) */
+ -2, 1, 1, 1, 1,-1, 1,-1,-1, 1, 1,-1,-1,-1, 1, /* B (!A) */
+ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, /* N */
+ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}; /* X */
+/* A C G T U R Y M W S K D H V B N */
+
+int *pam; /* Pam matrix- 1D */
+/* int *pam12; */
+/* int *pam12x; */
+int pamh1[MAXSQ+1]; /* used for kfact replacement */
+
+/* according to Reese and Pearson (2002) Bioinformatics 18:1500-1507
+the most effective gap penalties for matrices in 1/3 bit united are:
+ open = 25 - 0.1 * Pam_distance; ext = 5
+*/
+
+/* 16-Nov-2010 modified for modern gap open/gap extend
+ */
+
+
+/* must be ordered by entropy to adjust scoring matrix for query
+ length */
+
+#include <math.h>
+#ifndef M_LN2
+#define M_LN2 0.69314718055994530942
+#endif
+
+#define BIT2_SCALE M_LN2/2.0
+#define BIT3_SCALE M_LN2/3.0
+#define BIT5_SCALE M_LN2/5.0
+
+/* abbrev, name, matrix, scale, ulambda, entropy, tfract_id, gopen, gext */
+struct std_pam_str std_pams[] = {
+ {"VT10", "VT10", a_vt10, BIT2_SCALE, 0.2299, 3.4474, 0.9107, -16, -2},
+ {"P10", "MD10", a_md10, BIT3_SCALE, 0.2299, 3.4474, 0.9107, -23, -4},
+ {"M10", "MD10", a_md10, BIT3_SCALE, 0.2299, 3.4474, 0.9107, -23, -4},
+ {"MD10", "MD10", a_md10, BIT3_SCALE, 0.2299, 3.46293, 0.9107, -23, -4},
+ {"VT20", "VT20", a_vt20, BIT2_SCALE, 0.2300, 2.921119, 0.8312, -15, -2},
+ {"P20", "MD20", a_md20, BIT3_SCALE, 0.2300, 2.9397, 0.822, -22, -4},
+ {"M20", "MD20", a_md20, BIT3_SCALE, 0.2300, 2.9397, 0.822, -22, -4},
+ {"MD20", "MD20", a_md20, BIT3_SCALE, 0.2300, 2.9397, 0.822, -22, -4},
+ {"VT40", "VT40", a_vt40, BIT2_SCALE, 0.2305, 2.266217, 0.6968, -13, -1},
+ {"P40", "MD40", a_md40, BIT3_SCALE, 0.2305, 2.2284, 0.679, -21, -4},
+ {"M40", "MD40", a_md40, BIT3_SCALE, 0.2305, 2.2284, 0.679, -21, -4},
+ {"MD40", "MD40", a_md40, BIT3_SCALE, 0.2305, 2.2284, 0.679, -21, -4},
+ {"VT80", "VT80", a_vt80, BIT2_SCALE, 0.2305, 1.390771, 0.5024, -11, -1},
+ {"BL80", "BL80", abl80, BIT2_SCALE, 0.2259, 0.9128, 0.392, -10, -2},
+ {"VT120","VT120", a_vt120, BIT2_SCALE, 0.3416, 0.938201, 0.3748, -11, -1},
+ {"P120","PAM120", apam120, BIT3_SCALE, 0.3416, 0.9062, 0.353, -14, -3},
+ {"BL62", "BL62", abl62, BIT2_SCALE, 0.3716, 0.6979, 0.302, -11, -1},
+ {"BP62", "BL62", abl62, BIT2_SCALE, 0.3716, 0.6979, 0.302, -11, -1},
+ {"VT160","VT160", a_vt160, BIT3_SCALE, 0.2263, 0.617215, 0.2884, -12, -2},
+ {"BL50", "BL50", abl50, BIT3_SCALE, 0.2318, 0.4850, 0.273, -10, -2},
+ {"OPT5","OPTIMA5", aopt5, BIT5_SCALE, 0.1432, 0.4560, 0.262, -18, -2},
+ {"VT200","VTM200",a_vt200, BIT3_SCALE, 0.2252, 0.4121, 0.2295, -10, -2},
+ {"P250", "PAM250",apam250, BIT3_SCALE, 0.2252, 0.3207, 0.185, -10, -2},
+ {"\0", "\0", NULL, 0.0, 0.0, 0.0, 0.0, 0, 0}
+};
+
+/* Robinson & Robinson counts (based on old aa[] ordering) */
+long rrcounts[25] = {
+ 0,
+ 35155, /* A */
+ 23105, /* R */
+ 20212, /* N */
+ 24161, /* D */
+ 8669, /* C */
+ 19208, /* Q */
+ 28354, /* E */
+ 33229, /* G */
+ 9906, /* H */
+ 23161, /* I */
+ 40625, /* L */
+ 25872, /* K */
+ 10101, /* M */
+ 17367, /* F */
+ 23435, /* P */
+ 32070, /* S */
+ 26311, /* T */
+ 5990, /* W */
+ 14488, /* Y */
+ 29012, /* V */
+ 0, /* B */
+ 0, /* Z */
+ 0, /* X */
+ 0 /* * */
+};
+
+long rrtotal = 450431;
+#else
+
+/* extern char sqnam[]; */
+/* extern char sqtype[]; */
+/* extern int gdelval, ggapval; */
+extern int pamoff;
+
+extern char *NCBIstdaa;
+extern char *NCBIstdaa_l;
+extern char NCBIstdaa_n;
+extern char *NCBIstdaa_ext;
+extern char NCBIstdaa_ext_n;
+extern char *pam_sq;
+extern char *apam_sq;
+extern char *npam_sq;
+extern int pam_sq_n;
+extern int apam_sq_n;
+extern int npam_sq_n;
+
+/*
+extern char aa[MAXSQ+1];
+extern char aax[MAXSQ+1];
+*/
+extern char pssm_aa[26];
+extern char othx[MAXSQ+1];
+extern char nt[MAXSQ+1];
+extern char ntx[MAXSQ+1];
+extern char ntc[MAXSQ+1];
+extern int gc_nt[MAXSQ+1];
+
+extern int naa;
+extern int naax;
+extern int noth;
+extern int nothx;
+extern int nnt;
+extern int nntx;
+
+extern int h_NCBIstdaa[MAXSQ+1];
+extern int h_NCBIstdaa_ext[MAXSQ+1];
+/*
+extern int haa[MAXSQ+1];
+extern int haax[MAXSQ+1];
+*/
+extern int hnt[MAXSQ+1];
+extern int hntx[MAXSQ+1];
+/* extern int had[MAXSQ+1]; */
+
+extern int apam250[450];
+extern int apam120[450];
+extern int a_vt10[450];
+extern int a_vt20[450];
+extern int a_vt40[450];
+extern int a_vt80[450];
+extern int a_vt120[450];
+extern int a_vt160[450];
+extern int a_vt200[450];
+extern int a_md10[450];
+extern int a_md20[450];
+extern int a_md40[450];
+extern int abl50[450];
+extern int abl62[450];
+extern int abl80[450];
+extern int aopt5[450];
+extern int npam[450];
+extern int *pam;
+/* extern int *pam12; */
+/* extern int *pam12x; */
+extern int pamh1[MAXSQ+1];
+extern long rrcounts[25];
+extern long rrtotal;
+
+extern struct std_pam_str std_pams[];
+#endif
+#endif
diff --git a/src/url_subs.c b/src/url_subs.c
new file mode 100644
index 0000000..76d8070
--- /dev/null
+++ b/src/url_subs.c
@@ -0,0 +1,383 @@
+/* $Id: url_subs.c $ */
+
+/* copyright (c) 1998, 1999, 2014 by William R. Pearson and the
+ The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* 30 Dec 2004 - modify REF_URL to accomodate current Entrez */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+#include "structs.h"
+#include "param.h"
+
+#ifndef DEF_PROT_LIB
+#define DEF_PROT_LIB "q"
+#endif
+
+extern int seq_pos(int pos, int rev, int off);
+
+char *display_domains(char, struct annot_entry **s_annot_arr_p, int n_domains);
+char *web_encode(const char *);
+
+void encode_json_str(FILE *fp, const char *label, const char *value, int first) {
+ if (!first) {fprintf(fp, ",\n");}
+ fprintf(fp, " \"%s\": \"%s\"",label, value);
+}
+
+void encode_json_long(FILE *fp, const char *label, long value, int first) {
+ if (!first) {fprintf(fp, ",\n");}
+ fprintf(fp, " \"%s\": %ld",label, value);
+}
+
+void encode_json_dfmt(FILE *fp, const char *label, double value, char *fmt, int first) {
+ fprintf(fp, fmt, label, value);
+}
+
+void encode_json_aln(FILE *fp, const struct a_struct *aln_p, long q_offset, long l_offset, int first) {
+}
+
+void encode_json_lines(FILE *fp, const char *label, const char *annot_s, int first) {
+ char *obp, *bp;
+
+ char *tmp_annot_s;
+ int n_tmp_annot_s;
+
+ n_tmp_annot_s = strlen(annot_s)+1;
+ if ((tmp_annot_s = (char *)calloc(n_tmp_annot_s,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] *** cannot allocate tmp_annot_s[%d]\n",
+ __FILE__, __LINE__,n_tmp_annot_s);
+ return;
+ }
+
+ SAFE_STRNCPY(tmp_annot_s, annot_s, n_tmp_annot_s);
+
+ if (!first) {fprintf(fp, ",\n");}
+ fprintf(fp, " \"%s\": [\n",label);
+
+ obp = bp = tmp_annot_s;
+ while ((bp = strchr(obp,'\n'))) {
+ *bp='\0';
+ if (obp != tmp_annot_s) fprintf(fp, ",\n");
+ fprintf(fp," \"%s\"",obp);
+ obp = bp+1;
+ }
+ fprintf(fp, "\n ]");
+ free(tmp_annot_s);
+}
+
+void encode_json_domains(FILE *fp, const char *label, const struct annot_str *annot_p, int first) {
+ int i;
+
+ if (!first) {fprintf(fp, ",\n");}
+ fprintf(fp, "\"%s\": [\n",label);
+ for (i=0; i < annot_p->n_annot; i++) {
+ if (annot_p->s_annot_arr_p[i]->label != '-') continue;
+ if (i != 0) fprintf(fp, ",\n");
+ fprintf(fp, " { \"start\":%ld, \"stop\":%ld, \"description\":\"%s\" }",
+ annot_p->s_annot_arr_p[i]->pos+1,annot_p->s_annot_arr_p[i]->end+1,annot_p->s_annot_arr_p[i]->comment);
+ }
+ fprintf(fp,"\n ]");
+}
+
+void do_url1(FILE *fp, const struct mngmsg *m_msp, const struct pstruct *ppst,
+ char *l_name, int n1,
+ const struct a_struct *aln_p, const char *annot_var_s,
+ const struct annot_str *q_annot_p,
+ const struct annot_str *l_annot_p )
+{
+ char my_q_name[200], my_l_name[200], json_l_name[200];
+ char *db, *bp;
+ char pgm[10], o_pgm[10], lib[MAX_LSTR];
+ char *tmp_annot_s, *q_domain_s, *l_domain_s, *tmp_domain_s, *etmp_domain_s;
+ int n_tmp_annot_s, n_tmp_domain;
+ long q_offset, l_offset;
+ char *ref_url, *lbp=NULL;
+ char *srch_url, *srch_url1, *dom_url;
+
+ /* set the database */
+ if (m_msp->ldb_info.ldnaseq==SEQT_DNA) db="nucleotide";
+ else db="Protein";
+
+ /* set the program type */
+ if (strncmp(m_msp->f_id0,"rss",3)==0) {
+ strncpy(pgm,"fa",sizeof(pgm));
+ }
+ else if (strncmp(m_msp->f_id0,"rfx",3)==0) {
+ strncpy(pgm,"fx",sizeof(pgm));
+ }
+ else { strncpy(pgm,m_msp->f_id0,sizeof(pgm)); }
+
+ SAFE_STRNCPY(o_pgm, pgm, sizeof(o_pgm));
+
+ /* get a library name (probably does not work for %, + abbreviations */
+ if (m_msp->lname[0]!='%') {
+ SAFE_STRNCPY(lib,m_msp->lname,sizeof(lib));
+ }
+ else {
+ SAFE_STRNCPY(lib,"%25",sizeof(lib));
+ SAFE_STRNCAT(lib,&m_msp->lname[1],sizeof(lib));
+ }
+ lib[sizeof(lib)-1]='\0';
+
+ if ((lbp = strchr(l_name,'|'))==NULL) {
+ lbp = l_name;
+ }
+ else {
+ lbp++;
+ }
+
+ SAFE_STRNCPY(my_q_name,m_msp->qtitle,sizeof(my_q_name));
+ if ((bp=strchr(my_q_name,' '))!=NULL) *bp='\0';
+
+ SAFE_STRNCPY(my_l_name,lbp,sizeof(my_l_name));
+
+ if (pgm[0]=='t' || !strcmp(pgm,"fx") || !strcmp(pgm,"fy") ) {
+ if ((lbp=strchr(my_l_name,':'))!=NULL) *lbp='\0';
+ lbp = &my_l_name[strlen(my_l_name)-2];
+ if ( *lbp == '_' ) *lbp = '\0';
+ }
+
+ /* change the program name for fastx, tfastx, tfasta */
+ /* fastx returns proteins */
+ if (strcmp(pgm,"fx")==0 || strcmp(pgm,"fy")==0) {SAFE_STRNCPY(pgm,"fa",sizeof(pgm));}
+ else if (strcmp(pgm,"ff")==0) {SAFE_STRNCPY(pgm,"fa",sizeof(pgm));}
+ else if (pgm[0]=='t') {
+ SAFE_STRNCPY(pgm,"fx",sizeof(pgm));
+ SAFE_STRNCPY(lib,DEF_PROT_LIB,sizeof(lib));
+ }
+
+ fflush(fp);
+
+ q_offset = aln_p->q_offset;
+ l_offset = aln_p->l_offset;
+
+ /* set up ref_url, srch_url, srch_url1, dom_url */
+
+ fflush(fp);
+
+ ref_url = getenv("REF_URL");
+ srch_url = getenv("SRCH_URL");
+ srch_url1 = getenv("SRCH_URL1");
+ dom_url = NULL;
+ dom_url = getenv("DOMAIN_PLOT_URL");
+
+ if (ref_url || srch_url || srch_url1 || dom_url) {
+ fprintf(fp,"<!-- LINK_START %s -->",l_name);
+
+ /* REF_URL should provide */
+ /* "<A HREF=\"http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=%s&fcmd=Search&doptcmd1=DocSum&term=%s\">Entrez lookup</A> " */
+ if (ref_url != NULL) {fprintf(fp,ref_url,db,my_l_name);}
+
+ /* SRCH_URL should provide */
+ /* "<A HREF=\"http://localhost/fasta_www2/searchfa.cgi?dummy=%s&query=%s&db=fasta_www.cgi&lib=%s&pgm=%s&start=%ld&stop=%ld&n1=%d&o_pgm=%s\">Re-search database</A> " */
+ if (srch_url != NULL) {
+ fprintf(fp,srch_url,my_q_name, my_l_name,db,lib,pgm,
+ l_offset+aln_p->amin1+1,l_offset+aln_p->amax1,n1,m_msp->f_id0);
+ }
+
+ /* SRCH_URL1 should provide: */
+ /* "<A HREF=\"http://localhost/fasta_www2/searchxf.cgi?dummy=%s&query=%s&db=%s&lib=%s&pgm=%s&start=%ld&stop=%ld&n1=%d&o_pgm=%s\">General re-search</A>\n" */
+
+ if (srch_url1 != NULL) {
+ fprintf(fp,srch_url1,my_q_name, my_l_name,db,lib,pgm,
+ l_offset+aln_p->amin1+1,l_offset+aln_p->amax1,n1,m_msp->f_id0);
+ }
+
+ if (dom_url!=NULL) {
+ if (annot_var_s && annot_var_s[0]) {
+ tmp_annot_s = web_encode(annot_var_s);
+ }
+ else tmp_annot_s = "";
+
+ q_domain_s = l_domain_s = NULL;
+
+ if (q_annot_p && q_annot_p->n_domains > 0 &&
+ (q_domain_s = display_domains('q',q_annot_p->s_annot_arr_p, q_annot_p->n_annot))!=NULL) {
+ }
+ if (l_annot_p && l_annot_p->n_domains > 0 &&
+ (l_domain_s = display_domains('l',l_annot_p->s_annot_arr_p, l_annot_p->n_annot))!=NULL) {
+ }
+
+ /* combine domain strings */
+ n_tmp_domain = 0;
+ if (q_domain_s) n_tmp_domain += strlen(q_domain_s)+1;
+ if (l_domain_s) n_tmp_domain += strlen(l_domain_s)+1;
+ etmp_domain_s = "";
+ if (n_tmp_domain > 0) {
+ if ((tmp_domain_s=(char *)calloc(n_tmp_domain,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] *** cannot allocate tmp_domain_s[%d]\n",
+ __FILE__, __LINE__,n_tmp_domain);
+ }
+ else {
+ tmp_domain_s[0] = '\0';
+ if (q_domain_s) SAFE_STRNCAT(tmp_domain_s, q_domain_s, n_tmp_domain);
+ if (l_domain_s) SAFE_STRNCAT(tmp_domain_s, l_domain_s, n_tmp_domain);
+ etmp_domain_s = web_encode(tmp_domain_s);
+ }
+ }
+
+ /* appropriate format string: */
+ /*
+ pgm=%s -- program abbrev that created alignment
+ q_name=%s -- query info
+ q_cstart=%ld
+ q_cstop=%ld
+ q_astart=%ld
+ q_astop=%ld
+ l_name=%s -- library info
+ l_cstart=%ld
+ l_cstop=%ld
+ l_astart=%ld
+ l_astop=%ld
+ region=%s -- aligned domain and variant information
+ doms=%s
+
+ DOMAIN_PLOT_URL = "pgm=%s;q_name=%s;q_cstart=%ld;q_cstop=%ld&q_astart=%ld&q_astop=%ld&l_name=%s&l_cstart=%ld&l_cstop=%ld&l_astart=%ld&l_astop=%ld®ions=%s&doms=%s"
+ */
+
+ /* think about the alternative of running a script
+ rather than embedding it */
+
+ fprintf(fp,dom_url,o_pgm,
+ my_q_name, q_offset+seq_pos(1,aln_p->qlrev,2),q_offset+seq_pos(m_msp->n0,aln_p->qlrev,2),
+ q_offset+seq_pos(aln_p->amin0+1,aln_p->qlrev,1), q_offset+seq_pos(aln_p->amax0, aln_p->qlrev,2),
+ my_l_name, l_offset+seq_pos(1,aln_p->llrev,2), l_offset+seq_pos(n1,aln_p->llrev,2),
+ l_offset+seq_pos(aln_p->amin1+1,aln_p->llrev,1),l_offset+seq_pos(aln_p->amax1,aln_p->llrev,2),
+ tmp_annot_s, etmp_domain_s);
+
+ if (n_tmp_domain>0 && tmp_domain_s) {
+ free(tmp_domain_s);
+ free(etmp_domain_s);
+ }
+ if (l_annot_p && l_annot_p->n_domains && l_domain_s) {
+ free(l_domain_s);
+ }
+ if (q_annot_p && q_annot_p->n_domains && q_domain_s) {
+ free(q_domain_s);
+ }
+ if (annot_var_s && annot_var_s[0] && tmp_annot_s) free(tmp_annot_s);
+ }
+
+ fprintf(fp,"\n<!-- LINK_STOP -->");
+ fflush(fp);
+ }
+
+ /*
+ if ((srch_url2 = getenv("SRCH_URL2"))==NULL)
+ fprintf(fp,"<A HREF=\"http://fasta.bioch.virginia.edu/fasta/cgi/lalignx.cgi?seq1=\"%s\"&in_seq1=\"FASTA\"&seq2=\"%s\"&in_seq2=\"Accession\"&ssr2=%ld:%ld\">lalign</A>\n<p>\n",my_l_name,db,lib,pgm,l_offset+aln_p->amin1+1,l_offset+aln_p->amax1,n1);
+ else
+ fprintf(fp,srch_url1,my_l_name,db,lib,pgm,
+ l_offset+aln_p->amin1+1,l_offset+aln_p->amax1,n1);
+ */
+
+
+ if (getenv("JSON_HTML")) {
+
+ /* replace '|' with '_' */
+ SAFE_STRNCPY(json_l_name, l_name, sizeof(json_l_name));
+ for (bp=strchr(json_l_name,'|'); bp; bp=strchr(bp+1,'|')) { *bp = '_'; }
+
+ /* replace '.' with '_' */
+ for (bp=strchr(json_l_name,'.'); bp; bp=strchr(bp+1,'.')) { *bp = '_'; }
+
+ fprintf(fp,"\n<script type=\"text/javascript\">\n//<![CDATA[\n var json_%s = {\n",json_l_name);
+ encode_json_str(fp, "db", db, 1);
+ encode_json_str(fp, "l_acc", l_name, 0);
+ encode_json_str(fp, "acc", my_l_name, 0);
+ encode_json_str(fp, "lib", lib, 0);
+ encode_json_str(fp, "pgm", pgm, 0);
+ encode_json_str(fp, "o_pgm", m_msp->f_id0, 0);
+ encode_json_aln(fp, aln_p, q_offset, l_offset, 0);
+ if (annot_var_s && annot_var_s[0]) { encode_json_lines(fp, "annot", annot_var_s, 0); }
+ if (q_annot_p && q_annot_p->n_domains > 0) { encode_json_domains(fp, "q_domains", q_annot_p, 0); }
+ if (l_annot_p && l_annot_p->n_domains > 0) { encode_json_domains(fp, "l_domains", l_annot_p, 0); }
+
+ fprintf(fp, "\n}\n//]]>\n</script>");
+ fflush(fp);
+ }
+}
+
+char *display_domains(char target, struct annot_entry **annot_arr_p, int n_annots) {
+ char *domain_s;
+ char line[MAX_STR];
+ int i, i_doms, n_domain_s = MAX_LSTR;
+
+ /* since (currently) annot_var_s is MAX_LSOTR, do the same for domain_s */
+ if ((domain_s = (char *)calloc(n_domain_s, sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] *** cannot allocate domain_s[%d]\n",__FILE__, __LINE__,n_domain_s);
+ return NULL;
+ }
+
+ for (i=0; i < n_annots; i++) {
+ /* annot_arr_p[] has both domains and non domains, but n_domains only counts domains */
+ if (annot_arr_p[i]->label != '-') continue;
+ sprintf(line, "%cDomain:\t%ld-%ld\t%s\n",
+ target, annot_arr_p[i]->pos+1, annot_arr_p[i]->end+1, annot_arr_p[i]->comment);
+ if (strlen(domain_s) + strlen(line)+1 > n_domain_s) {
+ n_domain_s += n_domain_s/2;
+ domain_s = realloc(domain_s, n_domain_s);
+ }
+ SAFE_STRNCAT(domain_s, line, n_domain_s);
+ }
+
+ domain_s = realloc(domain_s, (n_domain_s=strlen(domain_s))+1);
+ domain_s[n_domain_s]='\0';
+
+ return domain_s;
+}
+
+/* take an annotation string *annot_var_s and convert problematic characters to their web encoding */
+/* ' ' (space) %20 */
+/* '|' %7C */
+/* ';' %3B */
+/* '=' %3D */
+/* '\n' %0A */
+
+static char bad_chars[] = "\n =;|";
+
+char *web_encode(const char *annot_var_s) {
+
+ int n_tmp_annot_s;
+ char *tmp_annot_s, *tmp_annot_d, *dp;
+ const char *bp, *sp;
+ int bad_cnt = 0;
+
+ /* make string largest possible size */
+ n_tmp_annot_s = strlen(annot_var_s)*3 + 1;
+ if ((tmp_annot_s = (char *)calloc(n_tmp_annot_s,sizeof(char)))==NULL) {
+ fprintf(stderr,"*** error [%s:%d] *** cannot allocate tmp_annot_s[%d]\n",__FILE__, __LINE__,n_tmp_annot_s);
+ return NULL;
+ }
+
+ dp = tmp_annot_s;
+ for (sp = annot_var_s; *sp ; sp++) {
+
+ if ((*sp < '0') ||
+ (*sp > 9 && *sp < 'A') ||
+ (*sp > 'Z' && *sp < 'a') ||
+ (*sp > 'z')) { sprintf(dp,"%%%02x",*sp); dp += 3;}
+ else { *dp++ = *sp; }
+ }
+
+ n_tmp_annot_s = dp - tmp_annot_s;
+ tmp_annot_s = realloc(tmp_annot_s, n_tmp_annot_s+1);
+ tmp_annot_s[n_tmp_annot_s] = '\0';
+
+ return tmp_annot_s;
+}
diff --git a/src/uthr_subs.h b/src/uthr_subs.h
new file mode 100644
index 0000000..ba0a270
--- /dev/null
+++ b/src/uthr_subs.h
@@ -0,0 +1,52 @@
+/***************************************/
+/* thread global variable declarations */
+/***************************************/
+
+/* $Id: uthr_subs.h 625 2011-03-23 17:21:38Z wrp $ */
+
+#ifndef MAX_WORKERS
+#define MAX_WORKERS 2
+#endif
+#define NUM_WORK_BUF 2*MAX_WORKERS
+
+#include <synch.h>
+#include <thread.h>
+
+#define check(status,string) \
+ if (status == -1) perror(string) /* error macro for thread calls */
+
+#ifndef XTERNAL
+
+thread_t threads[MAX_WORKERS];
+
+/* mutex stuff */
+
+mutex_t reader_mutex; /* empty buffer pointer structure lock */
+mutex_t worker_mutex; /* full buffer pointer structure lock */
+
+/* condition variable stuff */
+
+cond_t reader_cond_var; /* condition variable for reader */
+cond_t worker_cond_var; /* condition variable for workers */
+
+mutex_t start_mutex; /* start-up synchronisation lock */
+cond_t start_cond_var; /* start-up synchronisation condition variable */
+
+#else
+
+extern thread_t threads[];
+
+/* mutex stuff */
+
+extern mutex_t reader_mutex;
+extern mutex_t worker_mutex;
+
+/* condition variable stuff */
+
+extern cond_t reader_cond_var;
+extern cond_t worker_cond_var;
+
+extern mutex_t start_mutex;
+extern cond_t start_cond_var;
+
+#endif
diff --git a/src/wm_align.c b/src/wm_align.c
new file mode 100644
index 0000000..3d04bcc
--- /dev/null
+++ b/src/wm_align.c
@@ -0,0 +1,581 @@
+/* $Id: wm_align.c 1166 2013-05-30 01:05:55Z wrp $ */
+
+/* algorithms and code provided by Webb Miller, Penn State
+ University */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+#include "rstruct.h"
+#include "aln_structs.h"
+
+struct swstr {int H, E;};
+
+int
+NW_ALIGN(int IW, const unsigned char *B,
+ int M, int N,
+ int **W, int G, int H, int *S, int *NC);
+
+static int
+CHECK_SCORE(int IW, const unsigned char *B,
+ int M, int N,
+ int *S, int **W, int G, int H, int *nres, int *sw);
+
+/* sw_walign() is here, rather than in dropgsw2.c, because it is also
+ used by dropnfa.c
+*/
+
+struct a_res_str *
+merge_ares_chains(struct a_res_str *cur_ares,
+ struct a_res_str *tmp_ares,
+ int score_ix,
+ const char *msg);
+
+int
+sw_walign (int **pam2p, int n0,
+ const unsigned char *aa1, int n1,
+ int q, int r,
+ struct swstr *ss,
+ struct a_res_str *a_res
+ )
+{
+ const unsigned char *aa1p;
+ register int i, j;
+ register struct swstr *ssj;
+ int e, f, h, p;
+ int qr;
+ int score;
+ int cost, I, J, K, L;
+
+ qr = q + r;
+
+ /* initialize 0th row */
+ for (ssj=ss; ssj<ss+n0; ssj++) {
+ ssj->H = 0;
+ ssj->E = -q;
+ }
+
+ /* I = saved position in aa1
+ J = saved position in aa0
+ */
+ score = I = J = 0;
+ aa1p = aa1;
+ i = 0;
+ while (*aa1p) {
+ h = p = 0;
+ f = -q;
+ /* pwaa = waa + (*aa1p++ * n0); */
+ for (ssj = ss, j=0; j < n0; ssj++, j++) {
+ if ((h = h - qr) > (f = f - r)) f = h;
+ if ((h = ssj->H - qr) > (e = ssj->E - r)) e = h;
+ /* h = p + *pwaa++; */
+ h = p + pam2p[j][*aa1p];
+ if (h < 0 ) h = 0;
+ if (h < f ) h = f;
+ if (h < e ) h = e;
+ p = ssj->H;
+ ssj->H = h;
+ ssj->E = e;
+ if (h > score) {
+ score = h;
+ I = i;
+ /* J = (int)(ssj-ss); */
+ J = j;
+ }
+ }
+ i++;
+ aa1p++;
+ } /* done with forward pass */
+ if (score <= 0) return 0;
+
+ /* to get the start point, go backwards */
+
+ /* K = begin in aa1
+ L = begin in aa0
+ */
+ cost = K = L = 0;
+ for (ssj=ss+J; ssj>=ss; ssj--) {
+ ssj->H=ssj->E= -1;
+ }
+
+ for (i=I,aa1p=aa1+I; i>=0; i--) {
+ h = f = -1;
+ p = (i == I) ? 0 : -1;
+ for (ssj = ss+J, j=J; ssj>=ss; ssj--,j--) {
+ f = max (f,h-q)-r;
+ ssj->E=max(ssj->E,ssj->H-q)-r;
+ h = max(max(ssj->E,f), p+pam2p[j][aa1[i]]);
+ p = ssj->H;
+ ssj->H=h;
+ if (h > cost) {
+ cost = h;
+ K = i;
+ L = (int)(ssj-ss);
+ if (cost >= score) goto found;
+ }
+ }
+ }
+
+found:
+
+ /* printf(" %d: L: %3d-%3d/%3d; K: %3d-%3d/%3d\n",score,L,J,n0,K,I,n1); */
+
+ a_res->n1 = n1;
+ a_res->max0 = J+1; a_res->min0 = L; a_res->max1 = I+1; a_res->min1 = K;
+
+ NW_ALIGN(L,&aa1[K-1],J-L+1,I-K+1,pam2p,q,r,a_res->res,&a_res->nres);
+
+ return score;
+}
+
+/* nsw_malign is a recursive interface to nw/sw_walign() that is called
+ from do_walign(). nsw_malign() first does an alignment, then checks
+ to see if the score is greater than the threshold. If so, it tries
+ doing a left and right alignment.
+
+ 2009-Mar-22 -- This version generalizes the strategy for
+ partitioning the solution by taking the *_walign function as an
+ argument
+
+ 2009-May-1 -- add code to ensure that returned a_res->chain is
+ sorted by score. One strategy is to simply always insert at the
+ appropriate place, which requires re-searching from the top each
+ time (which is not a big deal, since the list is short). We simply
+ need another argument to nsw_malign, which is the head of the list.
+ */
+struct a_res_str *
+nsw_malign (int ***pam2p, int pam_ix, int n0,
+ const unsigned char *aa1, int n1,
+ int score_thresh, int max_res,
+ int gdelval, int ggapval,
+ struct swstr *ss,
+ struct a_res_str *cur_ares,
+ int (*fn_walign)
+ (
+ int **pam2p, int n0,
+ const unsigned char *aa1, int n1,
+ int q, int r,
+ struct swstr *ss,
+ struct a_res_str *a_res
+ ),
+ int do_rep
+ )
+{
+ struct a_res_str *tmpl_ares, *tmpr_ares, *this_ares;
+ unsigned char *local_aa1;
+ int nc, score_ix;
+ int min_alen;
+ int max_sub_score = -1;
+
+ min_alen = min(MIN_LOCAL_LEN,n0);
+
+ /* now we need alignment storage - get it */
+ if ((cur_ares->res = (int *)calloc((size_t)max_res,sizeof(int)))==NULL) {
+ fprintf(stderr," *** cannot allocate alignment results array %d\n",max_res);
+ exit(1);
+ }
+
+ score_ix = 0;
+
+ cur_ares->next = NULL;
+
+ cur_ares->sw_score = (*fn_walign)(pam2p[0], n0, aa1, n1,
+ gdelval, ggapval,
+ ss, cur_ares);
+
+ /* The scores in a_res->rst include low-complexity alignment.
+ Re-calculate the score of the optimal alignment using the -S matrix.
+
+ This makes sense for secondary HSP's, but not for the initial
+ HSP, because it -S score for the non-S alignment could be smaller
+ than the original optimal -S score (it cannot be higher).
+ */
+ CHECK_SCORE(cur_ares->min0, &aa1[cur_ares->min1-1],
+ cur_ares->max0 - cur_ares->min0, cur_ares->max1-cur_ares->min1,
+ cur_ares->res,
+ pam2p[pam_ix], gdelval, ggapval,
+ &nc, &cur_ares->rst.score[0]);
+
+ if (!do_rep || cur_ares->rst.score[score_ix] <= score_thresh) { return cur_ares;}
+
+ if (cur_ares->min1 >= min_alen) { /* try the left */
+ /* allocate a_res */
+ tmpl_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+
+ local_aa1 = (unsigned char *)calloc(cur_ares->min1+2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1,cur_ares->min1);
+ /*
+ save_res = aa1[cur_ares->min1];
+ aa1[cur_ares->min1] = '\0';
+ */
+ tmpl_ares = nsw_malign(pam2p, pam_ix, n0, local_aa1, cur_ares->min1,
+ score_thresh, max_res,
+ gdelval, ggapval, ss, tmpl_ares,
+ fn_walign, do_rep);
+ free(--local_aa1);
+ /*
+ aa1[cur_ares->min1] = save_res;
+ */
+
+ if (tmpl_ares->rst.score[score_ix] > score_thresh) {
+ max_sub_score = tmpl_ares->rst.score[score_ix];
+ }
+ else {
+ if (tmpl_ares->res) free(tmpl_ares->res);
+ free(tmpl_ares);
+ tmpl_ares=NULL;
+ }
+ }
+ else {tmpl_ares = NULL;}
+
+ if (n1 - cur_ares->max1 >= min_alen) { /* try the right */
+ /* allocate a_res */
+ tmpr_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
+
+ /* find boundaries */
+
+ local_aa1 = (unsigned char *)calloc(n1-cur_ares->max1+2,sizeof(unsigned char));
+ local_aa1++;
+ memcpy(local_aa1,aa1+cur_ares->max1,n1-cur_ares->max1);
+ /*
+ save_res = aa1[cur_ares->max1-1];
+ aa1[cur_ares->max1-1] = '\0';
+ */
+
+ tmpr_ares = nsw_malign(pam2p, pam_ix, n0, local_aa1, n1 - cur_ares->max1,
+ score_thresh, max_res,
+ gdelval, ggapval, ss, tmpr_ares,
+ fn_walign, do_rep);
+
+ free(--local_aa1);
+ /*
+ aa1[cur_ares->max1-1] = save_res;
+ */
+
+ if (tmpr_ares->rst.score[score_ix] > score_thresh) {
+ /* adjust the left boundary */
+ for (this_ares = tmpr_ares; this_ares; this_ares = this_ares->next) {
+ this_ares->min1 += cur_ares->max1;
+ this_ares->max1 += cur_ares->max1;
+ }
+
+ if (tmpr_ares->rst.score[score_ix] >= max_sub_score) {
+ max_sub_score = tmpr_ares->rst.score[score_ix];
+ }
+ }
+ else {
+ if (tmpr_ares->res) free(tmpr_ares->res);
+ free(tmpr_ares);
+ tmpr_ares = NULL;
+ }
+ }
+ else {tmpr_ares = NULL;}
+
+ /* We have checked both left and right, and better score is in max_sub_score.
+ If both scores are <= score_thresh, then forget it */
+
+ if (max_sub_score <= score_thresh) {
+ if (tmpl_ares) {
+ if (tmpl_ares->res) {free(tmpl_ares->res);}
+ free(tmpl_ares);
+ }
+ if (tmpr_ares) {
+ if (tmpr_ares->res) {free(tmpr_ares->res);}
+ free(tmpr_ares);
+ }
+ return cur_ares;
+ }
+
+ cur_ares = merge_ares_chains(cur_ares, tmpl_ares, score_ix, "left");
+ cur_ares = merge_ares_chains(cur_ares, tmpr_ares, score_ix, "right");
+
+ return cur_ares;
+}
+
+#define gap(k) ((k) <= 0 ? 0 : q+r*(k)) /* k-symbol indel cost */
+
+/* Append "Delete k" op */
+#define DEL(k) \
+{ if (*last < 0) \
+ *last = (*sapp)[-1] -= (k); \
+ else { \
+ *last = (*sapp)[0] = -(k); \
+ (*sapp)++; \
+ } \
+}
+
+/* Append "Insert k" op */
+#define INS(k) \
+{ if (*last > 0) \
+ *last = (*sapp)[-1] += (k); \
+ else { \
+ *last = (*sapp)[0] = (k); \
+ (*sapp)++; \
+ } \
+}
+
+/* align(A,B,M,N,tb,te,last) returns the cost of an optimum conversion between
+ A[1..M] and B[1..N] that begins(ends) with a delete if tb(te) is zero
+ and appends such a conversion to the current script. */
+
+static int
+nw_align(int iw, /* beginning of alignment in pam2p profile */
+ const unsigned char *B, /* second sequence aa1 */
+ int M, int N, /* length of profile, aa1 */
+ int tb, int te,
+ int **w, int q, int r, /* pam2p profile, open, ext */
+ struct swstr *f_ss, /* forward, reverse row matrix */
+ struct swstr *r_ss,
+ int dir, /* dir [0..3] is not currently used */
+ int **sapp, int *last)
+{
+
+ int midi, midj, type; /* Midpoint, type, and cost */
+ int midc;
+ int c1, c2;
+
+ register int i, j;
+ register int c, e, d, s;
+ int qr, t, *wa;
+
+/* print_seq_prof(A,M,B,N,w,iw); */
+
+/* m = g + h; */
+ qr = q + r;
+
+/* Boundary cases: M <= 1 or N == 0 */
+
+ if (N <= 0) {
+ if (M > 0) {DEL(M)}
+ return -gap(M);
+ }
+
+ if (M <= 1) {
+ if (M <= 0) {
+ INS(N)
+ return -gap(N);
+ }
+
+ if (tb < te) tb = te;
+ midc = (tb-r) - gap(N);
+ midj = 0;
+/* wa = w[A[1]]; */
+ wa = w[iw];
+ for (j = 1; j <= N; j++) {
+ c = -gap(j-1) + wa[B[j]] - gap(N-j);
+ if (c > midc) { midc = c; midj = j;}
+ }
+ if (midj == 0) { DEL(1) INS(N) }
+ else {
+ if (midj > 1) { INS(midj-1)}
+ *last = (*sapp)[0] = 0;
+ (*sapp)++;
+ if (midj < N) { INS(N-midj)}
+ }
+ return midc;
+ }
+
+/* Divide: Find optimum midpoint (midi,midj) of cost midc */
+
+ midi = M/2; /* Forward phase: */
+ f_ss[0].H = 0; /* Compute H(M/2,k) & E(M/2,k) for all k */
+ f_ss[0].E = t = -q;
+ for (j = 1; j <= N; j++) {
+ f_ss[j].H = t = t-r;
+ f_ss[j].E = t-q;
+ }
+ t = tb;
+ for (i = 1; i <= midi; i++) {
+ s = f_ss[0].H;
+ f_ss[0].H = c = t = t-r;
+ e = t-q;
+/* wa = w[A[i]]; */
+ wa = w[iw+i-1];
+ for (j = 1; j <= N; j++) {
+ if ((c = c - qr) > (e = e - r)) e = c;
+ if ((c = f_ss[j].H - qr) > (d = f_ss[j].E - r)) d = c;
+ c = s + wa[B[j]];
+ if (e > c) c = e;
+ if (d > c) c = d;
+ s = f_ss[j].H;
+ f_ss[j].H = c;
+ f_ss[j].E = d;
+ }
+ }
+ f_ss[0].E = f_ss[0].H;
+
+ r_ss[N].H = 0; /* Reverse phase: */
+ t = -q; /* Compute R(M/2,k) & S(M/2,k) for all k */
+
+ for (j = N-1; j >= 0; j--) {
+ r_ss[j].H = t = t-r;
+ r_ss[j].E = t-q;
+ }
+
+ t = te;
+ for (i = M-1; i >= midi; i--) {
+ s = r_ss[N].H;
+ r_ss[N].H = c = t = t-r;
+ e = t-q;
+/* wa = w[A[i+1]]; */
+ wa = w[iw+i];
+ for (j = N-1; j >= 0; j--) {
+ if ((c = c - qr) > (e = e - r)) { e = c; }
+ if ((c = r_ss[j].H - qr) > (d = r_ss[j].E - r)) { d = c; }
+ c = s + wa[B[j+1]];
+ if (e > c) c = e;
+ if (d > c) c = d;
+ s = r_ss[j].H;
+ r_ss[j].H = c;
+ r_ss[j].E = d;
+ }
+ }
+ r_ss[N].E = r_ss[N].H;
+
+ midc = f_ss[0].H+r_ss[0].H; /* Find optimal midpoint */
+ midj = 0;
+ type = 1;
+
+ for (j = 0; j <= N; j++) {
+ if ((c = f_ss[j].H + r_ss[j].H) >= midc) {
+ if (c > midc || (f_ss[j].H != f_ss[j].E && r_ss[j].H == r_ss[j].E)) {
+ midc = c;
+ midj = j;
+ }
+ }
+ }
+
+ for (j = N; j >= 0; j--) {
+ if ((c = f_ss[j].E + r_ss[j].E + q) > midc) {
+ midc = c;
+ midj = j;
+ type = 2;
+ }
+ }
+
+/* Conquer: recursively around midpoint */
+
+ if (type == 1) {
+ c1 = nw_align(iw,B,midi,midj,tb,-q,w,q,r,f_ss, r_ss,0,sapp,last);
+ c2 = nw_align(iw+midi,B+midj,M-midi,N-midj,-q,te,w,q,r,f_ss, r_ss,1,sapp,last);
+ }
+ else {
+ nw_align(iw,B,midi-1,midj,tb,0,w,q,r,f_ss, r_ss,2,sapp,last);
+ DEL(2);
+ nw_align(iw+midi+1,B+midj,M-midi-1,N-midj,0,te,w,q,r,f_ss,r_ss,3,sapp,last);
+ }
+ return midc;
+}
+
+/* Interface and top level of comparator */
+
+int
+NW_ALIGN(int IW, const unsigned char *B,
+ int M, int N,
+ int **W, int G, int H, int *S, int *NC)
+{
+ struct swstr *f_ss, *r_ss;
+ int *sapp, last;
+ int c, ck, sw;
+
+ sapp = S;
+ last = 0;
+
+ if ((f_ss = (struct swstr *) calloc (N+2, sizeof (struct swstr)))
+ == NULL) {
+ fprintf (stderr, " *** cannot allocate f_ss array %3d\n", N+2);
+ exit (1);
+ }
+ f_ss++;
+
+ if ((r_ss = (struct swstr *) calloc (N+2, sizeof (struct swstr)))
+ == NULL) {
+ fprintf (stderr, " *** cannot allocate r_ss array %3d\n", N+2);
+ exit (1);
+ }
+ r_ss++;
+
+ /* print_seq_prof(A,M,W,IW); */
+ c = nw_align(IW,B,M,N,-G,-G,W,G,H,f_ss, r_ss,0,&sapp,&last); /* OK, do it */
+
+ ck = CHECK_SCORE(IW,B,M,N,S,W,G,H,NC, &sw);
+ if (c != ck) {
+ fprintf(stderr," *** Check_score error. %d != %d ***\n",c,ck);
+ }
+
+ f_ss--; r_ss--;
+ free(r_ss); free(f_ss);
+
+ return c;
+}
+
+/* CHECK_SCORE - return the score of the alignment stored in S */
+
+static int
+CHECK_SCORE(int iw, const unsigned char *B,
+ int M, int N,
+ int *S, int **w,
+ int g, int h, int *NC, int *sw_score)
+{
+ register int i, j, op, nc;
+ int itmp;
+ int score;
+ int l_score, mx_l_score;
+
+ /* print_seq_prof(A,M,w,iw); */
+
+ score = i = j = nc = l_score = mx_l_score = 0;
+#ifdef SHOW_ALIGN_SCORE
+ printf("====start\n");
+ printf("#i j pam2 score l_score mx_l_score\n");
+#endif
+ while (i < M || j < N) {
+ op = *S++;
+ if (op == 0) {
+ itmp = w[iw+i][B[++j]];
+ score += itmp;
+ i++;
+ nc++;
+ l_score += itmp;
+ if (l_score < 0) l_score = 0;
+ if (l_score > mx_l_score) mx_l_score = l_score;
+#ifdef SHOW_ALIGN_SCORE
+ printf("%d\t%d\t%d\t%d\t%d\t%d\n",i, j, itmp, score, l_score, mx_l_score);
+#endif
+ }
+ else if (op > 0) {
+ score = score - (g+op*h);
+ j += op;
+ nc += op;
+ l_score -= (g+op*h);
+ if (l_score < 0) l_score = 0;
+#ifdef SHOW_ALIGN_SCORE
+ printf("%d\t%d\t%d\t%d\t%d\t%d\n",i, j, -(g+op*h) ,score, l_score, mx_l_score);
+#endif
+ } else {
+ score = score - (g-op*h);
+ i -= op;
+ nc -= op;
+ l_score -= (g-op*h);
+ if (l_score < 0) l_score = 0;
+#ifdef SHOW_ALIGN_SCORE
+ printf("%d\t%d\t%d\t%d\t\%d\t%d\n",i, j, -(g-op*h), score, l_score, mx_l_score);
+#endif
+ }
+ }
+#ifdef SHOW_ALIGN_SCORE
+ printf("%d\t%d\tend\t%d\t%d\n====\n",i, j, score, mx_l_score);
+#endif
+ *NC = nc;
+ /* used to return mx_l_score, which is wrong when CHECK_SCORE is used for global alignments */
+#ifndef GGSEARCH
+ *sw_score = mx_l_score;
+#else
+ *sw_score = score;
+#endif
+ return score;
+}
+
diff --git a/src/work_thr2.c b/src/work_thr2.c
new file mode 100644
index 0000000..f335042
--- /dev/null
+++ b/src/work_thr2.c
@@ -0,0 +1,492 @@
+/* $Id: work_thr2.c $ */
+
+/* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson
+ and The The Rector & Visitors of the University of Virginia */
+
+/* Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under this License is distributed on an "AS
+ IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
+ express or implied. See the License for the specific language
+ governing permissions and limitations under the License.
+*/
+
+/* work_thr.c - threaded worker */
+
+/* modified 21-Oct-1998 to work with reverse complement for DNA */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/types.h>
+#include <signal.h>
+
+#include "defs.h" /* various constants */
+#include "best_stats.h" /* defines beststr */
+#include "structs.h"
+#include "param.h" /* pstruct rstruct */
+#include "thr_buf_structs.h"
+
+/***************************************/
+/* thread global variable declarations */
+/***************************************/
+
+#ifndef PCOMPLIB
+#define XTERNAL
+#include "thr_bufs2.h"
+#undef XTERNAL
+#else
+#include "msg.h"
+#define XTERNAL
+#include "uascii.h"
+#undef XTERNAL
+#ifdef MPI_SRC
+#include "mpi.h"
+#endif
+#endif
+
+void alloc_pam (int, int, struct pstruct *);
+int **alloc_pam2p(int **,int, int);
+void revcomp(unsigned char *seq, int n, int *c_nt);
+
+#if defined(WIN32) || !defined(THR_EXIT)
+void pthread_exit(void *);
+#define THR_EXIT pthread_exit
+#else
+void THR_EXIT(void *);
+#endif
+
+#ifdef DEBUG
+extern struct buf_head *lib_buf2_list;
+#endif
+
+/* functions getting/sending buffers to threads (thr_sub.c) */
+extern void wait_thr(void);
+extern int get_wbuf(struct buf_head **cur_buf, int max_work_buf);
+extern void put_wbuf(struct buf_head *cur_buf, int max_work_buf);
+
+/* dropxx.c functions */
+#include "drop_func.h"
+
+extern void *my_srand();
+extern unsigned int my_nrand(int, void *);
+extern void qshuffle(unsigned char *aa0, int n0, int nm0, void *);
+extern void free_pam2p(int **);
+
+void init_aa0(unsigned char **aa0, int n0, int nm0,
+ unsigned char **aa0s, unsigned char **aa1s,
+ int qframe, int qshuffle_flg, int max_tot,
+ struct pstruct *ppst, void **f_str, void **qf_str,
+ void *my_rand_state);
+
+extern void
+buf_do_work(unsigned char **aa0, int n0, struct buf_head *lib_bhead_p,
+ int max_frame, struct pstruct *ppst, void **f_str);
+extern void
+buf_qshuf_work(unsigned char *aa0s, int n0, struct buf_head *lib_bhead_p,
+ int max_frame, struct pstruct *ppst, void *qf_str, int score_ix);
+extern void
+buf_shuf_work(unsigned char **aa0, int n0, unsigned char *aa1s,
+ struct buf_head *lib_bhead_p, int max_frame, struct pstruct *ppst,
+ void **f_str, int score_ix, void *);
+
+void
+buf_do_align(unsigned char **aa0, int n0,
+ struct buf_head *lib_bhead_p,
+ struct pstruct *ppst, const struct mngmsg *my_msp,
+ void **f_str);
+
+#ifndef PCOMPLIB
+#define FIRSTNODE 0
+void
+work_thread (struct thr_str *work_info)
+#else
+#if defined(TFAST)
+extern void aainit(int tr_type, int debug);
+#endif
+
+int g_worker;
+
+void work_comp(int my_worker)
+#endif
+{
+ struct buf_head *cur_buf, *my_cur_buf;
+ char info_lib_range[MAX_FN];
+ unsigned char *aa1s=NULL;
+#ifndef PCOMPLIB
+
+ const struct mngmsg *my_msp;
+ int my_worker;
+#else
+#ifdef MPI_SRC
+ struct mngmsg *my_msp;
+ MPI_Status mpi_status;
+ int buf_alloc_flag = 0;
+#endif
+ struct mngmsg my_msg;
+ int int_msg_b[4];
+ struct buf2_data_s *my_buf2_data;
+ struct buf2_res_s *my_buf2_res;
+ struct buf2_ares_s *my_buf2_ares;
+ struct seq_record *my_seq_buf;
+ unsigned char *my_aa1b_buf;
+#endif
+ int i, j, npam, n0, nm0;
+ int max_work_buf, max_buf2_res, max_chain_seqs, seq_buf_size;
+ void *my_rand_state;
+
+ struct pstruct my_pst, *my_ppst;
+ unsigned char *aa0[6], *aa0s;
+ void *f_str[6], *qf_str;
+
+ my_rand_state=my_srand();
+
+#ifndef PCOMPLIB
+ my_worker = work_info->worker;
+ max_work_buf = work_info->max_work_buf;
+ wait_thr(); /* wait for start_thread predicate to drop to 0 */
+
+ my_msp = work_info->m_msp;
+#else /* PCOMPLIB */
+
+#ifdef DEBUG
+/* fprintf(stderr,"%d: work_comp started\n",my_worker); */
+#endif
+ g_worker = my_worker;
+ my_msp = &my_msg;
+
+#ifdef MPI_SRC
+ pcomp_loop:
+
+ MPI_Recv(int_msg_b,4,MPI_INT,0, STARTTYPE0,MPI_COMM_WORLD,
+ &mpi_status);
+
+ max_work_buf = int_msg_b[0];
+ max_buf2_res = int_msg_b[1];
+ max_chain_seqs = int_msg_b[2];
+ seq_buf_size = int_msg_b[3];
+
+ /* quit the main loop with a message of 0 max_work_buf */
+ if (max_work_buf == 0) { goto pcomp_final;}
+
+ MPI_Recv((void *)my_msp,sizeof(struct mngmsg),MPI_BYTE,0,STARTTYPE1,MPI_COMM_WORLD,
+ &mpi_status);
+
+ MPI_Recv((void *)&my_pst,(int)sizeof(struct pstruct),MPI_BYTE,0,STARTTYPE2,MPI_COMM_WORLD,
+ &mpi_status);
+ my_ppst = &my_pst;
+
+#endif /* MPI_SRC */
+
+ if (!buf_alloc_flag) {
+ buf_alloc_flag = 1;
+ /* must allocate buffers for data, sequences, results */
+ if ((my_cur_buf = cur_buf = (struct buf_head *)calloc(1,sizeof(struct buf_head)))==NULL) {
+ fprintf(stderr,"cannot allocate buf_head\n");
+ exit(1);
+ }
+
+ /* allocate results array */
+ if ((my_buf2_res = (struct buf2_res_s*)calloc(max_buf2_res+1,sizeof(struct buf2_res_s)))==NULL) {
+ fprintf(stderr,"cannot allocate buf2_data[%d]\n",max_buf2_res);
+ exit(1);
+ }
+ cur_buf->buf2_res = my_buf2_res;
+
+ /* allocate buffers for ares alignment encodings */
+ if ((my_buf2_ares = (struct buf2_ares_s*)calloc(max_buf2_res+1,sizeof(struct buf2_ares_s)))==NULL) {
+ fprintf(stderr,"cannot allocate buf2_data[%d]\n",max_buf2_res);
+ exit(1);
+ }
+ cur_buf->buf2_ares = my_buf2_ares;
+
+ /* allocate buffers for data */
+ if ((my_buf2_data = (struct buf2_data_s*)calloc(max_buf2_res+1,sizeof(struct buf2_data_s)))==NULL) {
+ fprintf(stderr,"cannot allocate buf2_data[%d]\n",max_buf2_res);
+ exit(1);
+ }
+ cur_buf->buf2_data = my_buf2_data;
+
+ /* also must allocate seq_records */
+ if ((my_seq_buf =
+ (struct seq_record *)calloc((size_t)(max_buf2_res+1), sizeof(struct seq_record)))
+ ==NULL) {
+ fprintf(stderr,"%d: cannot allocate seq_record buffer[%d]\n",my_worker,max_buf2_res+1);
+ exit(1);
+ }
+ cur_buf->buf2_data[0].seq = cur_buf->hdr.seq_b = my_seq_buf;
+
+ if ((my_aa1b_buf = (unsigned char *)calloc((size_t)(seq_buf_size+1),sizeof(unsigned char)))
+ ==NULL) {
+ fprintf(stderr,"%d: cannot allocate sequence buffer[%d]\n",my_worker, seq_buf_size);
+ exit(1);
+ }
+ else { /* now associate the my_aa1b_buf with cur_buf */
+ my_aa1b_buf++;
+ cur_buf->hdr.aa1b_start = cur_buf->buf2_data[0].seq->aa1b = my_aa1b_buf;
+ cur_buf->hdr.aa1b_size = seq_buf_size;
+ }
+ }
+ else {
+ cur_buf = my_cur_buf;
+ cur_buf->buf2_data = my_buf2_data;
+ cur_buf->buf2_data[0].seq = cur_buf->hdr.seq_b = my_seq_buf;
+ cur_buf->buf2_res = my_buf2_res;
+ cur_buf->buf2_ares = my_buf2_ares;
+ cur_buf->hdr.aa1b_start = cur_buf->buf2_data[0].seq->aa1b = my_aa1b_buf;
+ cur_buf->hdr.aa1b_size = seq_buf_size;
+ }
+
+#if defined(TFAST)
+ /* set up translation tables: faatran.c */
+ aainit(my_ppst->tr_type,my_ppst->debug_lib);
+#endif
+
+#endif /* PCOMPLIB */
+
+ /* the pam allocation stuff is very different for threaded vs PCOMPLIB,
+ so the code is separate */
+#if !defined(PCOMPLIB)
+ /* make certain that all but 0 have their own copy of pst */
+ if (my_worker== 0) {
+ my_ppst=work_info->ppst;
+ }
+ else {
+ my_ppst = &my_pst;
+ memcpy(my_ppst,work_info->ppst,sizeof(struct pstruct));
+ /* #else we already have the stuff in my_pst from initialization */
+
+ my_ppst->pam2p[0] = my_ppst->pam2p[1] = NULL;
+
+ alloc_pam(MAXSQ, MAXSQ, my_ppst);
+
+ npam = my_pst.nsqx;
+
+ /* allocate local copy of pam2[][] */
+ for (i=0; i<npam; i++) {
+ for (j=0; j<npam; j++) {
+ my_pst.pam2[0][i][j] = work_info->ppst->pam2[0][i][j];
+ my_pst.pam2[1][i][j] = work_info->ppst->pam2[1][i][j];
+ }
+ }
+ }
+#endif
+#if defined(PCOMPLIB) /* PCOMPLIB */
+ my_ppst = &my_pst; /* for all workers */
+ alloc_pam(my_msg.pamd1,my_msg.pamd2,my_ppst);
+#ifdef MPI_SRC
+ MPI_Recv(&my_pst.pam2[0][0][0],my_msg.pamd1*my_msg.pamd2,MPI_INT,0,
+ STARTTYPE3, MPI_COMM_WORLD,&mpi_status);
+
+ MPI_Recv(&my_pst.pam2[1][0][0],my_msg.pamd1*my_msg.pamd2,MPI_INT,0,
+ STARTTYPE3, MPI_COMM_WORLD,&mpi_status);
+ /* no code for profiles */
+
+ /* get pascii (only for fasty/tfasty */
+ pascii = aascii;
+ MPI_Recv(pascii, sizeof(aascii), MPI_BYTE, 0, STARTTYPE4, MPI_COMM_WORLD, &mpi_status);
+#endif
+#endif
+
+ /* fill in info_lib_range */
+ if (my_worker == FIRSTNODE) {
+ /* label library size limits */
+ if (my_ppst->n1_low > 0 && my_ppst->n1_high < BIGNUM) {
+ sprintf(info_lib_range," (range: %d-%d)",my_ppst->n1_low,my_ppst->n1_high);}
+ else if (my_ppst->n1_low > 0) {
+ sprintf(info_lib_range," (range: >%d)",my_ppst->n1_low);}
+ else if (my_ppst->n1_high < BIGNUM) {
+ sprintf(info_lib_range," (range: <%d)",my_ppst->n1_high);}
+ else {
+ info_lib_range[0]='\0';
+ }
+ info_lib_range[sizeof(info_lib_range)-1]='\0';
+#ifndef PCOMPLIB
+ strncpy(work_info->info_lib_range,info_lib_range,MAX_SSTR);
+ /* this does not work on some architectures */
+ work_info->f_str_ap = &f_str[0];
+#endif
+ }
+
+#ifdef PCOMPLIB
+#ifdef MPI_SRC
+ /* send back sync message */
+ int_msg_b[0]=my_worker;
+ MPI_Send(int_msg_b,1,MPI_INT,0,MSEQTYPE0,MPI_COMM_WORLD);
+ if (my_worker == FIRSTNODE) {
+ MPI_Send(info_lib_range,MAX_FN,MPI_BYTE,0,MSEQTYPE0,MPI_COMM_WORLD);
+ }
+#endif
+#endif
+
+ /* do the aa0[] stuff after m_msg/my_pst are initialized, for later
+ inclusion in a loop */
+
+#ifdef PCOMPLIB
+#ifdef MPI_SRC
+ MPI_Recv(int_msg_b,2,MPI_INT,0,
+ QSEQTYPE0, MPI_COMM_WORLD, &mpi_status);
+
+ n0 = int_msg_b[0];
+ nm0 = int_msg_b[1];
+#endif
+#else /* COMP_THR */
+ n0 = my_msp->n0;
+ nm0 = my_msp->nm0;
+ if (my_worker != FIRSTNODE) {
+ /* if this is a pssm search, allocate local copy of pam2p[][]*/
+ if (work_info->ppst->pam_pssm && work_info->ppst->pam2p[0]) {
+ my_ppst->pam2p[0] = alloc_pam2p(my_ppst->pam2p[0],n0,npam);
+ my_ppst->pam2p[1] = alloc_pam2p(my_ppst->pam2p[1],n0,npam);
+
+ for (i=0; i<n0; i++) {
+ for (j=0; j < npam; j++) {
+ my_pst.pam2p[0][i][j] = work_info->ppst->pam2p[0][i][j];
+ my_pst.pam2p[1][i][j] = work_info->ppst->pam2p[1][i][j];
+ }
+ }
+ }
+ }
+#endif
+
+ if ((aa0[0]=(unsigned char *)calloc((size_t)n0+2+SEQ_PAD,sizeof(unsigned char)))
+ ==NULL) {
+ fprintf(stderr," cannot allocate aa00[%d] for worker %d\n",
+ n0, my_worker);
+ exit(1);
+ }
+ *aa0[0]='\0';
+ aa0[0]++;
+
+#ifndef PCOMPLIB
+ memcpy(aa0[0],work_info->aa0,n0+1);
+#else
+#ifdef MPI_SRC
+ /* get aa0[0] from host */
+ MPI_Recv(aa0[0],n0+1,MPI_BYTE,0,
+ QSEQTYPE1,MPI_COMM_WORLD, &mpi_status);
+
+ /* also get annotation if available */
+ if (my_msp->ann_flg && my_msp->aa0a != NULL) {
+ if ((my_msp->aa0a = (unsigned char *)calloc(my_msp->n0+2,sizeof(char)))==NULL) {
+ fprintf(stderr, "*** error -- cannot allocate annotation array\n");
+ exit(1);
+ }
+ MPI_Recv(my_msp->aa0a, (my_msp->n0+2)*sizeof(char), MPI_BYTE, 0,
+ QSEQTYPE1, MPI_COMM_WORLD, &mpi_status);
+ }
+#endif
+#endif
+
+ init_aa0(aa0, n0, nm0, &aa0s, &aa1s,
+ my_msp->qframe, my_msp->qshuffle, my_msp->max_tot,
+ my_ppst, &f_str[0], &qf_str, my_rand_state);
+
+/* **************************************************************** */
+/* main work loop */
+
+ while (get_wbuf(&cur_buf,max_work_buf)) {
+
+ if (cur_buf->hdr.stop_work) break;
+
+ /* exit thread on specific command -- this option is not used
+ for threads - get_wbuf() stops when rbuf_done() sets reader_done==1
+ but it is used for PCOMPLIB
+ */
+
+ if (cur_buf->hdr.buf2_cnt <= 0) { /* buffers can be empty */
+ cur_buf->hdr.have_results = 0;
+ goto res_done;
+ }
+
+ if (cur_buf->hdr.buf2_type & BUF2_DOWORK) {
+
+ buf_do_work(aa0, n0, cur_buf, my_msp->nitt1, my_ppst, f_str);
+
+ if (my_msp->qshuffle) {
+ buf_qshuf_work(aa0s, n0, cur_buf, my_msp->nitt1,
+ my_ppst, qf_str, my_ppst->score_ix);
+ }
+ }
+
+ if (cur_buf->hdr.buf2_type & BUF2_DOSHUF) {
+ buf_shuf_work(aa0, n0, aa1s, cur_buf, my_msp->nitt1,
+ my_ppst, f_str, my_ppst->score_ix, my_rand_state);
+ }
+
+ /*
+ if (cur_buf->hdr.buf2_type & BUF2_DOOPT) {
+ buf_do_opt(aa0, n0, cur_buf, my_ppst, f_str);
+ }
+ */
+
+ if (cur_buf->hdr.buf2_type & BUF2_DOALIGN) {
+ buf_do_align(aa0, n0, cur_buf, my_ppst, my_msp, f_str);
+ }
+ cur_buf->hdr.have_results = 1;
+
+ res_done:
+ cur_buf->hdr.have_data = 0;
+
+ put_wbuf(cur_buf,max_work_buf);
+
+ } /* end main while */
+
+/* **************************************************************** */
+/* all done - clean-up */
+
+ close_work(aa0[0], n0, my_ppst, &f_str[0]);
+ free(aa0[0]-1);
+ if (my_msp->qframe == 2) {
+ close_work(aa0[1], n0, my_ppst, &f_str[1]);
+ free(aa0[1]-1);
+ }
+
+ if (my_msp->qshuffle) {
+ close_work(aa0s, n0, my_ppst, &qf_str);
+ free(aa0s-1);
+ }
+
+ free(aa1s-1);
+
+#ifdef PCOMPLIB
+ if (my_msp->ann_flg && my_msp->aa0a) { free(my_msp->aa0a);}
+#endif
+
+ if (my_worker) {
+ free(my_pst.pam2[1][0]);
+ free(my_pst.pam2[0][0]);
+ free(my_pst.pam2[1]);
+ free(my_pst.pam2[0]);
+ }
+
+ if (my_worker && my_pst.pam_pssm) {
+ free_pam2p(my_pst.pam2p[0]);
+ free_pam2p(my_pst.pam2p[1]);
+ }
+
+/* **************************************************************** */
+/* and exit */
+
+#ifdef DEBUG
+ /* fprintf(stderr,"worker [%d] done\n",my_worker); */
+#endif
+
+#ifndef PCOMPLIB
+ free(my_rand_state);
+ THR_EXIT(&work_info->status);
+#else
+ /* the PCOMPLIB version loops after a search, waiting for another max_work_buf */
+ /* max_work_buf==0 signals end of queries */
+ goto pcomp_loop;
+
+ pcomp_final:
+ free(my_rand_state);
+#endif
+} /* end work_thread */
diff --git a/test/results/README b/test/results/README
new file mode 100644
index 0000000..e8131ca
--- /dev/null
+++ b/test/results/README
@@ -0,0 +1 @@
+Placeholder file to create test/results/ directory.
diff --git a/test/test.bat b/test/test.bat
new file mode 100644
index 0000000..61fc390
--- /dev/null
+++ b/test/test.bat
@@ -0,0 +1,73 @@
+rem ""
+rem "starting fasta36_t - protein on win32"
+rem ""
+..\bin\fasta36_t -q -m 6 -Z 100000 ..\seq\mgstm1.aa:1-100 q > results\test_m1.ok2_t.html
+..\bin\fasta36_t -S -q -z 11 -O results\test_m1.ok2_t_p25 -s P250 ..\seq\mgstm1.aa:100-218 q
+rem "done"
+rem "starting fastxy36_t"
+..\bin\fastx36_t -m 9c -S -q ..\seq\mgtt2_x.seq q 1 > results\test_t2.xk1_t
+..\bin\fasty36_t -S -q ..\seq\mgtt2_x.seq q > results\test_t2.yk2_t
+..\bin\fastx36_t -m 9c -S -q -z 2 ..\seq\mgstm1.esq a > results\test_m1.xk2_tz2
+..\bin\fasty36_t -S -q -z 2 ..\seq\mgstm1.esq a > results\test_m1.yk2_tz2
+rem "done"
+rem "starting fastxy36_t rev"
+..\bin\fastx36_t -m 9c -q -m 5 ..\seq\mgstm1.rev q > results\test_m1.xk2r_t
+..\bin\fasty36_t -q -m 5 -M 200-300 -z 2 ..\seq\mgstm1.rev q > results\test_m1.yk2r_tz2
+..\bin\fasty36_t -q -m 5 -z 11 ..\seq\mgstm1.rev q > results\test_m1.yk2rz11_t
+rem "done"
+rem "starting ssearch36_t"
+..\bin\ssearch36_t -m 9c -S -z 3 -q ..\seq\mgstm1.aa q > results\test_m1.ss_tz3
+..\bin\ssearch36_t -q -M 200-300 -z 2 -Z 100000 -s P250 ..\seq\mgstm1.aa q > results\test_m1.ss_t_p25
+rem "done"
+rem "starting prss/prfx36"
+..\bin\ssearch36_t -q -k 1000 -A ..\seq\mgstm1.aa ..\seq\xurt8c.aa > results\test_m1.rss
+..\bin\fastx36_t -q -k 1000 -A ..\seq\mgstm1.esq ..\seq\xurt8c.aa > results\test_m1.rfx
+rem "done"
+rem "starting fasta36_t - DNA"
+..\bin\fasta36_t -S -q -z 2 ..\seq\mgstm1.seq %M 4 > results\test_m1.ok4_tz2
+..\bin\fasta36_t -S -q ..\seq\mgstm1.rev %M 4 > results\test_m1.ok4r_t
+rem "done"
+rem "starting tfastxy36_t"
+..\bin\tfastx36_t -m 9c -q -i -3 -m 6 ..\seq\mgstm1.aa %m > results\test_m1.tx2_t.html
+..\bin\tfasty36_t -q -i -3 -N 5000 ..\seq\mgstm1.aa %m > results\test_m1.ty2_t
+rem "done"
+rem "starting fastf36_t"
+..\bin\fastf36_t -q ..\seq\m1r.aa q > results\test_mf.ff_t
+..\bin\fastf36 -q ..\seq\m1r.aa q > results\test_mf.ff_s
+rem "done"
+rem "starting tfastf36_t"
+..\bin\tfastf36_t -q ..\seq\m1r.aa %m > results\test_mf.tf_tr
+rem "done"
+rem "starting fasts36_t"
+..\bin\fasts36_t -q -V '*?@' ..\seq\ngts.aa q > results\test_m1.fs1_t
+..\bin\fasts36_t -q ..\seq\ngt.aa q > results\test_m1.fs_t
+..\bin\fasts36_t -q -n ..\seq\mgstm1.nts m > results\test_m1.nfs_t
+rem "done"
+rem "starting tfasts36_t"
+..\bin\tfasts36_t -q ..\seq\n0.aa %m > results\test_m1.ts_r
+rem "done"
+rem "starting fasta36 - protein"
+..\bin\fasta36 -q -z 2 ..\seq\mgstm1.aa q 1 > results\test_m1.ok1z2
+..\bin\fasta36 -q -s P250 ..\seq\mgstm1.aa q > results\test_m1.ok2_p25
+rem "done"
+rem "starting fastx3"
+..\bin\fastx36 -m 9c -q ..\seq\mgstm1.esq q > results\test_m1.ok2x
+rem "done"
+rem "starting fasty3"
+..\bin\fasty36 -q ..\seq\mgstm1.esq q > results\test_m1.ok2y
+rem "done"
+rem "starting fasta36 - DNA "
+..\bin\fasta36 -m 9c -q ..\seq\mgstm1.seq M 4 > results\test_m1.ok4
+rem "done"
+rem "starting ssearch3"
+..\bin\ssearch36 -S -q -z 2 ..\seq\mgstm1.aa q > results\test_m1.ss_z2
+..\bin\ssearch36 -q -s P250 ..\seq\mgstm1.aa q > results\test_m1.ss_p25
+rem "done"
+rem "starting tfastxy3"
+..\bin\tfastx36 -q ..\seq\mgstm1.aa M > results\test_m1.tx2
+..\bin\tfasty36 -m 9c -q ..\seq\mgstm1.aa M > results\test_m1.ty2
+rem "done"
+rem "starting fasts36"
+..\bin\fasts36 -q -V '@?*' ..\seq\ngts.aa q > results\test_m1.fs1
+..\bin\fasts36 -q ..\seq\ngt.aa q > results\test_m1.fs
+rem "done"
diff --git a/test/test.sh b/test/test.sh
new file mode 100755
index 0000000..15fd49d
--- /dev/null
+++ b/test/test.sh
@@ -0,0 +1,79 @@
+#!/bin/sh
+echo ""
+echo "STARTING FASTA36" `date` "on" `hostname`
+echo `uname -a`
+echo ""
+if [ ! -d results ]; then
+ mkdir results
+fi
+echo "starting fasta36 - protein" `date`
+../bin/fasta36 -q -m 6 -Z 100000 ../seq/mgstm1.aa:1-100 q > results/test_m1.ok2.html
+../bin/fasta36 -S -q -z 11 -O results/test_m1.ok2_p25 -s P250 ../seq/mgstm1.aa:100-218 q
+echo "done"
+echo "starting fastxy36" `date`
+../bin/fastx36 -m 9c -S -q ../seq/mgtt2_x.seq q 1 > results/test_t2.xk1
+../bin/fasty36 -S -q ../seq/mgtt2_x.seq q > results/test_t2.yk2
+../bin/fastx36 -m 9c -S -q -z 2 ../seq/mgstm1.esq a > results/test_m1.xk2z2
+../bin/fasty36 -S -q -z 2 ../seq/mgstm1.esq a > results/test_m1.yk2z2
+echo "done"
+echo "starting fastxy36 rev" `date`
+../bin/fastx36 -m 9c -q -m 5 ../seq/mgstm1.rev q > results/test_m1.xk2r
+../bin/fasty36 -q -m 5 -M 200-300 -z 2 ../seq/mgstm1.rev q > results/test_m1.yk2rz2
+../bin/fasty36 -q -m 5 -z 11 ../seq/mgstm1.rev q > results/test_m1.yk2rz11
+echo "done"
+echo "starting ssearch36" `date`
+../bin/ssearch36 -m 9c -S -z 3 -q ../seq/mgstm1.aa q > results/test_m1.ssz3
+../bin/ssearch36 -q -M 200-300 -z 2 -Z 100000 -s P250 ../seq/mgstm1.aa q > results/test_m1.ss_p25
+echo "done"
+if [ -e ../bin/ssearch36s ]; then
+ echo "starting ssearch36s" `date`
+ ../bin/ssearch36s -m 9c -S -z 3 -q ../seq/mgstm1.aa q > results/test_m1.sssz3
+ ../bin/ssearch36s -q -M 200-300 -z 2 -Z 100000 -s P250 ../seq/mgstm1.aa q > results/test_m1.sss_p25
+ echo "done"
+fi
+echo "starting prss36(ssearch/fastx)" `date`
+../bin/ssearch36 -q -k 1000 -a ../seq/mgstm1.aa ../seq/xurt8c.aa > results/test_m1.rss
+../bin/fastx36 -q -k 1000 ../seq/mgstm1.esq ../seq/xurt8c.aa > results/test_m1.rfx
+echo "done"
+echo "starting ggsearch36/glsearch36" `date`
+../bin/ggsearch36 -q -m 9i -w 80 ../seq/hahu.aa q > results/test_h1.gg
+../bin/glsearch36 -q -m 9i -w 80 ../seq/hahu.aa q > results/test_h1.gl
+../bin/ggsearch36 -q ../seq/gtt1_drome.aa q > results/test_t1.gg
+../bin/glsearch36 -q ../seq/gtt1_drome.aa q > results/test_t1.gl
+echo "done"
+echo "starting fasta36 - DNA" `date`
+../bin/fasta36 -S -q ../seq/mgstm1.nt %RMB 4 > results/test_m1.ok4
+../bin/fasta36 -S -q ../seq/mgstm1.rev %RMB 4 > results/test_m1.ok4r
+echo "done"
+#echo "starting tfasta36" `date`
+#tfasta36 -q ../seq/mgstm1.aa %RMB > results/test_m1.tk2
+#echo "done"
+echo "starting tfastxy36" `date`
+../bin/tfastx36 -m 9c -q -i -3 -m 6 ../seq/mgstm1.aa %p > results/test_m1.tx2.html
+../bin/tfasty36 -q -i -3 -N 5000 ../seq/mgstm1.aa %p > results/test_m1.ty2
+echo "done"
+echo "starting fastf36" `date`
+../bin/fastf36 -q ../seq/m1r.aa q > results/test_mf.ff
+../bin/fastf36 -q ../seq/m1r.aa q > results/test_mf.ff_s
+echo "done"
+echo "starting tfastf36" `date`
+../bin/tfastf36 -q ../seq/m1r.aa %r > results/test_mf.tfr
+echo "done"
+echo "starting fasts36" `date`
+../bin/fasts36 -q -V '*?@' ../seq/ngts.aa q > results/test_m1.fs1
+../bin/fasts36 -q ../seq/ngt.aa q > results/test_m1.fs
+../bin/fasts36 -q -n ../seq/mgstm1.nts m > results/test_m1.nfs
+echo "starting fastm36" `date`
+../bin/fastm36 -q ../seq/ngts.aa q > results/test_m1.fm
+../bin/fastm36 -q -n ../seq/mgstm1.nts m > results/test_m1.nfm
+echo "done"
+echo "starting tfasts36" `date`
+../bin/tfasts36 -q ../seq/n0.aa %r > results/test_m1.ts_r
+echo "starting lalign36" `date`
+../bin/lalign36 -k 1000 -q ../seq/mchu.aa ../seq/mchu.aa > results/test_mc.lal
+../bin/lalign36 -z 3 -q ../seq/mchu.aa ../seq/mchu.aa > results/test_mc.lal_z3
+../bin/lalign36 -s BL62 -f -11 -g -1 -q ../seq/mchu.aa ../seq/mchu.aa > results/test_mc.lal_bl62
+../bin/lalign36 -k 1000 -q ../seq/mwkw.aa ../seq/mwkw.aa > results/test_mw.lal
+../bin/lalign36 -z 3 -q ../seq/mwkw.aa ../seq/mwkw.aa > results/test_mw.lal_z3
+../bin/lalign36 -s BL62 -f -11 -g -1 -q ../seq/mwkw.aa ../seq/mwkw.aa > results/test_mw.lal_bl62
+echo "FINISHED" `date`
diff --git a/test/test2.sh b/test/test2.sh
new file mode 100755
index 0000000..dc499b6
--- /dev/null
+++ b/test/test2.sh
@@ -0,0 +1,53 @@
+#!/bin/sh
+echo ""
+echo "starting FASTA36" `date` "on" `hostname`
+echo `uname -a`
+echo ""
+echo "starting fasta36 - protein" `date`
+if [ ! -d results ]; then
+ mkdir results
+fi
+../bin/fasta36 -S -z 21 -s BP62 -c O ../seq/mgstm1.aa q > results/test2o_m1.ok2_bp62
+../bin/fasta36 -S -z 21 -s BP62 ../seq/mgstm1.aa q > results/test2_m1.ok2_bp62
+../bin/fasta36 -S -z 21 -c O ../seq/mgstm1.aa q > results/test2o_m1.ok2_z21
+../bin/fasta36 -S -z 21 ../seq/mgstm1.aa q > results/test2_m1.ok2_z21
+../bin/fasta36 -S -m BB ../seq/mgstm1.aa q > results/test2_m1.ok2mB
+../bin/fasta36 -S -m 8CC ../seq/mgstm1.aa q > results/test2_m1.ok2m8CC
+../bin/fasta36 -S -m 8CB ../seq/mgstm1.aa q > results/test2_m1.ok2m8CB
+echo "done"
+echo "starting fastxy36" `date`
+../bin/fastx36 -m 9c -S -c O -q ../seq/mgtt2_x.seq q > results/test2o_t2.xk2m9c
+../bin/fastx36 -m 8C -S -q ../seq/mgtt2_x.seq q > results/test2_t2.xk2m8C
+../bin/fastx36 -m 8CB -S -q ../seq/mgtt2_x.seq q > results/test2_t2.xk2m8CB
+../bin/fastx36 -m BB -S -q -H ../seq/mgtt2_x.seq q > results/test2_t2.xk2mB
+../bin/fasty36 -S -c O -q ../seq/mgtt2_x.seq q > results/test2o_t2.yk2
+../bin/fasty36 -S -c O -q ../seq/mgtt2_x.seq q > results/test2_t2.yk2
+../bin/fasty36 -S -m8 -q ../seq/mgtt2_x.seq q > results/test2_t2.yk2m8
+../bin/fastx36 -m 9c -c O -S -q -z 22 ../seq/mgstm1.esq q > results/test2o_m1.xk2m9cz22
+../bin/fastx36 -m 8C -S -q ../seq/mgstm1.esq q > results/test2_m1.xk2m8Cz22
+../bin/fastx36 -m 8CB -S -q ../seq/mgstm1.esq q > results/test2_m1.xk2m8CBz22
+../bin/fasty36 -S -c O -q -z 21 ../seq/mgstm1.esq q > results/test2o_m1.yk2z21
+../bin/fasty36 -S -q -z 21 ../seq/mgstm1.esq q > results/test2_m1.yk2z21
+echo "done"
+echo "starting ssearch36" `date`
+../bin/ssearch36 -m 9c -S -z 22 -q ../seq/mgstm1.aa q > results/test2_m1.ssm9cz22
+../bin/ssearch36 -m 9C -S -z 21 -q ../seq/mgstm1.aa q > results/test2_m1.ssm9Cz21
+../bin/ssearch36 -m 8C -S -q ../seq/mgstm1.aa q > results/test2_m1.ssm8C
+../bin/ssearch36 -m 8CB -S -q ../seq/mgstm1.aa q > results/test2_m1.ssm8CB
+echo "done"
+echo "starting fasta36 - DNA" `date`
+../bin/fasta36 -S -q -c O -r "+2/-4" ../seq/mgstm1.nt %RMB 4 > results/test2o_m1.ok4z1r24
+../bin/fasta36 -S -q -r "+2/-4" ../seq/mgstm1.nt %RMB 4 > results/test2_m1.ok4z1r24
+../bin/fasta36 -S -q -c O ../seq/mgstm1.rev %RMB 4 > results/test2o_m1.ok4r
+../bin/fasta36 -S -q ../seq/mgstm1.rev %RMB 4 > results/test2_m1.ok4r
+../bin/fasta36 -m BB -q ../seq/mgstm1.rev %RMB > results/test2_m1.ok6mB
+../bin/fasta36 -m 8C -q ../seq/mgstm1.rev %RMB > results/test2_m1.ok6m8C
+../bin/fasta36 -m 8CB -q ../seq/mgstm1.rev %RMB > results/test2_m1.ok6m8CB
+echo "done"
+echo "starting tfastxy36" `date`
+../bin/tfastx36 -c O -m 9c -q -i -3 ../seq/mgstm1.aa %p > results/test2o_m1.tx2
+../bin/tfastx36 -m 8C -q -i -3 ../seq/mgstm1.aa %p > results/test2_m1.tx2_m8C
+../bin/tfastx36 -m 8CB -q -i -3 ../seq/mgstm1.aa %p > results/test2_m1.tx2_m8CB
+../bin/tfasty36 -c O -q -i -3 -N 5000 ../seq/mgstm1.aa %p > results/test2o_m1.ty2
+../bin/tfasty36 -q -i -3 -N 5000 ../seq/mgstm1.aa %p > results/test2_m1.ty2
+echo "done" `date`
diff --git a/test/test2G.sh b/test/test2G.sh
new file mode 100755
index 0000000..64b6e9a
--- /dev/null
+++ b/test/test2G.sh
@@ -0,0 +1,79 @@
+#!/bin/sh
+echo ""
+echo "STARTING FASTA36" `date` "on" `hostname`
+echo `uname -a`
+echo ""
+if [ ! -d results ]; then
+ mkdir results
+fi
+echo "starting fasta36 - protein" `date`
+../bin/fasta36 -XM2G -q -m 6 -Z 100000 ../seq/mgstm1.aa:1-100 q > results/test2G_m1.ok2.html
+../bin/fasta36 -q -XM2G -S -z 11 -O results/test2G_m1.ok2_p25 -s P250 ../seq/mgstm1.aa:100-218 q
+echo "done"
+echo "starting fastxy36" `date`
+../bin/fastx36 -q -XM2G -m 9c -S ../seq/mgtt2_x.seq q 1 > results/test2G_t2.xk1
+../bin/fasty36 -q -XM2G -S ../seq/mgtt2_x.seq q > results/test2G_t2.yk2
+../bin/fastx36 -q -XM2G -m 9c -S -z 2 ../seq/mgstm1.esq a > results/test2G_m1.xk2z2
+../bin/fasty36 -q -XM2G -S -z 2 ../seq/mgstm1.esq a > results/test2G_m1.yk2z2
+echo "done"
+echo "starting fastxy36 rev" `date`
+../bin/fastx36 -q -XM2G -m 9c -m 5 ../seq/mgstm1.rev q > results/test2G_m1.xk2r
+../bin/fasty36 -q -XM2G -m 5 -M 200-300 -z 2 ../seq/mgstm1.rev q > results/test2G_m1.yk2rz2
+../bin/fasty36 -q -XM2G -m 5 -z 11 ../seq/mgstm1.rev q > results/test2G_m1.yk2rz11
+echo "done"
+echo "starting ssearch36" `date`
+../bin/ssearch36 -q -XM2G -m 9c -S -z 3 ../seq/mgstm1.aa q > results/test2G_m1.ssz3
+../bin/ssearch36 -q -XM2G -M 200-300 -z 2 -Z 100000 -s P250 ../seq/mgstm1.aa q > results/test2G_m1.ss_p25
+echo "done"
+if [ -e ../bin/ssearch36s ]; then
+ echo "starting ssearch36s" `date`
+ ../bin/ssearch36s -q -XM2G -m 9c -S -z 3 ../seq/mgstm1.aa q > results/test2G_m1.sssz3
+ ../bin/ssearch36s -q -XM2G -M 200-300 -z 2 -Z 100000 -s P250 ../seq/mgstm1.aa q > results/test2G_m1.sss_p25
+ echo "done"
+fi
+echo "starting prss36(ssearch/fastx)" `date`
+../bin/ssearch36 -q -XM2G -k 1000 -a ../seq/mgstm1.aa ../seq/xurt8c.aa > results/test2G_m1.rss
+../bin/fastx36 -q -XM2G -k 1000 ../seq/mgstm1.esq ../seq/xurt8c.aa > results/test2G_m1.rfx
+echo "done"
+echo "starting ggsearch36/glsearch36" `date`
+../bin/ggsearch36 -q -XM2G -m 9i -w 80 ../seq/hahu.aa q > results/test2G_h1.gg
+../bin/glsearch36 -q -XM2G -m 9i -w 80 ../seq/hahu.aa q > results/test2G_h1.gl
+../bin/ggsearch36 -q -XM2G ../seq/gtt1_drome.aa q > results/test2G_t1.gg
+../bin/glsearch36 -q -XM2G ../seq/gtt1_drome.aa q > results/test2G_t1.gl
+echo "done"
+echo "starting fasta36 - DNA" `date`
+../bin/fasta36 -S -q -XM2G ../seq/mgstm1.nt %RMB 4 > results/test2G_m1.ok4
+../bin/fasta36 -S -q -XM2G ../seq/mgstm1.rev %RMB 4 > results/test2G_m1.ok4r
+echo "done"
+#echo "starting tfasta36" `date`
+#tfasta36 -q -XM2G ../seq/mgstm1.aa %RMB > results/test2G_m1.tk2
+#echo "done"
+echo "starting tfastxy36" `date`
+../bin/tfastx36 -m 9c -q -XM2G -i -3 -m 6 ../seq/mgstm1.aa %p > results/test2G_m1.tx2.html
+../bin/tfasty36 -q -XM2G -i -3 -N 5000 ../seq/mgstm1.aa %p > results/test2G_m1.ty2
+echo "done"
+echo "starting fastf36" `date`
+../bin/fastf36 -q -XM2G ../seq/m1r.aa q > results/test2G_mf.ff
+../bin/fastf36 -q -XM2G ../seq/m1r.aa q > results/test2G_mf.ff_s
+echo "done"
+echo "starting tfastf36" `date`
+../bin/tfastf36 -q -XM2G ../seq/m1r.aa %r > results/test2G_mf.tfr
+echo "done"
+echo "starting fasts36" `date`
+../bin/fasts36 -q -XM2G -V '*?@' ../seq/ngts.aa q > results/test2G_m1.fs1
+../bin/fasts36 -q -XM2G ../seq/ngt.aa q > results/test2G_m1.fs
+../bin/fasts36 -q -XM2G -n ../seq/mgstm1.nts m > results/test2G_m1.nfs
+echo "starting fastm36" `date`
+../bin/fastm36 -q -XM2G ../seq/ngts.aa q > results/test2G_m1.fm
+../bin/fastm36 -q -XM2G -n ../seq/mgstm1.nts m > results/test2G_m1.nfm
+echo "done"
+echo "starting tfasts36" `date`
+../bin/tfasts36 -q -XM2G ../seq/n0.aa %r > results/test2G_m1.ts_r
+echo "starting lalign36" `date`
+../bin/lalign36 -q -XM2G -k 1000 ../seq/mchu.aa ../seq/mchu.aa > results/test2G_mc.lal
+../bin/lalign36 -q -XM2G -z 3 ../seq/mchu.aa ../seq/mchu.aa > results/test2G_mc.lal_z3
+../bin/lalign36 -q -XM2G -s BL62 -f -11 -g -1 ../seq/mchu.aa ../seq/mchu.aa > results/test2G_mc.lal_bl62
+../bin/lalign36 -q -XM2G -k 1000 ../seq/mwkw.aa ../seq/mwkw.aa > results/test2G_mw.lal
+../bin/lalign36 -z 3 -q -XM2G ../seq/mwkw.aa ../seq/mwkw.aa > results/test2G_mw.lal_z3
+../bin/lalign36 -q -XM2G -s BL62 -f -11 -g -1 ../seq/mwkw.aa ../seq/mwkw.aa > results/test2G_mw.lal_bl62
+echo "FINISHED" `date`
diff --git a/test/test2V.sh b/test/test2V.sh
new file mode 100755
index 0000000..26bf7ed
--- /dev/null
+++ b/test/test2V.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+echo ""
+echo "starting FASTA36" `date` "on" `hostname`
+echo `uname -a`
+echo ""
+echo "starting fasta36 - protein" `date`
+if [ ! -d results ]; then
+ mkdir results
+fi
+../bin/fasta36 -V q\!../scripts/ann_feats_up_www2.pl -V \!../scripts/ann_feats_up_www2.pl -S -z 21 -s BP62 ../seq/gstm1_human.vaa q > results/test2V_m1.ok2_bp62
+../bin/fasta36 -V q\!../scripts/ann_feats_up_www2.pl -V \!../scripts/ann_feats_up_www2.pl -S -z 21 ../seq/gstm1_human.vaa q > results/test2V_m1.ok2_z21
+../bin/fasta36 -V q\!../scripts/ann_feats_up_www2.pl -V \!../scripts/ann_feats_up_www2.pl -S -m BB ../seq/gstm1_human.vaa q > results/test2V_m1.ok2mB
+echo "done"
+echo "starting fastxy36" `date`
+../bin/fastx36 -V \!../scripts/ann_feats_up_www2.pl -m 9c -S -q ../seq/mgtt2_x.seq q > results/test2V_t2.xk2m9c
+../bin/fastx36 -V \!../scripts/ann_feats_up_www2.pl -m BB -S -q ../seq/mgtt2_x.seq q > results/test2V_t2.xk2mB
+../bin/fastx36 -V \!../scripts/ann_feats_up_www2.pl -m 9c -S -q -z 22 ../seq/gstm1b_human.nt q > results/test2V_m1.xk2m9cz22
+../bin/fasty36 -V \!../scripts/ann_feats_up_www2.pl -S -q -z 21 ../seq/gstm1b_human.nt q > results/test2V_m1.yk2z21
+echo "done"
+echo "starting ssearch36" `date`
+../bin/ssearch36 -V q\!../scripts/ann_pfam_www.pl -V \!../scripts/ann_pfam_www.pl -m 9c -S -z 22 -q ../seq/gstm1_human.vaa q > results/test2V_m1.ssm9cz22
+../bin/ssearch36 -V q\!../scripts/ann_pfam_www.pl -V \!../scripts/ann_pfam_www.pl -m 9C -S -z 21 -q ../seq/gstm1_human.vaa q > results/test2V_m1.ssm9Cz21
+../bin/ssearch36 -V q\!../scripts/ann_pfam_www.pl -V \!../scripts/ann_pfam_www.pl -m 8CC -S -q ../seq/gstm1_human.vaa q > results/test2V_m1.ssm8CC
+echo "done" `date`
+echo "starting ssearch36" `date`
+../bin/ggsearch36 -V q\!../scripts/ann_feats_up_www2.pl -V \!../scripts/ann_feats_up_www2.pl -m 9c -S -q ../seq/gstm1_human.vaa q > results/test2V_m1.ggm9c
+../bin/ggsearch36 -V q\!../scripts/ann_feats_up_www2.pl -V \!../scripts/ann_feats_up_www2.pl -m 9C -S -z 21 -q ../seq/gstm1_human.vaa q > results/test2V_m1.ggm9Cz21
+echo "done" `date`
diff --git a/test/test_mpi.pbs b/test/test_mpi.pbs
new file mode 100644
index 0000000..a09c9e1
--- /dev/null
+++ b/test/test_mpi.pbs
@@ -0,0 +1,59 @@
+#!/bin/sh
+#PBS -l nodes=4:ppn=4
+#PBS -l mem=16GB
+#PBS -l walltime=2:00:00
+
+echo ""
+echo "starting FASTA36" `date` "on" `hostname`
+echo `uname -a`
+echo ""
+if [ ! -d ${TEST_DIR}/test/mpi_results ]; then
+ mkdir ${TEST_DIR}/test/mpi_results
+fi
+echo "starting fasta36_mpi - protein" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasta36_mpi -q -m 6 -Z 100000 ${TEST_DIR}/seq/mgstm1.aa:1-100 q > ${TEST_DIR}/test/mpi_results/test_m1.ok2_mpi.html
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasta36_mpi -S -q -z 11 -O ${TEST_DIR}/test/mpi_results/test_m1.ok2_mpi_p25 -s P250 ${TEST_DIR}/seq/mgstm1.aa:100-218 q
+echo "done"
+echo "starting fastxy36_mpi" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fastx36_mpi -m 9c -S -q ${TEST_DIR}/seq/mgtt2_x.seq q 1 > ${TEST_DIR}/test/mpi_results/test_t2.xk1_mpi
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasty36_mpi -S -q ${TEST_DIR}/seq/mgtt2_x.seq q > ${TEST_DIR}/test/mpi_results/test_t2.yk2_mpi
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fastx36_mpi -m 9c -S -q -z 2 ${TEST_DIR}/seq/mgstm1.esq a > ${TEST_DIR}/test/mpi_results/test_m1.xk2_mpi_z2
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasty36_mpi -S -q -z 2 ${TEST_DIR}/seq/mgstm1.esq a > ${TEST_DIR}/test/mpi_results/test_m1.yk2_mpi_z2
+echo "done"
+echo "starting fastxy36_mpi rev" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fastx36_mpi -m 9c -q -m 5 ${TEST_DIR}/seq/mgstm1.rev q > ${TEST_DIR}/test/mpi_results/test_m1.xk2r_mpi
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasty36_mpi -q -m 5 -M 200-300 -z 2 ${TEST_DIR}/seq/mgstm1.rev q > ${TEST_DIR}/test/mpi_results/test_m1.yk2r_mpi_z2
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasty36_mpi -q -m 5 -z 11 ${TEST_DIR}/seq/mgstm1.rev q > ${TEST_DIR}/test/mpi_results/test_m1.yk2rz11_mpi
+echo "done"
+echo "starting ssearch36_mpi" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/ssearch36_mpi -m 9c -S -z 3 -q ${TEST_DIR}/seq/mgstm1.aa q > ${TEST_DIR}/test/mpi_results/test_m1.ss_mpi_z3
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/ssearch36_mpi -q -M 200-300 -z 2 -Z 100000 -s P250 ${TEST_DIR}/seq/mgstm1.aa q > ${TEST_DIR}/test/mpi_results/test_m1.ss_mpi_p25
+echo "done"
+echo "starting ggsearch36/glsearch36" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/ggsearch36_t -q -m 9i -w 80 ${TEST_DIR}/seq/hahu.aa q > results/test_h1.gg_t
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/glsearch36_t -q -m 9i -w 80 ${TEST_DIR}/seq/hahu.aa q > results/test_h1.gl_t
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/ggsearch36_t -q ${TEST_DIR}/seq/gtt1_drome.aa q > results/test_t1.gg_t
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/glsearch36_t -q ${TEST_DIR}/seq/gtt1_drome.aa q > results/test_t1.gl_t
+echo "done"
+echo "starting fasta36_t - DNA" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasta36_t -S -q ${TEST_DIR}/seq/mgstm1.nt %R 4 > results/test_m1.ok4_t
+echo "done"
+echo "starting tfastxy36_t" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/tfastx36_t -m 9c -q -i -3 -m 6 ${TEST_DIR}/seq/mgstm1.aa %p > results/test_m1.tx2_t.html
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/tfasty36_t -q -i -3 -N 5000 ${TEST_DIR}/seq/mgstm1.aa %p > results/test_m1.ty2_t
+echo "done"
+echo "starting fastf36_t" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fastf36_t -q ${TEST_DIR}/seq/m1r.aa q > results/test_mf.ff_t
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fastf36 -q ${TEST_DIR}/seq/m1r.aa q > results/test_mf.ff_s
+echo "done"
+echo "starting tfastf36_t" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/tfastf36_t -q ${TEST_DIR}/seq/m1r.aa %r > results/test_mf.tf_tr
+echo "done"
+echo "starting fasts36_t" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasts36_t -q -V '*?@' ${TEST_DIR}/seq/ngts.aa q > results/test_m1.fs1_t
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasts36_t -q ${TEST_DIR}/seq/ngt.aa q > results/test_m1.fs_t
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasts36_t -q -n ${TEST_DIR}/seq/mgstm1.nts m > results/test_m1.nfs_t
+echo "done"
+echo "starting tfasts36_t" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/tfasts36_t -q ${TEST_DIR}/seq/n0.aa %r > results/test_m1.ts_r
+echo "done" `date`
diff --git a/test/test_mpi1.pbs b/test/test_mpi1.pbs
new file mode 100644
index 0000000..8a1b7cb
--- /dev/null
+++ b/test/test_mpi1.pbs
@@ -0,0 +1,27 @@
+#!/bin/sh
+#PBS -l nodes=4:ppn=4
+#PBS -l mem=8GB
+#PBS -l walltime=60:00
+
+echo ""
+echo "starting FASTA36" `date` "on" `hostname`
+echo `uname -a`
+echo ""
+if [ ! -d ${TEST_DIR}/test/mpi_results ]; then
+ mkdir ${TEST_DIR}/test/mpi_results
+fi
+echo "starting fasta36_mpi - protein" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasta36_mpi -q -m 9c -Z 100000 -d 10 ${TEST_DIR}/seq/prot_test.lseg q > ${TEST_DIR}/test/mpi_results/test_plib.ok2_mpi
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasta36_mpi -S -q -z 21 -s BP62 -d 10 -E 1e-6 ${TEST_DIR}/seq/prot_test.lseg q >${TEST_DIR}/test/mpi_results/test_plib.ok2_mpi_BP62
+echo "done"
+echo "starting fastxy36_mpi" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fastx36_mpi -m 9c -S -q ${TEST_DIR}/seq/gst.nlib q 1 > ${TEST_DIR}/test/mpi_results/test_nlib.xk1_mpi
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasty36_mpi -S -q ${TEST_DIR}/seq/gst.nlib q > ${TEST_DIR}/test/mpi_results/test_nlib.yk2_mpi
+echo "done"
+echo "starting ssearch36_mpi" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/ssearch36_mpi -m 9c -S -z 21 -d 10 -E 1e-6 -q ${TEST_DIR}/seq/prot_test.lseg q > ${TEST_DIR}/test/mpi_results/test_plib.ss_mpi_z21
+echo "done"
+echo "starting ggsearch36/glsearch36" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/ggsearch36_mpi -q -m 9i -w 80 -d 5 ${TEST_DIR}/seq/prot_test.lseg q >${TEST_DIR}/test/mpi_results/test_plib.gg_mpi
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/glsearch36_mpi -q -m 9i -w 80 -d 5 ${TEST_DIR}/seq/prot_test.lseg q >${TEST_DIR}/test/mpi_results/test_plib.gl_mpi
+echo "done" `date`
diff --git a/test/test_mpi2.pbs b/test/test_mpi2.pbs
new file mode 100644
index 0000000..ae86eb8
--- /dev/null
+++ b/test/test_mpi2.pbs
@@ -0,0 +1,33 @@
+#!/bin/sh
+#PBS -l nodes=4:ppn=4
+#PBS -l mem=16GB
+#PBS -l walltime=1:00:00
+
+echo ""
+echo "starting FASTA36" `date` "on" `hostname`
+echo `uname -a`
+echo ""
+if [ ! -d ${TEST_DIR}/test/mpi_results ]; then
+ mkdir ${TEST_DIR}/test/mpi_results
+fi
+echo "starting fasta36_mpi - DNA" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasta36_mpi -S -q ${TEST_DIR}/seq/dna_test_s.nlib %R 4 >${TEST_DIR}/test/mpi_results/test_nlib.ok4_mpi
+echo "done"
+echo "starting tfastxy36_mpi" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/tfastx36_mpi -m 9c -q -i -3 ${TEST_DIR}/seq/prot_test_s.lseg %p >${TEST_DIR}/test/mpi_results/test_plib.tx2_mpi
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/tfasty36_mpi -q -i -3 -N 5000 ${TEST_DIR}/seq/prot_test_s.lseg %p >${TEST_DIR}/test/mpi_results/test_plib.ty2_mpi
+echo "done"
+echo "starting fastf36_mpi" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fastf36_mpi -q ${TEST_DIR}/seq/m1r.aa q >${TEST_DIR}/test/mpi_results/test_mf.ff_mpi
+echo "done"
+echo "starting tfastf36_mpi" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/tfastf36_mpi -q ${TEST_DIR}/seq/m1r.aa %r >${TEST_DIR}/test/mpi_results/test_mf.tf_mpir
+echo "done"
+echo "starting fasts36_mpi" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasts36_mpi -q -V '*?@' ${TEST_DIR}/seq/ngts.aa q >${TEST_DIR}/test/mpi_results/test_m1.fs1_mpi
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasts36_mpi -q ${TEST_DIR}/seq/ngt.aa q >${TEST_DIR}/test/mpi_results/test_m1.fs_mpi
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/fasts36_mpi -q -n ${TEST_DIR}/seq/mgstm1.nts m >${TEST_DIR}/test/mpi_results/test_m1.nfs_mpi
+echo "done"
+echo "starting tfasts36_mpi" `date`
+mpiexec -comm mpich2-pmi ${TEST_DIR}/bin/tfasts36_mpi -q ${TEST_DIR}/seq/n0.aa %r >${TEST_DIR}/test/mpi_results/test_m1.ts_r
+echo "done" `date`
diff --git a/test/test_s.sh b/test/test_s.sh
new file mode 100755
index 0000000..4d90841
--- /dev/null
+++ b/test/test_s.sh
@@ -0,0 +1,47 @@
+#!/bin/csh -f
+echo ""
+echo "starting fasta36 - protein" `date` "on" `hostname`
+echo `uname -a`
+echo ""
+fasta36 -q -m 6 -Z 100000 ../seq/mgstm1.aa:1-100 q > test_m1.ok2.html
+fasta36 -S -q -z 11 -O test_m1.ok2_p25 -s P250 ../seq/mgstm1.aa:100-218 q
+echo "done"
+echo "starting fastxy36" `date`
+fastx36 -m 9 -S -q ../seq/mgtt2_x.seq q > test_t2.xk2
+fasty36 -S -q ../seq/mgtt2_x.seq q > test_t2.yk2
+fastx36 -m 9 -S -q -z 2 ../seq/mgstm1.esq a > test_m1.xk2z2
+fasty36 -S -q -z 2 ../seq/mgstm1.esq a > test_m1.yk2z2
+echo "done"
+echo "starting fastxy36 rev" `date`
+fastx36 -m 9 -q -m 5 mgstm1.rev q > test_m1.xk2r
+fasty36 -q -m 5 -M 200-300 -z 2 mgstm1.rev q > test_m1.yk2rz2
+fasty36 -q -m 5 -z 11 mgstm1.rev q > test_m1.yk2rz11
+echo "done"
+echo "starting ssearch36" `date`
+ssearch36 -m 9 -S -z 3 -q mgstm1.aa q > test_m1.ssz3
+ssearch36 -q -M 200-300 -z 2 -Z 100000 -s P250 mgstm1.aa q > test_m1.ss_p25
+echo "done"
+echo "starting fasta36 - DNA" `date`
+fasta36 -q -z 2 mgstm1.seq %RMB 4 > test_m1.ok4z2
+fasta36 -q mgstm1.rev %RMB 4 > test_m1.ok4r
+echo "done"
+echo "starting tfasta36" `date`
+tfasta36 -q mgstm1.aa %RMB > test_m1.tk2
+echo "done"
+echo "starting tfastxy36" `date`
+tfastx36 -m 9 -q -i -3 -m 6 mgstm1.aa %p > test_m1.tx2.html
+tfasty36 -q -i -3 -N 5000 mgstm1.aa %p > test_m1.ty2
+echo "done"
+echo "starting fastf36" `date`
+fastf36 -q m1r.aa q > test_mf.ff_s
+echo "done"
+echo "starting tfastf36" `date`
+tfastf36 -q -E 0.0001 m1r.aa %r > test_mf.tf_r
+echo "done"
+echo "starting fasts36" `date`
+fasts36 -q n0.aa q > test_m1.fs_s
+echo "done"
+echo "starting tfasts36" `date`
+tfasts36 -q n0.aa %r > test_m1.ts_r
+echo "done"
+echo "done" `date`
diff --git a/test/test_z.sh b/test/test_z.sh
new file mode 100755
index 0000000..b4507b2
--- /dev/null
+++ b/test/test_z.sh
@@ -0,0 +1,22 @@
+#!/bin/csh -f
+echo "starting fasta36 - protein" `date`
+foreach z ( 1 2 3 11 12 21 22)
+../bin/fasta36 -q -z $z -d 0 ../seq/mgstm1.aa a > results/test_m1_a.ok2_${z}
+../bin/fasta36 -q -z $z -d 0 ../seq/oohu.aa a > results/test_m1_b.ok2_${z}
+../bin/fasta36 -q -S -z $z -d 0 ../seq/prio_atepa.aa a > results/test_m1_c.ok2S_${z}
+../bin/fasta36 -q -S -z $z -d 0 ../seq/h10_human.aa a > results/test_m1_d.ok2S_${z}
+../bin/fasta36 -c -1 -q -z $z -d 0 ../seq/mgstm1.aa a > results/test_m1_a.c1_ok2_${z}
+../bin/fasta36 -c -1 -q -z $z -d 0 ../seq/oohu.aa a > results/test_m1_b.c1_ok2_${z}
+../bin/fasta36 -c -1 -q -S -z $z -d 0 ../seq/prio_atepa.aa a > results/test_m1_c.c1_ok2S_${z}
+../bin/fasta36 -c -1 -q -S -z $z -d 0 ../seq/h10_human.aa a > results/test_m1_d.c1_ok2S_${z}
+end
+echo "done"
+echo "starting ssearch36" `date`
+foreach z ( 1 2 3 11 21 22)
+../bin/ssearch36 -q -z $z -d 0 ../seq/mgstm1.aa a > results/test_m1_a.ssS_${z}
+../bin/ssearch36 -q -z $z -d 0 ../seq/oohu.aa a > results/test_m1_b.ssS_${z}
+../bin/ssearch36 -q -sBL62 -d 0 -S -f -11 -z $z ../seq/prio_atepa.aa a > results/test_m1_c.ssSbl62_${z}
+../bin/ssearch36 -q -sBL62 -d 0 -S -f -11 -z $z ../seq/h10_human.aa a > results/test_m1_d.ssSbl62_${z}
+end
+echo "done"
+echo "done" `date`
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fasta3.git
More information about the debian-med-commit
mailing list