[med-svn] [Git][med-team/last-align][upstream] New upstream version 1044

Steffen Möller gitlab at salsa.debian.org
Sat Dec 21 10:32:42 GMT 2019



Steffen Möller pushed to branch upstream at Debian Med / last-align


Commits:
60c6d42f by Steffen Moeller at 2019-12-20T23:28:05Z
New upstream version 1044
- - - - -


25 changed files:

- ChangeLog.txt
- examples/multiMito.maf
- makefile
- src/Alignment.cc
- src/Alignment.hh
- src/GappedXdropAligner.cc
- src/GappedXdropAligner.hh
- src/GappedXdropAligner2qual.cc
- src/GappedXdropAligner3frame.cc
- src/GappedXdropAligner3framePssm.cc
- + src/GappedXdropAlignerDna.cc
- src/GappedXdropAlignerInl.hh
- src/GappedXdropAlignerPssm.cc
- src/ScoreMatrixRow.hh
- src/alp/njn_matrix.hpp
- src/alp/njn_random.cpp
- src/alp/njn_vector.hpp
- src/lastal.cc
- src/makefile
- src/mcf_contiguous_queue.hh
- src/mcf_gap_costs.cc
- src/mcf_gap_costs.hh
- + src/mcf_reverse_queue.hh
- src/mcf_simd.hh
- src/version.hh


Changes:

=====================================
ChangeLog.txt
=====================================
@@ -1,8 +1,146 @@
+2019-12-20  Martin C. Frith  <Martin C. Frith>
+
+	* src/GappedXdropAligner.hh, src/GappedXdropAlignerDna.cc,
+	src/mcf_simd.hh, test/last-test.out, test/last-test.sh:
+	Maybe make lastal gapped alignment a bit faster
+	[11810fcff80c] [tip]
+
+2019-12-19  Martin C. Frith  <Martin C. Frith>
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAligner.hh,
+	src/GappedXdropAlignerDna.cc, src/makefile,
+	src/mcf_reverse_queue.hh, src/mcf_simd.hh:
+	Maybe make lastal gapped alignment a bit faster
+	[2be7c5f313a5]
+
+2019-12-16  Martin C. Frith  <Martin C. Frith>
+
+	* src/Alignment.cc, src/GappedXdropAligner.hh,
+	src/GappedXdropAlignerDna.cc, src/mcf_simd.hh:
+	Make lastal DNA gapped alignment faster
+	[daa684e4956e]
+
+	* src/Alignment.cc, src/GappedXdropAligner.hh,
+	src/GappedXdropAlignerDna.cc, src/ScoreMatrixRow.hh,
+	src/mcf_simd.hh:
+	Make lastal DNA gapped alignment faster
+	[7fdb95aea8e3]
+
+	* src/Alignment.cc, src/Alignment.hh, src/GappedXdropAlignerDna.cc,
+	src/lastal.cc, src/mcf_simd.hh:
+	Refactor
+	[b1fc3a37fe34]
+
+2019-12-12  Martin C. Frith  <Martin C. Frith>
+
+	* src/Alignment.cc, src/GappedXdropAligner.cc,
+	src/GappedXdropAligner.hh, src/GappedXdropAlignerDna.cc,
+	src/GappedXdropAlignerPssm.cc, src/makefile, src/mcf_simd.hh, test
+	/last-test.out, test/last-test.sh:
+	Make lastal DNA gapped alignment faster
+	[a02f4aab6116]
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAlignerPssm.cc,
+	src/mcf_simd.hh:
+	Refactor
+	[d21b748f5540]
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAligner.hh,
+	src/GappedXdropAlignerPssm.cc:
+	Refactor
+	[84db4d2edb22]
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAligner2qual.cc,
+	src/GappedXdropAlignerInl.hh, src/GappedXdropAlignerPssm.cc:
+	Refactor
+	[759bd401fd1b]
+
+2019-12-11  Martin C. Frith  <Martin C. Frith>
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAligner2qual.cc,
+	src/GappedXdropAlignerPssm.cc, test/last-train-test.out:
+	Change lastal probabilities slightly
+	[1cd65b7f8fdb]
+
+	* src/GappedXdropAligner2qual.cc:
+	Refactor
+	[f888698950e8]
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAligner2qual.cc,
+	src/GappedXdropAligner3frame.cc,
+	src/GappedXdropAligner3framePssm.cc, src/GappedXdropAlignerInl.hh,
+	src/GappedXdropAlignerPssm.cc:
+	Refactor & robustify the code
+	[6ee595437d4c]
+
+	* src/Alignment.cc, src/GappedXdropAligner.cc,
+	src/GappedXdropAligner.hh, src/GappedXdropAligner2qual.cc,
+	src/GappedXdropAlignerInl.hh, src/GappedXdropAlignerPssm.cc,
+	src/mcf_gap_costs.cc, src/mcf_gap_costs.hh:
+	Refactor
+	[14a1fdd74624]
+
+2019-12-10  Martin C. Frith  <Martin C. Frith>
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAlignerPssm.cc,
+	src/alp/njn_matrix.hpp, src/alp/njn_random.cpp,
+	src/alp/njn_vector.hpp, src/makefile, src/mcf_simd.hh:
+	Robustify compilation
+	[5b00a436e53c]
+
+2019-12-09  Martin C. Frith  <Martin C. Frith>
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAligner.hh,
+	src/GappedXdropAlignerPssm.cc, src/mcf_simd.hh:
+	Refactor
+	[e0434cbfa780]
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAligner.hh,
+	src/GappedXdropAligner2qual.cc, src/GappedXdropAligner3frame.cc,
+	src/GappedXdropAligner3framePssm.cc, src/GappedXdropAlignerPssm.cc:
+	Refactor
+	[29b88752b6d6]
+
+	* makefile, src/makefile:
+	Fix compile error (thanks: V Brendel & D Eccles)
+	[cf7b477fbeaf]
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAlignerPssm.cc,
+	src/mcf_contiguous_queue.hh:
+	Refactor
+	[4b2847c49990]
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAlignerPssm.cc:
+	Refactor
+	[8410315ff484]
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAligner2qual.cc,
+	src/GappedXdropAlignerPssm.cc:
+	Refactor
+	[c614e7fba41c]
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAligner2qual.cc,
+	src/GappedXdropAlignerPssm.cc:
+	Refactor
+	[ede8e6df999a]
+
+	* src/GappedXdropAligner.cc, src/GappedXdropAlignerPssm.cc,
+	src/mcf_contiguous_queue.hh:
+	Refactor
+	[aa391569f221]
+
+	* examples/multiMito.maf, src/GappedXdropAligner.cc,
+	src/GappedXdropAligner2qual.cc, src/GappedXdropAlignerInl.hh,
+	src/GappedXdropAlignerPssm.cc, test/last-test.out, test/last-train-
+	test.out, test/maf-swap-test.out:
+	Change lastal probabilities slightly
+	[c24e24e0c606]
+
 2019-11-28  Martin C. Frith  <Martin C. Frith>
 
 	* doc/last-train.txt, scripts/last-train, test/last-train-test.out:
 	train: double scale if integer-rounding is bad
-	[887c0e6339d1] [tip]
+	[887c0e6339d1]
 
 2019-11-27  Martin C. Frith  <Martin C. Frith>
 


=====================================
examples/multiMito.maf
=====================================
@@ -3,7 +3,7 @@ s humanMito    598 349 + 16571 CAAAGCAATACACTGAAAATGTTTAGACGGGCTCACATCAC-CCCATAA
 s mouseMito     18 346 + 16299 CAAAGCAAAGCACTGAAAATGCTTAGATGGA-TAATTGTAT-CCCATAAACACAAAGGTTTGGTCCTGGCCTTATAATTAATTAGAGGTAAAATTACACATGCAAACCTCCATAGACCGGTGTAAAATCCCTTAAACAT-TTACTTAAAATTTAAGGAGAGGGT-ATCAAGCACATTAAAAT--AGCTTAAGACACCTTGCCTA-GCCACACCCCCACGGGACT-CAGCAGTGATAAATATTAAGCAATAAACGAAAGTTTGACTAAGTTATACCT--CTTAGGGTTGGTAAATTTCGTGCCAGCCACCGCGGTCATACGATTAACCCAAACTAATTA-TCTTCGGCGTAAAACGTG
 s chickenMito 1246 353 + 16775 CAAAGCATGGCACTGAAGATGCCAAGATGGTACCTACTATA-CCTGTGGGCAAA-AGACTTAGTCCTAACCTTTCTATTGGTTTTTGCTAGACATATACATGCAAGTATCCGCATCCCAGTGAAAATGCCCCCAAACCTTTCTTCCCAAGCAAAAGGAGCAGGT-ATCAGGCACACTCAGCAGTAGCCCAAGACGCCTTGCTTAAGCCACACCCCCACGGGTACTCAGCAGTAATTAACCTTAAGCAATAAGTGTAAACTTGACTTAGCCATAGCAACCC-AGGGTTGGTAAATCTTGTGCCAGCCACCGCGGTCATACAAGAAACCCAAATCAATAGCTACCCGGCGTAAAGAGTG
 s fuguMito      16 350 + 16447 CAAAGCAGAGTACTGAAGATGCTAAGATGGGCCCTGAAAAGTCCCGCAGGCACAAAAGCTTGGTCCTGACTTTACTAACAACTCTGATCAAACTTACACATGCAAGTATCCGCATCCCAGTGAAaatgccccccgcc---ccccgtcCGGAAATAGGGAGTTGGTATCAGGCACACAAATTTGTAGCCCATGACACCTAGCTTT-GCCACGCCCCCAAGGGAATTCAGCAGTGATAAACATTAAGCCATAAGTGAAAACTTGACTTAGTTATGAT--CTAAAGAGTCGGTAAAACTCGTGCCAGCCACCGCGGTTATACGAGAGACCCAAGTTGTTAG-CCAACGGCGTAAAGGGTG
-p                              '.37<?ABBCGLPTWXYYYXUMKIC:1'%#"!+-.///045~68;DMRTV\]ZZZ`fghghjjljig_a_^XVSSSUYZYUSSRRQRU]a`_chilmmmmmke\TKJHHFECBBAAA>8542---,,,,,,,*)($"$"~*+-........./12222222357~9:::96/,%##""""""""/9BIKSVVWXWWVWW`fhlm~mkjllkgeeegllhffb^_~ejmmmmmlg_\\\[[\\[_gljjlihillihikgfeaejf^UKB?=<7/%#$"*,/8AHJSX_be`bdgghlmmmkkmmkkljjlljkmlicilmmjifd^[ZTRPOOOOOMM~MNOPT\^fhaXROIGE@>
+p                              ',159<>>?@DIMQTUVVVVTMKIC:1'%#"!+-.///045~68;DMRTV\]ZZZ`fghghjjljig_a_^XVSSSUYZYUSSRRQRU]a`_chilmmmmmke\TKJHHFECBBAAA>8542---,,,,,,,*)($"$"~*+-........./12222222357~9:::96/,%##""""""""/9BIKSVVWXWWVWW`fhlm~mkjllkgeeegllhffb^_~ejmmmmmlg_\\\[[\\[_gljjlihillihikgfeaejf^UKB?=<7/%#$"*,/8AHJSX_be`bdgghlmmmkkmmkkljjlljkmlicilmmjifd^[ZTRPOOOOOMM~MNOPT\^fhaXROIGE@>
 p                              %*/37:;<<=BFIKLLMMMMMMMMMMMLLLKKKKKKKLLNO~TX]bgkosuvwyz~~~~~~~~~~~~~~~{wrmmlllkjggfffffffffeeedb^ZVQLHC>944210.*))))))))))(((((((((((((((((~((((((((((((((((((((((((~)))))))))))))))))))-1355899;<<>>@AFJOTY~]bglquz~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}yuqlggeb^YTTSSTTUZ_chmquxz{~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{wrmhd_ZUQLKKKJJHH~HHHHHHHHGGFDA=<<;:
 p                              (())))))))))))))))))))))))))(((((((''''''~'''''''''&%$#)+,-.015<BEIKLQVZ^afgbZWPPMMLLKJJJJHHHJQWYafilmmjjichhc`^_^^]_`_VQH>;:8887776632,$++#,01246<>DFNQUWWVUTTZZZ[]~ef_ULIB?>=<;;;;;;>>GPUUVRPIIG at 744432,*##-47 at INKHGFEA8/,+%$""/9BLU]b`cghiihgc`_biljjib_[ZZUUTTTUVXZ__]_^VMJC:0-,+%#""/579 at CJLSUZ\]Z[[~~~~~~~~~~~~~~~~~~~`a`\\[Z^`ba`\YWWRJG at 7.##()+-47=???><940+'
 p                              ''(((((((((((((((((((((((((((((((((((((((~((((((((((((()**+.016:?CFHIMRW[`cefgggggggghhhhhhhhilnosx|~~~~~~zywtplkiiggfda]YTONNNNNNNNNMMMMLLLMMMNOOPQRSX]adfghhhjklmm~oppqppppomlkkjjkkmmqsttvwwwwwvtqqoonlkjglqvz~~~~~~~|xsniedbafkotx{}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{wvvutqninsw|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{vutppnnnmljgc^ZYWTPKGB============<:73/*%
@@ -115,7 +115,7 @@ p                               cb`^aghlmid_abdigf`ZYY`cee_ab`djiiibed`b`]\__cb`
 p                               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{zxutttuuuvwwyzzzyyyz{~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{{yyy{{~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|{zxwwwvvvuusponkjjjhhhgggggghhiiiiikkkllmqtuvy{|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}||}~~~~~~~~~~~~|{|{|||~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}}~~}}|{zyyy{|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|{|{{}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~||zxwwwyz~~~~~~~~~}ytssssuvvwwwvvww~wwx{|}~~~~~~~~~~{vrqol~~~h~~~geeeeddddeeeeeeeeffffffffffgggijjlllmmmmnqtuuuuuvvvy{|~~~~~~~~~~~~~~~~~~}|{zxwwx{|}}}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}}~}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~zvqlkjfbbaaaaaaabbccccccccccdddddddeeefijkmnoopstux{||}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{vrmhgeb^]\[ZZZ]_`~~~dhklmmnsx}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|yup~~~kgb]\ZZZ\]^^~~~~~^^^^^^^]\ZZZZZZZZZZZZZZZZZ[[[[[[[[[[\\_abbcccccdddfglquz~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|yxwvvvvvvw{}~~~~~}{zyvutqmiddbaaaaaaaaaaaaaccefghiijnprsstx|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|{{||~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 p                               SRNNOOQTTPNMLKJJKKKKQRVWYZaehjjljjiceeaejhfbYXWUU]`^_^``^ca_acggebhhghjjmmmlibejgffdadaded_a^_^][\XXWVVYbhd\SSSSSTTTSRRQQRRTV]ZXQNMLKF>;:99999999::::;;ABELPPPQRSRQQPPRTXYY[adhifc`\]][\[XYYY]`^a\VTSQQRTW_a_`_ca_billged`]^^ailkkmmkebhiig``deb`cjlhffbZZY[cjmmmkklihebhg`bcc`b`ghcihceeaekjjmmkebhgab]Z[YXYWSSSSTTUXXXWTPOOLJE<:94+++*****))&$$$$$$#####$$$$$%%%%%%%%&&&'*./1233336?IQVW[``gkjkmmkdb`dadgciicgc_`~~~~~~~~~~~~~~~~`ab`ZXX[\[^`^_fijlkjmmlib_`_cb_b`]^^]_\\`fhjd`a_chijhgchjhd\\bdjhbdfcgjjg_XVUTTTUW`deglmjd[XUTTRRRRUUUVVTTUXZ_gjeadgbejge``ghijebfhiiib^^bhgcfc_`_^`fd\YXWW_ec_a]]_`_cfadfab_`_gjjjhfbYYWPNLKJHHHHKSYZZ[`_\WTTVY`ciebgeaggcggchg`ZXY_gklihillic`a_dda^`bghljiihgc_abb`cjllhfa^]\[]_dedailmmmjca_bijjlhgb[\\\dhijebilkg`\]ZZZ[[^`ggfa[\dhijdb_bdkmmmlhgb_[[[bdkljc`cjjjmmkecbb^_][[[Z[adgbYYXVVVVY__giifedefijlkdb[[cbZWVVWW]eecc^_`abdahijjd\\\[aeaeigfebggd^]ZZ[ZZ\]_a_edahkmjjje]VVVVX[[bcagdaekmmmjihcfcZZ]\``dd^[XVWZ^cfhb_^]__][Z[__^`a`gkkg`XUUWY^^ejhga]]__bc[[[YY_ehica`^_\XWW[^fkljjh`ZYYahjhbXOLJJJIHHHHHJLSV\\\[\\Z[]]ejhbegcjlmmmkklia^__]^]\]]]ehchebhkjecafgjjmmkkmkebfgchgaccagiihgcihaYWUUUUVZZ\]YY\\`e`]]]\][[[VVUUTVUUU\^^]ahjdb`daYPPOPPPRXY\dhdb`ddaba`gijkebgebhijmmmmmmkebgf_``_gkkic^^`^^`_djlkg``_]_a`b`]]__beijgbeeaekmmmkecahkih~~ciljeb`dkmkhc[WV~~VZZ]_]YXXZY[[WUOLEA9211100000//////-,!!"#$&&((/1566~6655320,"$$"~~~$$$%%%%%%%%%%%%%%%%%%%&&&&')+2566788888:CJMNONNOPQRRQQQQXZ\^glljcZZY[[YVTNNNNMMNPVZZZ]`\ZYXWRRRSUUWXZ`hlkebillhfaba`gica^ZZ]\YXXXZ[bdddb[[cdcc`bdjic_bc]\]`fhkigc`cjmmmjc]]WRQQRVZYWXX]`fdbahhbekmlhggebhhggg`a`^ZZZ^^_aa`eglmmlh_\ZZVUSRQPQQSQQRQPOOPQRRUUUTTUXYXUUVUUSQOKKKIHHHJKNUXWXYZ_``^XXXWWYYZ^ehd\[\`efbfb_a\SPPPVXWWXY]``fgfc]TQJA>=74~~~.+)#&&#()+--.7>AABJMNNNPPPVWTTV^`_defkkf^VSQNLKIC@=8/"#$"#%()~~~*+-.~~~~~../012458~?BCCCCCCCCCBBBA;:5421110000001:CJLLMNMMMMNKJJJKNR\dggebgd[[`^`^_dadg`aimmmkkmmjgaa[YY^`bjmmmmmmkklhc[UUWY\ehilmmmlha[[_[\[ZVMGEDCCBBCDDDFFFGIIJJKLLLNNOOPQQPQQQQPPPPQPPQPPQSX]cdbghfaXSRRSXafklkg_`^\YY]]ccailmligd]URPPPQSUX_ailkec`deea[RI?>=9777777789;>??ADLQROOQY^a`fdbikkhcghjjlihhiheaffaehimmlia^]YY^]`ca_a^`bcddb^[XWQI@?<;:5321000000111
 p                               iiiiiiiiiiihhhhhhhhhijkkmnsx|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}yupkjihgea`_^][WWUUSQQQPOMIEDDDCCCCCCCCCCCCCCCCBBBAAA@@@@@@???????@@@@CDEGGHHHJKKKKKLMMOOPQUZ_cgikknoprstvvwxxxxxxxwvvvwz|}}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|yuusrqonmkjifeda]YTPKFEC@@>;;9664211111111000/.....--,,,,+++++++*****************************+./036779:?CGJLLOQRW\`ejnqsttvvyzz}~~~~~~~~~~~~~~~~~~~~~~~~~~~}}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{wrqqppppqqqrrrrrtuy}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|wsrpmllkkkkkkklmmmmnnnnnnotx|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|{zyyyy{{~~~~~~~~~~~~~~}||{|}}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|{{|}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~yxxyyxxyyz~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}}~}~~}}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|}||~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|yu~~pomjfb]\ZWSOJEA@~~?????>=<<<<<<<;962-)$##############################~############~~~#######################################$(+-.///114566778<AEILMMNNNNNNNNNNNNNNNNNOSVXY]`bccccccddffiklquy|~~~~~~~~~~~~~~~}yxxwuuuvwx}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}yyyz}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|wsnihffeb_^^^^^^^^^]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]^^^^^^_______________________________^]]]]]]]\\]]\\\\\[ZZYVSOJJ~~~HEA<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;:9888752.)$$$$$$$$~~~$$$$~~~~~$$$$$$$$$~$$$$$$$$$$$$$$$$$$$$$$$$$$$$%%&'''''''''''''''()-13555666666677788:;;<AEJOTX\`bcceffglpuz~~~~~~~~~~~~~~~~~~~~~yupkgb]\\[[ZYVSNNMMMMMMMMMMMMMMMMMMMMNNNNNNNNNNNNNNNNOOOOOOOOPTX[\]_abbbbbbchmquy{}}~~~~~~~~~~~~~~~~~{vvtsssty~~~~~~~~~~}zuqlhcbbaaaaaaaaabfikkorsttttux{}}~~~~~~~~~~~~~~~~~~~~~~~~~~~yupkfb]\\[[ZYVVTTSQNIE@;7641-,***)))((''''''''
-p                               $$$%%%%%%%%%%%%%%%%%%&&&&''((()*+.016:?DHKMNNPPPQTVV[_``^a_\\\\_efhd`cdagd`ca^^cdcb__eijdb`c`ba]\]XYXXX[dhbYONNNNOQZ`ggaccaggbcb]\^]`^VTSSTSRPPOOOPPPRZ_acc\SPPONHFFFFFFMPWY_bffb]VRQPONLLLNORVW[aa`_^efaeiebghljjmmmlhffb__fkmlhfbZZ]ekjjibded__bijd_a`YYY[ab`ghbekjjjjmmmihd`baghcjmmjjjjljie_^``fdafc`cifd_[[YXXVVXWVUUSSSPNHE><52222222222000000000000111679<===========>>>?ABFGJKLT\bb_ZZ\\`hlkecahlkebikjf^\]afhkjjjjlkkljcigffgf``ehiiic__fijlkjlmlibdfacb_a`]^^^[[[_ehjeaeefhbegfchhgbZXXY^`^cfcgjkjcZWUTTUUW`deglmmlhffhljjlkdb_ba^^ccacahgeda^`b^^`_edbhijjjjjmmmkjkeaejjca_biic`]]_\YSJIHHHHHHHHHHHHHIQVVWVUUTQIG@?????@ADKPRSXYY[[YVUV[beiebgdaddacc_\\ZYZ`fkljca_billkebghijjjjjljijigc_abb~~~~~~~~~~~~~~~~~~~~~~~~~~cfkihc`cjlihiljkljjkebiijib]]]\_bgfd^XXZ^_^`_]acjlmmjc`a`^djjjkga`_bijjmmkdb``^c_\\[[\ab`dihgcZWUUW^dc`baaab`djlkklia^^XXXWVWZ`acfa_]\^_fhhe\YYZYahijlihillkhbbafe__cikibekmmmmjjiaXUUVVX^^cgilmmmmmmmjihcgc[Z]]`fgf_\YXXY[\_ca_^]_`_]]bca_adc_^[[YSQRRTWX`ffe`ZXYYac_abb`dijg`abdga^^afgiciiih`ZYYahjlmlg_XWYWUUWW\]_egkg`]^][\^]egbdb^]_^cjlkkmmlihc__^\]]]ehchebgdaghljjjjmlhgbeiimmmmmmkebgdaeecikhfa]^^ZXXUUWUUVVVVPPQQQUVY]\]^][][Z[```^bijec`edaggf`bdihbeklihijjlkkmmjjkebhijljkmmmmmmkfbhg`abdkmmmmlic`bc`djg_`_XUSSUW]_]]cggc`cjllibekjd[SIF?<5,+!"!""#$$%)0222210,#)$-5:DLQQRUW[dghhhca[SQJGFD at 7.--*)('''~'&&%%%%$$$####,1234443~~~/"#%"#%))*+-/56899999::;;<<<===>@BIMNOOOOOOPPPQRRRRRRRRTSTX`eefgkg^VLKKHHHHHHFEDCCCDDFFFFFHKLMMNONNNPQUWY^bafdaeebfilkkmlh`][ZZ\]YXXXZ[cdddb[[_^]\[]`fd[WW]\[[\cejjjljjmljchg`a_]WWX[\YWWVUUUXXY_^XX\ejhggebhgeda^__\[]\^__ab]]\WWTQJA?====<<<;;;;;;<<<<<<==>>>>>>>>>>>>>>=============>>>>>>>>>>>>???????@@@@AJT\cgcgebilmkh`a_^[\^_^ZXYY`a`hlmmkkl~~~ihe~~~affc\WW\djhfgdb`fedcc`WWX[djihijebge^URPOMGDDA:~~~60/-,~~~'%$$'&%$$'))*,1246~=FMPVXZ]YQHCA@???==<<<<<<<;;;<<<<<<<==>>ABCENSUTTVV[\YWWUUUZZ\\ad``[SPPQY^``\YY`daYVVXX[\XX\dig`bb`c`^\]]`hiikhbdf`a_bc\\[XQNKJIHEEECCA@????AAAA@=5332222222222211111111111246:;<======>>@AHJT]d`WVWWVWZZab`hlmmjif^VVUUVWW]__bjlia]]`cahklhb\ZWVTSTTTSSTUUUUWX\\XVV_daZZabahkjd`bdhilihhiihbdgbehilljf`\XTONMLLLLLLKKIEA=<::863..,++++*******))('
+p                               $$$%%%%%%%%%%%%%%%%%%&&&&''((()*+.016:?DHKMNNPPPQTVV[_``^a_\\\\_efhd`cdagd`ca^^cdcb__eijdb`c`ba]\]XYXXX[dhbYONNNNOQZ`ggaccaggbcb]\^]`^VTSSTSRPPOOOPPPRZ_acc\SPPONHFFFFFFMPWY_bffb]VRQPONLLLNORVW[aa`_^efaeiebghljjmmmlhffb__fkmlhfbZZ]ekjjibded__bijd_a`YYY[ab`ghbekjjjjmmmihd`baghcjmmjjjjljie_^``fdafc`cifd_[[YXXVVXWVUUSSSPNHE><52222222222000000000000111679<===========>>>?ABFGJKLT\bb_ZZ\\`hlkecahlkebikjf^\]afhkjjjjlkkljcigffgf``ehiiic__fijlkjlmlibdfacb_a`]^^^[[[_ehjeaeefhbegfchhgbZXXY^`^cfcgjkjcZWUTTUUW`deglmmlhffhljjlkdb_ba^^ccacahgeda^`b^^`_edbhijjjjjmmmkjkeaejjca_biic`]]_\YSJIHHHHHHHHHHHHHIQVVWVUUTQIG@?????@ADKPRSXYY[[YVUV[beiebgdaddacc_\\ZYZ`fkljca_billkebghijjjjjljijigc_abb~~~~~~~~~~~~~~~~~~~~~~~~~~cfkihc`cjlihiljkljjkebiijib]]]\_bgfd^XXZ^_^`_]acjlmmjc`a`^djjjkga`_bijjmmkdb``^c_\\[[\ab`dihgcZWUUW^dc`baaab`djlkklia^^XXXWVWZ`acfa_]\^_fhhe\YYZYahijlihillkhbbafe__cikibekmmmmjjiaXUUVVX^^cgilmmmmmmmjihcgc[Z]]`fgf_\YXXY[\_ca_^]_`_]]bca_adc_^[[YSQRRTWX`ffe`ZXYYac_abb`dijg`abdga^^afgiciiih`ZYYahjlmlg_XWYWUUWW\]_egkg`]^][\^]egbdb^]_^cjlkkmmlihc__^\]]]ehchebgdaghljjjjmlhgbeiimmmmmmkebgdaeecikhfa]^^ZXXUUWUUVVVVPPQQQUVY]\]^][][Z[```^bijec`edaggf`bdihbeklihijjlkkmmjjkebhijljkmmmmmmkfbhg`abdkmmmmlic`bc`djg_`_XUSSUW]_]]cggc`cjllibekjd[SIF?<5,+!"!""#$$%)0222210,#)$-5:DLQQRUW[dghhhca[SQJGFD at 7.--*)('''~'&&%%%%$$$####,1234443~~~/"#%"#%))*+-/56899999::;;<<<===>@BIMNOOOOOOPPPQRRRRRRRRTSTX`eefgkg^VLKKHHHHHHFEDCCCDDFFFFFHKLMMNONNNPQUWY^bafdaeebfilkkmlh`][ZZ\]YXXXZ[cdddb[[_^]\[]`fd[WW]\[[\cejjjljjmljchg`a_]WWX[\YWWVUUUXXY_^XX\ejhggebhgeda^__\[]\^__ab]]\WWTQJA?====<<<;;;;;;<<<<<<==>>>>>>>>>>>>>>=============>>>>>>>>>>>>???????@@@@AJT\cgcgebilmkh`a_^[\^_^ZXYY`a`hlmmkkl~~~ihe~~~affc\WW\djhfgdb`fedcc`WWX[djihijebge^URPOMGDDA:~~~60/-,~~~'%$$'&%$$'))*,1246~=FMPVXZ]YQHCA@???==<<<<<<<;;;<<<<<<<==>>ABCENSTTTUVZ[XVVUTTYZ\\ad``[SPPQY^``\YY`daYVVXX[\XX\dig`bb`c`^\]]`hiikhbdf`a_bc\\[XQNKJIHEEECCA@????AAAA@=5332222222222211111111111246:;<======>>@AHJT]d`WVWWVWZZab`hlmmjif^VVUUVWW]__bjlia]]`cahklhb\ZWVTSTTTSSTUUUUWX\\XVV_daZZabahkjd`bdhilihhiihbdgbehilljf`\XTONMLLLLLLKKIEA=<::863..,++++*******))('
 p                               """##################$$$$%%&&&'(),./48=BFIKLLNNNORTUY]`abefgjlmqvz|}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|ytpooonnnoooonnnmlkiiheeedddcbbbbbbbbbbbcccdimrvz}~~~~~~~~~~~~~~~~~~~~~~~~~~|{zzz{|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|wsnid`[ZZYYXWWVTSROLHCB@@@?????>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>???????@@ACDDFFIKLPTVWWWXZ[_dimprrw{}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~zyyxvtpommmlllllllllllllllkiiheba`_______`dfghjkkmmnnnosx{~~~~~~~~~~~~~~}}}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|}||~~~~~~~~~~~~~~~~~~~~~~~~~~~~}yxxxz{~~~~~~~~~~~~~~~~~|xwvutrqqqrttx|~~~}}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}zvusssrrssssrrssstuwxxyzzyyyyy}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|{zvrnie`[ZXUTSSSSSSUUUVVVVVVVVVVVW[^`abbbbbbcccccccb`]\ZZYYXVSRQNJIIII~IIIHHHGFFFEEEDDDDDDDDD~~~DDDDDDDDDEEILNORTUVVVVVVVVVVVVVVVWWWXXXXXXXXXYZZZZZZZ[[[[\^```aaa```^^][[[[[ZZZZZZZZZZZZ[[\`cefffffghikmmprruwwz|}~~~~~~~~~~}yxxwuuuvwx}~~~~~~~~~~~~~~~~~~~~}|}~~~~~~~~~~~~~~~~~~}|}~~~}}||~~~~~~~~~~~~~~~~~~~~~|wvuqqqpponlhdca^^\YVQPPPPPPPPPPPPPPPPPPPPPQQQQQQQQQQQQQQQQQQQQQQQQQQQQRRRSSSSTUUUUUUUUUVVXX[]^cgknopsuvz~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{wrmihhgggeb_ZVQPNKGBBAA?<<:7~~~4/.-,~~~++****************~**********************************************************+++++,,,,,,,,-.///////0000000000000000000000000000000///////...-,))'''''&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&''''''''''())++.001112223458:;@DIMQSTUUUVVY[[^`aehjkkklmnnooooonmmlllllllllllllmmmmmnnqrsssvxy~~~~~~~~~~~~~~~~~~~{{yvrnie`[VRMLJJIIIIIHGEB>:97653/+*(((''''&&&&&&&%#
 
 a


=====================================
makefile
=====================================
@@ -1,4 +1,4 @@
-CXXFLAGS = -msse4.1 -O3 -std=c++11 -pthread -DHAS_CXX_THREADS
+CXXFLAGS = -msse4 -O3 -std=c++11 -pthread -DHAS_CXX_THREADS
 all:
 	@cd src && $(MAKE) CXXFLAGS="$(CXXFLAGS)"
 


=====================================
src/Alignment.cc
=====================================
@@ -60,7 +60,8 @@ static bool isNext( const SegmentPair& x, const SegmentPair& y ){
 void Alignment::makeXdrop( Centroid& centroid,
 			   GreedyXdropAligner& greedyAligner, bool isGreedy,
 			   const uchar* seq1, const uchar* seq2, int globality,
-			   const ScoreMatrixRow* scoreMatrix, int smMax,
+			   const ScoreMatrixRow* scoreMatrix,
+			   int smMax, int smMin,
 			   const mcf::GapCosts& gap, int maxDrop,
 			   int frameshiftCost, size_t frameSize,
 			   const ScoreMatrixRow* pssm2,
@@ -87,7 +88,7 @@ void Alignment::makeXdrop( Centroid& centroid,
   std::vector<char>& columnAmbiguityCodes = extras.columnAmbiguityCodes;
   extend( blocks, columnAmbiguityCodes, centroid, greedyAligner, isGreedy,
 	  seq1, seq2, seed.beg1(), seed.beg2(), false, globality,
-	  scoreMatrix, smMax, maxDrop, gap, frameshiftCost,
+	  scoreMatrix, smMax, smMin, maxDrop, gap, frameshiftCost,
 	  frameSize, pssm2, sm2qual, qual1, qual2, alph,
 	  extras, gamma, outputType );
 
@@ -107,7 +108,7 @@ void Alignment::makeXdrop( Centroid& centroid,
   std::vector<char> forwardAmbiguities;
   extend( forwardBlocks, forwardAmbiguities, centroid, greedyAligner, isGreedy,
 	  seq1, seq2, seed.end1(), seed.end2(), true, globality,
-	  scoreMatrix, smMax, maxDrop, gap, frameshiftCost,
+	  scoreMatrix, smMax, smMin, maxDrop, gap, frameshiftCost,
 	  frameSize, pssm2, sm2qual, qual1, qual2, alph,
 	  extras, gamma, outputType );
 
@@ -279,8 +280,8 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
 			const uchar* seq1, const uchar* seq2,
 			size_t start1, size_t start2,
 			bool isForward, int globality,
-			const ScoreMatrixRow* sm, int smMax, int maxDrop,
-			const mcf::GapCosts& gap,
+			const ScoreMatrixRow* sm, int smMax, int smMin,
+			int maxDrop, const mcf::GapCosts& gap,
 			int frameshiftCost, size_t frameSize,
 			const ScoreMatrixRow* pssm2,
                         const TwoQualityScoreMatrix& sm2qual,
@@ -324,25 +325,38 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
     --start2;
   }
 
+  bool isSimdMatrix = (alph.size == 4 && !globality && gap.isAffine &&
+		       smMin >= SCHAR_MIN &&
+		       maxDrop + smMax * 2 - smMin < UCHAR_MAX);
+  for (int i = 0; i < 4; ++i)
+    for (int j = 0; j < 4; ++j)
+      if (sm[i][j] != sm[alph.numbersToLowercase[i]][j])
+	isSimdMatrix = false;
+
   int extensionScore =
-    isGreedy  ? greedyAligner.align( seq1 + start1, seq2 + start2,
-				     isForward, sm, maxDrop, alph.size )
-    : sm2qual ? aligner.align2qual( seq1 + start1, qual1 + start1,
-				    seq2 + start2, qual2 + start2,
-				    isForward, globality, sm2qual,
-				    del.openCost, del.growCost,
-				    ins.openCost, ins.growCost,
-				    gap.pairCost, maxDrop, smMax )
-    : pssm2   ? aligner.alignPssm( seq1 + start1, pssm2 + start2,
-				   isForward, globality,
+    isGreedy  ? greedyAligner.align(seq1 + start1, seq2 + start2,
+				    isForward, sm, maxDrop, alph.size)
+    : sm2qual ? aligner.align2qual(seq1 + start1, qual1 + start1,
+				   seq2 + start2, qual2 + start2,
+				   isForward, globality, sm2qual,
 				   del.openCost, del.growCost,
 				   ins.openCost, ins.growCost,
-				   gap.pairCost, maxDrop, smMax )
-    :           aligner.align( seq1 + start1, seq2 + start2,
-			       isForward, globality, sm,
-			       del.openCost, del.growCost,
-			       ins.openCost, ins.growCost,
-			       gap.pairCost, maxDrop, smMax );
+				   gap.pairCost, gap.isAffine, maxDrop, smMax)
+    : pssm2   ? aligner.alignPssm(seq1 + start1, pssm2 + start2,
+				  isForward, globality,
+				  del.openCost, del.growCost,
+				  ins.openCost, ins.growCost,
+				  gap.pairCost, gap.isAffine, maxDrop, smMax)
+    : isSimdMatrix ? aligner.alignDna(seq1 + start1, seq2 + start2,
+				      isForward, sm,
+				      del.openCost, del.growCost,
+				      ins.openCost, ins.growCost,
+				      maxDrop, smMax, alph.numbersToUppercase)
+    :           aligner.align(seq1 + start1, seq2 + start2,
+			      isForward, globality, sm,
+			      del.openCost, del.growCost,
+			      ins.openCost, ins.growCost,
+			      gap.pairCost, gap.isAffine, maxDrop, smMax);
 
   if( extensionScore == -INF ){
     score = -INF;  // avoid score overflow
@@ -356,6 +370,11 @@ void Alignment::extend( std::vector< SegmentPair >& chunks,
     if( isGreedy ){
       while( greedyAligner.getNextChunk( end1, end2, size ) )
 	chunks.push_back( SegmentPair( end1 - size, end2 - size, size ) );
+    } else if (isSimdMatrix && !pssm2 && !sm2qual) {
+      while (aligner.getNextChunkDna(end1, end2, size,
+				     del.openCost, del.growCost,
+				     ins.openCost, ins.growCost))
+	chunks.push_back(SegmentPair(end1 - size, end2 - size, size));
     }else{
       while( aligner.getNextChunk( end1, end2, size,
 				   del.openCost, del.growCost,


=====================================
src/Alignment.hh
=====================================
@@ -80,7 +80,7 @@ struct Alignment{
   void makeXdrop( Centroid& centroid,
 		  GreedyXdropAligner& greedyAligner, bool isGreedy,
 		  const uchar* seq1, const uchar* seq2, int globality,
-		  const ScoreMatrixRow* scoreMatrix, int smMax,
+		  const ScoreMatrixRow* scoreMatrix, int smMax, int smMin,
 		  const mcf::GapCosts& gap, int maxDrop,
 		  int frameshiftCost, size_t frameSize,
 		  const ScoreMatrixRow* pssm2,
@@ -133,7 +133,7 @@ struct Alignment{
 	       const uchar* seq1, const uchar* seq2,
 	       size_t start1, size_t start2,
 	       bool isForward, int globality,
-	       const ScoreMatrixRow* sm, int smMax, int maxDrop,
+	       const ScoreMatrixRow* sm, int smMax, int smMin, int maxDrop,
 	       const mcf::GapCosts& gap,
 	       int frameshiftCost, size_t frameSize,
 	       const ScoreMatrixRow* pssm2,


=====================================
src/GappedXdropAligner.cc
=====================================
@@ -54,25 +54,6 @@
 
 namespace cbrc {
 
-// Puts 2 "dummy" antidiagonals at the start, so that we can safely
-// look-back from subsequent antidiagonals.
-void GappedXdropAligner::init() {
-  scoreOrigins.resize(0);
-  scoreEnds.resize(1);
-
-  initAntidiagonal(0, xdropPadLen);
-  initAntidiagonal(0, xdropPadLen * 2);
-
-  const SimdInt mNegInf = simdSet1(-INF);
-  simdStore(&xScores[xdropPadLen], mNegInf);
-  simdStore(&yScores[0], mNegInf);
-  simdStore(&yScores[xdropPadLen], mNegInf);
-  simdStore(&zScores[0], mNegInf);
-  simdStore(&zScores[xdropPadLen], mNegInf);
-
-  bestAntidiagonal = 0;
-}
-
 int GappedXdropAligner::align(const uchar *seq1,
                               const uchar *seq2,
                               bool isForward,
@@ -83,103 +64,95 @@ int GappedXdropAligner::align(const uchar *seq1,
                               int insExistenceCost,
                               int insExtensionCost,
                               int gapUnalignedCost,
+			      bool isAffine,
                               int maxScoreDrop,
                               int maxMatchScore) {
-  const SimdInt mNegInf = simdSet1(-INF);
-  const SimdInt mDelOpenCost = simdSet1(delExistenceCost);
-  const SimdInt mDelGrowCost = simdSet1(delExtensionCost);
-  const SimdInt mInsOpenCost = simdSet1(insExistenceCost);
-  const SimdInt mInsGrowCost = simdSet1(insExtensionCost);
+  const SimdInt mNegInf = simdFill(-INF);
+  const SimdInt mDelOpenCost = simdFill(delExistenceCost);
+  const SimdInt mDelGrowCost = simdFill(delExtensionCost);
+  const SimdInt mInsOpenCost = simdFill(insExistenceCost);
+  const SimdInt mInsGrowCost = simdFill(insExtensionCost);
   const int seqIncrement = isForward ? 1 : -1;
-  const bool isAffine = isAffineGaps(delExistenceCost, delExtensionCost,
-				     insExistenceCost, insExtensionCost,
-				     gapUnalignedCost);
 
-  size_t maxSeq1begs[] = { 0, 9 };
-  size_t minSeq1ends[] = { 1, 0 };
+  size_t seq1beg = 0;
+  size_t seq1end = 1;
+  size_t diagPos = xdropPadLen - 1;
+  size_t horiPos = xdropPadLen * 2 - 1;
+  size_t thisPos = xdropPadLen * 2;
 
   int bestScore = 0;
-  SimdInt mBestScore = simdSet1(0);
+  SimdInt mBestScore = simdZero();
   int bestEdgeScore = -INF;
   size_t bestEdgeAntidiagonal = 0;
 
   init();
-  seq1queue.clear();
+  pssmQueue.clear();
   seq2queue.clear();
 
-  for (int i = 0; i < simdLen-1; ++i) {
-    const int *scores = scorer[*seq1];
-    seq1queue.push(scores, 0);
-    seq1 += seqIncrement * !isDelimiter(0, scores);
-    seq2queue.push(0, 0);
-  }
+  bool isDelimiter1 = isDelimiter(0, scorer[*seq1]);
+  bool isDelimiter2 = isDelimiter(*seq2, scorer[0]);
 
-  for (size_t antidiagonal = 0; /* noop */; ++antidiagonal) {
-    size_t seq1beg = std::min(maxSeq1begs[0], maxSeq1begs[1]);
-    size_t seq1end = std::max(minSeq1ends[0], minSeq1ends[1]);
+  for (int i = 0; i < simdLen; ++i) {
+    const int *x = scorer[*seq1];
+    pssmQueue.push(x, i);
+    seq1 += seqIncrement * !isDelimiter(0, x);
+    seq2queue.push(*seq2, i);
+  }
 
-    if (seq1beg >= seq1end) break;
+  seq2 += seqIncrement;
 
-    size_t scoreEnd = scoreEnds.back();
+  for (size_t antidiagonal = 0; /* noop */; ++antidiagonal) {
     int numCells = seq1end - seq1beg;
-
-    initAntidiagonal(seq1end, scoreEnd + xdropPadLen + numCells);
-
-    if (seq1end + (simdLen-1) > seq1queue.size()) {
-      const int *scores = scorer[*seq1];
-      seq1queue.push(scores, seq1beg);
-      seq1 += seqIncrement * !isDelimiter(0, scores);
-    }
-    const const_int_ptr *s1 = &seq1queue[seq1beg];
-
-    size_t seq2pos = antidiagonal - seq1beg;
-    if (seq2pos + simdLen > seq2queue.size()) {
-      seq2queue.push(*seq2, seq2pos - numCells + 1);
-      seq2 += seqIncrement;
+    int n = numCells - 1;
+
+    const const_int_ptr *s1 = &pssmQueue.fromEnd(n + simdLen);
+    const uchar *s2 = seq2queue.begin();
+
+    initAntidiagonal(seq1end, thisPos, numCells);
+    thisPos += xdropPadLen;
+    Score *x0 = &xScores[thisPos];
+    Score *y0 = &yScores[thisPos];
+    Score *z0 = &zScores[thisPos];
+    const Score *y1 = &yScores[horiPos];
+    const Score *z1 = &zScores[horiPos + 1];
+    const Score *x2 = &xScores[diagPos];
+
+    if (!globality && (isDelimiter1 || isDelimiter2)) {
+      updateMaxScoreDrop(maxScoreDrop, n, maxMatchScore);
     }
-    const uchar *s2 = &seq2queue[seq2pos + (simdLen-1)];
-
-    if (!globality && isDelimiter(*s2, *scorer))
-      updateMaxScoreDrop(maxScoreDrop, numCells, maxMatchScore);
 
     int minScore = bestScore - maxScoreDrop;
-    SimdInt mMinScore = simdSet1(minScore);
-
-    Score *x0 = &xScores[scoreEnd];
-    Score *y0 = &yScores[scoreEnd];
-    Score *z0 = &zScores[scoreEnd];
-    const Score *y1 = &yScores[hori(antidiagonal, seq1beg)];
-    const Score *z1 = &zScores[vert(antidiagonal, seq1beg)];
-    const Score *x2 = &xScores[diag(antidiagonal, seq1beg)];
-
-    simdStore(x0, mNegInf);  x0 += xdropPadLen;
-    simdStore(y0, mNegInf);  y0 += xdropPadLen;
-    simdStore(z0, mNegInf);  z0 += xdropPadLen;
+    SimdInt mMinScore = simdFill(minScore);
 
-    if (globality && isDelimiter(*s2, *scorer)) {
-      const Score *z2 = &zScores[diag(antidiagonal, seq1beg)];
+    if (globality && isDelimiter2) {
+      const Score *z2 = &zScores[diagPos];
       int b = maxValue(x2[0], z1[0]-insExtensionCost, z2[0]-gapUnalignedCost);
-      if (b >= minScore)
-	updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
-		    b, antidiagonal, seq1beg);
+      updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
+		  minScore, b, antidiagonal, seq1beg);
+    }
+
+    if (globality && isDelimiter1) {
+      const Score *y2 = &yScores[diagPos];
+      int b = maxValue(x2[n], y1[n]-delExtensionCost, y2[n]-gapUnalignedCost);
+      updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
+		  minScore, b, antidiagonal, seq1end-1);
     }
 
     if (isAffine) {
       for (int i = 0; i < numCells; i += simdLen) {
 	SimdInt s = simdSet(
 #ifdef __SSE4_1__
-//#ifdef __AVX2__
-#ifdef WANT_AVX2
-			    s1[7][s2[-7]],
-			    s1[6][s2[-6]],
-			    s1[5][s2[-5]],
-			    s1[4][s2[-4]],
+#ifdef __AVX2__
+			    s1[7][s2[7]],
+			    s1[6][s2[6]],
+			    s1[5][s2[5]],
+			    s1[4][s2[4]],
 #endif
-			    s1[3][s2[-3]],
-			    s1[2][s2[-2]],
-			    s1[1][s2[-1]],
+			    s1[3][s2[3]],
+			    s1[2][s2[2]],
+			    s1[1][s2[1]],
 #endif
-			    s1[0][s2[-0]]);
+			    s1[0][s2[0]]);
 	SimdInt x = simdLoad(x2+i);
 	SimdInt y = simdSub(simdLoad(y1+i), mDelGrowCost);
 	SimdInt z = simdSub(simdLoad(z1+i), mInsGrowCost);
@@ -190,7 +163,7 @@ int GappedXdropAligner::align(const uchar *seq1,
 	simdStore(y0+i, simdMax(simdSub(b, mDelOpenCost), y));
 	simdStore(z0+i, simdMax(simdSub(b, mInsOpenCost), z));
 	s1 += simdLen;
-	s2 -= simdLen;
+	s2 += simdLen;
       }
 
       int newBestScore = simdHorizontalMax(mBestScore);
@@ -199,8 +172,8 @@ int GappedXdropAligner::align(const uchar *seq1,
 	bestAntidiagonal = antidiagonal;
       }
     } else {
-      const Score *y2 = &yScores[diag(antidiagonal, seq1beg)];
-      const Score *z2 = &zScores[diag(antidiagonal, seq1beg)];
+      const Score *y2 = &yScores[diagPos];
+      const Score *z2 = &zScores[diagPos];
       for (int i = 0; i < numCells; ++i) {
         int x = x2[i];
         int y = maxValue(y1[i] - delExtensionCost, y2[i] - gapUnalignedCost);
@@ -217,26 +190,33 @@ int GappedXdropAligner::align(const uchar *seq1,
         }
         else x0[i] = y0[i] = z0[i] = -INF;
 	++s1;
-	--s2;
+	++s2;
       }
     }
 
-    const int *seq1back = seq1queue[seq1end - 1];
+    diagPos = horiPos;
+    horiPos = thisPos - 1;
+    thisPos += numCells;
 
-    if (globality && isDelimiter(0, seq1back)) {
-      const Score *y2 = &yScores[diag(antidiagonal, seq1beg)];
-      int n = numCells - 1;
-      int b = maxValue(x2[n], y1[n]-delExtensionCost, y2[n]-gapUnalignedCost);
-      if (b >= minScore)
-	updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
-		    b, antidiagonal, seq1end-1);
+    if (x0[n] > -INF / 2) {
+      ++seq1end;
+      const int *x = scorer[*seq1];
+      pssmQueue.push(x, n + simdLen);
+      seq1 += seqIncrement * !isDelimiter(0, x);
+      isDelimiter1 = isDelimiter(0, pssmQueue.fromEnd(simdLen));
     }
 
-    if (!globality && isDelimiter(0, seq1back))
-      updateMaxScoreDrop(maxScoreDrop, numCells, maxMatchScore);
-
-    const Score *x0base = x0 - seq1beg;
-    updateFiniteEdges(maxSeq1begs, minSeq1ends, x0base, x0+numCells, numCells);
+    if (x0[0] > -INF / 2) {
+      uchar y = *seq2;
+      seq2queue.push(y, n + simdLen);
+      seq2 += seqIncrement;
+      isDelimiter2 = isDelimiter(y, scorer[0]);
+    } else {
+      ++seq1beg;
+      ++diagPos;
+      ++horiPos;
+      if (seq1beg == seq1end) break;
+    }
   }
 
   if (globality) {


=====================================
src/GappedXdropAligner.hh
=====================================
@@ -46,6 +46,7 @@
 #define GAPPED_XDROP_ALIGNER_HH
 
 #include "mcf_contiguous_queue.hh"
+#include "mcf_reverse_queue.hh"
 #include "mcf_simd.hh"
 #include "ScoreMatrixRow.hh"
 
@@ -61,10 +62,13 @@ typedef unsigned char uchar;
 typedef const int *const_int_ptr;
 
 typedef int Score;
+typedef uchar TinyScore;
 
 class TwoQualityScoreMatrix;
 
-const int xdropPadLen = simdLen;
+const int xdropPadLen = simdBytes;
+
+const int droppedTinyScore = UCHAR_MAX;
 
 class GappedXdropAligner {
  public:
@@ -78,6 +82,7 @@ class GappedXdropAligner {
 	    int insExistenceCost,
 	    int insExtensionCost,
             int gapUnalignedCost,
+	    bool isAffine,
             int maxScoreDrop,
             int maxMatchScore);
 
@@ -91,6 +96,7 @@ class GappedXdropAligner {
 		int insExistenceCost,
 		int insExtensionCost,
                 int gapUnalignedCost,
+		bool isAffine,
                 int maxScoreDrop,
                 int maxMatchScore);
 
@@ -107,9 +113,26 @@ class GappedXdropAligner {
 		 int insExistenceCost,
 		 int insExtensionCost,
                  int gapUnalignedCost,
+		 bool isAffine,
                  int maxScoreDrop,
                  int maxMatchScore);
 
+  // Like "align", but maybe faster for DNA.  Assumes that
+  // scorer[i<4][j<4] fits in signed char.  Each sequence element is
+  // first mapped through "toUnmasked".  If an unmasked sequence
+  // element >= 4 appears, alignDna falls back to a slower algorithm.
+  int alignDna(const uchar *seq1,
+	       const uchar *seq2,
+	       bool isForward,
+	       const ScoreMatrixRow *scorer,
+	       int delExistenceCost,
+	       int delExtensionCost,
+	       int insExistenceCost,
+	       int insExtensionCost,
+	       int maxScoreDrop,
+	       int maxMatchScore,
+	       const uchar *toUnmasked);
+
   // Call this repeatedly to get each gapless chunk of the alignment.
   // The chunks are returned in far-to-near order.  The chunk's end
   // coordinates in each sequence (relative to the start of extension)
@@ -125,6 +148,15 @@ class GappedXdropAligner {
 		    int insExtensionCost,
                     int gapUnalignedCost);
 
+  // After "alignDna", must use this instead of "getNextChunk"
+  bool getNextChunkDna(size_t &end1,
+		       size_t &end2,
+		       size_t &length,
+		       int delExistenceCost,
+		       int delExtensionCost,
+		       int insExistenceCost,
+		       int insExtensionCost);
+
   // Like "align", but it aligns a protein sequence to a DNA sequence.
   // The DNA should be provided as 3 protein sequences, one for each
   // reading frame.  seq2frame0 is the in-frame start point.
@@ -225,8 +257,9 @@ class GappedXdropAligner {
   std::vector<size_t> scoreOrigins;  // score origin for each antidiagonal
   std::vector<size_t> scoreEnds;  // score end pos for each antidiagonal
 
-  ContiguousQueue<const int *> seq1queue;
-  ContiguousQueue<uchar> seq2queue;
+  ContiguousQueue<const int *> pssmQueue;
+  ContiguousQueue<uchar> seq1queue;
+  ReverseQueue<uchar> seq2queue;
 
   // Our position during the trace-back:
   size_t bestAntidiagonal;
@@ -240,9 +273,31 @@ class GappedXdropAligner {
     }
   }
 
-  void init();
+  void initAntidiagonal(size_t seq1end, size_t thisEnd, int numCells) {
+    const SimdInt mNegInf = simdFill(-INF);
+    size_t nextEnd = thisEnd + xdropPadLen + numCells;
+    scoreEnds.push_back(nextEnd);
+    scoreOrigins.push_back(nextEnd - seq1end);
+    resizeScoresIfSmaller(nextEnd + (simdLen-1));
+    for (int i = 0; i < xdropPadLen; i += simdLen) {
+      simdStore(&xScores[thisEnd + i], mNegInf);
+      simdStore(&yScores[thisEnd + i], mNegInf);
+      simdStore(&zScores[thisEnd + i], mNegInf);
+    }
+  }
 
-  void initAntidiagonal(size_t seq1end, size_t scoreEnd) {
+  // Puts 2 "dummy" antidiagonals at the start, so that we can safely
+  // look-back from subsequent antidiagonals
+  void init() {
+    scoreOrigins.resize(0);
+    scoreEnds.resize(1);
+    initAntidiagonal(0, 0, 0);
+    initAntidiagonal(0, xdropPadLen, 0);
+    xScores[xdropPadLen - 1] = 0;
+    bestAntidiagonal = 0;
+  }
+
+  void initAntidiagonal3(size_t seq1end, size_t scoreEnd) {
     scoreOrigins.push_back(scoreEnd - seq1end);
     resizeScoresIfSmaller(scoreEnd + (simdLen-1));
     scoreEnds.push_back(scoreEnd);
@@ -260,6 +315,53 @@ class GappedXdropAligner {
   }
 
   void init3();
+
+  // Everything below here is for alignDna & getNextChunkDna
+
+  std::vector<TinyScore> xTinyScores;
+  std::vector<TinyScore> yTinyScores;
+  std::vector<TinyScore> zTinyScores;
+
+  std::vector<int> scoreRises;  // increase of best score, per antidiagonal
+
+  void resizeTinyScoresIfSmaller(size_t size) {
+    if (xTinyScores.size() < size) {
+      xTinyScores.resize(size);
+      yTinyScores.resize(size);
+      zTinyScores.resize(size);
+    }
+  }
+
+  void initAntidiagonalTiny(size_t seq1end, size_t thisEnd, int numCells) {
+    const SimdInt mNegInf = simdOnes();
+    size_t nextEnd = thisEnd + xdropPadLen + numCells;
+    scoreEnds.push_back(nextEnd);
+    scoreOrigins.push_back(nextEnd - seq1end);
+    resizeTinyScoresIfSmaller(nextEnd + (simdBytes-1));
+    simdStore(&xTinyScores[thisEnd], mNegInf);
+    simdStore(&yTinyScores[thisEnd], mNegInf);
+    simdStore(&zTinyScores[thisEnd], mNegInf);
+  }
+
+  void initTiny(int scoreOffset) {
+    scoreOrigins.resize(0);
+    scoreEnds.resize(1);
+    initAntidiagonalTiny(0, 0, 0);
+    initAntidiagonalTiny(0, xdropPadLen, 0);
+    xTinyScores[xdropPadLen - 1] = scoreOffset;
+    bestAntidiagonal = 0;
+    scoreRises.resize(2);
+  }
+
+  void calcBestSeq1positionTiny(int scoreOffset) {
+    size_t seq1beg = seq1start(bestAntidiagonal);
+    const TinyScore *x2 = &xTinyScores[diag(bestAntidiagonal, seq1beg)];
+    const TinyScore *x2beg = x2;
+    int target = scoreOffset - scoreRises[bestAntidiagonal] -
+      scoreRises[bestAntidiagonal + 1] - scoreRises[bestAntidiagonal + 2];
+    while (*x2 != target) ++x2;
+    bestSeq1position = x2 - x2beg + seq1beg;
+  }
 };
 
 }


=====================================
src/GappedXdropAligner2qual.cc
=====================================
@@ -22,15 +22,16 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
 				   int insExistenceCost,
 				   int insExtensionCost,
                                    int gapUnalignedCost,
+				   bool isAffine,
                                    int maxScoreDrop,
                                    int maxMatchScore) {
-  const SimdInt mNegInf = simdSet1(-INF);
-  const bool isAffine = isAffineGaps(delExistenceCost, delExtensionCost,
-				     insExistenceCost, insExtensionCost,
-				     gapUnalignedCost);
+  const int seqIncrement = isForward ? 1 : -1;
 
-  size_t maxSeq1begs[] = { 0, 9 };
-  size_t minSeq1ends[] = { 1, 0 };
+  size_t seq1beg = 0;
+  size_t seq1end = 1;
+  size_t diagPos = xdropPadLen - 1;
+  size_t horiPos = xdropPadLen * 2 - 1;
+  size_t thisPos = xdropPadLen * 2;
 
   int bestScore = 0;
   int bestEdgeScore = -INF;
@@ -39,15 +40,8 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
   init();
 
   for (size_t antidiagonal = 0; /* noop */; ++antidiagonal) {
-    size_t seq1beg = std::min(maxSeq1begs[0], maxSeq1begs[1]);
-    size_t seq1end = std::max(minSeq1ends[0], minSeq1ends[1]);
-
-    if (seq1beg >= seq1end) break;
-
-    size_t scoreEnd = scoreEnds.back();
-    size_t numCells = seq1end - seq1beg;
-
-    initAntidiagonal(seq1end, scoreEnd + xdropPadLen + numCells);
+    int numCells = seq1end - seq1beg;
+    int n = numCells - 1;
 
     size_t seq2pos = antidiagonal - seq1beg;
 
@@ -56,33 +50,40 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
     const uchar *s2 = isForward ? seq2 + seq2pos : seq2 - seq2pos;
     const uchar *q2 = isForward ? qual2 + seq2pos : qual2 - seq2pos;
 
-    if (!globality && isDelimiter2qual(*s2))
-      updateMaxScoreDrop(maxScoreDrop, numCells, maxMatchScore);
+    initAntidiagonal(seq1end, thisPos, numCells);
+    thisPos += xdropPadLen;
+    Score *x0 = &xScores[thisPos];
+    Score *y0 = &yScores[thisPos];
+    Score *z0 = &zScores[thisPos];
+    const Score *y1 = &yScores[horiPos];
+    const Score *z1 = &zScores[horiPos + 1];
+    const Score *x2 = &xScores[diagPos];
+
+    bool isDelimiter1 = isDelimiter2qual(s1[n * seqIncrement]);
+    bool isDelimiter2 = isDelimiter2qual(s2[0]);
+
+    if (!globality && (isDelimiter1 || isDelimiter2)) {
+      updateMaxScoreDrop(maxScoreDrop, n, maxMatchScore);
+    }
 
     int minScore = bestScore - maxScoreDrop;
 
-    Score *x0 = &xScores[scoreEnd];
-    Score *y0 = &yScores[scoreEnd];
-    Score *z0 = &zScores[scoreEnd];
-    const Score *y1 = &yScores[hori(antidiagonal, seq1beg)];
-    const Score *z1 = &zScores[vert(antidiagonal, seq1beg)];
-    const Score *x2 = &xScores[diag(antidiagonal, seq1beg)];
-
-    simdStore(x0, mNegInf);  x0 += xdropPadLen;
-    simdStore(y0, mNegInf);  y0 += xdropPadLen;
-    simdStore(z0, mNegInf);  z0 += xdropPadLen;
-
-    const Score *x0last = x0 + numCells - 1;
-    const Score *x0base = x0 - seq1beg;
-
-    if (globality && isDelimiter2qual(*s2)) {
-      const Score *z2 = &zScores[diag(antidiagonal, seq1beg)];
-      int b = maxValue(*x2, *z1 - insExtensionCost, *z2 - gapUnalignedCost);
-      if (b >= minScore)
-	updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
-		    b, antidiagonal, seq1beg);
+    if (globality && isDelimiter2) {
+      const Score *z2 = &zScores[diagPos];
+      int b = maxValue(x2[0], z1[0]-insExtensionCost, z2[0]-gapUnalignedCost);
+      updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
+		  minScore, b, antidiagonal, seq1beg);
     }
 
+    if (globality && isDelimiter1) {
+      const Score *y2 = &yScores[diagPos];
+      int b = maxValue(x2[n], y1[n]-delExtensionCost, y2[n]-gapUnalignedCost);
+      updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
+		  minScore, b, antidiagonal, seq1end-1);
+    }
+
+    const Score *x0last = x0 + n;
+
     if (isAffine) {
       if (isForward)
         while (1) {
@@ -123,8 +124,8 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
           --s1;  --q1;  ++s2;  ++q2;  ++x0;  ++y0;  ++z0;  ++y1;  ++z1;  ++x2;
         }
     } else {
-      const Score *y2 = &yScores[diag(antidiagonal, seq1beg)];
-      const Score *z2 = &zScores[diag(antidiagonal, seq1beg)];
+      const Score *y2 = &yScores[diagPos];
+      const Score *z2 = &zScores[diagPos];
       while (1) {
         int x = *x2;
         int y = maxValue(*y1 - delExtensionCost, *y2 - gapUnalignedCost);
@@ -142,23 +143,25 @@ int GappedXdropAligner::align2qual(const uchar *seq1,
         else *x0 = *y0 = *z0 = -INF;
         if (x0 == x0last) break;
         ++x0;  ++y0;  ++z0;  ++y1;  ++z1;  ++x2;  ++y2;  ++z2;
-        if (isForward) { ++s1;  ++q1;  --s2;  --q2; }
-        else           { --s1;  --q1;  ++s2;  ++q2; }
+	s1 += seqIncrement;  q1 += seqIncrement;
+	s2 -= seqIncrement;  q2 -= seqIncrement;
       }
     }
 
-    if (globality && isDelimiter2qual(*s1)) {
-      const Score *y2 = &yScores[diag(antidiagonal, seq1end-1)];
-      int b = maxValue(*x2, *y1 - delExtensionCost, *y2 - gapUnalignedCost);
-      if (b >= minScore)
-	updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
-		    b, antidiagonal, seq1end-1);
-    }
+    diagPos = horiPos;
+    horiPos = thisPos - 1;
+    thisPos += numCells;
 
-    if (!globality && isDelimiter2qual(*s1))
-      updateMaxScoreDrop(maxScoreDrop, numCells, maxMatchScore);
+    if (x0[0] > -INF / 2) {
+      ++seq1end;
+    }
 
-    updateFiniteEdges(maxSeq1begs, minSeq1ends, x0base, x0 + 1, numCells);
+    if (x0[-n] <= -INF / 2) {
+      ++seq1beg;
+      ++diagPos;
+      ++horiPos;
+      if (seq1beg >= seq1end) break;
+    }
   }
 
   if (globality) {


=====================================
src/GappedXdropAligner3frame.cc
=====================================
@@ -62,13 +62,13 @@ void GappedXdropAligner::init3() {
   scoreOrigins.resize(0);
   scoreEnds.resize(1);
 
-  initAntidiagonal(0, 2);
-  initAntidiagonal(0, 4);
-  initAntidiagonal(0, 6);
-  initAntidiagonal(0, 8);
-  initAntidiagonal(0, 10);
-  initAntidiagonal(0, 12);
-  initAntidiagonal(0, 14);
+  initAntidiagonal3(0, 2);
+  initAntidiagonal3(0, 4);
+  initAntidiagonal3(0, 6);
+  initAntidiagonal3(0, 8);
+  initAntidiagonal3(0, 10);
+  initAntidiagonal3(0, 12);
+  initAntidiagonal3(0, 14);
 
   std::fill_n(xScores.begin(), 14, -INF);
   std::fill_n(yScores.begin(), 14, -INF);
@@ -122,7 +122,7 @@ int GappedXdropAligner::align3(const uchar *seq1,
     size_t scoreEnd = scoreEnds.back();
     size_t numCells = seq1end - seq1beg;
 
-    initAntidiagonal(seq1end, scoreEnd + numCells + 2);  // + 2 pad cells
+    initAntidiagonal3(seq1end, scoreEnd + numCells + 2);  // + 2 pad cells
 
     const uchar *seq2 =
         whichFrame(antidiagonal, seq2frame0, seq2frame1, seq2frame2);
@@ -220,7 +220,7 @@ int GappedXdropAligner::align3(const uchar *seq1,
     }
 
     if (isDelimiter(*s1, *scorer))
-      updateMaxScoreDrop(maxScoreDrop, numCells, maxMatchScore);
+      updateMaxScoreDrop(maxScoreDrop, numCells-1, maxMatchScore);
 
     updateFiniteEdges3(maxSeq1begs, minSeq1ends, x0base, x0 + 1, numCells);
   }


=====================================
src/GappedXdropAligner3framePssm.cc
=====================================
@@ -34,7 +34,7 @@ int GappedXdropAligner::align3pssm(const uchar *seq,
     size_t scoreEnd = scoreEnds.back();
     size_t numCells = seq1end - seq1beg;
 
-    initAntidiagonal(seq1end, scoreEnd + numCells + 2);  // + 2 pad cells
+    initAntidiagonal3(seq1end, scoreEnd + numCells + 2);  // + 2 pad cells
 
     const ScoreMatrixRow *pssm =
         whichFrame(antidiagonal, pssmFrame0, pssmFrame1, pssmFrame2);
@@ -132,7 +132,7 @@ int GappedXdropAligner::align3pssm(const uchar *seq,
     }
 
     if (isDelimiter(*s1, *pssmFrame2) && isDelimiter(*s1, *pssmFrame0))
-      updateMaxScoreDrop(maxScoreDrop, numCells, maxMatchScore);
+      updateMaxScoreDrop(maxScoreDrop, numCells-1, maxMatchScore);
 
     updateFiniteEdges3(maxSeq1begs, minSeq1ends, x0base, x0 + 1, numCells);
   }


=====================================
src/GappedXdropAlignerDna.cc
=====================================
@@ -0,0 +1,275 @@
+// Author: Martin C. Frith 2019
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "GappedXdropAligner.hh"
+#include "GappedXdropAlignerInl.hh"
+
+//#include <iostream>  // for debugging
+
+namespace cbrc {
+
+const int seqLoadLen = simdBytes;
+
+const int delimiter = 4;
+
+int GappedXdropAligner::alignDna(const uchar *seq1,
+				 const uchar *seq2,
+				 bool isForward,
+				 const ScoreMatrixRow *scorer,
+				 int delOpenCost,
+				 int delGrowCost,
+				 int insOpenCost,
+				 int insGrowCost,
+				 int maxScoreDrop,
+				 int maxMatchScore,
+				 const uchar *toUnmasked) {
+  int badScoreDrop = maxScoreDrop + 1;
+
+  delGrowCost = std::min(delGrowCost, badScoreDrop);
+  delOpenCost = std::min(delOpenCost, badScoreDrop - delGrowCost);
+
+  insGrowCost = std::min(insGrowCost, badScoreDrop);
+  insOpenCost = std::min(insOpenCost, badScoreDrop - insGrowCost);
+
+  const SimdInt mNegInf = simdOnes();
+  const SimdInt mDelOpenCost = simdFill1(delOpenCost);
+  const SimdInt mDelGrowCost = simdFill1(delGrowCost);
+  const SimdInt mInsOpenCost = simdFill1(insOpenCost);
+  const SimdInt mInsGrowCost = simdFill1(insGrowCost);
+  const int seqIncrement = isForward ? 1 : -1;
+  const int scoreOffset = maxMatchScore * 2;
+
+  const SimdInt scorer4x4 =
+    simdSet1(
+#ifdef __AVX2__
+		 scorer[3][3], scorer[3][2], scorer[3][1], scorer[3][0],
+		 scorer[2][3], scorer[2][2], scorer[2][1], scorer[2][0],
+		 scorer[1][3], scorer[1][2], scorer[1][1], scorer[1][0],
+		 scorer[0][3], scorer[0][2], scorer[0][1], scorer[0][0],
+#endif
+		 scorer[3][3], scorer[3][2], scorer[3][1], scorer[3][0],
+		 scorer[2][3], scorer[2][2], scorer[2][1], scorer[2][0],
+		 scorer[1][3], scorer[1][2], scorer[1][1], scorer[1][0],
+		 scorer[0][3], scorer[0][2], scorer[0][1], scorer[0][0]);
+
+  size_t seq1beg = 0;
+  size_t seq1end = 1;
+  size_t diagPos = xdropPadLen - 1;
+  size_t horiPos = xdropPadLen * 2 - 1;
+  size_t thisPos = xdropPadLen * 2;
+
+  int bestScore = 0;
+  SimdInt mBestScore = mNegInf;
+  SimdInt mBadScore = simdFill1(scoreOffset + badScoreDrop);
+  SimdInt mScoreRise1 = simdZero();
+  SimdInt mScoreRise2 = simdZero();
+
+  initTiny(scoreOffset);
+  seq1queue.clear();
+  seq2queue.clear();
+
+  bool isDna = (toUnmasked[*seq1] < 4 && toUnmasked[*seq2] < 4);
+
+  for (int i = 0; i < seqLoadLen; ++i) {
+    uchar x = toUnmasked[*seq1];
+    seq1queue.push(x, i);
+    seq1 += seqIncrement * (x != delimiter);
+    seq2queue.push(toUnmasked[*seq2], i);
+  }
+
+  seq2 += seqIncrement;
+
+  for (size_t antidiagonal = 0; /* noop */; ++antidiagonal) {
+    int numCells = seq1end - seq1beg;
+    int n = numCells - 1;
+
+    const uchar *s1 = &seq1queue.fromEnd(n + seqLoadLen);
+    const uchar *s2 = seq2queue.begin();
+
+    initAntidiagonalTiny(seq1end, thisPos, numCells);
+    thisPos += xdropPadLen;
+    TinyScore *x0 = &xTinyScores[thisPos];
+    TinyScore *y0 = &yTinyScores[thisPos];
+    TinyScore *z0 = &zTinyScores[thisPos];
+    const TinyScore *y1 = &yTinyScores[horiPos];
+    const TinyScore *z1 = &zTinyScores[horiPos + 1];
+    const TinyScore *x2 = &xTinyScores[diagPos];
+
+    const SimdInt mScoreRise12 = simdAdd1(mScoreRise1, mScoreRise2);
+    const SimdInt mDelGrowCost1 = simdAdd1(mDelGrowCost, mScoreRise1);
+    const SimdInt mInsGrowCost1 = simdAdd1(mInsGrowCost, mScoreRise1);
+
+    if (isDna) {
+      for (int i = 0; i < numCells; i += simdBytes) {
+	SimdInt fwd1 = simdLoad(s1+i);
+	SimdInt rev2 = simdLoad(s2+i);
+	SimdInt j = simdOr(simdLeft(fwd1, 2), rev2);
+	SimdInt s = simdChoose1(scorer4x4, j);
+	SimdInt x = simdAdds1(simdLoad(x2+i), mScoreRise12);
+	SimdInt y = simdAdds1(simdLoad(y1+i), mDelGrowCost1);
+	SimdInt z = simdAdds1(simdLoad(z1+i), mInsGrowCost1);
+	SimdInt b = simdMin1(simdMin1(x, y), z);
+	SimdInt isDrop = simdEq1(simdMin1(b, mBadScore), mBadScore);
+	mBestScore = simdMin1(b, mBestScore);
+	simdStore(x0+i, simdOr(simdSub1(b, s), isDrop));
+	simdStore(y0+i, simdMin1(simdAdds1(b, mDelOpenCost), y));
+	simdStore(z0+i, simdMin1(simdAdds1(b, mInsOpenCost), z));
+      }
+    } else {
+      bool isDelimiter1 = (s1[n] == delimiter);
+      bool isDelimiter2 = (s2[0] == delimiter);
+      if (isDelimiter1 || isDelimiter2) {
+	badScoreDrop = std::min(badScoreDrop, n * maxMatchScore);
+	mBadScore = simdFill1(scoreOffset + badScoreDrop);
+      }
+
+      for (int i = 0; i < numCells; i += simdBytes) {
+	SimdInt s = simdSet1(
+#ifdef __SSE4_1__
+#ifdef __AVX2__
+			     scorer[s1[31]][s2[31]],
+			     scorer[s1[30]][s2[30]],
+			     scorer[s1[29]][s2[29]],
+			     scorer[s1[28]][s2[28]],
+			     scorer[s1[27]][s2[27]],
+			     scorer[s1[26]][s2[26]],
+			     scorer[s1[25]][s2[25]],
+			     scorer[s1[24]][s2[24]],
+			     scorer[s1[23]][s2[23]],
+			     scorer[s1[22]][s2[22]],
+			     scorer[s1[21]][s2[21]],
+			     scorer[s1[20]][s2[20]],
+			     scorer[s1[19]][s2[19]],
+			     scorer[s1[18]][s2[18]],
+			     scorer[s1[17]][s2[17]],
+			     scorer[s1[16]][s2[16]],
+#endif
+			     scorer[s1[15]][s2[15]],
+			     scorer[s1[14]][s2[14]],
+			     scorer[s1[13]][s2[13]],
+			     scorer[s1[12]][s2[12]],
+			     scorer[s1[11]][s2[11]],
+			     scorer[s1[10]][s2[10]],
+			     scorer[s1[9]][s2[9]],
+			     scorer[s1[8]][s2[8]],
+			     scorer[s1[7]][s2[7]],
+			     scorer[s1[6]][s2[6]],
+			     scorer[s1[5]][s2[5]],
+			     scorer[s1[4]][s2[4]],
+			     scorer[s1[3]][s2[3]],
+			     scorer[s1[2]][s2[2]],
+			     scorer[s1[1]][s2[1]],
+#endif
+			     scorer[s1[0]][s2[0]]);
+
+	SimdInt x = simdAdds1(simdLoad(x2+i), mScoreRise12);
+	SimdInt y = simdAdds1(simdLoad(y1+i), mDelGrowCost1);
+	SimdInt z = simdAdds1(simdLoad(z1+i), mInsGrowCost1);
+	SimdInt b = simdMin1(simdMin1(x, y), z);
+	SimdInt isDrop = simdEq1(simdMin1(b, mBadScore), mBadScore);
+	mBestScore = simdMin1(b, mBestScore);
+	simdStore(x0+i, simdOr(simdSub1(b, s), isDrop));
+	simdStore(y0+i, simdMin1(simdAdds1(b, mDelOpenCost), y));
+	simdStore(z0+i, simdMin1(simdAdds1(b, mInsOpenCost), z));
+	s1 += simdBytes;
+	s2 += simdBytes;
+      }
+      if (isDelimiter2) x0[0] = droppedTinyScore;
+      if (isDelimiter1) x0[n] = droppedTinyScore;  // maybe n=0
+    }
+
+    mScoreRise2 = mScoreRise1;
+    mScoreRise1 = simdZero();
+    int newBestScore = simdHorizontalMin1(mBestScore);
+    int rise = 0;
+    if (newBestScore < scoreOffset) {
+      rise = scoreOffset - newBestScore;
+      bestScore += rise;
+      bestAntidiagonal = antidiagonal;
+      mBestScore = mNegInf;
+      mScoreRise1 = simdFill1(rise);
+    }
+    scoreRises.push_back(rise);
+
+    diagPos = horiPos;
+    horiPos = thisPos - 1;
+    thisPos += numCells;
+
+    if (x0[n] != droppedTinyScore) {
+      ++seq1end;
+      uchar x = toUnmasked[*seq1];
+      seq1queue.push(x, n + seqLoadLen);
+      seq1 += seqIncrement * (x != delimiter);
+      uchar z = seq1queue.fromEnd(seqLoadLen);
+      if (z >= 4) {
+	isDna = false;
+      }
+    }
+
+    if (x0[0] != droppedTinyScore) {
+      uchar y = toUnmasked[*seq2];
+      seq2queue.push(y, n + seqLoadLen);
+      seq2 += seqIncrement;
+      if (y >= 4) {
+	isDna = false;
+      }
+    } else {
+      ++seq1beg;
+      ++diagPos;
+      ++horiPos;
+      if (seq1beg == seq1end) break;
+    }
+  }
+
+  calcBestSeq1positionTiny(scoreOffset);
+  return bestScore;
+}
+
+bool GappedXdropAligner::getNextChunkDna(size_t &end1,
+					 size_t &end2,
+					 size_t &length,
+					 int delOpenCost,
+					 int delGrowCost,
+					 int insOpenCost,
+					 int insGrowCost) {
+  if (bestAntidiagonal == 0) return false;
+
+  end1 = bestSeq1position;
+  end2 = bestAntidiagonal - bestSeq1position;
+
+  int x, y, z;
+  while (1) {
+    size_t h = hori(bestAntidiagonal, bestSeq1position);
+    size_t v = vert(bestAntidiagonal, bestSeq1position);
+    size_t d = diag(bestAntidiagonal, bestSeq1position);
+    x = xTinyScores[d] + scoreRises[bestAntidiagonal];
+    y = yTinyScores[h] + delGrowCost;
+    z = zTinyScores[v] + insGrowCost;
+    if (x > y || x > z || bestAntidiagonal == 0) break;
+    bestAntidiagonal -= 2;
+    bestSeq1position -= 1;
+  }
+
+  length = end1 - bestSeq1position;
+  if (bestAntidiagonal == 0) return true;
+
+  while (1) {
+    bool isDel = (y <= z);
+    bestAntidiagonal -= 1;
+    if (isDel) bestSeq1position -= 1;
+    size_t h = hori(bestAntidiagonal, bestSeq1position);
+    size_t v = vert(bestAntidiagonal, bestSeq1position);
+    size_t d = diag(bestAntidiagonal, bestSeq1position);
+    x = xTinyScores[d] + scoreRises[bestAntidiagonal];
+    y = yTinyScores[h] + delGrowCost;
+    z = zTinyScores[v] + insGrowCost;
+    if (isDel) {
+      y -= delOpenCost;
+    } else {
+      z -= insOpenCost;
+    }
+    if (x <= y && x <= z) return true;
+  }
+}
+
+}


=====================================
src/GappedXdropAlignerInl.hh
=====================================
@@ -59,13 +59,6 @@ T whichFrame(size_t antidiagonal, T frame0, T frame1, T frame2) {
   }
 }
 
-inline bool isAffineGaps(int delExistenceCost, int delExtensionCost,
-			 int insExistenceCost, int insExtensionCost,
-			 int gapUnalignedCost) {
-  return gapUnalignedCost >= delExtensionCost + insExtensionCost +
-    std::max(delExistenceCost, insExistenceCost);
-}
-
 // The next two functions will stop the alignment at delimiters.  But
 // this is not guaranteed if bestScore > INF / 2.  We could avoid this
 // restriction by replacing -INF / 2 with bestScore - INF.
@@ -97,10 +90,11 @@ inline void checkGappedXdropScore(int bestScore) {
 inline void updateBest1(int &bestScore,
 			size_t &bestAntidiagonal,
 			size_t &bestSeq1position,
+			int minScore,
 			int score,
 			size_t antidiagonal,
 			size_t seq1position) {
-  if (score > bestScore) {
+  if (score >= minScore && score > bestScore) {
     bestScore = score;
     bestAntidiagonal = antidiagonal;
     bestSeq1position = seq1position;
@@ -119,26 +113,13 @@ inline void GappedXdropAligner::updateBest(int &bestScore, int score,
 }
 
 inline void updateMaxScoreDrop(int &maxScoreDrop,
-                               size_t numCells, int maxMatchScore) {
+                               int maxMatches, int maxMatchScore) {
   // If the current antidiagonal touches a sentinel/delimiter, then
   // maxMatches is the maximum possible number of matches starting
   // from the next antidiagonal.
-  int maxMatches = static_cast<int>(numCells - 1);
   maxScoreDrop = std::min(maxScoreDrop, maxMatches * maxMatchScore - 1);
 }
 
-inline void updateFiniteEdges(size_t *maxSeq1begs, size_t *minSeq1ends,
-                              const Score *x0base, const Score *x0end,
-                              size_t numCells) {
-  const Score *x0beg = x0end - numCells;
-
-  maxSeq1begs[0] = maxSeq1begs[1] + 1;
-  maxSeq1begs[1] = finiteBeg(x0beg, x0end) - x0base;
-
-  minSeq1ends[0] = minSeq1ends[1];
-  minSeq1ends[1] = finiteEnd(x0beg, x0end) - x0base + 1;
-}
-
 inline void updateFiniteEdges3(size_t *maxSeq1begs, size_t *minSeq1ends,
                                const Score *x0base, const Score *x0end,
                                size_t numCells) {


=====================================
src/GappedXdropAlignerPssm.cc
=====================================
@@ -14,96 +14,86 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
 				  int insExistenceCost,
 				  int insExtensionCost,
                                   int gapUnalignedCost,
+				  bool isAffine,
                                   int maxScoreDrop,
                                   int maxMatchScore) {
   const int *vectorOfMatchScores = *pssm;
-  const SimdInt mNegInf = simdSet1(-INF);
-  const SimdInt mDelOpenCost = simdSet1(delExistenceCost);
-  const SimdInt mDelGrowCost = simdSet1(delExtensionCost);
-  const SimdInt mInsOpenCost = simdSet1(insExistenceCost);
-  const SimdInt mInsGrowCost = simdSet1(insExtensionCost);
+  const SimdInt mNegInf = simdFill(-INF);
+  const SimdInt mDelOpenCost = simdFill(delExistenceCost);
+  const SimdInt mDelGrowCost = simdFill(delExtensionCost);
+  const SimdInt mInsOpenCost = simdFill(insExistenceCost);
+  const SimdInt mInsGrowCost = simdFill(insExtensionCost);
   const int seqIncrement = isForward ? 1 : -1;
-  const bool isAffine = isAffineGaps(delExistenceCost, delExtensionCost,
-				     insExistenceCost, insExtensionCost,
-				     gapUnalignedCost);
 
-  size_t maxSeq1begs[] = { 0, 9 };
-  size_t minSeq1ends[] = { 1, 0 };
+  size_t seq1beg = 0;
+  size_t seq1end = 1;
+  size_t diagPos = xdropPadLen - 1;
+  size_t horiPos = xdropPadLen * 2 - 1;
+  size_t thisPos = xdropPadLen * 2;
 
   int bestScore = 0;
-  SimdInt mBestScore = simdSet1(0);
+  SimdInt mBestScore = simdZero();
   int bestEdgeScore = -INF;
   size_t bestEdgeAntidiagonal = 0;
 
   init();
   seq1queue.clear();
-  seq2queue.clear();
+  pssmQueue.clear();
 
-  // xxx the queue names are flipped: seq1queue <=> seq2queue
+  bool isDelimiter1 = isDelimiter(*seq, vectorOfMatchScores);
+  bool isDelimiter2 = isDelimiter(0, vectorOfMatchScores);
 
-  for (int i = 0; i < simdLen-1; ++i) {
-    uchar c = *seq;
-    seq2queue.push(c, 0);
-    seq += seqIncrement * !isDelimiter(c, vectorOfMatchScores);
-    seq1queue.push(vectorOfMatchScores, 0);
+  for (int i = 0; i < simdLen; ++i) {
+    uchar x = *seq;
+    seq1queue.push(x, i);
+    seq += seqIncrement * !isDelimiter(x, vectorOfMatchScores);
+    pssmQueue.push(vectorOfMatchScores, i);
   }
 
-  for (size_t antidiagonal = 0; /* noop */; ++antidiagonal) {
-    size_t seq1beg = std::min(maxSeq1begs[0], maxSeq1begs[1]);
-    size_t seq1end = std::max(minSeq1ends[0], minSeq1ends[1]);
-
-    if (seq1beg >= seq1end) break;
+  pssm += seqIncrement;
 
-    size_t scoreEnd = scoreEnds.back();
+  for (size_t antidiagonal = 0; /* noop */; ++antidiagonal) {
     int numCells = seq1end - seq1beg;
-
-    initAntidiagonal(seq1end, scoreEnd + xdropPadLen + numCells);
-
-    if (seq1end + (simdLen-1) > seq2queue.size()) {
-      uchar c = *seq;
-      seq2queue.push(c, seq1beg);
-      seq += seqIncrement * !isDelimiter(c, vectorOfMatchScores);
+    int n = numCells - 1;
+
+    const uchar *s1 = &seq1queue.fromEnd(n + simdLen);
+    const const_int_ptr *s2 = &pssmQueue.fromEnd(1);
+
+    initAntidiagonal(seq1end, thisPos, numCells);
+    thisPos += xdropPadLen;
+    Score *x0 = &xScores[thisPos];
+    Score *y0 = &yScores[thisPos];
+    Score *z0 = &zScores[thisPos];
+    const Score *y1 = &yScores[horiPos];
+    const Score *z1 = &zScores[horiPos + 1];
+    const Score *x2 = &xScores[diagPos];
+
+    if (!globality && (isDelimiter1 || isDelimiter2)) {
+      updateMaxScoreDrop(maxScoreDrop, n, maxMatchScore);
     }
-    const uchar *s1 = &seq2queue[seq1beg];
-
-    size_t seq2pos = antidiagonal - seq1beg;
-    if (seq2pos + simdLen > seq1queue.size()) {
-      seq1queue.push(*pssm, seq2pos - numCells + 1);
-      pssm += seqIncrement;
-    }
-    const const_int_ptr *s2 = &seq1queue[seq2pos + (simdLen-1)];
-
-    if (!globality && isDelimiter(0, *s2))
-      updateMaxScoreDrop(maxScoreDrop, numCells, maxMatchScore);
 
     int minScore = bestScore - maxScoreDrop;
-    SimdInt mMinScore = simdSet1(minScore);
-
-    Score *x0 = &xScores[scoreEnd];
-    Score *y0 = &yScores[scoreEnd];
-    Score *z0 = &zScores[scoreEnd];
-    const Score *y1 = &yScores[hori(antidiagonal, seq1beg)];
-    const Score *z1 = &zScores[vert(antidiagonal, seq1beg)];
-    const Score *x2 = &xScores[diag(antidiagonal, seq1beg)];
+    SimdInt mMinScore = simdFill(minScore);
 
-    simdStore(x0, mNegInf);  x0 += xdropPadLen;
-    simdStore(y0, mNegInf);  y0 += xdropPadLen;
-    simdStore(z0, mNegInf);  z0 += xdropPadLen;
-
-    if (globality && isDelimiter(0, *s2)) {
-      const Score *z2 = &zScores[diag(antidiagonal, seq1beg)];
+    if (globality && isDelimiter2) {
+      const Score *z2 = &zScores[diagPos];
       int b = maxValue(x2[0], z1[0]-insExtensionCost, z2[0]-gapUnalignedCost);
-      if (b >= minScore)
-	updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
-		    b, antidiagonal, seq1beg);
+      updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
+		  minScore, b, antidiagonal, seq1beg);
+    }
+
+    if (globality && isDelimiter1) {
+      const Score *y2 = &yScores[diagPos];
+      int b = maxValue(x2[n], y1[n]-delExtensionCost, y2[n]-gapUnalignedCost);
+      updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
+		  minScore, b, antidiagonal, seq1end-1);
     }
 
     if (isAffine) {
       for (int i = 0; i < numCells; i += simdLen) {
 	SimdInt s = simdSet(
 #ifdef __SSE4_1__
-//#ifdef __AVX2__
-#ifdef WANT_AVX2
+#ifdef __AVX2__
 			    s2[-7][s1[7]],
 			    s2[-6][s1[6]],
 			    s2[-5][s1[5]],
@@ -133,8 +123,8 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
 	bestAntidiagonal = antidiagonal;
       }
     } else {
-      const Score *y2 = &yScores[diag(antidiagonal, seq1beg)];
-      const Score *z2 = &zScores[diag(antidiagonal, seq1beg)];
+      const Score *y2 = &yScores[diagPos];
+      const Score *z2 = &zScores[diagPos];
       for (int i = 0; i < numCells; ++i) {
         int x = x2[i];
         int y = maxValue(y1[i] - delExtensionCost, y2[i] - gapUnalignedCost);
@@ -155,22 +145,30 @@ int GappedXdropAligner::alignPssm(const uchar *seq,
       }
     }
 
-    uchar seq1back = seq2queue[seq1end - 1];
-
-    if (globality && isDelimiter(seq1back, vectorOfMatchScores)) {
-      const Score *y2 = &yScores[diag(antidiagonal, seq1beg)];
-      int n = numCells - 1;
-      int b = maxValue(x2[n], y1[n]-delExtensionCost, y2[n]-gapUnalignedCost);
-      if (b >= minScore)
-	updateBest1(bestEdgeScore, bestEdgeAntidiagonal, bestSeq1position,
-		    b, antidiagonal, seq1end-1);
+    diagPos = horiPos;
+    horiPos = thisPos - 1;
+    thisPos += numCells;
+
+    if (x0[n] > -INF / 2) {
+      ++seq1end;
+      uchar x = *seq;
+      seq1queue.push(x, n + simdLen);
+      seq += seqIncrement * !isDelimiter(x, vectorOfMatchScores);
+      isDelimiter1 = isDelimiter(seq1queue.fromEnd(simdLen),
+				 vectorOfMatchScores);
     }
 
-    if (!globality && isDelimiter(seq1back, vectorOfMatchScores))
-      updateMaxScoreDrop(maxScoreDrop, numCells, maxMatchScore);
-
-    const Score *x0base = x0 - seq1beg;
-    updateFiniteEdges(maxSeq1begs, minSeq1ends, x0base, x0+numCells, numCells);
+    if (x0[0] > -INF / 2) {
+      const int *y = *pssm;
+      pssmQueue.push(y, n + simdLen);
+      pssm += seqIncrement;
+      isDelimiter2 = isDelimiter(0, y);
+    } else {
+      ++seq1beg;
+      ++diagPos;
+      ++horiPos;
+      if (seq1beg == seq1end) break;
+    }
   }
 
   if (globality) {


=====================================
src/ScoreMatrixRow.hh
=====================================
@@ -15,10 +15,17 @@ enum { scoreMatrixRowSize = ALPHABET_CAPACITY };
 
 typedef int ScoreMatrixRow[scoreMatrixRowSize];
 
-// An "infinite" score.  Delimiters at the ends of sequences get a
-// score of -INF.  We want it high enough to terminate alignments
-// immediately, but not so high that it causes overflow errors.
-enum { INF = INT_MAX / 2 };
+// Substitution score for delimiter symbols at the ends of sequences.
+// It should be highly negative, to terminate alignments immediately,
+// but not so negative that it causes overflow errors.
+
+// The delimiter score when using short ints:
+const int shortDelimiterScore = SHRT_MIN/2 + SCHAR_MIN;
+
+// We want: short(delimiterScore) = shortDelimiterScore
+const int delimiterScore = INT_MIN/2 + (unsigned short)shortDelimiterScore;
+
+enum { INF = -delimiterScore };
 
 }
 


=====================================
src/alp/njn_matrix.hpp
=====================================
@@ -597,7 +597,7 @@ namespace Njn {
         for (size_t i = 0; i < this->getM (); i++) {
             delete [] d_matrix_p [i]; d_matrix_p [i] = 0;
         }
-        if (this->getM () > 0) delete [] d_matrix_p; d_matrix_p = 0;
+        if (this->getM () > 0) { delete [] d_matrix_p; d_matrix_p = 0; }
 
         d_m = 0;
         d_n = 0;


=====================================
src/alp/njn_random.cpp
=====================================
@@ -85,7 +85,7 @@ void Random::seed (long x)
 long Random::number () // uniform random x : 0 <= x <= exp2 (31) - 1
 
 {
-	long	r;
+	long r;
 
 	r = *rK;
 	r += *rJ--;


=====================================
src/alp/njn_vector.hpp
=====================================
@@ -330,7 +330,7 @@ namespace Njn {
    template <typename T> 
    void Vector <T>::free2 ()
    {
-      if (getM () > 0) delete [] d_vector_p; d_vector_p = 0;
+      if (getM () > 0) { delete [] d_vector_p; d_vector_p = 0; }
 
       d_m = 0;
    }


=====================================
src/lastal.cc
=====================================
@@ -673,7 +673,8 @@ void alignGapped( LastAligner& aligner,
 
     // do gapped extension from each end of the seed:
     aln.makeXdrop( aligner.centroid, aligner.greedyAligner, args.isGreedy,
-		   dis.a, dis.b, args.globality, dis.m, scoreMatrix.maxScore,
+		   dis.a, dis.b, args.globality,
+		   dis.m, scoreMatrix.maxScore, scoreMatrix.minScore,
 		   gapCosts, dis.d, args.frameshiftCost, frameSize,
 		   dis.p, dis.t, dis.i, dis.j, alph, extras );
     ++gappedExtensionCount;
@@ -742,8 +743,8 @@ void alignFinish( LastAligner& aligner, const AlignmentPot& gappedAlns,
       probAln.seed = aln.seed;
       probAln.makeXdrop( centroid, aligner.greedyAligner, args.isGreedy,
 			 dis.a, dis.b, args.globality,
-			 dis.m, scoreMatrix.maxScore, gapCosts, dis.d,
-                         args.frameshiftCost, frameSize,
+			 dis.m, scoreMatrix.maxScore, scoreMatrix.minScore,
+			 gapCosts, dis.d, args.frameshiftCost, frameSize,
 			 dis.p, dis.t, dis.i, dis.j, alph, extras,
 			 args.gamma, args.outputType );
       assert( aln.score != -INF );


=====================================
src/makefile
=====================================
@@ -1,6 +1,5 @@
-CXXFLAGS = -msse4.1 -O3 -Wall -Wextra -Wcast-qual -Wswitch-enum	\
--Wundef -Wcast-align -pedantic -g -std=c++11 -pthread		\
--DHAS_CXX_THREADS
+CXXFLAGS = -msse4 -O3 -Wall -Wextra -Wcast-qual -Wswitch-enum -Wundef	\
+-Wcast-align -pedantic -g -std=c++11 -pthread -DHAS_CXX_THREADS
 # -Wconversion
 # -fomit-frame-pointer ?
 
@@ -32,12 +31,12 @@ SubsetSuffixArraySort.o lastdb.o
 alignObj0 = Alphabet.o CyclicSubsetSeed.o LambdaCalculator.o		\
 ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o fileMap.o		\
 tantan.o LastalArguments.o GappedXdropAligner.o				\
-GappedXdropAlignerPssm.o GappedXdropAligner2qual.o			\
-GappedXdropAligner3frame.o mcf_gap_costs.o GeneticCode.o	\
-GreedyXdropAligner.o LastEvaluer.o OneQualityScoreMatrix.o		\
-QualityPssmMaker.o TwoQualityScoreMatrix.o gaplessXdrop.o		\
-gaplessPssmXdrop.o gaplessTwoQualityXdrop.o cbrc_linalg.o		\
-mcf_substitution_matrix_stats.o $(alpObj)
+GappedXdropAlignerDna.o GappedXdropAlignerPssm.o			\
+GappedXdropAligner2qual.o GappedXdropAligner3frame.o mcf_gap_costs.o	\
+GeneticCode.o GreedyXdropAligner.o LastEvaluer.o			\
+OneQualityScoreMatrix.o QualityPssmMaker.o TwoQualityScoreMatrix.o	\
+gaplessXdrop.o gaplessPssmXdrop.o gaplessTwoQualityXdrop.o		\
+cbrc_linalg.o mcf_substitution_matrix_stats.o $(alpObj)
 
 alignObj4 = Alignment.o AlignmentPot.o AlignmentWrite.o Centroid.o	\
 DiagonalTable.o MultiSequence.o MultiSequenceQual.o SegmentPair.o	\
@@ -146,9 +145,9 @@ depend:
 	mv m makefile
 Alignment.o Alignment.o8: Alignment.cc Alignment.hh ScoreMatrixRow.hh SegmentPair.hh \
  mcf_gap_costs.hh Alphabet.hh Centroid.hh GappedXdropAligner.hh \
- mcf_contiguous_queue.hh mcf_simd.hh OneQualityScoreMatrix.hh \
- mcf_substitution_matrix_stats.hh GeneticCode.hh GreedyXdropAligner.hh \
- TwoQualityScoreMatrix.hh
+ mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_simd.hh \
+ OneQualityScoreMatrix.hh mcf_substitution_matrix_stats.hh GeneticCode.hh \
+ GreedyXdropAligner.hh TwoQualityScoreMatrix.hh
 AlignmentPot.o AlignmentPot.o8: AlignmentPot.cc AlignmentPot.hh Alignment.hh \
  ScoreMatrixRow.hh SegmentPair.hh mcf_gap_costs.hh
 AlignmentWrite.o AlignmentWrite.o8: AlignmentWrite.cc Alignment.hh ScoreMatrixRow.hh \
@@ -159,28 +158,32 @@ AlignmentWrite.o AlignmentWrite.o8: AlignmentWrite.cc Alignment.hh ScoreMatrixRo
  Alphabet.hh
 Alphabet.o Alphabet.o8: Alphabet.cc Alphabet.hh
 Centroid.o Centroid.o8: Centroid.cc Centroid.hh GappedXdropAligner.hh \
- mcf_contiguous_queue.hh mcf_simd.hh ScoreMatrixRow.hh mcf_gap_costs.hh \
- SegmentPair.hh OneQualityScoreMatrix.hh mcf_substitution_matrix_stats.hh \
+ mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_simd.hh \
+ ScoreMatrixRow.hh mcf_gap_costs.hh SegmentPair.hh \
+ OneQualityScoreMatrix.hh mcf_substitution_matrix_stats.hh \
  GappedXdropAlignerInl.hh
 CyclicSubsetSeed.o CyclicSubsetSeed.o8: CyclicSubsetSeed.cc CyclicSubsetSeed.hh \
  CyclicSubsetSeedData.hh zio.hh mcf_zstream.hh stringify.hh
 DiagonalTable.o DiagonalTable.o8: DiagonalTable.cc DiagonalTable.hh
 GappedXdropAligner.o GappedXdropAligner.o8: GappedXdropAligner.cc GappedXdropAligner.hh \
- mcf_contiguous_queue.hh mcf_simd.hh ScoreMatrixRow.hh \
- GappedXdropAlignerInl.hh
+ mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_simd.hh \
+ ScoreMatrixRow.hh GappedXdropAlignerInl.hh
 GappedXdropAligner2qual.o GappedXdropAligner2qual.o8: GappedXdropAligner2qual.cc \
- GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_simd.hh \
- ScoreMatrixRow.hh GappedXdropAlignerInl.hh TwoQualityScoreMatrix.hh \
- mcf_substitution_matrix_stats.hh
+ GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_reverse_queue.hh \
+ mcf_simd.hh ScoreMatrixRow.hh GappedXdropAlignerInl.hh \
+ TwoQualityScoreMatrix.hh mcf_substitution_matrix_stats.hh
 GappedXdropAligner3frame.o GappedXdropAligner3frame.o8: GappedXdropAligner3frame.cc \
- GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_simd.hh \
- ScoreMatrixRow.hh GappedXdropAlignerInl.hh
+ GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_reverse_queue.hh \
+ mcf_simd.hh ScoreMatrixRow.hh GappedXdropAlignerInl.hh
 GappedXdropAligner3framePssm.o GappedXdropAligner3framePssm.o8: GappedXdropAligner3framePssm.cc \
- GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_simd.hh \
+ GappedXdropAligner.hh mcf_contiguous_queue.hh mcf_reverse_queue.hh \
+ mcf_simd.hh ScoreMatrixRow.hh GappedXdropAlignerInl.hh
+GappedXdropAlignerDna.o GappedXdropAlignerDna.o8: GappedXdropAlignerDna.cc GappedXdropAligner.hh \
+ mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_simd.hh \
  ScoreMatrixRow.hh GappedXdropAlignerInl.hh
 GappedXdropAlignerPssm.o GappedXdropAlignerPssm.o8: GappedXdropAlignerPssm.cc GappedXdropAligner.hh \
- mcf_contiguous_queue.hh mcf_simd.hh ScoreMatrixRow.hh \
- GappedXdropAlignerInl.hh
+ mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_simd.hh \
+ ScoreMatrixRow.hh GappedXdropAlignerInl.hh
 GeneticCode.o GeneticCode.o8: GeneticCode.cc GeneticCode.hh GeneticCodeData.hh \
  Alphabet.hh zio.hh mcf_zstream.hh
 GreedyXdropAligner.o GreedyXdropAligner.o8: GreedyXdropAligner.cc GreedyXdropAligner.hh \
@@ -204,8 +207,9 @@ OneQualityScoreMatrix.o OneQualityScoreMatrix.o8: OneQualityScoreMatrix.cc \
  ScoreMatrixRow.hh qualityScoreUtil.hh stringify.hh
 QualityPssmMaker.o QualityPssmMaker.o8: QualityPssmMaker.cc QualityPssmMaker.hh \
  ScoreMatrixRow.hh qualityScoreUtil.hh stringify.hh
-ScoreMatrix.o ScoreMatrix.o8: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixData.hh \
- qualityScoreUtil.hh stringify.hh zio.hh mcf_zstream.hh
+ScoreMatrix.o ScoreMatrix.o8: ScoreMatrix.cc ScoreMatrix.hh ScoreMatrixRow.hh \
+ ScoreMatrixData.hh qualityScoreUtil.hh stringify.hh zio.hh \
+ mcf_zstream.hh
 SegmentPair.o SegmentPair.o8: SegmentPair.cc SegmentPair.hh
 SegmentPairPot.o SegmentPairPot.o8: SegmentPairPot.cc SegmentPairPot.hh SegmentPair.hh
 SubsetMinimizerFinder.o SubsetMinimizerFinder.o8: SubsetMinimizerFinder.cc \
@@ -244,11 +248,12 @@ lastal.o lastal.o8: lastal.cc last.hh Alphabet.hh MultiSequence.hh \
  alp/sls_falp_alignment_evaluer.hpp alp/sls_fsa1_pvalues.hpp \
  GeneticCode.hh SubsetMinimizerFinder.hh SubsetSuffixArray.hh \
  CyclicSubsetSeed.hh Centroid.hh GappedXdropAligner.hh \
- mcf_contiguous_queue.hh mcf_simd.hh mcf_gap_costs.hh SegmentPair.hh \
- AlignmentPot.hh Alignment.hh SegmentPairPot.hh ScoreMatrix.hh \
- TantanMasker.hh tantan.hh DiagonalTable.hh GreedyXdropAligner.hh \
- gaplessXdrop.hh gaplessPssmXdrop.hh gaplessTwoQualityXdrop.hh zio.hh \
- mcf_zstream.hh threadUtil.hh version.hh
+ mcf_contiguous_queue.hh mcf_reverse_queue.hh mcf_simd.hh \
+ mcf_gap_costs.hh SegmentPair.hh AlignmentPot.hh Alignment.hh \
+ SegmentPairPot.hh ScoreMatrix.hh TantanMasker.hh tantan.hh \
+ DiagonalTable.hh GreedyXdropAligner.hh gaplessXdrop.hh \
+ gaplessPssmXdrop.hh gaplessTwoQualityXdrop.hh zio.hh mcf_zstream.hh \
+ threadUtil.hh version.hh
 lastdb.o lastdb.o8: lastdb.cc last.hh Alphabet.hh MultiSequence.hh \
  ScoreMatrixRow.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh \
  SequenceFormat.hh qualityScoreUtil.hh LastdbArguments.hh \


=====================================
src/mcf_contiguous_queue.hh
=====================================
@@ -13,25 +13,19 @@ template <typename T> class ContiguousQueue {
 public:
   void clear() {
     v.clear();
-    start = 0;
   }
 
-  size_t size() const { return start + v.size(); }
-
-  void push(const T &item, size_t newStart) {
-    size_t oldSize = newStart - start;
-    if (oldSize > v.size() / 2) {
-      v.erase(v.begin(), v.begin() + oldSize);
-      start = newStart;
+  void push(const T &item, size_t numOfOldItemsToKeep) {
+    if (numOfOldItemsToKeep <= v.size() / 2) {
+      v.erase(v.begin(), v.end() - numOfOldItemsToKeep);
     }
     v.push_back(item);
   }
 
-  const T &operator [] (size_t n) const { return v[n - start]; }
+  const T &fromEnd(int n) const { return v.end()[-n]; }
 
 private:
   std::vector<T> v;
-  size_t start;
 };
 
 }


=====================================
src/mcf_gap_costs.cc
=====================================
@@ -39,6 +39,9 @@ void GapCosts::assign(const std::vector<int> &delOpenCosts,
     pairCost = delPieces[0].openCost + delPieces[0].growCost +
       insPieces[0].openCost + insPieces[0].growCost;
   }
+  isAffine = (delPieces.size() < 2 && insPieces.size() < 2 &&
+	      pairCost >= delPieces[0].growCost + insPieces[0].growCost +
+	      std::max(delPieces[0].openCost, insPieces[0].openCost));
 }
 
 int GapCosts::cost(int refInsertLen, int qryInsertLen) const {


=====================================
src/mcf_gap_costs.hh
=====================================
@@ -31,6 +31,8 @@ struct GapCosts {
   std::vector<Piece> insPieces;
   int pairCost;
 
+  bool isAffine;
+
   // Assign piecewise linear open and grow costs, and one pairCost.
   // If unalignedPairCost <= 0, assign non-generalized costs.
   // Throw a runtime_error if any growCost is <= 0.


=====================================
src/mcf_reverse_queue.hh
=====================================
@@ -0,0 +1,60 @@
+// Author: Martin C. Frith 2019
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef MCF_REVERSE_QUEUE
+#define MCF_REVERSE_QUEUE
+
+#include <new>
+#include <stdlib.h>
+#include <string.h>
+
+namespace mcf {
+
+template <typename T> class ReverseQueue {
+public:
+  ReverseQueue() : buf(0), beg(0), end(0) {}
+
+  void clear() {
+    beg = end;
+  }
+
+  void push(const T &item, unsigned numOfOldItemsToKeep) {
+    if (beg == buf) {
+      size_t len = end - beg;
+      if (numOfOldItemsToKeep < len / 2) {
+	beg = end - numOfOldItemsToKeep;
+	memcpy(beg, buf, numOfOldItemsToKeep * sizeof(T));
+      } else {
+	size_t newLen = len * 2 + 128;
+	size_t newBytes = newLen * sizeof(T);
+	if (newBytes <= len * sizeof(T)) throw std::bad_alloc();
+	void *p = malloc(newBytes);
+	if (!p) throw std::bad_alloc();
+	end = static_cast<T *>(p) + newLen;
+	beg = end - numOfOldItemsToKeep;
+	memcpy(beg, buf, numOfOldItemsToKeep * sizeof(T));
+	free(buf);
+	buf = static_cast<T *>(p);
+      }
+    }
+    --beg;
+    *beg = item;
+  }
+
+  const T *begin() const {
+    return beg;
+  }
+
+  ~ReverseQueue() {
+    free(buf);
+  }
+
+private:
+  T *buf;
+  T *beg;
+  T *end;
+};
+
+}
+
+#endif


=====================================
src/mcf_simd.hh
=====================================
@@ -8,11 +8,20 @@
 
 namespace mcf {
 
-//#if defined __AVX2__
-#if defined WANT_AVX2
+#if defined __AVX2__
 
 typedef __m256i SimdInt;
 
+const int simdBytes = 32;
+
+static inline SimdInt simdZero() {
+  return _mm256_setzero_si256();
+}
+
+static inline SimdInt simdOnes() {
+  return _mm256_set1_epi32(-1);
+}
+
 static inline SimdInt simdLoad(const void *p) {
   return _mm256_loadu_si256((const SimdInt *)p);
 }
@@ -21,37 +30,112 @@ static inline void simdStore(void *p, SimdInt x) {
   _mm256_storeu_si256((SimdInt *)p, x);
 }
 
+static inline SimdInt simdOr(SimdInt x, SimdInt y) {
+  return _mm256_or_si256(x, y);
+}
+
 static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdInt mask) {
   return _mm256_blendv_epi8(x, y, mask);
 }
 
 const int simdLen = 8;
+const int simdLen2 = 16;
 
 static inline SimdInt simdSet(int i7, int i6, int i5, int i4,
 			      int i3, int i2, int i1, int i0) {
   return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
 }
 
-static inline SimdInt simdSet1(int x) {
+static inline SimdInt simdSet2(short iF, short iE, short iD, short iC,
+			       short iB, short iA, short i9, short i8,
+			       short i7, short i6, short i5, short i4,
+			       short i3, short i2, short i1, short i0) {
+  return _mm256_set_epi16(iF, iE, iD, iC, iB, iA, i9, i8,
+			  i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static inline SimdInt simdSet1(char jF, char jE, char jD, char jC,
+			       char jB, char jA, char j9, char j8,
+			       char j7, char j6, char j5, char j4,
+			       char j3, char j2, char j1, char j0,
+			       char iF, char iE, char iD, char iC,
+			       char iB, char iA, char i9, char i8,
+			       char i7, char i6, char i5, char i4,
+			       char i3, char i2, char i1, char i0) {
+  return _mm256_set_epi8(jF, jE, jD, jC, jB, jA, j9, j8,
+			 j7, j6, j5, j4, j3, j2, j1, j0,
+			 iF, iE, iD, iC, iB, iA, i9, i8,
+			 i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static inline SimdInt simdFill(int x) {
   return _mm256_set1_epi32(x);
 }
 
+static inline SimdInt simdFill2(short x) {
+  return _mm256_set1_epi16(x);
+}
+
+static inline SimdInt simdFill1(char x) {
+  return _mm256_set1_epi8(x);
+}
+
+static inline SimdInt simdEq1(SimdInt x, SimdInt y) {
+  return _mm256_cmpeq_epi8(x, y);
+}
+
 static inline SimdInt simdGt(SimdInt x, SimdInt y) {
   return _mm256_cmpgt_epi32(x, y);
 }
 
+static inline SimdInt simdGt2(SimdInt x, SimdInt y) {
+  return _mm256_cmpgt_epi16(x, y);
+}
+
 static inline SimdInt simdAdd(SimdInt x, SimdInt y) {
   return _mm256_add_epi32(x, y);
 }
 
+static inline SimdInt simdAdd2(SimdInt x, SimdInt y) {
+  return _mm256_add_epi16(x, y);
+}
+
+static inline SimdInt simdAdd1(SimdInt x, SimdInt y) {
+  return _mm256_add_epi8(x, y);
+}
+
+static inline SimdInt simdAdds1(SimdInt x, SimdInt y) {
+  return _mm256_adds_epu8(x, y);
+}
+
 static inline SimdInt simdSub(SimdInt x, SimdInt y) {
   return _mm256_sub_epi32(x, y);
 }
 
+static inline SimdInt simdSub2(SimdInt x, SimdInt y) {
+  return _mm256_sub_epi16(x, y);
+}
+
+static inline SimdInt simdSub1(SimdInt x, SimdInt y) {
+  return _mm256_sub_epi8(x, y);
+}
+
+static inline SimdInt simdLeft(SimdInt x, int bits) {
+  return _mm256_slli_epi32(x, bits);
+}
+
 static inline SimdInt simdMax(SimdInt x, SimdInt y) {
   return _mm256_max_epi32(x, y);
 }
 
+static inline SimdInt simdMax2(SimdInt x, SimdInt y) {
+  return _mm256_max_epi16(x, y);
+}
+
+static inline SimdInt simdMin1(SimdInt x, SimdInt y) {
+  return _mm256_min_epu8(x, y);
+}
+
 static inline int simdHorizontalMax(SimdInt x) {
   __m128i z = _mm256_castsi256_si128(x);
   z = _mm_max_epi32(z, _mm256_extracti128_si256(x, 1));
@@ -60,10 +144,40 @@ static inline int simdHorizontalMax(SimdInt x) {
   return _mm_cvtsi128_si32(z);
 }
 
+static inline int simdHorizontalMax2(SimdInt x) {
+  __m128i z = _mm256_castsi256_si128(x);
+  z = _mm_max_epi16(z, _mm256_extracti128_si256(x, 1));
+  z = _mm_sub_epi16(_mm_set1_epi16(32767), z);
+  z = _mm_minpos_epu16(z);
+  return 32767 - _mm_extract_epi16(z, 0);
+}
+
+static inline int simdHorizontalMin1(SimdInt x) {
+  __m128i z = _mm256_castsi256_si128(x);
+  z = _mm_min_epu8(z, _mm256_extracti128_si256(x, 1));
+  z = _mm_min_epu8(z, _mm_srli_epi16(z, 8));
+  z = _mm_minpos_epu16(z);
+  return _mm_extract_epi16(z, 0);
+}
+
+static inline SimdInt simdChoose1(SimdInt items, SimdInt choices) {
+  return _mm256_shuffle_epi8(items, choices);
+}
+
 #elif defined __SSE4_1__
 
 typedef __m128i SimdInt;
 
+const int simdBytes = 16;
+
+static inline SimdInt simdZero() {
+  return _mm_setzero_si128();
+}
+
+static inline SimdInt simdOnes() {
+  return _mm_set1_epi32(-1);
+}
+
 static inline SimdInt simdLoad(const void *p) {
   return _mm_loadu_si128((const SimdInt *)p);
 }
@@ -72,48 +186,130 @@ static inline void simdStore(void *p, SimdInt x) {
   _mm_storeu_si128((SimdInt *)p, x);
 }
 
+static inline SimdInt simdOr(SimdInt x, SimdInt y) {
+  return _mm_or_si128(x, y);
+}
+
 static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdInt mask) {
   return _mm_blendv_epi8(x, y, mask);  // SSE4.1
 }
 
 const int simdLen = 4;
+const int simdLen2 = 8;
 
 static inline SimdInt simdSet(int i3, int i2, int i1, int i0) {
   return _mm_set_epi32(i3, i2, i1, i0);
 }
 
-static inline SimdInt simdSet1(int x) {
+static inline SimdInt simdSet2(short i7, short i6, short i5, short i4,
+			       short i3, short i2, short i1, short i0) {
+  return _mm_set_epi16(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static inline SimdInt simdSet1(char iF, char iE, char iD, char iC,
+			       char iB, char iA, char i9, char i8,
+			       char i7, char i6, char i5, char i4,
+			       char i3, char i2, char i1, char i0) {
+  return _mm_set_epi8(iF, iE, iD, iC, iB, iA, i9, i8,
+		      i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static inline SimdInt simdFill(int x) {
   return _mm_set1_epi32(x);
 }
 
+static inline SimdInt simdFill2(short x) {
+  return _mm_set1_epi16(x);
+}
+
+static inline SimdInt simdFill1(char x) {
+  return _mm_set1_epi8(x);
+}
+
+static inline SimdInt simdEq1(SimdInt x, SimdInt y) {
+  return _mm_cmpeq_epi8(x, y);
+}
+
 static inline SimdInt simdGt(SimdInt x, SimdInt y) {
   return _mm_cmpgt_epi32(x, y);
 }
 
+static inline SimdInt simdGt2(SimdInt x, SimdInt y) {
+  return _mm_cmpgt_epi16(x, y);
+}
+
 static inline SimdInt simdAdd(SimdInt x, SimdInt y) {
   return _mm_add_epi32(x, y);
 }
 
+static inline SimdInt simdAdd2(SimdInt x, SimdInt y) {
+  return _mm_add_epi16(x, y);
+}
+
+static inline SimdInt simdAdd1(SimdInt x, SimdInt y) {
+  return _mm_add_epi8(x, y);
+}
+
+static inline SimdInt simdAdds1(SimdInt x, SimdInt y) {
+  return _mm_adds_epu8(x, y);
+}
+
 static inline SimdInt simdSub(SimdInt x, SimdInt y) {
   return _mm_sub_epi32(x, y);
 }
 
+static inline SimdInt simdSub2(SimdInt x, SimdInt y) {
+  return _mm_sub_epi16(x, y);
+}
+
+static inline SimdInt simdSub1(SimdInt x, SimdInt y) {
+  return _mm_sub_epi8(x, y);
+}
+
+static inline SimdInt simdLeft(SimdInt x, int bits) {
+  return _mm_slli_epi32(x, bits);
+}
+
 static inline SimdInt simdMax(SimdInt x, SimdInt y) {
   return _mm_max_epi32(x, y);  // SSE4.1
 }
 
+static inline SimdInt simdMax2(SimdInt x, SimdInt y) {
+  return _mm_max_epi16(x, y);
+}
+
+static inline SimdInt simdMin1(SimdInt x, SimdInt y) {
+  return _mm_min_epu8(x, y);
+}
+
 static inline int simdHorizontalMax(SimdInt x) {
   x = simdMax(x, _mm_shuffle_epi32(x, 0x4E));
   x = simdMax(x, _mm_shuffle_epi32(x, 0xB1));
   return _mm_cvtsi128_si32(x);
 }
 
+static inline int simdHorizontalMax2(SimdInt x) {
+  x = simdSub2(simdFill2(32767), x);
+  x = _mm_minpos_epu16(x);  // SSE4.1
+  return 32767 - _mm_extract_epi16(x, 0);
+}
+
+static inline int simdHorizontalMin1(SimdInt x) {
+  x = _mm_min_epu8(x, _mm_srli_epi16(x, 8));
+  x = _mm_minpos_epu16(x);  // SSE4.1
+  return _mm_extract_epi16(x, 0);
+}
+
+static inline SimdInt simdChoose1(SimdInt items, SimdInt choices) {
+  return _mm_shuffle_epi8(items, choices);
+}
+
 #else
 
 typedef int SimdInt;
 const int simdLen = 1;
 static inline int simdSet(int x) { return x; }
-static inline int simdSet1(int x) { return x; }
+static inline int simdFill(int x) { return x; }
 static inline int simdLoad(const int *p) { return *p; }
 static inline void simdStore(int *p, int x) { *p = x; }
 static inline int simdGt(int x, int y) { return x > y; }


=====================================
src/version.hh
=====================================
@@ -1 +1 @@
-"1021"
+"1044"



View it on GitLab: https://salsa.debian.org/med-team/last-align/commit/60c6d42fe266f0fa717ddee8358bfbb1b71731c1

-- 
View it on GitLab: https://salsa.debian.org/med-team/last-align/commit/60c6d42fe266f0fa717ddee8358bfbb1b71731c1
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191221/592f2c90/attachment-0001.html>


More information about the debian-med-commit mailing list